From 9c06e71945215c7c6f91e3dc4a81e79a1fddf237 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 17 Dec 2018 21:29:46 +0100 Subject: [PATCH 001/130] Created tnl-benchmark-traversers. --- src/Benchmarks/CMakeLists.txt | 1 + src/Benchmarks/Traversers/CMakeLists.txt | 9 ++ .../Traversers/tnl-benchmark-traversers.cpp | 11 ++ .../Traversers/tnl-benchmark-traversers.cu | 11 ++ .../Traversers/tnl-benchmark-traversers.h | 102 ++++++++++++++++++ src/Benchmarks/scripts/cuda-profiler.conf | 7 -- .../scripts/process-cuda-profile.pl | 42 -------- 7 files changed, 134 insertions(+), 49 deletions(-) create mode 100644 src/Benchmarks/Traversers/CMakeLists.txt create mode 100644 src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp create mode 100644 src/Benchmarks/Traversers/tnl-benchmark-traversers.cu create mode 100644 src/Benchmarks/Traversers/tnl-benchmark-traversers.h delete mode 100644 src/Benchmarks/scripts/cuda-profiler.conf delete mode 100644 src/Benchmarks/scripts/process-cuda-profile.pl diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt index e0637205f..d4c2258c9 100644 --- a/src/Benchmarks/CMakeLists.txt +++ b/src/Benchmarks/CMakeLists.txt @@ -3,6 +3,7 @@ add_subdirectory( BLAS ) add_subdirectory( SpMV ) add_subdirectory( DistSpMV ) add_subdirectory( LinearSolvers ) +add_subdirectory( Traversers ) set( headers Benchmarks.h diff --git a/src/Benchmarks/Traversers/CMakeLists.txt b/src/Benchmarks/Traversers/CMakeLists.txt new file mode 100644 index 000000000..b58c7d66f --- /dev/null +++ b/src/Benchmarks/Traversers/CMakeLists.txt @@ -0,0 +1,9 @@ +if( BUILD_CUDA ) + CUDA_ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cu ) + TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl ${CUDA_cusparse_LIBRARY} ) +else() + ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp ) + TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl ) +endif() + +install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin ) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp new file mode 100644 index 000000000..cf69b41dd --- /dev/null +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp @@ -0,0 +1,11 @@ +/*************************************************************************** + tnl-benchmark-traversers.cpp - description + ------------------- + begin : Dec 17, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include "tnl-benchmark-traversers.h" \ No newline at end of file diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.cu b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cu new file mode 100644 index 000000000..614b0d200 --- /dev/null +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cu @@ -0,0 +1,11 @@ +/*************************************************************************** + tnl-benchmark-traversers.cu - description + ------------------- + begin : Dec 17, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include "tnl-benchmark-traversers.h" \ No newline at end of file diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h new file mode 100644 index 000000000..9d1af1ec9 --- /dev/null +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -0,0 +1,102 @@ +/*************************************************************************** + tnl-benchmark-traversers.h - description + ------------------- + begin : Dec 17, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include "../Benchmarks.h" + +#include +#include +#include +#include + +using namespace TNL; +using namespace TNL::Benchmarks; + +void setupConfig( Config::ConfigDescription& config ) +{ + config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log"); + config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); + config.addEntryEnum( "append" ); + config.addEntryEnum( "overwrite" ); + config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); + config.addEntryEnum( "float" ); + config.addEntryEnum( "double" ); + config.addEntryEnum( "all" ); + config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 ); + config.addEntry< std::size_t >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 ); + config.addEntry< std::size_t >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 ); + config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); + config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); + config.addEntry< int >( "verbose", "Verbose mode.", 1 ); + + config.addDelimiter( "Device settings:" ); + Devices::Host::configSetup( config ); + Devices::Cuda::configSetup( config ); +} + +int main( int argc, char* argv[] ) +{ + Config::ConfigDescription config; + Config::ParameterContainer parameters; + + setupConfig( config ); + if( ! parseCommandLine( argc, argv, config, parameters ) ) { + config.printUsage( argv[ 0 ] ); + return EXIT_FAILURE; + } + + if( ! Devices::Host::setup( parameters ) || + ! Devices::Cuda::setup( parameters ) ) + return EXIT_FAILURE; + + const String & logFileName = parameters.getParameter< String >( "log-file" ); + const String & outputMode = parameters.getParameter< String >( "output-mode" ); + const String & precision = parameters.getParameter< String >( "precision" ); + // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), + // which have a default value. The workaround below works for int values, but it is not possible + // to pass 64-bit integer values + // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); + // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); + const int dimension = parameters.getParameter< int >( "dimension" ); + const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); + const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); + const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); + const unsigned loops = parameters.getParameter< unsigned >( "loops" ); + const unsigned verbose = parameters.getParameter< unsigned >( "verbose" ); + + bool status( false ); + if( ! dimension ) + { + status = performBenchmark< 1 >( parameters ); + status |= performBenchmark< 2 >( parameters ); + status |= performBenchmark< 3 >( parameters ); + } + else + { + switch( dimension ) + { + case 1: + status = performBenchmark< 1 >( parameters ); + break; + case 2: + status = performBenchmark< 2 >( parameters ); + break; + case 3: + status = performBenchmark< 3 >( parameters ); + break; + } + } + if( status == false ) + return EXIT_FAILURE; + return EXIT_SUCCES; +} \ No newline at end of file diff --git a/src/Benchmarks/scripts/cuda-profiler.conf b/src/Benchmarks/scripts/cuda-profiler.conf deleted file mode 100644 index 8ff91fe3b..000000000 --- a/src/Benchmarks/scripts/cuda-profiler.conf +++ /dev/null @@ -1,7 +0,0 @@ -== cuda-kernel.conf == -timestamp -threadblocksize -l1_global_load_hit -l1_global_load_miss -gld_incoherent -gst_incoherent \ No newline at end of file diff --git a/src/Benchmarks/scripts/process-cuda-profile.pl b/src/Benchmarks/scripts/process-cuda-profile.pl deleted file mode 100644 index 187623da9..000000000 --- a/src/Benchmarks/scripts/process-cuda-profile.pl +++ /dev/null @@ -1,42 +0,0 @@ -open( INPUT, "$ARGV[0]" ) - or die "Can not open file $ARGV[ 0 ]"; -$blockSize = 0; -$testNumber = 0; -while( $line = ) -{ - if( $line =~ m/.*sparseCSRMatrixVectorProductKernel.*threadblocksize=\[ (.*), 1, 1 \] occupancy=\[ (.*) \] tex_cache_hit=\[ (.*) \] tex_cache_miss=\[ (.*) \] gld_incoherent=\[ (.*) \] gst_incoherent=\[ (.*) \].*/ ) - { - if( $blockSize != $1 ) - { - $blockSize = $1; - $occupancy{$testNumber} = $2; - $texCacheHit{$testNumber} = $3; - $texCacheMiss{$testNumber} = $4; - $gldIncoherent{$testNumber} = $5; - $gstIncoherent{$testNumber} = $6; - $testNumber = $testNumber + 1; - } - } -} -close( INPUT ); - -print "There were $testNumber tests."; - -open( LOG, ">>$ARGV[1]" ) - or die "Can not open file $ARGV[1]"; -printf LOG "| %97s |", $ARGV[ 0 ]; -$testOutput = 0; -while( $testOutput < $testNumber ) -{ - printf LOG "%10.3f |", $occupancy{$testOutput}; - printf LOG "%10.3f |", $texCahceHit{$testOutput}; - printf LOG "%10.3f |", $texCacheMiss{$testOutput}; - printf LOG "%10.3f |", $gldIncoherent{$testOutput}; - printf LOG "%10.3f |", $gstIncoherent{$testOutput}; - $testOutput = $testOutput + 1; -} -print LOG "\n"; -close( LOG ); - - - -- GitLab From b3e88de0d849888ba23d48c94c41f8815f0d29e7 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Wed, 19 Dec 2018 14:29:28 +0100 Subject: [PATCH 002/130] Implementation of the traversers benchmark. --- .../Traversers/tnl-benchmark-traversers.h | 72 +++++++++++++------ 1 file changed, 50 insertions(+), 22 deletions(-) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 9d1af1ec9..7e5189bfb 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -44,6 +44,48 @@ void setupConfig( Config::ConfigDescription& config ) Devices::Cuda::configSetup( config ); } +template< int Dimension > +bool runBenchmark( const Config::ParameterContainer& parameters, + Benchmark& benchmark, + Benchmark::MetadataMap& metadat ) +{ + // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), + // which have a default value. The workaround below works for int values, but it is not possible + // to pass 64-bit integer values + // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); + // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); + const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); + const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); + +} + +template< int Dimension > +bool setupBenchmark( const Config::ParameterContainer& parameters ) +{ + const String & logFileName = parameters.getParameter< String >( "log-file" ); + const String & outputMode = parameters.getParameter< String >( "output-mode" ); + const String & precision = parameters.getParameter< String >( "precision" ); + const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); + const unsigned loops = parameters.getParameter< unsigned >( "loops" ); + const unsigned verbose = parameters.getParameter< unsigned >( "verbose" ); + + Benchmark benchmark( loops, verbose ); + Benchmark::MetadataMap metadata = getHardwareMetadata(); + runBenchmark< Dimension >( parameters, benchmark, metadata ); + + auto mode = std::ios::out; + if( outputMode == "append" ) + mode |= std::ios::app; + std::ofstream logFile( logFileName.getString(), mode ); + + if( ! benchmark.save( logFile ) ) + { + std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; + return false; + } + return true; +} + int main( int argc, char* argv[] ) { Config::ConfigDescription config; @@ -59,44 +101,30 @@ int main( int argc, char* argv[] ) ! Devices::Cuda::setup( parameters ) ) return EXIT_FAILURE; - const String & logFileName = parameters.getParameter< String >( "log-file" ); - const String & outputMode = parameters.getParameter< String >( "output-mode" ); - const String & precision = parameters.getParameter< String >( "precision" ); - // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), - // which have a default value. The workaround below works for int values, but it is not possible - // to pass 64-bit integer values - // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); - // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); const int dimension = parameters.getParameter< int >( "dimension" ); - const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); - const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); - const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); - const unsigned loops = parameters.getParameter< unsigned >( "loops" ); - const unsigned verbose = parameters.getParameter< unsigned >( "verbose" ); - bool status( false ); if( ! dimension ) { - status = performBenchmark< 1 >( parameters ); - status |= performBenchmark< 2 >( parameters ); - status |= performBenchmark< 3 >( parameters ); + status = setupBenchmark< 1 >( parameters ); + status |= setupBenchmark< 2 >( parameters ); + status |= setupBenchmark< 3 >( parameters ); } else { switch( dimension ) { case 1: - status = performBenchmark< 1 >( parameters ); + status = setupBenchmark< 1 >( parameters ); break; case 2: - status = performBenchmark< 2 >( parameters ); + status = setupBenchmark< 2 >( parameters ); break; case 3: - status = performBenchmark< 3 >( parameters ); + status = setupBenchmark< 3 >( parameters ); break; } } if( status == false ) return EXIT_FAILURE; - return EXIT_SUCCES; -} \ No newline at end of file + return EXIT_SUCCESS; +} -- GitLab From 65d6268c1cb363b9fa35aff0739fb4e30c4f94a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 19 Dec 2018 21:12:19 +0100 Subject: [PATCH 003/130] Fixed typo in vector operations benchmark comment. --- src/Benchmarks/BLAS/vector-operations.h | 2 +- src/Benchmarks/Traversers/WriteOne.h | 88 +++++++++++++++++++++ src/Benchmarks/Traversers/grid-traversing.h | 54 +++++++++++++ 3 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 src/Benchmarks/Traversers/WriteOne.h create mode 100644 src/Benchmarks/Traversers/grid-traversing.h diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h index b9a68d618..8dd63de85 100644 --- a/src/Benchmarks/BLAS/vector-operations.h +++ b/src/Benchmarks/BLAS/vector-operations.h @@ -64,7 +64,7 @@ benchmarkVectorOperations( Benchmark & benchmark, deviceVector.setValue( 1.0 ); #endif // A relatively harmless call to keep the compiler from realizing we - // don't actually do any useful work with the result of the reduciton. + // don't actually do any useful work with the result of the reduction. srand48(resultHost); resultHost = resultDevice = 0.0; }; diff --git a/src/Benchmarks/Traversers/WriteOne.h b/src/Benchmarks/Traversers/WriteOne.h new file mode 100644 index 000000000..73bf0bfec --- /dev/null +++ b/src/Benchmarks/Traversers/WriteOne.h @@ -0,0 +1,88 @@ +/*************************************************************************** + WriteOne.h - description + ------------------- + begin : Dec 19, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include + +namespace TNL { + namespace Benchmarks { + + +template< int Dimenions, + typename Device, + typename Real, + typename Index > +class WriteOne{}; + +template< typename Device, + typename Real, + typename Index > +class WriteOne< 1, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + + static void run( std::size_t size ) + { + Vector v( size ); + auto writeOne = []( Index i, Real* data ) + { + data[ i ] = 1.0; + }; + + + ParallelFor< Devices::Host >::exec( ( std::size_t ) 0, size, writeOne, v.getData() ); + } +}; + + +template< typename Device, + typename Real, + typename Index > +class WriteOne< 2, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + + static void run( std::size_t size ) + { + + } +}; + +template< typename Device, + typename Real, + typename Index > +class WriteOne< 3, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + + static void run( std::size_t size ) + { + + } +}; + + + } // namespace Benchmarks +} // namespace TNL + + + diff --git a/src/Benchmarks/Traversers/grid-traversing.h b/src/Benchmarks/Traversers/grid-traversing.h new file mode 100644 index 000000000..df45b1d7f --- /dev/null +++ b/src/Benchmarks/Traversers/grid-traversing.h @@ -0,0 +1,54 @@ +/*************************************************************************** + grid-traversing.h - description + ------------------- + begin : Dec 19, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include "../Benchmarks.h" +#include "WriteOne.h" + +#include + +namespace TNL { + namespace Benchmarks { + +template< int Dimension, + typename Real = double, + typename Index = int > +class benchmarkTraversingFullGrid +{ + public: + + static void run ( Benchmark& benchmark, std::size_t size ) + { + auto reset = [&]() + {}; + + auto testHost = [&] () + { + WriteOne< Dimension, Devices::Host, Real, Index >::run( size ); + }; + + auto testCuda = [&] () + { + WriteOne< Dimension, Devices::Cuda, Real, Index >::run( size ); + }; + + benchmark.setOperation( "writeOne", size * sizeof( Real ) ); + benchmark.time( reset, "CPU", testHost ); +#ifdef HAVE_CUDA + benchmark.time( reset, "GPU", testCuda ); +#endif + + } +}; + } // namespace Benchmarks +} // namespace TNL \ No newline at end of file -- GitLab From a5e791efe5fff11f8073512522ec02143efbcfbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 19 Dec 2018 21:13:37 +0100 Subject: [PATCH 004/130] Implementation of grid traversers benchmarks. --- .../Traversers/tnl-benchmark-traversers.h | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 7e5189bfb..e227a258d 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -13,6 +13,7 @@ #pragma once #include "../Benchmarks.h" +#include "grid-traversing.h" #include #include @@ -33,8 +34,8 @@ void setupConfig( Config::ConfigDescription& config ) config.addEntryEnum( "double" ); config.addEntryEnum( "all" ); config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 ); - config.addEntry< std::size_t >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 ); - config.addEntry< std::size_t >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 ); + config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 ); + config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 ); config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); config.addEntry< int >( "verbose", "Verbose mode.", 1 ); @@ -47,16 +48,26 @@ void setupConfig( Config::ConfigDescription& config ) template< int Dimension > bool runBenchmark( const Config::ParameterContainer& parameters, Benchmark& benchmark, - Benchmark::MetadataMap& metadat ) + Benchmark::MetadataMap& metadata ) { // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), // which have a default value. The workaround below works for int values, but it is not possible // to pass 64-bit integer values // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); - const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); - const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); + const int minSize = parameters.getParameter< int >( "min-size" ); + const int maxSize = parameters.getParameter< int >( "max-size" ); + // Full grid traversing + benchmark.newBenchmark( String("Full grid traversing " + convertToString( Dimension ) + "D" ), metadata ); + for( std::size_t size = minSize; size <= maxSize; size *= 2 ) + { + benchmark.setMetadataColumns( Benchmark::MetadataColumns({ + {"size", convertToString( size ) }, + } )); + benchmarkTraversingFullGrid< Dimension >::run( benchmark, size ); + } + return true; } template< int Dimension > -- GitLab From f9d70a3d56e379ef013638294dacc0a87b4e9104 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 20 Dec 2018 14:06:01 +0100 Subject: [PATCH 005/130] Fixing lambda function for CUDA in traverser benchmark. --- src/Benchmarks/Traversers/WriteOne.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Benchmarks/Traversers/WriteOne.h b/src/Benchmarks/Traversers/WriteOne.h index 73bf0bfec..9fd269f10 100644 --- a/src/Benchmarks/Traversers/WriteOne.h +++ b/src/Benchmarks/Traversers/WriteOne.h @@ -39,13 +39,13 @@ class WriteOne< 1, Device, Real, Index > static void run( std::size_t size ) { Vector v( size ); - auto writeOne = []( Index i, Real* data ) + auto writeOne = [] __cuda_callable__ ( Index i, Real* data ) { data[ i ] = 1.0; }; - ParallelFor< Devices::Host >::exec( ( std::size_t ) 0, size, writeOne, v.getData() ); + ParallelFor< Device >::exec( ( std::size_t ) 0, size, writeOne, v.getData() ); } }; -- GitLab From 32a9a6d6bfc784eb08faff0d0d9ae57d5cd4a614 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 20 Dec 2018 15:17:18 +0100 Subject: [PATCH 006/130] Implemented write-one grid traverser becnhamrk in 2D and 3D. --- src/Benchmarks/Traversers/WriteOne.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/Benchmarks/Traversers/WriteOne.h b/src/Benchmarks/Traversers/WriteOne.h index 9fd269f10..4c39926aa 100644 --- a/src/Benchmarks/Traversers/WriteOne.h +++ b/src/Benchmarks/Traversers/WriteOne.h @@ -44,7 +44,6 @@ class WriteOne< 1, Device, Real, Index > data[ i ] = 1.0; }; - ParallelFor< Device >::exec( ( std::size_t ) 0, size, writeOne, v.getData() ); } }; @@ -61,7 +60,17 @@ class WriteOne< 2, Device, Real, Index > static void run( std::size_t size ) { + Vector v( size * size ); + auto writeOne = [=] __cuda_callable__ ( Index i, Index j, Real* data ) + { + data[ i * size + j ] = 1.0; + }; + ParallelFor2D< Device >::exec( ( std::size_t ) 0, + ( std::size_t ) 0, + size, + size, + writeOne, v.getData() ); } }; @@ -76,7 +85,19 @@ class WriteOne< 3, Device, Real, Index > static void run( std::size_t size ) { + Vector v( size * size * size ); + auto writeOne = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) + { + data[ ( i * size + j ) * size + k ] = 1.0; + }; + ParallelFor3D< Device >::exec( ( std::size_t ) 0, + ( std::size_t ) 0, + ( std::size_t ) 0, + size, + size, + size, + writeOne, v.getData() ); } }; -- GitLab From 88541617933501531ac8ec765d001942f985fa5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Thu, 20 Dec 2018 22:02:02 +0100 Subject: [PATCH 007/130] Added computation minimal time, config setup and setup to Benchmark. --- src/Benchmarks/Benchmarks.h | 42 +++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 559e27ee2..39973d0ba 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -26,6 +26,7 @@ #include #include #include +#include #include namespace TNL { @@ -40,6 +41,7 @@ double timeFunction( ComputeFunction compute, ResetFunction reset, int loops, + int minTime, Monitor && monitor = Monitor() ) { // the timer is constructed zero-initialized and stopped @@ -52,7 +54,11 @@ timeFunction( ComputeFunction compute, reset(); compute(); - for(int i = 0; i < loops; ++i) { + int i; + for( i = 0; + i < loops && ( ! minTime || timer.getRealTime() < ( double ) minTime ); + ++i) + { // abuse the monitor's "time" for loops monitor.setTime( i + 1 ); @@ -71,7 +77,7 @@ timeFunction( ComputeFunction compute, timer.stop(); } - return timer.getRealTime() / loops; + return timer.getRealTime() / ( double ) i; } @@ -89,6 +95,12 @@ public: : verbose(verbose) {} + void + setVerbose( bool verbose) + { + this->verbose = verbose; + } + void writeTitle( const String & title ) { @@ -309,12 +321,25 @@ public: using Logging::MetadataElement; using Logging::MetadataMap; using Logging::MetadataColumns; - + Benchmark( int loops = 10, bool verbose = true ) : Logging(verbose), loops(loops) {} + + static void configSetup( Config::ConfigDescription& config ) + { + config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); + config.addEntry< int >( "min-time", "Minimal real time in seconds for every computation.", 1 ); + } + void setup( const Config::ParameterContainer& parameters ) + { + this->loops = parameters.getParameter< unsigned >( "loops" ); + this->minTime = parameters.getParameter< unsigned >( "min-time" ); + const unsigned verbose = parameters.getParameter< unsigned >( "verbose" ); + Logging::setVerbose( verbose ); + } // TODO: ensure that this is not called in the middle of the benchmark // (or just remove it completely?) void @@ -322,6 +347,11 @@ public: { this->loops = loops; } + + void setMinTime( int minTime ) + { + this->minTime = minTime; + } // Marks the start of a new benchmark void @@ -424,10 +454,10 @@ public: if( verbose ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); - result.time = timeFunction( compute, reset, loops, monitor ); + result.time = timeFunction( compute, reset, loops, minTime, monitor ); } else { - result.time = timeFunction( compute, reset, loops, monitor ); + result.time = timeFunction( compute, reset, minTime, loops, monitor ); } } catch ( const std::exception& e ) { @@ -477,7 +507,7 @@ public: } protected: - int loops; + int loops, minTime = 1; double datasetSize = 0.0; double baseTime = 0.0; Solvers::IterativeSolverMonitor< double, int > monitor; -- GitLab From a167d1b58204e48bc9174668c807dbe64747f578 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Thu, 20 Dec 2018 22:02:46 +0100 Subject: [PATCH 008/130] Fixed grid traversers benchmark. --- src/Benchmarks/Traversers/WriteOne.h | 109 ------------------ src/Benchmarks/Traversers/grid-traversing.h | 20 +--- .../Traversers/tnl-benchmark-traversers.h | 86 +++++++++----- src/Benchmarks/scripts/CMakeLists.txt | 15 +-- 4 files changed, 63 insertions(+), 167 deletions(-) delete mode 100644 src/Benchmarks/Traversers/WriteOne.h diff --git a/src/Benchmarks/Traversers/WriteOne.h b/src/Benchmarks/Traversers/WriteOne.h deleted file mode 100644 index 4c39926aa..000000000 --- a/src/Benchmarks/Traversers/WriteOne.h +++ /dev/null @@ -1,109 +0,0 @@ -/*************************************************************************** - WriteOne.h - description - ------------------- - begin : Dec 19, 2018 - copyright : (C) 2018 by oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Tomas Oberhuber - -#pragma once - -#include -#include -#include -#include - -namespace TNL { - namespace Benchmarks { - - -template< int Dimenions, - typename Device, - typename Real, - typename Index > -class WriteOne{}; - -template< typename Device, - typename Real, - typename Index > -class WriteOne< 1, Device, Real, Index > -{ - public: - - using Vector = Containers::Vector< Real, Device, Index >; - - static void run( std::size_t size ) - { - Vector v( size ); - auto writeOne = [] __cuda_callable__ ( Index i, Real* data ) - { - data[ i ] = 1.0; - }; - - ParallelFor< Device >::exec( ( std::size_t ) 0, size, writeOne, v.getData() ); - } -}; - - -template< typename Device, - typename Real, - typename Index > -class WriteOne< 2, Device, Real, Index > -{ - public: - - using Vector = Containers::Vector< Real, Device, Index >; - - static void run( std::size_t size ) - { - Vector v( size * size ); - auto writeOne = [=] __cuda_callable__ ( Index i, Index j, Real* data ) - { - data[ i * size + j ] = 1.0; - }; - - ParallelFor2D< Device >::exec( ( std::size_t ) 0, - ( std::size_t ) 0, - size, - size, - writeOne, v.getData() ); - } -}; - -template< typename Device, - typename Real, - typename Index > -class WriteOne< 3, Device, Real, Index > -{ - public: - - using Vector = Containers::Vector< Real, Device, Index >; - - static void run( std::size_t size ) - { - Vector v( size * size * size ); - auto writeOne = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) - { - data[ ( i * size + j ) * size + k ] = 1.0; - }; - - ParallelFor3D< Device >::exec( ( std::size_t ) 0, - ( std::size_t ) 0, - ( std::size_t ) 0, - size, - size, - size, - writeOne, v.getData() ); - } -}; - - - } // namespace Benchmarks -} // namespace TNL - - - diff --git a/src/Benchmarks/Traversers/grid-traversing.h b/src/Benchmarks/Traversers/grid-traversing.h index df45b1d7f..c977fea1c 100644 --- a/src/Benchmarks/Traversers/grid-traversing.h +++ b/src/Benchmarks/Traversers/grid-traversing.h @@ -13,7 +13,7 @@ #pragma once #include "../Benchmarks.h" -#include "WriteOne.h" + #include @@ -29,24 +29,6 @@ class benchmarkTraversingFullGrid static void run ( Benchmark& benchmark, std::size_t size ) { - auto reset = [&]() - {}; - - auto testHost = [&] () - { - WriteOne< Dimension, Devices::Host, Real, Index >::run( size ); - }; - - auto testCuda = [&] () - { - WriteOne< Dimension, Devices::Cuda, Real, Index >::run( size ); - }; - - benchmark.setOperation( "writeOne", size * sizeof( Real ) ); - benchmark.time( reset, "CPU", testHost ); -#ifdef HAVE_CUDA - benchmark.time( reset, "GPU", testCuda ); -#endif } }; diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index e227a258d..3e13d52dd 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -13,7 +13,8 @@ #pragma once #include "../Benchmarks.h" -#include "grid-traversing.h" +//#include "grid-traversing.h" +#include "GridTraversersBenchmark.h" #include #include @@ -23,29 +24,10 @@ using namespace TNL; using namespace TNL::Benchmarks; -void setupConfig( Config::ConfigDescription& config ) -{ - config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log"); - config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); - config.addEntryEnum( "append" ); - config.addEntryEnum( "overwrite" ); - config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); - config.addEntryEnum( "float" ); - config.addEntryEnum( "double" ); - config.addEntryEnum( "all" ); - config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 ); - config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 ); - config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 ); - config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); - config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); - config.addEntry< int >( "verbose", "Verbose mode.", 1 ); - - config.addDelimiter( "Device settings:" ); - Devices::Host::configSetup( config ); - Devices::Cuda::configSetup( config ); -} -template< int Dimension > +template< int Dimension, + typename Real = float, + typename Index = int > bool runBenchmark( const Config::ParameterContainer& parameters, Benchmark& benchmark, Benchmark::MetadataMap& metadata ) @@ -62,14 +44,59 @@ bool runBenchmark( const Config::ParameterContainer& parameters, benchmark.newBenchmark( String("Full grid traversing " + convertToString( Dimension ) + "D" ), metadata ); for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { - benchmark.setMetadataColumns( Benchmark::MetadataColumns({ - {"size", convertToString( size ) }, - } )); - benchmarkTraversingFullGrid< Dimension >::run( benchmark, size ); + + GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size ); + GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size ); + + auto reset = [&]() {}; + + benchmark.setMetadataColumns( + Benchmark::MetadataColumns( + { {"size", convertToString( size ) }, } ) ); + + auto hostWriteOne = [&] () + { + hostTraverserBenchmark.writeOne(); + }; + + auto cudaWriteOne = [&] () + { + cudaTraverserBenchmark.writeOne(); + }; + + benchmark.setOperation( "writeOne", size * sizeof( Real ) ); + benchmark.time( reset, "CPU", hostWriteOne ); +#ifdef HAVE_CUDA + benchmark.time( reset, "GPU", cudaWriteOne ); +#endif + } return true; } +void setupConfig( Config::ConfigDescription& config ) +{ + config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log"); + config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); + config.addEntryEnum( "append" ); + config.addEntryEnum( "overwrite" ); + config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); + config.addEntryEnum( "float" ); + config.addEntryEnum( "double" ); + config.addEntryEnum( "all" ); + config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 ); + config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 ); + config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 ); + config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); + config.addEntry< bool >( "verbose", "Verbose mode.", true ); + + Benchmark::configSetup( config ); + + config.addDelimiter( "Device settings:" ); + Devices::Host::configSetup( config ); + Devices::Cuda::configSetup( config ); +} + template< int Dimension > bool setupBenchmark( const Config::ParameterContainer& parameters ) { @@ -77,10 +104,9 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) const String & outputMode = parameters.getParameter< String >( "output-mode" ); const String & precision = parameters.getParameter< String >( "precision" ); const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); - const unsigned loops = parameters.getParameter< unsigned >( "loops" ); - const unsigned verbose = parameters.getParameter< unsigned >( "verbose" ); + - Benchmark benchmark( loops, verbose ); + Benchmark benchmark; //( loops, verbose ); Benchmark::MetadataMap metadata = getHardwareMetadata(); runBenchmark< Dimension >( parameters, benchmark, metadata ); diff --git a/src/Benchmarks/scripts/CMakeLists.txt b/src/Benchmarks/scripts/CMakeLists.txt index 1388c7984..31acdeb7d 100644 --- a/src/Benchmarks/scripts/CMakeLists.txt +++ b/src/Benchmarks/scripts/CMakeLists.txt @@ -1,16 +1,13 @@ -INSTALL( FILES matrix-market - florida-matrix-market - get-matrices - convert-matrices - draw-matrices +INSTALL( FILES tnl-run-heat-equation-benchmark + run-tnl-benchmark-spmv + run-tnl-benchmark-traversers run-matrix-solvers-benchmark run-tnl-benchmark-spmv run-tnl-benchmark-linear-solvers - tnl-run-heat-equation-benchmark - cuda-profiler.conf - process-cuda-profile.pl + DESTINATION ${TNL_TARGET_DATA_DIRECTORY}/benchmark-scripts ) -INSTALL( FILES tnl-run-spmv-benchmark +INSTALL( FILES run-tnl-benchmark-spmv + run-tnl-benchmark-traversers DESTINATION bin PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) -- GitLab From 2ad04b206862bb8b40df466acc13033000c90089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 21 Dec 2018 19:57:15 +0100 Subject: [PATCH 009/130] Added script for running traversers benchmark. Fixing traversers benchmark. --- src/Benchmarks/Benchmarks.h | 2 +- .../Traversers/GridTraversersBenchmark.h | 137 ++++++++++++++++++ .../Traversers/tnl-benchmark-traversers.h | 1 + .../scripts/run-tnl-benchmark-traversers | 5 + 4 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 src/Benchmarks/Traversers/GridTraversersBenchmark.h create mode 100644 src/Benchmarks/scripts/run-tnl-benchmark-traversers diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 39973d0ba..13ba3a6d1 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -56,7 +56,7 @@ timeFunction( ComputeFunction compute, int i; for( i = 0; - i < loops && ( ! minTime || timer.getRealTime() < ( double ) minTime ); + i < loops || timer.getRealTime() < ( double ) minTime; ++i) { // abuse the monitor's "time" for loops diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h new file mode 100644 index 000000000..3302c4cb9 --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -0,0 +1,137 @@ +/*************************************************************************** + WriteOne.h - description + ------------------- + begin : Dec 19, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include + +namespace TNL { + namespace Benchmarks { + + +template< int Dimension, + typename Device, + typename Real, + typename Index > +class GridTraversersBenchmark{}; + +template< typename Device, + typename Real, + typename Index > +class GridTraversersBenchmark< 1, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + + GridTraversersBenchmark( Index size ) + :v( size ), size( size ) + {} + + void writeOne() + { + + auto f = [] __cuda_callable__ ( Index i, Real* data ) + { + data[ i ] = i; + }; + + ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); + } + + protected: + + Index size; + Vector v; +}; + + +template< typename Device, + typename Real, + typename Index > +class GridTraversersBenchmark< 2, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + + GridTraversersBenchmark( Index size ) + :size( size ), v( size * size ) { } + + void writeOne() + { + Index _size = this->size; + auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) + { + data[ i * _size + j ] = i + j; + }; + + ParallelFor2D< Device >::exec( ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); + } + + protected: + + Index size; + + Vector v; + +}; + +template< typename Device, + typename Real, + typename Index > +class GridTraversersBenchmark< 3, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + + GridTraversersBenchmark( Index size ) + : size( size ), v( size * size * size ) {} + + void writeOne() + { + Index _size = this->size; + auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) + { + data[ ( i * _size + j ) * _size + k ] = i + j + k; + }; + + ParallelFor3D< Device >::exec( ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); + } + + protected: + + Index size; + Vector v; + +}; + + + } // namespace Benchmarks +} // namespace TNL + + + diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 3e13d52dd..9b69a3163 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -107,6 +107,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) Benchmark benchmark; //( loops, verbose ); + benchmark.setup( parameters ); Benchmark::MetadataMap metadata = getHardwareMetadata(); runBenchmark< Dimension >( parameters, benchmark, metadata ); diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-traversers b/src/Benchmarks/scripts/run-tnl-benchmark-traversers new file mode 100644 index 000000000..00cd1e1ac --- /dev/null +++ b/src/Benchmarks/scripts/run-tnl-benchmark-traversers @@ -0,0 +1,5 @@ +#!/bin/bash + +tnl-benchmark-traversers --dimension 1 --loops 1 --min-size 16 --max-size 100000 --min-time 1 +tnl-benchmark-traversers --dimension 2 --loops 1 --min-size 16 --max-size 10000 --min-time 1 --output-mode append +tnl-benchmark-traversers --dimension 3 --loops 1 --min-size 16 --max-size 1000 --min-time 1 --output-mode append -- GitLab From 84d226023095c5d233cbd16b3c8bc75f28ac935f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 21 Dec 2018 21:46:04 +0100 Subject: [PATCH 010/130] Added constructors with dimensions to grids. --- src/TNL/Meshes/GridDetails/Grid1D.h | 2 ++ src/TNL/Meshes/GridDetails/Grid1D_impl.h | 11 +++++++++++ src/TNL/Meshes/GridDetails/Grid2D.h | 2 ++ src/TNL/Meshes/GridDetails/Grid2D_impl.h | 14 ++++++++++++++ src/TNL/Meshes/GridDetails/Grid3D.h | 2 ++ src/TNL/Meshes/GridDetails/Grid3D_impl.h | 22 ++++++++++++++++++++++ 6 files changed, 53 insertions(+) diff --git a/src/TNL/Meshes/GridDetails/Grid1D.h b/src/TNL/Meshes/GridDetails/Grid1D.h index 426428ae4..9a8f14600 100644 --- a/src/TNL/Meshes/GridDetails/Grid1D.h +++ b/src/TNL/Meshes/GridDetails/Grid1D.h @@ -60,6 +60,8 @@ class Grid< 1, Real, Device, Index > : public Object * \brief Basic constructor. */ Grid(); + + Grid( const Index xSize ); /** * \brief Returns type of grid Real (value), Device type and the type of Index. diff --git a/src/TNL/Meshes/GridDetails/Grid1D_impl.h b/src/TNL/Meshes/GridDetails/Grid1D_impl.h index 1754edc58..995fa6dab 100644 --- a/src/TNL/Meshes/GridDetails/Grid1D_impl.h +++ b/src/TNL/Meshes/GridDetails/Grid1D_impl.h @@ -33,6 +33,17 @@ Grid< 1, Real, Device, Index >::Grid() { } +template< typename Real, + typename Device, + typename Index > +Grid< 1, Real, Device, Index >::Grid( const Index xSize ) +: numberOfCells( 0 ), + numberOfVertices( 0 ), + distGrid(nullptr) +{ + this->setDimensions( xSize ); +} + template< typename Real, typename Device, typename Index > diff --git a/src/TNL/Meshes/GridDetails/Grid2D.h b/src/TNL/Meshes/GridDetails/Grid2D.h index 84c6b4f33..896b61548 100644 --- a/src/TNL/Meshes/GridDetails/Grid2D.h +++ b/src/TNL/Meshes/GridDetails/Grid2D.h @@ -61,6 +61,8 @@ class Grid< 2, Real, Device, Index > : public Object /** * \brief See Grid1D::getType(). */ + Grid( const Index xSize, const Index ySize ); + static String getType(); /** diff --git a/src/TNL/Meshes/GridDetails/Grid2D_impl.h b/src/TNL/Meshes/GridDetails/Grid2D_impl.h index b315d5d08..49ad91035 100644 --- a/src/TNL/Meshes/GridDetails/Grid2D_impl.h +++ b/src/TNL/Meshes/GridDetails/Grid2D_impl.h @@ -36,6 +36,20 @@ Grid< 2, Real, Device, Index > :: Grid() { } +template< typename Real, + typename Device, + typename Index > +Grid< 2, Real, Device, Index >::Grid( const Index xSize, const Index ySize ) +: numberOfCells( 0 ), + numberOfNxFaces( 0 ), + numberOfNyFaces( 0 ), + numberOfFaces( 0 ), + numberOfVertices( 0 ), + distGrid(nullptr) +{ + this->setDimensions( xSize, ySize ); +} + template< typename Real, typename Device, typename Index > diff --git a/src/TNL/Meshes/GridDetails/Grid3D.h b/src/TNL/Meshes/GridDetails/Grid3D.h index 565198077..3ddd44735 100644 --- a/src/TNL/Meshes/GridDetails/Grid3D.h +++ b/src/TNL/Meshes/GridDetails/Grid3D.h @@ -57,6 +57,8 @@ class Grid< 3, Real, Device, Index > : public Object * \brief See Grid1D::Grid(). */ Grid(); + + Grid( const Index xSize, const Index ySize, const Index zSize ); /** * \brief See Grid1D::getType(). diff --git a/src/TNL/Meshes/GridDetails/Grid3D_impl.h b/src/TNL/Meshes/GridDetails/Grid3D_impl.h index cc6805ac0..edbee0c00 100644 --- a/src/TNL/Meshes/GridDetails/Grid3D_impl.h +++ b/src/TNL/Meshes/GridDetails/Grid3D_impl.h @@ -43,6 +43,28 @@ Grid< 3, Real, Device, Index > :: Grid() { } +template< typename Real, + typename Device, + typename Index > +Grid< 3, Real, Device, Index >::Grid( const Index xSize, const Index ySize, const Index zSize ) +: numberOfCells( 0 ), + numberOfNxFaces( 0 ), + numberOfNyFaces( 0 ), + numberOfNzFaces( 0 ), + numberOfNxAndNyFaces( 0 ), + numberOfFaces( 0 ), + numberOfDxEdges( 0 ), + numberOfDyEdges( 0 ), + numberOfDzEdges( 0 ), + numberOfDxAndDyEdges( 0 ), + numberOfEdges( 0 ), + numberOfVertices( 0 ), + distGrid(nullptr) +{ + this->setDimensions( xSize, ySize, zSize ); +} + + template< typename Real, typename Device, typename Index > -- GitLab From 2b38a34bcfb7c5d1f6b1f51432b038306c64b0b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 21 Dec 2018 21:47:40 +0100 Subject: [PATCH 011/130] Fixed memory bandwidth in traversers benchmark. --- .../Traversers/GridTraversersBenchmark.h | 63 +++++++++++++++---- .../Traversers/tnl-benchmark-traversers.h | 14 ++--- src/TNL/Meshes/GridDetails/Grid2D.h | 4 +- src/TNL/Meshes/GridDetails/Grid2D_impl.h | 2 +- src/TNL/Meshes/GridDetails/Grid3D.h | 4 +- 5 files changed, 64 insertions(+), 23 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 3302c4cb9..6f1019deb 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -16,6 +16,10 @@ #include #include #include +#include +#include +#include +#include namespace TNL { namespace Benchmarks { @@ -35,26 +39,52 @@ class GridTraversersBenchmark< 1, Device, Real, Index > public: using Vector = Containers::Vector< Real, Device, Index >; + using Grid = Meshes::Grid< 1, Real, Device, Index >; + using Coordinates = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using Cell = typename Grid::EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; GridTraversersBenchmark( Index size ) - :v( size ), size( size ) - {} + :v( size ), size( size ), grid( size ) + { + } - void writeOne() + void writeOneUsingParallelFor() { auto f = [] __cuda_callable__ ( Index i, Real* data ) { - data[ i ] = i; + data[ i ] = 1.0; }; ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } + void writeOneUsingTraverser() + { + class EntitiesProcessor + { + + }; + + class UserData + { + + }; + + Traverser traverser; + /*traverser.template processAllEntities< UserData, EntitiesProcessor > + ( meshPointer, + userData );*/ + + } + protected: Index size; Vector v; + Grid grid; }; @@ -66,16 +96,20 @@ class GridTraversersBenchmark< 2, Device, Real, Index > public: using Vector = Containers::Vector< Real, Device, Index >; + using Grid = Meshes::Grid< 2, Real, Device, Index >; + using Coordinates = typename Grid::CoordinatesType; GridTraversersBenchmark( Index size ) - :size( size ), v( size * size ) { } + :size( size ), v( size * size ), grid( size, size ) + { + } - void writeOne() + void writeOneUsingParallelFor() { Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { - data[ i * _size + j ] = i + j; + data[ i * _size + j ] = 1.0; }; ParallelFor2D< Device >::exec( ( Index ) 0, @@ -88,8 +122,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index > protected: Index size; - Vector v; + Grid grid; }; @@ -101,16 +135,22 @@ class GridTraversersBenchmark< 3, Device, Real, Index > public: using Vector = Containers::Vector< Real, Device, Index >; + using Grid = Meshes::Grid< 3, Real, Device, Index >; + using Coordinates = typename Grid::CoordinatesType; GridTraversersBenchmark( Index size ) - : size( size ), v( size * size * size ) {} + : size( size ), + v( size * size * size ), + grid( size, size, size ) + { + } - void writeOne() + void writeOneUsingParallelFor() { Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { - data[ ( i * _size + j ) * _size + k ] = i + j + k; + data[ ( i * _size + j ) * _size + k ] = 1.0; }; ParallelFor3D< Device >::exec( ( Index ) 0, @@ -126,6 +166,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > Index size; Vector v; + Grid grid; }; diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 9b69a3163..c6349f596 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -54,20 +54,20 @@ bool runBenchmark( const Config::ParameterContainer& parameters, Benchmark::MetadataColumns( { {"size", convertToString( size ) }, } ) ); - auto hostWriteOne = [&] () + auto hostWriteOneUsingParallelFor = [&] () { - hostTraverserBenchmark.writeOne(); + hostTraverserBenchmark.writeOneUsingParallelFor(); }; - auto cudaWriteOne = [&] () + auto cudaWriteOneUsingParallelFor = [&] () { - cudaTraverserBenchmark.writeOne(); + cudaTraverserBenchmark.writeOneUsingParallelFor(); }; - benchmark.setOperation( "writeOne", size * sizeof( Real ) ); - benchmark.time( reset, "CPU", hostWriteOne ); + benchmark.setOperation( "write 1 using parallel for", size * sizeof( Real ) / oneGB ); + benchmark.time( reset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA - benchmark.time( reset, "GPU", cudaWriteOne ); + benchmark.time( reset, "GPU", cudaWriteOneUsingParallelFor ); #endif } diff --git a/src/TNL/Meshes/GridDetails/Grid2D.h b/src/TNL/Meshes/GridDetails/Grid2D.h index 896b61548..f2dbebc5c 100644 --- a/src/TNL/Meshes/GridDetails/Grid2D.h +++ b/src/TNL/Meshes/GridDetails/Grid2D.h @@ -82,8 +82,8 @@ class Grid< 2, Real, Device, Index > : public Object /** * \brief Sets the size of dimensions. - * \param xSize Size of dimesion x. - * \param ySize Size of dimesion y. + * \param xSize Size of dimension x. + * \param ySize Size of dimension y. */ void setDimensions( const Index xSize, const Index ySize ); diff --git a/src/TNL/Meshes/GridDetails/Grid2D_impl.h b/src/TNL/Meshes/GridDetails/Grid2D_impl.h index 49ad91035..41e05d8b5 100644 --- a/src/TNL/Meshes/GridDetails/Grid2D_impl.h +++ b/src/TNL/Meshes/GridDetails/Grid2D_impl.h @@ -43,7 +43,7 @@ Grid< 2, Real, Device, Index >::Grid( const Index xSize, const Index ySize ) : numberOfCells( 0 ), numberOfNxFaces( 0 ), numberOfNyFaces( 0 ), - numberOfFaces( 0 ), + numberOfFaces( 0 ), numberOfVertices( 0 ), distGrid(nullptr) { diff --git a/src/TNL/Meshes/GridDetails/Grid3D.h b/src/TNL/Meshes/GridDetails/Grid3D.h index 3ddd44735..617efe7f3 100644 --- a/src/TNL/Meshes/GridDetails/Grid3D.h +++ b/src/TNL/Meshes/GridDetails/Grid3D.h @@ -57,8 +57,8 @@ class Grid< 3, Real, Device, Index > : public Object * \brief See Grid1D::Grid(). */ Grid(); - - Grid( const Index xSize, const Index ySize, const Index zSize ); + + Grid( const Index xSize, const Index ySize, const Index zSize ); /** * \brief See Grid1D::getType(). -- GitLab From 23bb05dfc578bbd42f30b37c682aca4251f9b557 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 21 Dec 2018 22:20:28 +0100 Subject: [PATCH 012/130] Changing minTime in Benchmark from int to double. --- src/Benchmarks/Benchmarks.h | 13 +++++++------ src/Benchmarks/Traversers/GridTraversersBenchmark.h | 12 +++++------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 13ba3a6d1..61452d074 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -41,7 +41,7 @@ double timeFunction( ComputeFunction compute, ResetFunction reset, int loops, - int minTime, + const double& minTime, Monitor && monitor = Monitor() ) { // the timer is constructed zero-initialized and stopped @@ -56,7 +56,7 @@ timeFunction( ComputeFunction compute, int i; for( i = 0; - i < loops || timer.getRealTime() < ( double ) minTime; + i < loops || timer.getRealTime() < minTime; ++i) { // abuse the monitor's "time" for loops @@ -330,13 +330,13 @@ public: static void configSetup( Config::ConfigDescription& config ) { config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); - config.addEntry< int >( "min-time", "Minimal real time in seconds for every computation.", 1 ); + config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 ); } void setup( const Config::ParameterContainer& parameters ) { this->loops = parameters.getParameter< unsigned >( "loops" ); - this->minTime = parameters.getParameter< unsigned >( "min-time" ); + this->minTime = parameters.getParameter< double >( "min-time" ); const unsigned verbose = parameters.getParameter< unsigned >( "verbose" ); Logging::setVerbose( verbose ); } @@ -348,7 +348,7 @@ public: this->loops = loops; } - void setMinTime( int minTime ) + void setMinTime( const double& minTime ) { this->minTime = minTime; } @@ -507,7 +507,8 @@ public: } protected: - int loops, minTime = 1; + int loops = 1; + double minTime = 1; double datasetSize = 0.0; double baseTime = 0.0; Solvers::IterativeSolverMonitor< double, int > monitor; diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 6f1019deb..dcb6f5fdd 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -65,23 +65,21 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { class EntitiesProcessor { - }; - + class UserData { - }; - + Traverser traverser; /*traverser.template processAllEntities< UserData, EntitiesProcessor > ( meshPointer, userData );*/ - + } - + protected: - + Index size; Vector v; Grid grid; -- GitLab From fe1ca902cd2f55a4c61b1bef3d070c709ff74af6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 22 Dec 2018 09:28:11 +0100 Subject: [PATCH 013/130] Fixed indexing and data set size in traversers benchmark. --- src/Benchmarks/Traversers/GridTraversersBenchmark.h | 4 ++-- src/Benchmarks/Traversers/tnl-benchmark-traversers.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index dcb6f5fdd..735d0a241 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -105,7 +105,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > void writeOneUsingParallelFor() { Index _size = this->size; - auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) + auto f = [=] __cuda_callable__ ( Index j, Index i, Real* data ) { data[ i * _size + j ] = 1.0; }; @@ -146,7 +146,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > void writeOneUsingParallelFor() { Index _size = this->size; - auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) + auto f = [=] __cuda_callable__ ( Index k, Index j, Index i, Real* data ) { data[ ( i * _size + j ) * _size + k ] = 1.0; }; diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index c6349f596..6f9a4575a 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -64,7 +64,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.writeOneUsingParallelFor(); }; - benchmark.setOperation( "write 1 using parallel for", size * sizeof( Real ) / oneGB ); + benchmark.setOperation( "write 1 using parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time( reset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA benchmark.time( reset, "GPU", cudaWriteOneUsingParallelFor ); -- GitLab From 467521f72013a460cd7c8da185163b4ef958f9e1 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Sun, 23 Dec 2018 13:34:55 +0100 Subject: [PATCH 014/130] Fixed traversers benchmark test using traverser. --- .../Traversers/GridTraversersBenchmark.h | 107 ++++++++++++++---- .../Traversers/tnl-benchmark-traversers.h | 25 +++- 2 files changed, 110 insertions(+), 22 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 735d0a241..0190532c3 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -20,11 +20,43 @@ #include #include #include +#include namespace TNL { namespace Benchmarks { +template< typename TraverserUserData > +class WriteOneEntitiesProcessor +{ + public: + + using MeshType = typename TraverserUserData::MeshType; + using DeviceType = typename MeshType::DeviceType; + + template< typename GridEntity > + __cuda_callable__ + static inline void processEntity( const MeshType& mesh, + TraverserUserData& userData, + const GridEntity& entity ) + { + auto& u = userData.u.template modifyData< DeviceType >(); + u( entity ) = 1.0; + } +}; + +template< typename MeshFunctionPointer > +class WriteOneUserData +{ + public: + + using MeshType = typename MeshFunctionPointer::ObjectType::MeshType; + + MeshFunctionPointer u; + +}; + + template< int Dimension, typename Device, typename Real, @@ -40,14 +72,19 @@ class GridTraversersBenchmark< 1, Device, Real, Index > using Vector = Containers::Vector< Real, Device, Index >; using Grid = Meshes::Grid< 1, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< Grid >; using Coordinates = typename Grid::CoordinatesType; using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; using Cell = typename Grid::EntityType< 1, Meshes::GridEntityNoStencilStorage >; using Traverser = Meshes::Traverser< Grid, Cell >; + using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; + using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; GridTraversersBenchmark( Index size ) - :v( size ), size( size ), grid( size ) + :v( size ), size( size ), grid( size ), u( grid ) { + userData.u = this->u; } void writeOneUsingParallelFor() @@ -63,26 +100,18 @@ class GridTraversersBenchmark< 1, Device, Real, Index > void writeOneUsingTraverser() { - class EntitiesProcessor - { - }; - - class UserData - { - }; - - Traverser traverser; - /*traverser.template processAllEntities< UserData, EntitiesProcessor > - ( meshPointer, - userData );*/ - + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); } protected: Index size; Vector v; - Grid grid; + GridPointer grid; + MeshFunctionPointer u; + Traverser traverser; + WriteOneTraverserUserDataType userData; }; @@ -95,11 +124,20 @@ class GridTraversersBenchmark< 2, Device, Real, Index > using Vector = Containers::Vector< Real, Device, Index >; using Grid = Meshes::Grid< 2, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< Grid >; using Coordinates = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using Cell = typename Grid::EntityType< 2, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; + using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; + using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; + using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; GridTraversersBenchmark( Index size ) - :size( size ), v( size * size ), grid( size, size ) + :size( size ), v( size * size ), grid( size, size ), u( grid ) { + userData.u = this->u; } void writeOneUsingParallelFor() @@ -116,13 +154,22 @@ class GridTraversersBenchmark< 2, Device, Real, Index > this->size, f, v.getData() ); } + + void writeOneUsingTraverser() + { + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } + protected: Index size; Vector v; - Grid grid; - + GridPointer grid; + MeshFunctionPointer u; + Traverser traverser; + WriteOneTraverserUserDataType userData; }; template< typename Device, @@ -134,13 +181,23 @@ class GridTraversersBenchmark< 3, Device, Real, Index > using Vector = Containers::Vector< Real, Device, Index >; using Grid = Meshes::Grid< 3, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< Grid >; using Coordinates = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using Cell = typename Grid::EntityType< 3, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; + using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; + using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; + using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; GridTraversersBenchmark( Index size ) : size( size ), v( size * size * size ), - grid( size, size, size ) + grid( size, size, size ), + u( grid ) { + userData.u = this->u; } void writeOneUsingParallelFor() @@ -159,13 +216,21 @@ class GridTraversersBenchmark< 3, Device, Real, Index > this->size, f, v.getData() ); } + + void writeOneUsingTraverser() + { + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } protected: Index size; Vector v; - Grid grid; - + GridPointer grid; + MeshFunctionPointer u; + Traverser traverser; + WriteOneTraverserUserDataType userData; }; diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 6f9a4575a..4f839faf7 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -54,6 +54,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, Benchmark::MetadataColumns( { {"size", convertToString( size ) }, } ) ); + /**** + * Write one using parallel for + */ auto hostWriteOneUsingParallelFor = [&] () { hostTraverserBenchmark.writeOneUsingParallelFor(); @@ -69,6 +72,26 @@ bool runBenchmark( const Config::ParameterContainer& parameters, #ifdef HAVE_CUDA benchmark.time( reset, "GPU", cudaWriteOneUsingParallelFor ); #endif + + /**** + * Write one using traverser + */ + auto hostWriteOneUsingTraverser = [&] () + { + hostTraverserBenchmark.writeOneUsingTraverser(); + }; + + auto cudaWriteOneUsingTraverser = [&] () + { + cudaTraverserBenchmark.writeOneUsingTraverser(); + }; + + benchmark.setOperation( "write 1 using traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time( reset, "CPU", hostWriteOneUsingTraverser ); +#ifdef HAVE_CUDA + benchmark.time( reset, "GPU", cudaWriteOneUsingTraverser ); +#endif + } return true; @@ -76,7 +99,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, void setupConfig( Config::ConfigDescription& config ) { - config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log"); + config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-traversers.log"); config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); config.addEntryEnum( "append" ); config.addEntryEnum( "overwrite" ); -- GitLab From 2496b2659e7c91267be5c1b8fc7f5a300bd54045 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Mon, 24 Dec 2018 15:42:57 +0100 Subject: [PATCH 015/130] Changing verbose form bool to int to have three levels of verbosity in Benchmark. --- src/Benchmarks/Benchmarks.h | 16 +++++++++------- .../Traversers/tnl-benchmark-traversers.h | 1 - 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 61452d074..7a6b12676 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -48,12 +48,13 @@ timeFunction( ComputeFunction compute, Timer timer; // set timer to the monitor - monitor.setTimer( timer ); + //monitor.setTimer( timer ); // warm up reset(); compute(); + //timer.start(); int i; for( i = 0; i < loops || timer.getRealTime() < minTime; @@ -91,12 +92,12 @@ public: using HeaderElements = std::vector< String >; using RowElements = std::vector< double >; - Logging( bool verbose = true ) + Logging( int verbose = true ) : verbose(verbose) {} void - setVerbose( bool verbose) + setVerbose( int verbose) { this->verbose = verbose; } @@ -286,7 +287,7 @@ protected: std::string header_indent; std::string body_indent; - bool verbose; + int verbose; MetadataColumns metadataColumns; bool header_changed = true; std::vector< std::pair< String, int > > horizontalGroups; @@ -331,13 +332,14 @@ public: { config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 ); + config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 ); } void setup( const Config::ParameterContainer& parameters ) { this->loops = parameters.getParameter< unsigned >( "loops" ); this->minTime = parameters.getParameter< double >( "min-time" ); - const unsigned verbose = parameters.getParameter< unsigned >( "verbose" ); + const int verbose = parameters.getParameter< unsigned >( "verbose" ); Logging::setVerbose( verbose ); } // TODO: ensure that this is not called in the middle of the benchmark @@ -451,13 +453,13 @@ public: { result.time = std::numeric_limits::quiet_NaN(); try { - if( verbose ) { + if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); result.time = timeFunction( compute, reset, loops, minTime, monitor ); } else { - result.time = timeFunction( compute, reset, minTime, loops, monitor ); + result.time = timeFunction( compute, reset, loops, minTime, monitor ); } } catch ( const std::exception& e ) { diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 4f839faf7..d9958e29c 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -111,7 +111,6 @@ void setupConfig( Config::ConfigDescription& config ) config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 ); config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 ); config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); - config.addEntry< bool >( "verbose", "Verbose mode.", true ); Benchmark::configSetup( config ); -- GitLab From 09467575801555fe35a275763980c0e07ebb0558 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Tue, 25 Dec 2018 13:11:48 +0100 Subject: [PATCH 016/130] Added pure-C test to traversers benchmark. --- src/Benchmarks/Benchmarks.h | 2 +- .../Traversers/GridTraversersBenchmark.h | 174 ++++++++++++++++-- .../Traversers/tnl-benchmark-traversers.h | 66 +++++-- 3 files changed, 208 insertions(+), 34 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 7a6b12676..c371e2dfb 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -48,7 +48,7 @@ timeFunction( ComputeFunction compute, Timer timer; // set timer to the monitor - //monitor.setTimer( timer ); + monitor.setTimer( timer ); // warm up reset(); diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 0190532c3..ee18adfa6 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -53,9 +53,37 @@ class WriteOneUserData using MeshType = typename MeshFunctionPointer::ObjectType::MeshType; MeshFunctionPointer u; - }; - + +template< typename Real, + typename Index > +__global__ void simpleCudaKernel1D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( threadIdx_x < size ) + v_data[ threadIdx_x ] = 1.0; +} + +template< typename Real, + typename Index > +__global__ void simpleCudaKernel2D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + if( threadIdx_x < size && threadIdx_y < size ) + v_data[ threadIdx_y * size + threadIdx_x ] = 1.0; +} + +template< typename Real, + typename Index > +__global__ void simpleCudaKernel3D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size ) + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0; +} template< int Dimension, typename Device, @@ -85,19 +113,55 @@ class GridTraversersBenchmark< 1, Device, Real, Index > :v( size ), size( size ), grid( size ), u( grid ) { userData.u = this->u; + v_data = v.getData(); + } + + void reset() + { + v.setValue( 0.0 ); + u->getData().setValue( 0.0 ); + }; + + void writeOneUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + v_data[ i ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + simpleCudaKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } } void writeOneUsingParallelFor() { - auto f = [] __cuda_callable__ ( Index i, Real* data ) { data[ i ] = 1.0; }; - ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } - + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > @@ -108,6 +172,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > Index size; Vector v; + Real* v_data; GridPointer grid; MeshFunctionPointer u; Traverser traverser; @@ -133,11 +198,52 @@ class GridTraversersBenchmark< 2, Device, Real, Index > using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; - + GridTraversersBenchmark( Index size ) :size( size ), v( size * size ), grid( size, size ), u( grid ) { userData.u = this->u; + v_data = v.getData(); + } + + void reset() + { + v.setValue( 0.0 ); + u->getData().setValue( 0.0 ); + }; + + void writeOneUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + for( int j = 0; j < size; j++ ) + v_data[ i * size + j ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size ); + dim3 gridIdx; + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + simpleCudaKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } } void writeOneUsingParallelFor() @@ -154,18 +260,18 @@ class GridTraversersBenchmark< 2, Device, Real, Index > this->size, f, v.getData() ); } - + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > ( grid, userData ); } - protected: Index size; Vector v; + Real* v_data; GridPointer grid; MeshFunctionPointer u; Traverser traverser; @@ -178,7 +284,7 @@ template< typename Device, class GridTraversersBenchmark< 3, Device, Real, Index > { public: - + using Vector = Containers::Vector< Real, Device, Index >; using Grid = Meshes::Grid< 3, Real, Device, Index >; using GridPointer = Pointers::SharedPointer< Grid >; @@ -198,6 +304,50 @@ class GridTraversersBenchmark< 3, Device, Real, Index > u( grid ) { userData.u = this->u; + v_data = v.getData(); + } + + void reset() + { + v.setValue( 0.0 ); + u->getData().setValue( 0.0 ); + }; + + void writeOneUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + for( int j = 0; j < size; j++ ) + for( int k = 0; k < size; k++ ) + v_data[ ( i * size + j ) * size + k ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size, + size ); + dim3 gridIdx; + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + simpleCudaKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } } void writeOneUsingParallelFor() @@ -227,6 +377,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > Index size; Vector v; + Real* v_data; GridPointer grid; MeshFunctionPointer u; Traverser traverser; @@ -235,7 +386,4 @@ class GridTraversersBenchmark< 3, Device, Real, Index > } // namespace Benchmarks -} // namespace TNL - - - +} // namespace TNL \ No newline at end of file diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index d9958e29c..f1c4efeed 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -39,21 +39,50 @@ bool runBenchmark( const Config::ParameterContainer& parameters, // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); const int minSize = parameters.getParameter< int >( "min-size" ); const int maxSize = parameters.getParameter< int >( "max-size" ); - + // Full grid traversing - benchmark.newBenchmark( String("Full grid traversing " + convertToString( Dimension ) + "D" ), metadata ); + benchmark.newBenchmark( String("Full grid traversing - write 1" + convertToString( Dimension ) + "D" ), metadata ); for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size ); GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size ); - auto reset = [&]() {}; + auto noReset = []() {}; + + auto hostReset = [&]() + { + hostTraverserBenchmark.reset(); + }; + + auto cudaReset = [&]() + { + cudaTraverserBenchmark.reset(); + }; benchmark.setMetadataColumns( Benchmark::MetadataColumns( { {"size", convertToString( size ) }, } ) ); + /**** + * Write one using C for + */ + auto hostWriteOneUsingPureC = [&] () + { + hostTraverserBenchmark.writeOneUsingPureC(); + }; + + auto cudaWriteOneUsingPureC = [&] () + { + cudaTraverserBenchmark.writeOneUsingPureC(); + }; + + benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time( hostReset, "CPU", hostWriteOneUsingPureC ); +#ifdef HAVE_CUDA + benchmark.time( cudaReset, "GPU", cudaWriteOneUsingPureC ); +#endif + /**** * Write one using parallel for */ @@ -67,10 +96,10 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.writeOneUsingParallelFor(); }; - benchmark.setOperation( "write 1 using parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time( reset, "CPU", hostWriteOneUsingParallelFor ); + benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA - benchmark.time( reset, "GPU", cudaWriteOneUsingParallelFor ); + benchmark.time( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); #endif /**** @@ -84,16 +113,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters, auto cudaWriteOneUsingTraverser = [&] () { cudaTraverserBenchmark.writeOneUsingTraverser(); - }; - - benchmark.setOperation( "write 1 using traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time( reset, "CPU", hostWriteOneUsingTraverser ); + } + + benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time( hostReset, "CPU", hostWriteOneUsingTraverser ); #ifdef HAVE_CUDA - benchmark.time( reset, "GPU", cudaWriteOneUsingTraverser ); + benchmark.time( cudaReset, "GPU", cudaWriteOneUsingTraverser ); #endif - - - } + } return true; } @@ -107,16 +134,16 @@ void setupConfig( Config::ConfigDescription& config ) config.addEntryEnum( "float" ); config.addEntryEnum( "double" ); config.addEntryEnum( "all" ); - config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 ); + config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 ); config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 ); config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 ); config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); Benchmark::configSetup( config ); - + config.addDelimiter( "Device settings:" ); Devices::Host::configSetup( config ); - Devices::Cuda::configSetup( config ); + Devices::Cuda::configSetup( config ); } template< int Dimension > @@ -126,18 +153,17 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) const String & outputMode = parameters.getParameter< String >( "output-mode" ); const String & precision = parameters.getParameter< String >( "precision" ); const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); - Benchmark benchmark; //( loops, verbose ); benchmark.setup( parameters ); Benchmark::MetadataMap metadata = getHardwareMetadata(); runBenchmark< Dimension >( parameters, benchmark, metadata ); - + auto mode = std::ios::out; if( outputMode == "append" ) mode |= std::ios::app; std::ofstream logFile( logFileName.getString(), mode ); - + if( ! benchmark.save( logFile ) ) { std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; -- GitLab From 7c172b2fb04407b9c5d6175fe9e5d1ace61f5b1e Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Tue, 25 Dec 2018 17:42:34 +0100 Subject: [PATCH 017/130] CUDA device synchronization is performed only for CUDA benchmarks. --- src/Benchmarks/BLAS/array-operations.h | 24 ++-- src/Benchmarks/BLAS/spmv.h | 4 +- src/Benchmarks/BLAS/vector-operations.h | 58 +++++----- src/Benchmarks/Benchmarks.h | 103 ++++++++++-------- .../DistSpMV/tnl-benchmark-distributed-spmv.h | 4 +- src/Benchmarks/LinearSolvers/benchmarks.h | 4 +- .../Traversers/tnl-benchmark-traversers.h | 16 +-- 7 files changed, 111 insertions(+), 102 deletions(-) diff --git a/src/Benchmarks/BLAS/array-operations.h b/src/Benchmarks/BLAS/array-operations.h index 9ee6ff8a0..b5cf9ff58 100644 --- a/src/Benchmarks/BLAS/array-operations.h +++ b/src/Benchmarks/BLAS/array-operations.h @@ -72,9 +72,9 @@ benchmarkArrayOperations( Benchmark & benchmark, resultDevice = (int) deviceArray == deviceArray2; }; benchmark.setOperation( "comparison (operator==)", 2 * datasetSize ); - benchmark.time( reset1, "CPU", compareHost ); + benchmark.time< Devices::Host >( reset1, "CPU", compareHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", compareCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", compareCuda ); #endif @@ -87,9 +87,9 @@ benchmarkArrayOperations( Benchmark & benchmark, benchmark.setOperation( "copy (operator=)", 2 * datasetSize ); // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will // complain when compiling without CUDA - const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost ); + const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", copyAssignCudaCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", copyAssignCudaCuda ); #endif @@ -101,8 +101,8 @@ benchmarkArrayOperations( Benchmark & benchmark, }; #ifdef HAVE_CUDA benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime ); - benchmark.time( reset1, "CPU->GPU", copyAssignHostCuda ); - benchmark.time( reset1, "GPU->CPU", copyAssignCudaHost ); + benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost ); #endif @@ -113,9 +113,9 @@ benchmarkArrayOperations( Benchmark & benchmark, deviceArray.setValue( 3.0 ); }; benchmark.setOperation( "setValue", datasetSize ); - benchmark.time( reset1, "CPU", setValueHost ); + benchmark.time< Devices::Host >( reset1, "CPU", setValueHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", setValueCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", setValueCuda ); #endif @@ -132,9 +132,9 @@ benchmarkArrayOperations( Benchmark & benchmark, #endif }; benchmark.setOperation( "allocation (setSize)", datasetSize ); - benchmark.time( resetSize1, "CPU", setSizeHost ); + benchmark.time< Devices::Host >( resetSize1, "CPU", setSizeHost ); #ifdef HAVE_CUDA - benchmark.time( resetSize1, "GPU", setSizeCuda ); + benchmark.time< Devices::Cuda >( resetSize1, "GPU", setSizeCuda ); #endif @@ -151,9 +151,9 @@ benchmarkArrayOperations( Benchmark & benchmark, #endif }; benchmark.setOperation( "deallocation (reset)", datasetSize ); - benchmark.time( setSize1, "CPU", resetSizeHost ); + benchmark.time< Devices::Host >( setSize1, "CPU", resetSizeHost ); #ifdef HAVE_CUDA - benchmark.time( setSize1, "GPU", resetSizeCuda ); + benchmark.time< Devices::Cuda >( setSize1, "GPU", resetSizeCuda ); #endif return true; diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h index 9df40f4ec..7299f828a 100644 --- a/src/Benchmarks/BLAS/spmv.h +++ b/src/Benchmarks/BLAS/spmv.h @@ -163,9 +163,9 @@ benchmarkSpMV( Benchmark & benchmark, }; benchmark.setOperation( datasetSize ); - benchmark.time( reset, "CPU", spmvHost ); + benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); #ifdef HAVE_CUDA - benchmark.time( reset, "GPU", spmvCuda ); + benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda ); #endif return true; diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h index 8dd63de85..e191b8fbb 100644 --- a/src/Benchmarks/BLAS/vector-operations.h +++ b/src/Benchmarks/BLAS/vector-operations.h @@ -90,9 +90,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.max(); }; benchmark.setOperation( "max", datasetSize ); - benchmark.time( reset1, "CPU", maxHost ); + benchmark.time< Devices::Host >( reset1, "CPU", maxHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", maxCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", maxCuda ); #endif @@ -103,9 +103,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.min(); }; benchmark.setOperation( "min", datasetSize ); - benchmark.time( reset1, "CPU", minHost ); + benchmark.time< Devices::Host >( reset1, "CPU", minHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", minCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", minCuda ); #endif @@ -125,10 +125,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "absMax", datasetSize ); - benchmark.time( reset1, "CPU", absMaxHost ); + benchmark.time< Devices::Host >( reset1, "CPU", absMaxHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", absMaxCuda ); - benchmark.time( reset1, "cuBLAS", absMaxCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", absMaxCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMaxCublas ); #endif @@ -148,10 +148,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "absMin", datasetSize ); - benchmark.time( reset1, "CPU", absMinHost ); + benchmark.time< Devices::Host >( reset1, "CPU", absMinHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", absMinCuda ); - benchmark.time( reset1, "cuBLAS", absMinCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", absMinCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMinCublas ); #endif @@ -162,9 +162,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.sum(); }; benchmark.setOperation( "sum", datasetSize ); - benchmark.time( reset1, "CPU", sumHost ); + benchmark.time< Devices::Host >( reset1, "CPU", sumHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", sumCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", sumCuda ); #endif @@ -182,10 +182,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "l1 norm", datasetSize ); - benchmark.time( reset1, "CPU", l1normHost ); + benchmark.time< Devices::Host >( reset1, "CPU", l1normHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", l1normCuda ); - benchmark.time( reset1, "cuBLAS", l1normCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", l1normCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l1normCublas ); #endif @@ -203,10 +203,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "l2 norm", datasetSize ); - benchmark.time( reset1, "CPU", l2normHost ); + benchmark.time< Devices::Host >( reset1, "CPU", l2normHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", l2normCuda ); - benchmark.time( reset1, "cuBLAS", l2normCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", l2normCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l2normCublas ); #endif @@ -217,9 +217,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.lpNorm( 3.0 ); }; benchmark.setOperation( "l3 norm", datasetSize ); - benchmark.time( reset1, "CPU", l3normHost ); + benchmark.time< Devices::Host >( reset1, "CPU", l3normHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", l3normCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", l3normCuda ); #endif @@ -238,10 +238,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "scalar product", 2 * datasetSize ); - benchmark.time( reset1, "CPU", scalarProductHost ); + benchmark.time< Devices::Host >( reset1, "CPU", scalarProductHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", scalarProductCuda ); - benchmark.time( reset1, "cuBLAS", scalarProductCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", scalarProductCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas ); #endif /* @@ -289,10 +289,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "scalar multiplication", 2 * datasetSize ); - benchmark.time( reset1, "CPU", multiplyHost ); + benchmark.time< Devices::Host >( reset1, "CPU", multiplyHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", multiplyCuda ); - benchmark.time( reset1, "cuBLAS", multiplyCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", multiplyCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas ); #endif @@ -312,10 +312,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "vector addition", 3 * datasetSize ); - benchmark.time( reset1, "CPU", addVectorHost ); + benchmark.time< Devices::Host >( reset1, "CPU", addVectorHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", addVectorCuda ); - benchmark.time( reset1, "cuBLAS", addVectorCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", addVectorCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", addVectorCublas ); #endif diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index c371e2dfb..435e70373 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -34,53 +34,60 @@ namespace Benchmarks { const double oneGB = 1024.0 * 1024.0 * 1024.0; -template< typename ComputeFunction, - typename ResetFunction, - typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > -double -timeFunction( ComputeFunction compute, - ResetFunction reset, - int loops, - const double& minTime, - Monitor && monitor = Monitor() ) +template< typename Device > +class FunctionTimer { - // the timer is constructed zero-initialized and stopped - Timer timer; - - // set timer to the monitor - monitor.setTimer( timer ); - - // warm up - reset(); - compute(); - - //timer.start(); - int i; - for( i = 0; - i < loops || timer.getRealTime() < minTime; - ++i) - { - // abuse the monitor's "time" for loops - monitor.setTime( i + 1 ); - - reset(); - - // Explicit synchronization of the CUDA device - // TODO: not necessary for host computations -#ifdef HAVE_CUDA - cudaDeviceSynchronize(); + public: + using DeviceType = Device; + + template< typename ComputeFunction, + typename ResetFunction, + typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > + static double + timeFunction( ComputeFunction compute, + ResetFunction reset, + int loops, + const double& minTime, + Monitor && monitor = Monitor() ) + { + // the timer is constructed zero-initialized and stopped + Timer timer; + + // set timer to the monitor + monitor.setTimer( timer ); + + // warm up + reset(); + compute(); + + //timer.start(); + int i; + for( i = 0; + i < loops || timer.getRealTime() < minTime; + ++i) + { + // abuse the monitor's "time" for loops + monitor.setTime( i + 1 ); + + reset(); + + // Explicit synchronization of the CUDA device +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); #endif - timer.start(); - compute(); + timer.start(); + compute(); #ifdef HAVE_CUDA - cudaDeviceSynchronize(); + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); #endif - timer.stop(); - } - - return timer.getRealTime() / ( double ) i; -} + timer.stop(); + } + return timer.getRealTime() / ( double ) i; + } +}; class Logging { @@ -443,7 +450,8 @@ public: // "speedup" columns. // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation) // Also terminates the recursion of the following variadic template. - template< typename ResetFunction, + template< typename Device, + typename ResetFunction, typename ComputeFunction > double time( ResetFunction reset, @@ -456,10 +464,10 @@ public: if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); - result.time = timeFunction( compute, reset, loops, minTime, monitor ); + result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor ); } else { - result.time = timeFunction( compute, reset, loops, minTime, monitor ); + result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor ); } } catch ( const std::exception& e ) { @@ -477,7 +485,8 @@ public: return this->baseTime; } - template< typename ResetFunction, + template< typename Device, + typename ResetFunction, typename ComputeFunction, typename... NextComputations > inline double @@ -486,7 +495,7 @@ public: ComputeFunction & compute ) { BenchmarkResult result; - return time( reset, performer, compute, result ); + return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result ); } // Adds an error message to the log. Should be called in places where the diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h index a3bd76753..55c6bc156 100644 --- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h +++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h @@ -62,7 +62,7 @@ benchmarkSpmv( Benchmark& benchmark, matrix.vectorProduct( x, y ); }; - benchmark.time( reset, performer, compute ); + benchmark.time< typename Matrix::DeviceType >( reset, performer, compute ); } template< typename Matrix, typename Vector > @@ -114,7 +114,7 @@ benchmarkDistributedSpmv( Benchmark& benchmark, Matrix::CommunicatorType::Barrier( matrix.getCommunicationGroup() ); }; - benchmark.time( reset, performer, compute ); + benchmark.time< typename Matrix::DeviceType >( reset, performer, compute ); } template< typename Matrix, typename Vector > diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h index a82ec2dc2..c6278a76b 100644 --- a/src/Benchmarks/LinearSolvers/benchmarks.h +++ b/src/Benchmarks/LinearSolvers/benchmarks.h @@ -73,7 +73,7 @@ benchmarkPreconditionerUpdate( Benchmark& benchmark, barrier( matrix ); }; - benchmark.time( reset, performer, compute ); + benchmark.time< typename Matrix::DeviceType >( reset, performer, compute ); } template< template class Solver, template class Preconditioner, typename Matrix, typename Vector > @@ -166,7 +166,7 @@ benchmarkSolver( Benchmark& benchmark, }; MyBenchmarkResult benchmarkResult( solver, matrix, x, b ); - benchmark.time( reset, performer, compute, benchmarkResult ); + benchmark.time< typename Matrix::DeviceType >( reset, performer, compute, benchmarkResult ); } #ifdef HAVE_ARMADILLO diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index f1c4efeed..9e80b0d06 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -41,7 +41,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, const int maxSize = parameters.getParameter< int >( "max-size" ); // Full grid traversing - benchmark.newBenchmark( String("Full grid traversing - write 1" + convertToString( Dimension ) + "D" ), metadata ); + benchmark.newBenchmark( String("Full grid traversing - write 1 " + convertToString( Dimension ) + "D" ), metadata ); for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { @@ -78,9 +78,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time( hostReset, "CPU", hostWriteOneUsingPureC ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); #ifdef HAVE_CUDA - benchmark.time( cudaReset, "GPU", cudaWriteOneUsingPureC ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); #endif /**** @@ -97,9 +97,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time( hostReset, "CPU", hostWriteOneUsingParallelFor ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA - benchmark.time( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); #endif /**** @@ -113,12 +113,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters, auto cudaWriteOneUsingTraverser = [&] () { cudaTraverserBenchmark.writeOneUsingTraverser(); - } + }; benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time( hostReset, "CPU", hostWriteOneUsingTraverser ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); #ifdef HAVE_CUDA - benchmark.time( cudaReset, "GPU", cudaWriteOneUsingTraverser ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); #endif } return true; -- GitLab From a81c62c62f475f15a21d7adf60d8e4dcf772613c Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Tue, 25 Dec 2018 20:59:18 +0100 Subject: [PATCH 018/130] Added benchmark function timing without reset function to measure CPU cache effect. --- src/Benchmarks/Benchmarks.h | 129 +++++++++++++++--- .../Traversers/tnl-benchmark-traversers.h | 9 +- 2 files changed, 114 insertions(+), 24 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 435e70373..6ca7c3830 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -46,46 +46,84 @@ class FunctionTimer static double timeFunction( ComputeFunction compute, ResetFunction reset, - int loops, - const double& minTime, - Monitor && monitor = Monitor() ) + int maxLoops, + const double& minTime, + int verbose = 1, + Monitor && monitor = Monitor(), + bool performReset = true ) { // the timer is constructed zero-initialized and stopped Timer timer; // set timer to the monitor - monitor.setTimer( timer ); + if( verbose > 1 ) + monitor.setTimer( timer ); // warm up reset(); compute(); - //timer.start(); - int i; - for( i = 0; - i < loops || timer.getRealTime() < minTime; - ++i) + int loops; + // If we do not perform reset function and don't need + // the monitor, the timer is not interrupted after each loop. + if( ! performReset && verbose < 2 ) { - // abuse the monitor's "time" for loops - monitor.setTime( i + 1 ); - - reset(); - + timer.start(); // Explicit synchronization of the CUDA device #ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - timer.start(); - compute(); + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + for( loops = 0; + loops < maxLoops || timer.getRealTime() < minTime; + ++loops) + compute(); + // Explicit synchronization of the CUDA device #ifdef HAVE_CUDA if( std::is_same< Device, Devices::Cuda >::value ) cudaDeviceSynchronize(); #endif timer.stop(); } + else + { + for( loops = 0; + loops < maxLoops || timer.getRealTime() < minTime; + ++loops) + { + // abuse the monitor's "time" for loops + monitor.setTime( loops + 1 ); + + reset(); + + // Explicit synchronization of the CUDA device +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + timer.start(); + compute(); +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + timer.stop(); + } + } + return timer.getRealTime() / ( double ) loops; + } - return timer.getRealTime() / ( double ) i; + template< typename ComputeFunction, + typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > + static double + timeFunction( ComputeFunction compute, + int maxLoops, + const double& minTime, + int verbose = 1, + Monitor && monitor = Monitor() ) + { + auto noReset = [] () {}; + return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false ); } }; @@ -464,10 +502,10 @@ public: if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); - result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor ); + result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); } else { - result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor ); + result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); } } catch ( const std::exception& e ) { @@ -497,6 +535,53 @@ public: BenchmarkResult result; return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result ); } + + /**** + * The same methods as above but without reset function + */ + template< typename Device, + typename ComputeFunction > + double + time( const String & performer, + ComputeFunction & compute, + BenchmarkResult & result ) + { + result.time = std::numeric_limits::quiet_NaN(); + try { + if( verbose > 1 ) { + // run the monitor main loop + Solvers::SolverMonitorThread monitor_thread( monitor ); + result.time = FunctionTimer< Device >::timeFunction( compute, loops, minTime, verbose, monitor ); + } + else { + result.time = FunctionTimer< Device >::timeFunction( compute, loops, minTime, verbose, monitor ); + } + } + catch ( const std::exception& e ) { + std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl; + } + + result.bandwidth = datasetSize / result.time; + result.speedup = this->baseTime / result.time; + if( this->baseTime == 0.0 ) + this->baseTime = result.time; + + writeTableHeader( performer, result.getTableHeader() ); + writeTableRow( performer, result.getRowElements() ); + + return this->baseTime; + } + + template< typename Device, + typename ComputeFunction, + typename... NextComputations > + inline double + time( const String & performer, + ComputeFunction & compute ) + { + BenchmarkResult result; + return time< Device, ComputeFunction >( performer, compute, result ); + } // Adds an error message to the log. Should be called in places where the // "time" method could not be called (e.g. due to failed allocation). diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 9e80b0d06..6d2ed7cea 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -48,8 +48,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters, GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size ); GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size ); - auto noReset = []() {}; - auto hostReset = [&]() { hostTraverserBenchmark.reset(); @@ -78,10 +76,17 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC ); +#endif + + benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); #endif + /**** * Write one using parallel for -- GitLab From 2db4825dff1f092960db75dd2f08ae327e6e92c6 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Tue, 25 Dec 2018 21:58:19 +0100 Subject: [PATCH 019/130] Added traversers benchmark tests without reseting. --- .../Traversers/tnl-benchmark-traversers.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 6d2ed7cea..53b29b92a 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -102,6 +102,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor ); +#endif + + benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); @@ -113,7 +119,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, auto hostWriteOneUsingTraverser = [&] () { hostTraverserBenchmark.writeOneUsingTraverser(); - }; + }; auto cudaWriteOneUsingTraverser = [&] () { @@ -125,6 +131,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); #endif + + benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingTraverser ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser ); +#endif + } return true; } -- GitLab From b1676595638b180f19af034930510a8c421109ce Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Tue, 25 Dec 2018 23:20:53 +0100 Subject: [PATCH 020/130] Splitting Benchmarks.h into Benchmarks, Logging and FunctionTimer. --- src/Benchmarks/Benchmarks.h | 313 +-------------------------------- src/Benchmarks/CMakeLists.txt | 2 + src/Benchmarks/FunctionTimer.h | 119 +++++++++++++ src/Benchmarks/Logging.h | 240 +++++++++++++++++++++++++ 4 files changed, 366 insertions(+), 308 deletions(-) create mode 100644 src/Benchmarks/FunctionTimer.h create mode 100644 src/Benchmarks/Logging.h diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 6ca7c3830..0770680d2 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -8,20 +8,20 @@ /* See Copyright Notice in tnl/Copyright */ -// Implemented by: Jakub Klinkovsky +// Implemented by: Jakub Klinkovsky, +// Tomas Oberhuber #pragma once +#include "FunctionTimer.h" +#include "Logging.h" + #include #include -#include -#include #include #include -#include #include -#include #include #include @@ -34,309 +34,6 @@ namespace Benchmarks { const double oneGB = 1024.0 * 1024.0 * 1024.0; -template< typename Device > -class FunctionTimer -{ - public: - using DeviceType = Device; - - template< typename ComputeFunction, - typename ResetFunction, - typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > - static double - timeFunction( ComputeFunction compute, - ResetFunction reset, - int maxLoops, - const double& minTime, - int verbose = 1, - Monitor && monitor = Monitor(), - bool performReset = true ) - { - // the timer is constructed zero-initialized and stopped - Timer timer; - - // set timer to the monitor - if( verbose > 1 ) - monitor.setTimer( timer ); - - // warm up - reset(); - compute(); - - int loops; - // If we do not perform reset function and don't need - // the monitor, the timer is not interrupted after each loop. - if( ! performReset && verbose < 2 ) - { - timer.start(); - // Explicit synchronization of the CUDA device -#ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - for( loops = 0; - loops < maxLoops || timer.getRealTime() < minTime; - ++loops) - compute(); - // Explicit synchronization of the CUDA device -#ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - timer.stop(); - } - else - { - for( loops = 0; - loops < maxLoops || timer.getRealTime() < minTime; - ++loops) - { - // abuse the monitor's "time" for loops - monitor.setTime( loops + 1 ); - - reset(); - - // Explicit synchronization of the CUDA device -#ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - timer.start(); - compute(); -#ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - timer.stop(); - } - } - return timer.getRealTime() / ( double ) loops; - } - - template< typename ComputeFunction, - typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > - static double - timeFunction( ComputeFunction compute, - int maxLoops, - const double& minTime, - int verbose = 1, - Monitor && monitor = Monitor() ) - { - auto noReset = [] () {}; - return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false ); - } -}; - -class Logging -{ -public: - using MetadataElement = std::pair< const char*, String >; - using MetadataMap = std::map< const char*, String >; - using MetadataColumns = std::vector; - - using HeaderElements = std::vector< String >; - using RowElements = std::vector< double >; - - Logging( int verbose = true ) - : verbose(verbose) - {} - - void - setVerbose( int verbose) - { - this->verbose = verbose; - } - - void - writeTitle( const String & title ) - { - if( verbose ) - std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl; - log << ": title = " << title << std::endl; - } - - void - writeMetadata( const MetadataMap & metadata ) - { - if( verbose ) - std::cout << "properties:" << std::endl; - - for( auto & it : metadata ) { - if( verbose ) - std::cout << " " << it.first << " = " << it.second << std::endl; - log << ": " << it.first << " = " << it.second << std::endl; - } - if( verbose ) - std::cout << std::endl; - } - - void - writeTableHeader( const String & spanningElement, - const HeaderElements & subElements ) - { - if( verbose && header_changed ) { - for( auto & it : metadataColumns ) { - std::cout << std::setw( 20 ) << it.first; - } - - // spanning element is printed as usual column to stdout, - // but is excluded from header - std::cout << std::setw( 15 ) << ""; - - for( auto & it : subElements ) { - std::cout << std::setw( 15 ) << it; - } - std::cout << std::endl; - - header_changed = false; - } - - // initial indent string - header_indent = "!"; - log << std::endl; - for( auto & it : metadataColumns ) { - log << header_indent << " " << it.first << std::endl; - } - - // dump stacked spanning columns - if( horizontalGroups.size() > 0 ) - while( horizontalGroups.back().second <= 0 ) { - horizontalGroups.pop_back(); - header_indent.pop_back(); - } - for( size_t i = 0; i < horizontalGroups.size(); i++ ) { - if( horizontalGroups[ i ].second > 0 ) { - log << header_indent << " " << horizontalGroups[ i ].first << std::endl; - header_indent += "!"; - } - } - - log << header_indent << " " << spanningElement << std::endl; - for( auto & it : subElements ) { - log << header_indent << "! " << it << std::endl; - } - - if( horizontalGroups.size() > 0 ) { - horizontalGroups.back().second--; - header_indent.pop_back(); - } - } - - void - writeTableRow( const String & spanningElement, - const RowElements & subElements ) - { - if( verbose ) { - for( auto & it : metadataColumns ) { - std::cout << std::setw( 20 ) << it.second; - } - // spanning element is printed as usual column to stdout - std::cout << std::setw( 15 ) << spanningElement; - for( auto & it : subElements ) { - std::cout << std::setw( 15 ); - if( it != 0.0 )std::cout << it; - else std::cout << "N/A"; - } - std::cout << std::endl; - } - - // only when changed (the header has been already adjusted) - // print each element on separate line - for( auto & it : metadataColumns ) { - log << it.second << std::endl; - } - - // benchmark data are indented - const String indent = " "; - for( auto & it : subElements ) { - if( it != 0.0 ) log << indent << it << std::endl; - else log << indent << "N/A" << std::endl; - } - } - - void - writeErrorMessage( const char* msg, - int colspan = 1 ) - { - // initial indent string - header_indent = "!"; - log << std::endl; - for( auto & it : metadataColumns ) { - log << header_indent << " " << it.first << std::endl; - } - - // make sure there is a header column for the message - if( horizontalGroups.size() == 0 ) - horizontalGroups.push_back( {"", 1} ); - - // dump stacked spanning columns - while( horizontalGroups.back().second <= 0 ) { - horizontalGroups.pop_back(); - header_indent.pop_back(); - } - for( size_t i = 0; i < horizontalGroups.size(); i++ ) { - if( horizontalGroups[ i ].second > 0 ) { - log << header_indent << " " << horizontalGroups[ i ].first << std::endl; - header_indent += "!"; - } - } - if( horizontalGroups.size() > 0 ) { - horizontalGroups.back().second -= colspan; - header_indent.pop_back(); - } - - // only when changed (the header has been already adjusted) - // print each element on separate line - for( auto & it : metadataColumns ) { - log << it.second << std::endl; - } - log << msg << std::endl; - } - - void - closeTable() - { - log << std::endl; - header_indent = body_indent = ""; - header_changed = true; - horizontalGroups.clear(); - } - - bool save( std::ostream & logFile ) - { - closeTable(); - logFile << log.str(); - if( logFile.good() ) { - log.str() = ""; - return true; - } - return false; - } - -protected: - - // manual double -> String conversion with fixed precision - static String - _to_string( double num, int precision = 0, bool fixed = false ) - { - std::stringstream str; - if( fixed ) - str << std::fixed; - if( precision ) - str << std::setprecision( precision ); - str << num; - return String( str.str().data() ); - } - - std::stringstream log; - std::string header_indent; - std::string body_indent; - - int verbose; - MetadataColumns metadataColumns; - bool header_changed = true; - std::vector< std::pair< String, int > > horizontalGroups; -}; struct BenchmarkResult diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt index d4c2258c9..556dc1604 100644 --- a/src/Benchmarks/CMakeLists.txt +++ b/src/Benchmarks/CMakeLists.txt @@ -7,6 +7,8 @@ add_subdirectory( Traversers ) set( headers Benchmarks.h + FunctionTimer.h + Logging.h ) install( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/Benchmarks ) diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h new file mode 100644 index 000000000..091eb4a2a --- /dev/null +++ b/src/Benchmarks/FunctionTimer.h @@ -0,0 +1,119 @@ +/*************************************************************************** + FunctionTimer.h - description + ------------------- + begin : Dec 25, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky, +// Tomas Oberhuber + +#pragma once + +#include + +#include +#include + +namespace TNL { + namespace Benchmarks { + + +template< typename Device > +class FunctionTimer +{ + public: + using DeviceType = Device; + + template< typename ComputeFunction, + typename ResetFunction, + typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > + static double + timeFunction( ComputeFunction compute, + ResetFunction reset, + int maxLoops, + const double& minTime, + int verbose = 1, + Monitor && monitor = Monitor(), + bool performReset = true ) + { + // the timer is constructed zero-initialized and stopped + Timer timer; + + // set timer to the monitor + if( verbose > 1 ) + monitor.setTimer( timer ); + + // warm up + reset(); + compute(); + + int loops; + // If we do not perform reset function and don't need + // the monitor, the timer is not interrupted after each loop. + if( ! performReset && verbose < 2 ) + { + timer.start(); + // Explicit synchronization of the CUDA device +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + for( loops = 0; + loops < maxLoops || timer.getRealTime() < minTime; + ++loops) + compute(); + // Explicit synchronization of the CUDA device +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + timer.stop(); + } + else + { + for( loops = 0; + loops < maxLoops || timer.getRealTime() < minTime; + ++loops) + { + // abuse the monitor's "time" for loops + monitor.setTime( loops + 1 ); + + reset(); + + // Explicit synchronization of the CUDA device +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + timer.start(); + compute(); +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + timer.stop(); + } + } + return timer.getRealTime() / ( double ) loops; + } + + template< typename ComputeFunction, + typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > + static double + timeFunction( ComputeFunction compute, + int maxLoops, + const double& minTime, + int verbose = 1, + Monitor && monitor = Monitor() ) + { + auto noReset = [] () {}; + return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false ); + } +}; + + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Logging.h b/src/Benchmarks/Logging.h new file mode 100644 index 000000000..b10ab7199 --- /dev/null +++ b/src/Benchmarks/Logging.h @@ -0,0 +1,240 @@ +/*************************************************************************** + Logging.h - description + ------------------- + begin : Dec 25, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky, +// Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include +#include + +namespace TNL { + namespace Benchmarks { + +class Logging +{ + public: + using MetadataElement = std::pair< const char*, String >; + using MetadataMap = std::map< const char*, String >; + using MetadataColumns = std::vector; + + using HeaderElements = std::vector< String >; + using RowElements = std::vector< double >; + + Logging( int verbose = true ) + : verbose(verbose) + {} + + void + setVerbose( int verbose) + { + this->verbose = verbose; + } + + void + writeTitle( const String & title ) + { + if( verbose ) + std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl; + log << ": title = " << title << std::endl; + } + + void + writeMetadata( const MetadataMap & metadata ) + { + if( verbose ) + std::cout << "properties:" << std::endl; + + for( auto & it : metadata ) { + if( verbose ) + std::cout << " " << it.first << " = " << it.second << std::endl; + log << ": " << it.first << " = " << it.second << std::endl; + } + if( verbose ) + std::cout << std::endl; + } + + void + writeTableHeader( const String & spanningElement, + const HeaderElements & subElements ) + { + if( verbose && header_changed ) { + for( auto & it : metadataColumns ) { + std::cout << std::setw( 20 ) << it.first; + } + + // spanning element is printed as usual column to stdout, + // but is excluded from header + std::cout << std::setw( 15 ) << ""; + + for( auto & it : subElements ) { + std::cout << std::setw( 15 ) << it; + } + std::cout << std::endl; + + header_changed = false; + } + + // initial indent string + header_indent = "!"; + log << std::endl; + for( auto & it : metadataColumns ) { + log << header_indent << " " << it.first << std::endl; + } + + // dump stacked spanning columns + if( horizontalGroups.size() > 0 ) + while( horizontalGroups.back().second <= 0 ) { + horizontalGroups.pop_back(); + header_indent.pop_back(); + } + for( size_t i = 0; i < horizontalGroups.size(); i++ ) { + if( horizontalGroups[ i ].second > 0 ) { + log << header_indent << " " << horizontalGroups[ i ].first << std::endl; + header_indent += "!"; + } + } + + log << header_indent << " " << spanningElement << std::endl; + for( auto & it : subElements ) { + log << header_indent << "! " << it << std::endl; + } + + if( horizontalGroups.size() > 0 ) { + horizontalGroups.back().second--; + header_indent.pop_back(); + } + } + + void + writeTableRow( const String & spanningElement, + const RowElements & subElements ) + { + if( verbose ) { + for( auto & it : metadataColumns ) { + std::cout << std::setw( 20 ) << it.second; + } + // spanning element is printed as usual column to stdout + std::cout << std::setw( 15 ) << spanningElement; + for( auto & it : subElements ) { + std::cout << std::setw( 15 ); + if( it != 0.0 )std::cout << it; + else std::cout << "N/A"; + } + std::cout << std::endl; + } + + // only when changed (the header has been already adjusted) + // print each element on separate line + for( auto & it : metadataColumns ) { + log << it.second << std::endl; + } + + // benchmark data are indented + const String indent = " "; + for( auto & it : subElements ) { + if( it != 0.0 ) log << indent << it << std::endl; + else log << indent << "N/A" << std::endl; + } + } + + void + writeErrorMessage( const char* msg, + int colspan = 1 ) + { + // initial indent string + header_indent = "!"; + log << std::endl; + for( auto & it : metadataColumns ) { + log << header_indent << " " << it.first << std::endl; + } + + // make sure there is a header column for the message + if( horizontalGroups.size() == 0 ) + horizontalGroups.push_back( {"", 1} ); + + // dump stacked spanning columns + while( horizontalGroups.back().second <= 0 ) { + horizontalGroups.pop_back(); + header_indent.pop_back(); + } + for( size_t i = 0; i < horizontalGroups.size(); i++ ) { + if( horizontalGroups[ i ].second > 0 ) { + log << header_indent << " " << horizontalGroups[ i ].first << std::endl; + header_indent += "!"; + } + } + if( horizontalGroups.size() > 0 ) { + horizontalGroups.back().second -= colspan; + header_indent.pop_back(); + } + + // only when changed (the header has been already adjusted) + // print each element on separate line + for( auto & it : metadataColumns ) { + log << it.second << std::endl; + } + log << msg << std::endl; + } + + void + closeTable() + { + log << std::endl; + header_indent = body_indent = ""; + header_changed = true; + horizontalGroups.clear(); + } + + bool save( std::ostream & logFile ) + { + closeTable(); + logFile << log.str(); + if( logFile.good() ) { + log.str() = ""; + return true; + } + return false; + } + + protected: + + // manual double -> String conversion with fixed precision + static String + _to_string( double num, int precision = 0, bool fixed = false ) + { + std::stringstream str; + if( fixed ) + str << std::fixed; + if( precision ) + str << std::setprecision( precision ); + str << num; + return String( str.str().data() ); + } + + std::stringstream log; + std::string header_indent; + std::string body_indent; + + int verbose; + MetadataColumns metadataColumns; + bool header_changed = true; + std::vector< std::pair< String, int > > horizontalGroups; +}; + + + } // namespace Benchmarks +} // namespace TNL + + -- GitLab From 353260bd44b985b88221d85eee413c3e22d24a23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Thu, 27 Dec 2018 16:48:39 +0100 Subject: [PATCH 021/130] Added traversers benchmarks with boundaries. --- .../Traversers/GridTraversersBenchmark.h | 247 +++++++++++++++--- src/Benchmarks/Traversers/cuda-kernels.h | 128 +++++++++ src/Benchmarks/Traversers/grid-traversing.h | 36 --- .../Traversers/tnl-benchmark-traversers.h | 114 +++++++- 4 files changed, 439 insertions(+), 86 deletions(-) create mode 100644 src/Benchmarks/Traversers/cuda-kernels.h delete mode 100644 src/Benchmarks/Traversers/grid-traversing.h diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index ee18adfa6..2f439f988 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -1,5 +1,5 @@ /*************************************************************************** - WriteOne.h - description + GridTraversersBenchmark.h - description ------------------- begin : Dec 19, 2018 copyright : (C) 2018 by oberhuber @@ -21,10 +21,11 @@ #include #include #include +#include "cuda-kernels.h" namespace TNL { namespace Benchmarks { - + namespace Traversers { template< typename TraverserUserData > class WriteOneEntitiesProcessor @@ -55,35 +56,6 @@ class WriteOneUserData MeshFunctionPointer u; }; -template< typename Real, - typename Index > -__global__ void simpleCudaKernel1D( const Index size, const dim3 gridIdx, Real* v_data ) -{ - const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( threadIdx_x < size ) - v_data[ threadIdx_x ] = 1.0; -} - -template< typename Real, - typename Index > -__global__ void simpleCudaKernel2D( const Index size, const dim3 gridIdx, Real* v_data ) -{ - const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; - if( threadIdx_x < size && threadIdx_y < size ) - v_data[ threadIdx_y * size + threadIdx_x ] = 1.0; -} - -template< typename Real, - typename Index > -__global__ void simpleCudaKernel3D( const Index size, const dim3 gridIdx, Real* v_data ) -{ - const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; - const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; - if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size ) - v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0; -} template< int Dimension, typename Device, @@ -147,12 +119,12 @@ class GridTraversersBenchmark< 1, Device, Real, Index > gridsCount, gridIdx, gridSize ); - simpleCudaKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + fullGridTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); } #endif } } - + void writeOneUsingParallelFor() { auto f = [] __cuda_callable__ ( Index i, Real* data ) @@ -168,6 +140,56 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ( grid, userData ); } + void traverseUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + v_data[ 0 ] = 2; + for( int i = 1; i < size - 1; i++ ) + v_data[ i ] = 1.0; + v_data[ size - 1 ] = 2; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + boundariesTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + interiorTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void traverseUsingTraverser() + { + // TODO !!!!!!!!!!!!!!!!!!!!!! + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } + protected: Index size; @@ -240,7 +262,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > gridsCount, gridIdx, gridSize ); - simpleCudaKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + fullGridTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); } #endif } @@ -267,6 +289,69 @@ class GridTraversersBenchmark< 2, Device, Real, Index > ( grid, userData ); } + void traverseUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + { + v_data[ i * size ] = 2.0; + v_data[ i * size + size - 1 ] = 2.0; + } + for( int j = 1; j < size - 1; j++ ) + { + v_data[ j ] = 2.0; + v_data[ ( size - 1 ) * size + j ] = 2.0; + } + + for( int i = 1; i < size - 1; i++ ) + for( int j = 1; j < size - 1; j++ ) + v_data[ i * size + j ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size ); + dim3 gridIdx; + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + boundariesTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + interiorTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void traversingUsingTraverser() + { + // TODO !!!!!!!!!!!!!!!!!!!!!! + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } + protected: Index size; @@ -344,12 +429,12 @@ class GridTraversersBenchmark< 3, Device, Real, Index > gridsCount, gridIdx, gridSize ); - simpleCudaKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + fullGridTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); } #endif } } - + void writeOneUsingParallelFor() { Index _size = this->size; @@ -358,20 +443,96 @@ class GridTraversersBenchmark< 3, Device, Real, Index > data[ ( i * _size + j ) * _size + k ] = 1.0; }; - ParallelFor3D< Device >::exec( ( Index ) 0, - ( Index ) 0, - ( Index ) 0, + ParallelFor3D< Device >::exec( ( Index ) 0, + ( Index ) 0, + ( Index ) 0, this->size, this->size, this->size, - f, v.getData() ); + f, v.getData() ); } - + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > ( grid, userData ); - } + } + + void traverseUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + for( int j = 0; j < size; j++ ) + { + v_data[ ( i * size + j ) * size ] = 2.0; + v_data[ ( i * size + j ) * size + size - 1 ] = 2.0; + } + for( int j = 0; j < size; j++ ) + for( int k = 1; k < size - 1; k++ ) + { + v_data[ j * size + k ] = 1.0; + v_data[ ( ( size - 1) * size + j ) * size + k ] = 1.0; + } + + for( int i = 1; i < size -1; i++ ) + for( int k = 1; k < size - 1; k++ ) + { + v_data[ ( i * size ) * size + k ] = 2.0; + v_data[ ( i * size + size - 1 ) * size + k ] = 2.0; + } + + for( int i = 1; i < size -1; i++ ) + for( int j = 1; j < size -1; j++ ) + for( int k = 1; k < size - 1; k++ ) + v_data[ ( i * size + j ) * size + k ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size, + size ); + dim3 gridIdx; + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + boundariesTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + interiorTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void traverseUsingTraverser() + { + // TODO !!!!!!!!!!!!!!!!!!!!!! + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } protected: @@ -384,6 +545,6 @@ class GridTraversersBenchmark< 3, Device, Real, Index > WriteOneTraverserUserDataType userData; }; - + } // namespace Traversers } // namespace Benchmarks } // namespace TNL \ No newline at end of file diff --git a/src/Benchmarks/Traversers/cuda-kernels.h b/src/Benchmarks/Traversers/cuda-kernels.h new file mode 100644 index 000000000..2cd8b1b56 --- /dev/null +++ b/src/Benchmarks/Traversers/cuda-kernels.h @@ -0,0 +1,128 @@ +/*************************************************************************** + cuda-kernels.h - description + ------------------- + begin : Dec 19, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +#ifdef HAVE_CUDA + +/**** + * Full grid traversing + */ +template< typename Real, + typename Index > +__global__ void fullGridTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( threadIdx_x < size ) + v_data[ threadIdx_x ] = 1.0; +} + +template< typename Real, + typename Index > +__global__ void fullGridTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + if( threadIdx_x < size && threadIdx_y < size ) + v_data[ threadIdx_y * size + threadIdx_x ] = 1.0; +} + +template< typename Real, + typename Index > +__global__ void fullGridTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size ) + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0; +} + +/**** + * Traversing interior cells + */ +template< typename Real, + typename Index > +__global__ void interiorTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( threadIdx_x > 0 && threadIdx_x < size - 1 ) + v_data[ threadIdx_x ] = 1.0; +} + +template< typename Real, + typename Index > +__global__ void interiorTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + if( threadIdx_x > 0 && threadIdx_y > 0 && + threadIdx_x < size - 1 && threadIdx_y < size - 1 ) + v_data[ threadIdx_y * size + threadIdx_x ] = 1.0; +} + +template< typename Real, + typename Index > +__global__ void interiorTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_z > 0 && + threadIdx_x < size - 1 && threadIdx_y < size - 1 && threadIdx_z < size - 1 ) + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0; +} + +/**** + * Grid boundaries traversing + */ +template< typename Real, + typename Index > +__global__ void boundariesTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( threadIdx_x == 0 || threadIdx_x == size - 1 ) + v_data[ threadIdx_x ] = 2.0; +} + +template< typename Real, + typename Index > +__global__ void boundariesTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + if( threadIdx_x > 0 && threadIdx_y > 0 && + threadIdx_x < size - 1 && threadIdx_y < size - 1 ) + v_data[ threadIdx_y * size + threadIdx_x ] = 2.0; +} + +template< typename Real, + typename Index > +__global__ void boundariesTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + if( threadIdx_x == 0 || threadIdx_y == 0 || threadIdx_z == 0 || + threadIdx_x == size - 1 || threadIdx_y == size - 1 || threadIdx_z == size - 1 ) + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 2.0; +} + +#endif + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL + diff --git a/src/Benchmarks/Traversers/grid-traversing.h b/src/Benchmarks/Traversers/grid-traversing.h deleted file mode 100644 index c977fea1c..000000000 --- a/src/Benchmarks/Traversers/grid-traversing.h +++ /dev/null @@ -1,36 +0,0 @@ -/*************************************************************************** - grid-traversing.h - description - ------------------- - begin : Dec 19, 2018 - copyright : (C) 2018 by oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Tomas Oberhuber - -#pragma once - -#include "../Benchmarks.h" - - -#include - -namespace TNL { - namespace Benchmarks { - -template< int Dimension, - typename Real = double, - typename Index = int > -class benchmarkTraversingFullGrid -{ - public: - - static void run ( Benchmark& benchmark, std::size_t size ) - { - - } -}; - } // namespace Benchmarks -} // namespace TNL \ No newline at end of file diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 53b29b92a..276497f51 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -23,6 +23,7 @@ using namespace TNL; using namespace TNL::Benchmarks; +using namespace TNL::Benchmarks::Traversers; template< int Dimension, @@ -40,13 +41,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters, const int minSize = parameters.getParameter< int >( "min-size" ); const int maxSize = parameters.getParameter< int >( "max-size" ); - // Full grid traversing - benchmark.newBenchmark( String("Full grid traversing - write 1 " + convertToString( Dimension ) + "D" ), metadata ); + /**** + * Full grid traversing + */ + benchmark.newBenchmark( String("Traversing without boundary conditions" + convertToString( Dimension ) + "D" ), metadata ); for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { - GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size ); - GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size ); + GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size ); auto hostReset = [&]() { @@ -86,7 +88,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters, #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); #endif - /**** * Write one using parallel for @@ -94,12 +95,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters, auto hostWriteOneUsingParallelFor = [&] () { hostTraverserBenchmark.writeOneUsingParallelFor(); - }; + }; auto cudaWriteOneUsingParallelFor = [&] () { cudaTraverserBenchmark.writeOneUsingParallelFor(); - }; + }; benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor ); @@ -137,8 +138,107 @@ bool runBenchmark( const Config::ParameterContainer& parameters, #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser ); #endif + } + + /**** + * Full grid traversing + */ + benchmark.newBenchmark( String("Traversing with boundary conditions" + convertToString( Dimension ) + "D" ), metadata ); + for( std::size_t size = minSize; size <= maxSize; size *= 2 ) + { + GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size ); + GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size ); + + auto hostReset = [&]() + { + hostTraverserBenchmark.reset(); + }; + + auto cudaReset = [&]() + { + cudaTraverserBenchmark.reset(); + }; + + benchmark.setMetadataColumns( + Benchmark::MetadataColumns( + { {"size", convertToString( size ) }, } ) ); + + /**** + * Write one using C for + */ + auto hostTraverseUsingPureC = [&] () + { + hostTraverserBenchmark.traverseUsingPureC(); + }; + + auto cudaTraverseUsingPureC = [&] () + { + cudaTraverserBenchmark.traverseUsingPureC(); + }; + + benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC ); +#endif + + benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC ); +#endif + + /**** + * Write one using parallel for + */ + auto hostTraverseUsingParallelFor = [&] () + { + hostTraverserBenchmark.writeOneUsingParallelFor(); + }; + + auto cudaTraverseUsingParallelFor = [&] () + { + cudaTraverserBenchmark.writeOneUsingParallelFor(); + }; + + benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor ); +#endif + + benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor ); +#endif + /**** + * Write one using traverser + */ + auto hostTraverseUsingTraverser = [&] () + { + hostTraverserBenchmark.writeOneUsingTraverser(); + }; + + auto cudaTraverseUsingTraverser = [&] () + { + cudaTraverserBenchmark.writeOneUsingTraverser(); + }; + + benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser ); +#endif + + benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser ); +#endif } + return true; } -- GitLab From eb9cff082e7983f431949b76d39a286b44a1caa1 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Wed, 2 Jan 2019 20:06:49 +0100 Subject: [PATCH 022/130] Timing can be turned off in the becnhmark - for better profiling. --- src/Benchmarks/Benchmarks.h | 25 ++++++++++++++++++++----- src/Benchmarks/FunctionTimer.h | 24 ++++++++++++++++-------- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 0770680d2..71f808ad8 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -74,6 +74,7 @@ public: { config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 ); + config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true ); config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 ); } @@ -81,6 +82,7 @@ public: { this->loops = parameters.getParameter< unsigned >( "loops" ); this->minTime = parameters.getParameter< double >( "min-time" ); + this->timing = parameters.getParameter< bool >( "timing" ); const int verbose = parameters.getParameter< unsigned >( "verbose" ); Logging::setVerbose( verbose ); } @@ -199,10 +201,16 @@ public: if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); - result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + if( this->timing ) + result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + else + result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); } else { - result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + if( this->timing ) + result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + else + result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); } } catch ( const std::exception& e ) { @@ -232,7 +240,7 @@ public: BenchmarkResult result; return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result ); } - + /**** * The same methods as above but without reset function */ @@ -248,10 +256,16 @@ public: if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); - result.time = FunctionTimer< Device >::timeFunction( compute, loops, minTime, verbose, monitor ); + if( this->timing ) + result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); + else + result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); } else { - result.time = FunctionTimer< Device >::timeFunction( compute, loops, minTime, verbose, monitor ); + if( this->timing ) + result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); + else + result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); } } catch ( const std::exception& e ) { @@ -304,6 +318,7 @@ protected: double minTime = 1; double datasetSize = 0.0; double baseTime = 0.0; + bool timing = true; Solvers::IterativeSolverMonitor< double, int > monitor; }; diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h index 091eb4a2a..35dbb719f 100644 --- a/src/Benchmarks/FunctionTimer.h +++ b/src/Benchmarks/FunctionTimer.h @@ -22,7 +22,8 @@ namespace TNL { namespace Benchmarks { -template< typename Device > +template< typename Device, + bool timing > class FunctionTimer { public: @@ -56,14 +57,15 @@ class FunctionTimer // the monitor, the timer is not interrupted after each loop. if( ! performReset && verbose < 2 ) { - timer.start(); + if( timing ) + timer.start(); // Explicit synchronization of the CUDA device #ifdef HAVE_CUDA if( std::is_same< Device, Devices::Cuda >::value ) cudaDeviceSynchronize(); #endif for( loops = 0; - loops < maxLoops || timer.getRealTime() < minTime; + loops < maxLoops || ( timing && timer.getRealTime() < minTime ); ++loops) compute(); // Explicit synchronization of the CUDA device @@ -71,12 +73,13 @@ class FunctionTimer if( std::is_same< Device, Devices::Cuda >::value ) cudaDeviceSynchronize(); #endif - timer.stop(); + if( timing ) + timer.stop(); } else { for( loops = 0; - loops < maxLoops || timer.getRealTime() < minTime; + loops < maxLoops || ( timing && timer.getRealTime() < minTime ); ++loops) { // abuse the monitor's "time" for loops @@ -89,16 +92,21 @@ class FunctionTimer if( std::is_same< Device, Devices::Cuda >::value ) cudaDeviceSynchronize(); #endif - timer.start(); + if( timing ) + timer.start(); compute(); #ifdef HAVE_CUDA if( std::is_same< Device, Devices::Cuda >::value ) cudaDeviceSynchronize(); #endif - timer.stop(); + if( timing ) + timer.stop(); } } - return timer.getRealTime() / ( double ) loops; + if( timing ) + return timer.getRealTime() / ( double ) loops; + else + return std::numeric_limits::quiet_NaN(); } template< typename ComputeFunction, -- GitLab From 66243cb1236b0baf8b7d20328985f4a236a0ae0e Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Wed, 2 Jan 2019 20:07:46 +0100 Subject: [PATCH 023/130] Added flag -g to compilation of the traversers benchmark. --- src/Benchmarks/Traversers/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Benchmarks/Traversers/CMakeLists.txt b/src/Benchmarks/Traversers/CMakeLists.txt index b58c7d66f..a80487135 100644 --- a/src/Benchmarks/Traversers/CMakeLists.txt +++ b/src/Benchmarks/Traversers/CMakeLists.txt @@ -5,5 +5,6 @@ else() ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp ) TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl ) endif() +SET_TARGET_PROPERTIES( tnl-benchmark-traversers PROPERTIES COMPILE_OPTIONS "-g" ) install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin ) -- GitLab From 57f3b3557dee021e0ada680cd0d6a72708ebfb4b Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Wed, 2 Jan 2019 20:08:36 +0100 Subject: [PATCH 024/130] Fixed cell type in traversers benchmark. --- src/Benchmarks/Traversers/GridTraversersBenchmark.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 2f439f988..2ea81ed14 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -76,7 +76,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > using Coordinates = typename Grid::CoordinatesType; using MeshFunction = Functions::MeshFunction< Grid >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; using Traverser = Meshes::Traverser< Grid, Cell >; using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; @@ -215,7 +215,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > using Coordinates = typename Grid::CoordinatesType; using MeshFunction = Functions::MeshFunction< Grid >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::EntityType< 2, Meshes::GridEntityNoStencilStorage >; + using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >; using Traverser = Meshes::Traverser< Grid, Cell >; using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; @@ -376,7 +376,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > using Coordinates = typename Grid::CoordinatesType; using MeshFunction = Functions::MeshFunction< Grid >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::EntityType< 3, Meshes::GridEntityNoStencilStorage >; + using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >; using Traverser = Meshes::Traverser< Grid, Cell >; using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; -- GitLab From ab6016d1d1703d63d93fe358881c83a2f9905451 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Wed, 2 Jan 2019 20:09:30 +0100 Subject: [PATCH 025/130] Traversers benchmark tests can be controled from the command line. --- .../Traversers/tnl-benchmark-traversers.h | 127 ++++++++++++------ 1 file changed, 87 insertions(+), 40 deletions(-) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 276497f51..11899b369 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -33,6 +33,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, Benchmark& benchmark, Benchmark::MetadataMap& metadata ) { + const String tests = parameters.getParameter< String >( "tests" ); // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), // which have a default value. The workaround below works for int values, but it is not possible // to pass 64-bit integer values @@ -72,22 +73,28 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.writeOneUsingPureC(); }; +#ifdef HAVE_CUDA auto cudaWriteOneUsingPureC = [&] () { cudaTraverserBenchmark.writeOneUsingPureC(); }; +#endif - benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC ); + if( tests == "all" || tests == "no-bc-pure-c") + { + benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC ); + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC ); #endif - - benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); + benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); #endif + } /**** * Write one using parallel for @@ -97,22 +104,29 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.writeOneUsingParallelFor(); }; +#ifdef HAVE_CUDA auto cudaWriteOneUsingParallelFor = [&] () { cudaTraverserBenchmark.writeOneUsingParallelFor(); }; +#endif - benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor ); + if( tests == "all" || tests == "no-bc-parallel-for" ) + { + benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor ); + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor ); #endif - - benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); + + benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); #endif + } /**** * Write one using traverser @@ -154,96 +168,129 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.reset(); }; +#ifdef HAVE_CUDA auto cudaReset = [&]() { cudaTraverserBenchmark.reset(); }; - +#endif + benchmark.setMetadataColumns( - Benchmark::MetadataColumns( + Benchmark::MetadataColumns( { {"size", convertToString( size ) }, } ) ); /**** - * Write one using C for + * Write one and two (as BC) using C for */ auto hostTraverseUsingPureC = [&] () { hostTraverserBenchmark.traverseUsingPureC(); }; +#ifdef HAVE_CUDA auto cudaTraverseUsingPureC = [&] () { cudaTraverserBenchmark.traverseUsingPureC(); }; +#endif - benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC ); + if( tests == "all" || tests == "bc-pure-c" ) + { + benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC ); + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC ); #endif - - benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC ); + + benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC ); + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC ); #endif + } /**** - * Write one using parallel for + * Write one and two (as BC) using parallel for */ auto hostTraverseUsingParallelFor = [&] () { hostTraverserBenchmark.writeOneUsingParallelFor(); }; +#ifdef HAVE_CUDA auto cudaTraverseUsingParallelFor = [&] () { cudaTraverserBenchmark.writeOneUsingParallelFor(); }; +#endif - benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); + if( tests == "all" || tests == "bc-parallel-for" ) + { + benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor ); + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor ); #endif - - benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor ); + + benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor ); + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor ); #endif + } /**** - * Write one using traverser + * Write one and two (as BC) using traverser */ auto hostTraverseUsingTraverser = [&] () { hostTraverserBenchmark.writeOneUsingTraverser(); }; +#ifdef HAVE_CUDA auto cudaTraverseUsingTraverser = [&] () { cudaTraverserBenchmark.writeOneUsingTraverser(); }; +#endif - benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); + if( tests == "all" || tests == "bc-traverser" ) + { + benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser ); #endif - benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser ); + benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser ); + benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser ); #endif + } } - return true; } void setupConfig( Config::ConfigDescription& config ) { + config.addEntry< String >( "tests", "Tests to be performed.", "all" ); + config.addEntryEnum( "all" ); + config.addEntryEnum( "no-bc-pure-c" ); + config.addEntryEnum( "no-bc-parallel-for" ); + config.addEntryEnum( "no-bc-traverser" ); + config.addEntryEnum( "bc-pure-c" ); + config.addEntryEnum( "bc-parallel-for" ); + config.addEntryEnum( "bc-traverser" ); +#ifdef HAVE_CUDA + config.addEntry< bool >( "with-cuda", "Perform even the CUDA benchmarks.", true ); +#else + config.addEntry< bool >( "with-cuda", "Perform even the CUDA benchmarks.", false ); +#endif config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-traversers.log"); config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); config.addEntryEnum( "append" ); -- GitLab From 075740ec00f0b8b8242377115b323fab64efbad3 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Wed, 2 Jan 2019 20:10:40 +0100 Subject: [PATCH 026/130] Additional fixes of the traversers benchmark tests. --- .../Traversers/tnl-benchmark-traversers.h | 34 ++++++++++++++----- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 11899b369..60f672b22 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -41,26 +41,35 @@ bool runBenchmark( const Config::ParameterContainer& parameters, // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); const int minSize = parameters.getParameter< int >( "min-size" ); const int maxSize = parameters.getParameter< int >( "max-size" ); +#ifdef HAVE_CUDA + const bool withCuda = parameters.getParameter< bool >( "with-cuda" ); +#else + const bool withCuda = false; +#endif /**** - * Full grid traversing + * Full grid traversing with no boundary conditions */ benchmark.newBenchmark( String("Traversing without boundary conditions" + convertToString( Dimension ) + "D" ), metadata ); for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size ); +#ifdef HAVE_CUDA GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size ); +#endif auto hostReset = [&]() { hostTraverserBenchmark.reset(); }; +#ifdef HAVE_CUDA auto cudaReset = [&]() { cudaTraverserBenchmark.reset(); }; - +#endif + benchmark.setMetadataColumns( Benchmark::MetadataColumns( { {"size", convertToString( size ) }, } ) ); @@ -136,26 +145,33 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.writeOneUsingTraverser(); }; +#ifdef HAVE_CUDA auto cudaWriteOneUsingTraverser = [&] () { cudaTraverserBenchmark.writeOneUsingTraverser(); }; +#endif - benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); + if( tests == "all" || tests == "no-bc-traverser" ) + { + benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); #endif - benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingTraverser ); + benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingTraverser ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser ); + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser ); #endif + } } /**** - * Full grid traversing + * Full grid traversing including boundary conditions */ benchmark.newBenchmark( String("Traversing with boundary conditions" + convertToString( Dimension ) + "D" ), metadata ); for( std::size_t size = minSize; size <= maxSize; size *= 2 ) -- GitLab From 2e26d884d8a2e42a067ec1c6a4403a1d30a1fa42 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Wed, 2 Jan 2019 21:57:58 +0100 Subject: [PATCH 027/130] Optimization of ParallelFor on CPU. --- src/TNL/ParallelFor.h | 61 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h index 78d449982..7eac7058c 100644 --- a/src/TNL/ParallelFor.h +++ b/src/TNL/ParallelFor.h @@ -37,10 +37,21 @@ struct ParallelFor static void exec( Index start, Index end, Function f, FunctionArgs... args ) { #ifdef HAVE_OPENMP - #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 ) -#endif + // Benchmarks show that this is significantly faster compared + // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )' + if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 ) + { +#pragma omp parallel for + for( Index i = start; i < end; i++ ) + f( i, args... ); + } + else + for( Index i = start; i < end; i++ ) + f( i, args... ); +#else for( Index i = start; i < end; i++ ) f( i, args... ); +#endif } }; @@ -53,11 +64,24 @@ struct ParallelFor2D static void exec( Index startX, Index startY, Index endX, Index endY, Function f, FunctionArgs... args ) { #ifdef HAVE_OPENMP - #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() ) -#endif + // Benchmarks show that this is significantly faster compared + // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )' + if( TNL::Devices::Host::isOMPEnabled() ) + { +#pragma omp parallel for + for( Index i = startX; i < endX; i++ ) + for( Index j = startY; j < endY; j++ ) + f( i, j, args... ); + } + else + for( Index i = startX; i < endX; i++ ) + for( Index j = startY; j < endY; j++ ) + f( i, j, args... ); +#else for( Index i = startX; i < endX; i++ ) - for( Index j = startY; j < endY; j++ ) - f( i, j, args... ); + for( Index j = startY; j < endY; j++ ) + f( i, j, args... ); +#endif } }; @@ -70,12 +94,27 @@ struct ParallelFor3D static void exec( Index startX, Index startY, Index startZ, Index endX, Index endY, Index endZ, Function f, FunctionArgs... args ) { #ifdef HAVE_OPENMP - #pragma omp parallel for collapse(2) if( TNL::Devices::Host::isOMPEnabled() ) -#endif + // Benchmarks show that this is significantly faster compared + // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )' + if( TNL::Devices::Host::isOMPEnabled() ) + { +#pragma omp parallel for collapse(2) for( Index i = startX; i < endX; i++ ) - for( Index j = startY; j < endY; j++ ) - for( Index k = startZ; k < endZ; k++ ) - f( i, j, k, args... ); + for( Index j = startY; j < endY; j++ ) + for( Index k = startZ; k < endZ; k++ ) + f( i, j, k, args... ); + } + else + for( Index i = startX; i < endX; i++ ) + for( Index j = startY; j < endY; j++ ) + for( Index k = startZ; k < endZ; k++ ) + f( i, j, k, args... ); +#else + for( Index i = startX; i < endX; i++ ) + for( Index j = startY; j < endY; j++ ) + for( Index k = startZ; k < endZ; k++ ) + f( i, j, k, args... ); +#endif } }; -- GitLab From eeb8b76bf6f9cb7270e7a9446e9a7ca68421cae0 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Wed, 2 Jan 2019 21:58:35 +0100 Subject: [PATCH 028/130] Fixing indexes ordering in parallel for in traversers benchmark. --- src/Benchmarks/Traversers/GridTraversersBenchmark.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 2ea81ed14..5ae8c14b3 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -99,7 +99,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > if( std::is_same< Device, Devices::Host >::value ) { for( int i = 0; i < size; i++ ) - v_data[ i ] = 1.0; + v_data[ i ] += 1.0; } else // Device == Devices::Cuda { @@ -129,7 +129,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { auto f = [] __cuda_callable__ ( Index i, Real* data ) { - data[ i ] = 1.0; + data[ i ] = +1.0; }; ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } @@ -271,7 +271,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > void writeOneUsingParallelFor() { Index _size = this->size; - auto f = [=] __cuda_callable__ ( Index j, Index i, Real* data ) + auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { data[ i * _size + j ] = 1.0; }; @@ -438,7 +438,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > void writeOneUsingParallelFor() { Index _size = this->size; - auto f = [=] __cuda_callable__ ( Index k, Index j, Index i, Real* data ) + auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { data[ ( i * _size + j ) * _size + k ] = 1.0; }; -- GitLab From cee8b06f47934ef0441e0ec48f38eb752586fddd Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 12:02:53 +0100 Subject: [PATCH 029/130] Added traversers benchmark test - parallel for with a grid entity. --- .../Traversers/GridTraversersBenchmark.h | 45 ++++++++- .../Traversers/tnl-benchmark-traversers.h | 91 ++++++++++++------- .../Meshes/GridDetails/GridTraverser_impl.h | 35 ++++++- 3 files changed, 134 insertions(+), 37 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 5ae8c14b3..508a68eec 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -42,7 +42,7 @@ class WriteOneEntitiesProcessor const GridEntity& entity ) { auto& u = userData.u.template modifyData< DeviceType >(); - u( entity ) = 1.0; + u( entity ) += 1.0; } }; @@ -134,6 +134,15 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } + void writeOneUsingParallelForAndGridEntity() + { + auto f = [] __cuda_callable__ ( Index i, Real* data ) + { + data[ i ] = +1.0; + }; + ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); + } + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > @@ -267,7 +276,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > #endif } } - + void writeOneUsingParallelFor() { Index _size = this->size; @@ -283,6 +292,21 @@ class GridTraversersBenchmark< 2, Device, Real, Index > f, v.getData() ); } + void writeOneUsingParallelForAndGridEntity() + { + Index _size = this->size; + auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) + { + data[ i * _size + j ] = 1.0; + }; + + ParallelFor2D< Device >::exec( ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); + } + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > @@ -452,6 +476,23 @@ class GridTraversersBenchmark< 3, Device, Real, Index > f, v.getData() ); } + void writeOneUsingParallelForAndGridEntity() + { + Index _size = this->size; + auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) + { + data[ ( i * _size + j ) * _size + k ] = 1.0; + }; + + ParallelFor3D< Device >::exec( ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); + } + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 60f672b22..9f7920e3c 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -77,28 +77,27 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using C for */ - auto hostWriteOneUsingPureC = [&] () - { - hostTraverserBenchmark.writeOneUsingPureC(); - }; - -#ifdef HAVE_CUDA - auto cudaWriteOneUsingPureC = [&] () - { - cudaTraverserBenchmark.writeOneUsingPureC(); - }; -#endif - if( tests == "all" || tests == "no-bc-pure-c") { benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + + auto hostWriteOneUsingPureC = [&] () + { + hostTraverserBenchmark.writeOneUsingPureC(); + }; benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC ); + #ifdef HAVE_CUDA + auto cudaWriteOneUsingPureC = [&] () + { + cudaTraverserBenchmark.writeOneUsingPureC(); + }; if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC ); #endif benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); + #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); @@ -108,27 +107,24 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using parallel for */ - auto hostWriteOneUsingParallelFor = [&] () - { - hostTraverserBenchmark.writeOneUsingParallelFor(); - }; - -#ifdef HAVE_CUDA - auto cudaWriteOneUsingParallelFor = [&] () - { - cudaTraverserBenchmark.writeOneUsingParallelFor(); - }; -#endif - if( tests == "all" || tests == "no-bc-parallel-for" ) { benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + + auto hostWriteOneUsingParallelFor = [&] () + { + hostTraverserBenchmark.writeOneUsingParallelFor(); + }; benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor ); + #ifdef HAVE_CUDA + auto cudaWriteOneUsingParallelFor = [&] () + { + cudaTraverserBenchmark.writeOneUsingParallelFor(); + }; if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor ); #endif - benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA @@ -138,25 +134,51 @@ bool runBenchmark( const Config::ParameterContainer& parameters, } /**** - * Write one using traverser + * Write one using parallel for with grid entity */ - auto hostWriteOneUsingTraverser = [&] () + if( tests == "all" || tests == "no-bc-parallel-for-and-grid-entity" ) { - hostTraverserBenchmark.writeOneUsingTraverser(); - }; + auto hostWriteOneUsingParallelForAndGridEntity = [&] () + { + hostTraverserBenchmark.writeOneUsingParallelForAndGridEntity(); + }; + benchmark.setOperation( "par.for+grid ent.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelForAndGridEntity ); #ifdef HAVE_CUDA - auto cudaWriteOneUsingTraverser = [&] () - { - cudaTraverserBenchmark.writeOneUsingTraverser(); - }; + auto cudaWriteOneUsingParallelForAndGridEntity = [&] () + { + cudaTraverserBenchmark.writeOneUsingParallelForAndGridEntity(); + }; + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelForAndGridEntity ); +#endif + + benchmark.setOperation( "par.for+grid ent. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); +#ifdef HAVE_CUDA + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity ); #endif + } + /**** + * Write one using traverser + */ if( tests == "all" || tests == "no-bc-traverser" ) { benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + auto hostWriteOneUsingTraverser = [&] () + { + hostTraverserBenchmark.writeOneUsingTraverser(); + }; benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); + #ifdef HAVE_CUDA + auto cudaWriteOneUsingTraverser = [&] () + { + cudaTraverserBenchmark.writeOneUsingTraverser(); + }; if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); #endif @@ -298,6 +320,7 @@ void setupConfig( Config::ConfigDescription& config ) config.addEntryEnum( "all" ); config.addEntryEnum( "no-bc-pure-c" ); config.addEntryEnum( "no-bc-parallel-for" ); + config.addEntryEnum( "no-bc-parallel-for-and-grid-entity" ); config.addEntryEnum( "no-bc-traverser" ); config.addEntryEnum( "bc-pure-c" ); config.addEntryEnum( "bc-parallel-for" ); @@ -343,7 +366,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) auto mode = std::ios::out; if( outputMode == "append" ) mode |= std::ios::app; - std::ofstream logFile( logFileName.getString(), mode ); + std::ofstream logFile( logFileName.getString(), mode ); if( ! benchmark.save( logFile ) ) { diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h index 258325a76..ba6ab7e9b 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h +++ b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h @@ -64,6 +64,39 @@ processEntities( EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); }*/ #ifdef HAVE_OPENMP + if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 ) + { +#pragma omp parallel firstprivate( begin, end ) + GridEntity entity( *gridPointer ); +#pragma omp for + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + else + { + GridEntity entity( *gridPointer ); + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } +#else + GridEntity entity( *gridPointer ); + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +#endif + +/* #pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() ) #endif { @@ -77,7 +110,7 @@ processEntities( entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } - } + }*/ } } -- GitLab From f3e4d1bd15677bec9235835b95048d16658cf71b Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 12:37:17 +0100 Subject: [PATCH 030/130] Implemented traversers benchmark test - parallel for with a grid entity. --- .../Traversers/GridTraversersBenchmark.h | 70 ++++++++++++++----- 1 file changed, 54 insertions(+), 16 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 508a68eec..ef89bf969 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -86,6 +86,8 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { userData.u = this->u; v_data = v.getData(); + hostGrid = &grid.template getData< Devices::Host >(); + cudaGrid = &grid.template getData< Devices::Cuda >(); } void reset() @@ -136,9 +138,17 @@ class GridTraversersBenchmark< 1, Device, Real, Index > void writeOneUsingParallelForAndGridEntity() { - auto f = [] __cuda_callable__ ( Index i, Real* data ) + const Grid* currentGrid; + if( std::is_same< Device, Devices::Host >::value ) + currentGrid = hostGrid; + else + currentGrid = cudaGrid; + auto f = [=] __cuda_callable__ ( Index i, Real* data ) { - data[ i ] = +1.0; + Cell entity( *currentGrid ); + entity.getCoordinates().x() = i; + entity.refresh(); + data[ entity.getIndex() ] = +1.0; }; ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } @@ -199,15 +209,17 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ( grid, userData ); } - protected: + protected: - Index size; - Vector v; - Real* v_data; - GridPointer grid; - MeshFunctionPointer u; - Traverser traverser; - WriteOneTraverserUserDataType userData; + Index size; + Vector v; + Real* v_data; + GridPointer grid; + const Grid* hostGrid; + const Grid* cudaGrid; + MeshFunctionPointer u; + Traverser traverser; + WriteOneTraverserUserDataType userData; }; @@ -235,6 +247,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index > { userData.u = this->u; v_data = v.getData(); + hostGrid = &grid.template getData< Devices::Host >(); + cudaGrid = &grid.template getData< Devices::Cuda >(); } void reset() @@ -282,7 +296,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { - data[ i * _size + j ] = 1.0; + data[ i * _size + j ] += 1.0; }; ParallelFor2D< Device >::exec( ( Index ) 0, @@ -294,10 +308,18 @@ class GridTraversersBenchmark< 2, Device, Real, Index > void writeOneUsingParallelForAndGridEntity() { - Index _size = this->size; + const Grid* currentGrid; + if( std::is_same< Device, Devices::Host >::value ) + currentGrid = hostGrid; + else + currentGrid = cudaGrid; auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { - data[ i * _size + j ] = 1.0; + Cell entity( *currentGrid ); + entity.getCoordinates().y() = i; + entity.getCoordinates().x() = j; + entity.refresh(); + data[ entity.getIndex() ] += 1.0; }; ParallelFor2D< Device >::exec( ( Index ) 0, @@ -382,6 +404,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index > Vector v; Real* v_data; GridPointer grid; + const Grid* hostGrid; + const Grid* cudaGrid; MeshFunctionPointer u; Traverser traverser; WriteOneTraverserUserDataType userData; @@ -414,6 +438,8 @@ class GridTraversersBenchmark< 3, Device, Real, Index > { userData.u = this->u; v_data = v.getData(); + hostGrid = &grid.template getData< Devices::Host >(); + cudaGrid = &grid.template getData< Devices::Cuda >(); } void reset() @@ -429,7 +455,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > for( int i = 0; i < size; i++ ) for( int j = 0; j < size; j++ ) for( int k = 0; k < size; k++ ) - v_data[ ( i * size + j ) * size + k ] = 1.0; + v_data[ ( i * size + j ) * size + k ] += 1.0; } else // Device == Devices::Cuda { @@ -464,7 +490,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { - data[ ( i * _size + j ) * _size + k ] = 1.0; + data[ ( i * _size + j ) * _size + k ] += 1.0; }; ParallelFor3D< Device >::exec( ( Index ) 0, @@ -478,10 +504,20 @@ class GridTraversersBenchmark< 3, Device, Real, Index > void writeOneUsingParallelForAndGridEntity() { + const Grid* currentGrid; + if( std::is_same< Device, Devices::Host >::value ) + currentGrid = hostGrid; + else + currentGrid = cudaGrid; Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { - data[ ( i * _size + j ) * _size + k ] = 1.0; + Cell entity( *currentGrid ); + entity.getCoordinates().z() = i; + entity.getCoordinates().y() = j; + entity.getCoordinates().x() = k; + entity.refresh(); + data[ entity.getIndex() ] += 1.0; }; ParallelFor3D< Device >::exec( ( Index ) 0, @@ -581,6 +617,8 @@ class GridTraversersBenchmark< 3, Device, Real, Index > Vector v; Real* v_data; GridPointer grid; + const Grid* hostGrid; + const Grid* cudaGrid; MeshFunctionPointer u; Traverser traverser; WriteOneTraverserUserDataType userData; -- GitLab From f590e0e9ef45c187fc7192af55849b2f68f19c7d Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 12:43:42 +0100 Subject: [PATCH 031/130] GridTraversersBenchmark.h splitted into GridTraversersBenchmark_1D.h, GridTraversersBenchmark_2D.h and GridTraversersBenchmark_3D.h. --- .../Traversers/GridTraversersBenchmark.h | 568 +----------------- 1 file changed, 5 insertions(+), 563 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index ef89bf969..c320dc591 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -56,574 +56,16 @@ class WriteOneUserData MeshFunctionPointer u; }; - template< int Dimension, typename Device, typename Real, typename Index > class GridTraversersBenchmark{}; -template< typename Device, - typename Real, - typename Index > -class GridTraversersBenchmark< 1, Device, Real, Index > -{ - public: - - using Vector = Containers::Vector< Real, Device, Index >; - using Grid = Meshes::Grid< 1, Real, Device, Index >; - using GridPointer = Pointers::SharedPointer< Grid >; - using Coordinates = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; - using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; - using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; - using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; - - GridTraversersBenchmark( Index size ) - :v( size ), size( size ), grid( size ), u( grid ) - { - userData.u = this->u; - v_data = v.getData(); - hostGrid = &grid.template getData< Devices::Host >(); - cudaGrid = &grid.template getData< Devices::Cuda >(); - } - - void reset() - { - v.setValue( 0.0 ); - u->getData().setValue( 0.0 ); - }; - - void writeOneUsingPureC() - { - if( std::is_same< Device, Devices::Host >::value ) - { - for( int i = 0; i < size; i++ ) - v_data[ i ] += 1.0; - } - else // Device == Devices::Cuda - { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( - blockSize, - blocksCount, - gridsCount, - size ); - dim3 gridIdx; - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - fullGridTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); - } -#endif - } - } - - void writeOneUsingParallelFor() - { - auto f = [] __cuda_callable__ ( Index i, Real* data ) - { - data[ i ] = +1.0; - }; - ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); - } - - void writeOneUsingParallelForAndGridEntity() - { - const Grid* currentGrid; - if( std::is_same< Device, Devices::Host >::value ) - currentGrid = hostGrid; - else - currentGrid = cudaGrid; - auto f = [=] __cuda_callable__ ( Index i, Real* data ) - { - Cell entity( *currentGrid ); - entity.getCoordinates().x() = i; - entity.refresh(); - data[ entity.getIndex() ] = +1.0; - }; - ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); - } - - void writeOneUsingTraverser() - { - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > - ( grid, userData ); - } - - void traverseUsingPureC() - { - if( std::is_same< Device, Devices::Host >::value ) - { - v_data[ 0 ] = 2; - for( int i = 1; i < size - 1; i++ ) - v_data[ i ] = 1.0; - v_data[ size - 1 ] = 2; - } - else // Device == Devices::Cuda - { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( - blockSize, - blocksCount, - gridsCount, - size ); - dim3 gridIdx; - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - boundariesTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); - } - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - interiorTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); - } -#endif - } - } - - void traverseUsingTraverser() - { - // TODO !!!!!!!!!!!!!!!!!!!!!! - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > - ( grid, userData ); - } - - protected: - - Index size; - Vector v; - Real* v_data; - GridPointer grid; - const Grid* hostGrid; - const Grid* cudaGrid; - MeshFunctionPointer u; - Traverser traverser; - WriteOneTraverserUserDataType userData; -}; - - -template< typename Device, - typename Real, - typename Index > -class GridTraversersBenchmark< 2, Device, Real, Index > -{ - public: - - using Vector = Containers::Vector< Real, Device, Index >; - using Grid = Meshes::Grid< 2, Real, Device, Index >; - using GridPointer = Pointers::SharedPointer< Grid >; - using Coordinates = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; - using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; - using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; - using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; - using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; - - GridTraversersBenchmark( Index size ) - :size( size ), v( size * size ), grid( size, size ), u( grid ) - { - userData.u = this->u; - v_data = v.getData(); - hostGrid = &grid.template getData< Devices::Host >(); - cudaGrid = &grid.template getData< Devices::Cuda >(); - } - - void reset() - { - v.setValue( 0.0 ); - u->getData().setValue( 0.0 ); - }; - - void writeOneUsingPureC() - { - if( std::is_same< Device, Devices::Host >::value ) - { - for( int i = 0; i < size; i++ ) - for( int j = 0; j < size; j++ ) - v_data[ i * size + j ] = 1.0; - } - else // Device == Devices::Cuda - { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( - blockSize, - blocksCount, - gridsCount, - size, - size ); - dim3 gridIdx; - for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - fullGridTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); - } -#endif - } - } - - void writeOneUsingParallelFor() - { - Index _size = this->size; - auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) - { - data[ i * _size + j ] += 1.0; - }; - - ParallelFor2D< Device >::exec( ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - f, v.getData() ); - } - - void writeOneUsingParallelForAndGridEntity() - { - const Grid* currentGrid; - if( std::is_same< Device, Devices::Host >::value ) - currentGrid = hostGrid; - else - currentGrid = cudaGrid; - auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) - { - Cell entity( *currentGrid ); - entity.getCoordinates().y() = i; - entity.getCoordinates().x() = j; - entity.refresh(); - data[ entity.getIndex() ] += 1.0; - }; - - ParallelFor2D< Device >::exec( ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - f, v.getData() ); - } - - void writeOneUsingTraverser() - { - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > - ( grid, userData ); - } - - void traverseUsingPureC() - { - if( std::is_same< Device, Devices::Host >::value ) - { - for( int i = 0; i < size; i++ ) - { - v_data[ i * size ] = 2.0; - v_data[ i * size + size - 1 ] = 2.0; - } - for( int j = 1; j < size - 1; j++ ) - { - v_data[ j ] = 2.0; - v_data[ ( size - 1 ) * size + j ] = 2.0; - } - - for( int i = 1; i < size - 1; i++ ) - for( int j = 1; j < size - 1; j++ ) - v_data[ i * size + j ] = 1.0; - } - else // Device == Devices::Cuda - { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( - blockSize, - blocksCount, - gridsCount, - size, - size ); - dim3 gridIdx; - for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - boundariesTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); - } - for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - interiorTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); - } -#endif - } - } - - void traversingUsingTraverser() - { - // TODO !!!!!!!!!!!!!!!!!!!!!! - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > - ( grid, userData ); - } - - protected: - - Index size; - Vector v; - Real* v_data; - GridPointer grid; - const Grid* hostGrid; - const Grid* cudaGrid; - MeshFunctionPointer u; - Traverser traverser; - WriteOneTraverserUserDataType userData; -}; - -template< typename Device, - typename Real, - typename Index > -class GridTraversersBenchmark< 3, Device, Real, Index > -{ - public: - - using Vector = Containers::Vector< Real, Device, Index >; - using Grid = Meshes::Grid< 3, Real, Device, Index >; - using GridPointer = Pointers::SharedPointer< Grid >; - using Coordinates = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; - using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; - using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; - using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; - using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; - - GridTraversersBenchmark( Index size ) - : size( size ), - v( size * size * size ), - grid( size, size, size ), - u( grid ) - { - userData.u = this->u; - v_data = v.getData(); - hostGrid = &grid.template getData< Devices::Host >(); - cudaGrid = &grid.template getData< Devices::Cuda >(); - } - - void reset() - { - v.setValue( 0.0 ); - u->getData().setValue( 0.0 ); - }; - - void writeOneUsingPureC() - { - if( std::is_same< Device, Devices::Host >::value ) - { - for( int i = 0; i < size; i++ ) - for( int j = 0; j < size; j++ ) - for( int k = 0; k < size; k++ ) - v_data[ ( i * size + j ) * size + k ] += 1.0; - } - else // Device == Devices::Cuda - { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( - blockSize, - blocksCount, - gridsCount, - size, - size, - size ); - dim3 gridIdx; - for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) - for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - fullGridTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); - } -#endif - } - } - - void writeOneUsingParallelFor() - { - Index _size = this->size; - auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) - { - data[ ( i * _size + j ) * _size + k ] += 1.0; - }; - - ParallelFor3D< Device >::exec( ( Index ) 0, - ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - this->size, - f, v.getData() ); - } - - void writeOneUsingParallelForAndGridEntity() - { - const Grid* currentGrid; - if( std::is_same< Device, Devices::Host >::value ) - currentGrid = hostGrid; - else - currentGrid = cudaGrid; - Index _size = this->size; - auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) - { - Cell entity( *currentGrid ); - entity.getCoordinates().z() = i; - entity.getCoordinates().y() = j; - entity.getCoordinates().x() = k; - entity.refresh(); - data[ entity.getIndex() ] += 1.0; - }; - - ParallelFor3D< Device >::exec( ( Index ) 0, - ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - this->size, - f, v.getData() ); - } - - void writeOneUsingTraverser() - { - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > - ( grid, userData ); - } - - void traverseUsingPureC() - { - if( std::is_same< Device, Devices::Host >::value ) - { - for( int i = 0; i < size; i++ ) - for( int j = 0; j < size; j++ ) - { - v_data[ ( i * size + j ) * size ] = 2.0; - v_data[ ( i * size + j ) * size + size - 1 ] = 2.0; - } - for( int j = 0; j < size; j++ ) - for( int k = 1; k < size - 1; k++ ) - { - v_data[ j * size + k ] = 1.0; - v_data[ ( ( size - 1) * size + j ) * size + k ] = 1.0; - } - - for( int i = 1; i < size -1; i++ ) - for( int k = 1; k < size - 1; k++ ) - { - v_data[ ( i * size ) * size + k ] = 2.0; - v_data[ ( i * size + size - 1 ) * size + k ] = 2.0; - } - - for( int i = 1; i < size -1; i++ ) - for( int j = 1; j < size -1; j++ ) - for( int k = 1; k < size - 1; k++ ) - v_data[ ( i * size + j ) * size + k ] = 1.0; - } - else // Device == Devices::Cuda - { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( - blockSize, - blocksCount, - gridsCount, - size, - size, - size ); - dim3 gridIdx; - for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) - for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - boundariesTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); - } - for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) - for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - interiorTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); - } -#endif - } - } - - void traverseUsingTraverser() - { - // TODO !!!!!!!!!!!!!!!!!!!!!! - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > - ( grid, userData ); - } - - protected: - - Index size; - Vector v; - Real* v_data; - GridPointer grid; - const Grid* hostGrid; - const Grid* cudaGrid; - MeshFunctionPointer u; - Traverser traverser; - WriteOneTraverserUserDataType userData; -}; - } // namespace Traversers } // namespace Benchmarks -} // namespace TNL \ No newline at end of file +} // namespace TNL + +#include "GridTraversersBenchmark_1D.h" +#include "GridTraversersBenchmark_2D.h" +#include "GridTraversersBenchmark_3D.h" \ No newline at end of file -- GitLab From 330c0621fcef7cff4f8e70fefb99c1f9f0daed5c Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 12:52:39 +0100 Subject: [PATCH 032/130] GridTraversersBenchmark.h splitted into GridTraversersBenchmark_1D.h, GridTraversersBenchmark_2D.h and GridTraversersBenchmark_3D.h. --- .../Traversers/GridTraversersBenchmark_1D.h | 191 ++++++++++++++ .../Traversers/GridTraversersBenchmark_2D.h | 220 ++++++++++++++++ .../Traversers/GridTraversersBenchmark_3D.h | 245 ++++++++++++++++++ 3 files changed, 656 insertions(+) create mode 100644 src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h create mode 100644 src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h create mode 100644 src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h new file mode 100644 index 000000000..c270080fc --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -0,0 +1,191 @@ +/*************************************************************************** + GridTraversersBenchmark_1D.h - description + ------------------- + begin : Jan 3, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda-kernels.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename Device, + typename Real, + typename Index > +class GridTraversersBenchmark< 1, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + using Grid = Meshes::Grid< 1, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< Grid >; + using Coordinates = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; + using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; + using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; + + GridTraversersBenchmark( Index size ) + :v( size ), size( size ), grid( size ), u( grid ) + { + userData.u = this->u; + v_data = v.getData(); + hostGrid = &grid.template getData< Devices::Host >(); + cudaGrid = &grid.template getData< Devices::Cuda >(); + } + + void reset() + { + v.setValue( 0.0 ); + u->getData().setValue( 0.0 ); + }; + + void writeOneUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + v_data[ i ] += 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + fullGridTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void writeOneUsingParallelFor() + { + auto f = [] __cuda_callable__ ( Index i, Real* data ) + { + data[ i ] = +1.0; + }; + ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); + } + + void writeOneUsingParallelForAndGridEntity() + { + const Grid* currentGrid; + if( std::is_same< Device, Devices::Host >::value ) + currentGrid = hostGrid; + else + currentGrid = cudaGrid; + auto f = [=] __cuda_callable__ ( Index i, Real* data ) + { + Cell entity( *currentGrid ); + entity.getCoordinates().x() = i; + entity.refresh(); + data[ entity.getIndex() ] = +1.0; + }; + ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); + } + + void writeOneUsingTraverser() + { + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } + + void traverseUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + v_data[ 0 ] = 2; + for( int i = 1; i < size - 1; i++ ) + v_data[ i ] = 1.0; + v_data[ size - 1 ] = 2; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + boundariesTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + interiorTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void traverseUsingTraverser() + { + // TODO !!!!!!!!!!!!!!!!!!!!!! + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } + + protected: + + Index size; + Vector v; + Real* v_data; + GridPointer grid; + const Grid* hostGrid; + const Grid* cudaGrid; + MeshFunctionPointer u; + Traverser traverser; + WriteOneTraverserUserDataType userData; +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL \ No newline at end of file diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h new file mode 100644 index 000000000..d8823c335 --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -0,0 +1,220 @@ +/*************************************************************************** + GridTraversersBenchmark_2D.h - description + ------------------- + begin : Jan 3, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda-kernels.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename Device, + typename Real, + typename Index > +class GridTraversersBenchmark< 2, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + using Grid = Meshes::Grid< 2, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< Grid >; + using Coordinates = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; + using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; + using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; + using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; + + GridTraversersBenchmark( Index size ) + :size( size ), v( size * size ), grid( size, size ), u( grid ) + { + userData.u = this->u; + v_data = v.getData(); + hostGrid = &grid.template getData< Devices::Host >(); + cudaGrid = &grid.template getData< Devices::Cuda >(); + } + + void reset() + { + v.setValue( 0.0 ); + u->getData().setValue( 0.0 ); + }; + + void writeOneUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + for( int j = 0; j < size; j++ ) + v_data[ i * size + j ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size ); + dim3 gridIdx; + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + fullGridTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void writeOneUsingParallelFor() + { + Index _size = this->size; + auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) + { + data[ i * _size + j ] += 1.0; + }; + + ParallelFor2D< Device >::exec( ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); + } + + void writeOneUsingParallelForAndGridEntity() + { + const Grid* currentGrid; + if( std::is_same< Device, Devices::Host >::value ) + currentGrid = hostGrid; + else + currentGrid = cudaGrid; + auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) + { + Cell entity( *currentGrid ); + entity.getCoordinates().y() = i; + entity.getCoordinates().x() = j; + entity.refresh(); + data[ entity.getIndex() ] += 1.0; + }; + + ParallelFor2D< Device >::exec( ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); + } + + void writeOneUsingTraverser() + { + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } + + void traverseUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + { + v_data[ i * size ] = 2.0; + v_data[ i * size + size - 1 ] = 2.0; + } + for( int j = 1; j < size - 1; j++ ) + { + v_data[ j ] = 2.0; + v_data[ ( size - 1 ) * size + j ] = 2.0; + } + + for( int i = 1; i < size - 1; i++ ) + for( int j = 1; j < size - 1; j++ ) + v_data[ i * size + j ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size ); + dim3 gridIdx; + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + boundariesTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + interiorTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void traversingUsingTraverser() + { + // TODO !!!!!!!!!!!!!!!!!!!!!! + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } + + protected: + + Index size; + Vector v; + Real* v_data; + GridPointer grid; + const Grid* hostGrid; + const Grid* cudaGrid; + MeshFunctionPointer u; + Traverser traverser; + WriteOneTraverserUserDataType userData; +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL \ No newline at end of file diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h new file mode 100644 index 000000000..8f3a55e19 --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -0,0 +1,245 @@ +/*************************************************************************** + GridTraversersBenchmark_3D.h - description + ------------------- + begin : Jan 3, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda-kernels.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename Device, + typename Real, + typename Index > +class GridTraversersBenchmark< 3, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + using Grid = Meshes::Grid< 3, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< Grid >; + using Coordinates = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; + using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; + using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; + using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; + + GridTraversersBenchmark( Index size ) + : size( size ), + v( size * size * size ), + grid( size, size, size ), + u( grid ) + { + userData.u = this->u; + v_data = v.getData(); + hostGrid = &grid.template getData< Devices::Host >(); + cudaGrid = &grid.template getData< Devices::Cuda >(); + } + + void reset() + { + v.setValue( 0.0 ); + u->getData().setValue( 0.0 ); + }; + + void writeOneUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + for( int j = 0; j < size; j++ ) + for( int k = 0; k < size; k++ ) + v_data[ ( i * size + j ) * size + k ] += 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size, + size ); + dim3 gridIdx; + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + fullGridTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void writeOneUsingParallelFor() + { + Index _size = this->size; + auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) + { + data[ ( i * _size + j ) * _size + k ] += 1.0; + }; + + ParallelFor3D< Device >::exec( ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); + } + + void writeOneUsingParallelForAndGridEntity() + { + const Grid* currentGrid; + if( std::is_same< Device, Devices::Host >::value ) + currentGrid = hostGrid; + else + currentGrid = cudaGrid; + Index _size = this->size; + auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) + { + Cell entity( *currentGrid ); + entity.getCoordinates().z() = i; + entity.getCoordinates().y() = j; + entity.getCoordinates().x() = k; + entity.refresh(); + data[ entity.getIndex() ] += 1.0; + }; + + ParallelFor3D< Device >::exec( ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); + } + + void writeOneUsingTraverser() + { + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } + + void traverseUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + for( int j = 0; j < size; j++ ) + { + v_data[ ( i * size + j ) * size ] = 2.0; + v_data[ ( i * size + j ) * size + size - 1 ] = 2.0; + } + for( int j = 0; j < size; j++ ) + for( int k = 1; k < size - 1; k++ ) + { + v_data[ j * size + k ] = 1.0; + v_data[ ( ( size - 1) * size + j ) * size + k ] = 1.0; + } + + for( int i = 1; i < size -1; i++ ) + for( int k = 1; k < size - 1; k++ ) + { + v_data[ ( i * size ) * size + k ] = 2.0; + v_data[ ( i * size + size - 1 ) * size + k ] = 2.0; + } + + for( int i = 1; i < size -1; i++ ) + for( int j = 1; j < size -1; j++ ) + for( int k = 1; k < size - 1; k++ ) + v_data[ ( i * size + j ) * size + k ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size, + size ); + dim3 gridIdx; + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + boundariesTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + interiorTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void traverseUsingTraverser() + { + // TODO !!!!!!!!!!!!!!!!!!!!!! + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } + + protected: + + Index size; + Vector v; + Real* v_data; + GridPointer grid; + const Grid* hostGrid; + const Grid* cudaGrid; + MeshFunctionPointer u; + Traverser traverser; + WriteOneTraverserUserDataType userData; +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL \ No newline at end of file -- GitLab From 824c85a2f8e2386beb7a366a57094c37fce7f625 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 12:52:59 +0100 Subject: [PATCH 033/130] Deleting old code. --- .../Meshes/GridDetails/GridTraverser_impl.h | 28 +------------------ 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h index ba6ab7e9b..e8e96b42e 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h +++ b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h @@ -54,15 +54,6 @@ processEntities( } else { - //TODO: This does not work with gcc-5.4 and older, should work at gcc 6.x -/*#pragma omp parallel for firstprivate( entity, begin, end ) if( Devices::Host::isOMPEnabled() ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - }*/ #ifdef HAVE_OPENMP if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 ) { @@ -95,23 +86,6 @@ processEntities( EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } #endif - -/* -#pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() ) -#endif - { - GridEntity entity( *gridPointer ); -#ifdef HAVE_OPENMP -#pragma omp for -#endif - for( IndexType x = begin.x(); x <= end.x(); x ++ ) - { - entity.getCoordinates().x() = x; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - }*/ - } } @@ -385,7 +359,7 @@ processEntities( entity.getCoordinates().y() = y; entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } + } } } } -- GitLab From 439479dc1324a84f108d3704b5724bc920af519c Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 13:24:28 +0100 Subject: [PATCH 034/130] Added traversers benchmark test with mesh function. --- .../Traversers/GridTraversersBenchmark_1D.h | 24 ++++++++----- .../Traversers/GridTraversersBenchmark_2D.h | 31 +++++++++++----- .../Traversers/GridTraversersBenchmark_3D.h | 35 +++++++++++++------ .../Traversers/tnl-benchmark-traversers.h | 29 +++++++++++++++ 4 files changed, 91 insertions(+), 28 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index c270080fc..32cdc3229 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -50,8 +50,6 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { userData.u = this->u; v_data = v.getData(); - hostGrid = &grid.template getData< Devices::Host >(); - cudaGrid = &grid.template getData< Devices::Cuda >(); } void reset() @@ -102,11 +100,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > void writeOneUsingParallelForAndGridEntity() { - const Grid* currentGrid; - if( std::is_same< Device, Devices::Host >::value ) - currentGrid = hostGrid; - else - currentGrid = cudaGrid; + const Grid* currentGrid = &grid.template getData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Real* data ) { Cell entity( *currentGrid ); @@ -117,6 +111,20 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } + void writeOneUsingParallelForAndMeshFunction() + { + const Grid* currentGrid = &grid.template getData< Device >(); + MeshFunction* _u = &u.template modifyData< Device >(); + auto f = [=] __cuda_callable__ ( Index i, Real* data ) + { + Cell entity( *currentGrid ); + entity.getCoordinates().x() = i; + entity.refresh(); + ( *_u )( entity ) = +1.0; + }; + ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); + } + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > @@ -179,8 +187,6 @@ class GridTraversersBenchmark< 1, Device, Real, Index > Vector v; Real* v_data; GridPointer grid; - const Grid* hostGrid; - const Grid* cudaGrid; MeshFunctionPointer u; Traverser traverser; WriteOneTraverserUserDataType userData; diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index d8823c335..cc360c349 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -51,8 +51,6 @@ class GridTraversersBenchmark< 2, Device, Real, Index > { userData.u = this->u; v_data = v.getData(); - hostGrid = &grid.template getData< Devices::Host >(); - cudaGrid = &grid.template getData< Devices::Cuda >(); } void reset() @@ -112,11 +110,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > void writeOneUsingParallelForAndGridEntity() { - const Grid* currentGrid; - if( std::is_same< Device, Devices::Host >::value ) - currentGrid = hostGrid; - else - currentGrid = cudaGrid; + const Grid* currentGrid = &grid.template getData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { Cell entity( *currentGrid ); @@ -133,6 +127,27 @@ class GridTraversersBenchmark< 2, Device, Real, Index > f, v.getData() ); } + void writeOneUsingParallelForAndMeshFunction() + { + const Grid* currentGrid = &grid.template getData< Device >(); + MeshFunction* _u = &u.template modifyData< Device >(); + auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) + { + Cell entity( *currentGrid ); + entity.getCoordinates().y() = i; + entity.getCoordinates().x() = j; + entity.refresh(); + ( *_u )( entity ) += 1.0; + }; + + ParallelFor2D< Device >::exec( ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); + } + + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > @@ -208,8 +223,6 @@ class GridTraversersBenchmark< 2, Device, Real, Index > Vector v; Real* v_data; GridPointer grid; - const Grid* hostGrid; - const Grid* cudaGrid; MeshFunctionPointer u; Traverser traverser; WriteOneTraverserUserDataType userData; diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index 8f3a55e19..07ea6e5f8 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -54,8 +54,6 @@ class GridTraversersBenchmark< 3, Device, Real, Index > { userData.u = this->u; v_data = v.getData(); - hostGrid = &grid.template getData< Devices::Host >(); - cudaGrid = &grid.template getData< Devices::Cuda >(); } void reset() @@ -120,12 +118,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > void writeOneUsingParallelForAndGridEntity() { - const Grid* currentGrid; - if( std::is_same< Device, Devices::Host >::value ) - currentGrid = hostGrid; - else - currentGrid = cudaGrid; - Index _size = this->size; + const Grid* currentGrid = &grid.template getData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { Cell entity( *currentGrid ); @@ -145,6 +138,30 @@ class GridTraversersBenchmark< 3, Device, Real, Index > f, v.getData() ); } + void writeOneUsingParallelForAndMeshFunction() + { + const Grid* currentGrid = &grid.template getData< Device >(); + MeshFunction* _u = &u.template modifyData< Device >(); + auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) + { + Cell entity( *currentGrid ); + entity.getCoordinates().z() = i; + entity.getCoordinates().y() = j; + entity.getCoordinates().x() = k; + entity.refresh(); + ( *_u )( entity ) += 1.0; + }; + + ParallelFor3D< Device >::exec( ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); + } + + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > @@ -233,8 +250,6 @@ class GridTraversersBenchmark< 3, Device, Real, Index > Vector v; Real* v_data; GridPointer grid; - const Grid* hostGrid; - const Grid* cudaGrid; MeshFunctionPointer u; Traverser traverser; WriteOneTraverserUserDataType userData; diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 9f7920e3c..56fbc151c 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -162,6 +162,35 @@ bool runBenchmark( const Config::ParameterContainer& parameters, #endif } + /**** + * Write one using parallel for with mesh function + */ + if( tests == "all" || tests == "no-bc-parallel-for-and-mesh-function" ) + { + auto hostWriteOneUsingParallelForAndMeshFunction = [&] () + { + hostTraverserBenchmark.writeOneUsingParallelForAndMeshFunction(); + }; + benchmark.setOperation( "par.for+mesh fc.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelForAndMeshFunction ); + +#ifdef HAVE_CUDA + auto cudaWriteOneUsingParallelForAndMeshFunction = [&] () + { + cudaTraverserBenchmark.writeOneUsingParallelForAndMeshFunction(); + }; + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelForAndGridMeshFunction ); +#endif + + benchmark.setOperation( "par.for+mesh fc. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction ); +#ifdef HAVE_CUDA + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction ); +#endif + } + /**** * Write one using traverser */ -- GitLab From a5d90a72c68d9cd1c852781faa859c6430877350 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 19:41:52 +0100 Subject: [PATCH 035/130] Added configuration parameter 'reset' to Benchmark. --- src/Benchmarks/Benchmarks.h | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 71f808ad8..f31e21f6c 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -73,6 +73,7 @@ public: static void configSetup( Config::ConfigDescription& config ) { config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); + config.addEntry< bool >( "reset", "Call reset function between loops.", true ); config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 ); config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true ); config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 ); @@ -81,6 +82,7 @@ public: void setup( const Config::ParameterContainer& parameters ) { this->loops = parameters.getParameter< unsigned >( "loops" ); + this->reset = parameters.getParameter< bool >( "reset" ); this->minTime = parameters.getParameter< double >( "min-time" ); this->timing = parameters.getParameter< bool >( "timing" ); const int verbose = parameters.getParameter< unsigned >( "verbose" ); @@ -114,8 +116,11 @@ public: { closeTable(); writeTitle( title ); - // add loops to metadata + // add loops and reset flag to metadata metadata["loops"] = convertToString(loops); + metadata["reset"] = convertToString( reset ); + metadata["minimal test time"] = convertToString( minTime ); + metadata["timing"] = convertToString( timing ); writeMetadata( metadata ); } @@ -202,15 +207,27 @@ public: // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); if( this->timing ) - result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + if( this->reset ) + result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + else + result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); else - result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + if( this->reset ) + result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + else + result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); } else { if( this->timing ) - result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + if( this->reset ) + result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + else + result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); else - result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + if( this->reset ) + result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + else + result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); } } catch ( const std::exception& e ) { @@ -319,6 +336,7 @@ protected: double datasetSize = 0.0; double baseTime = 0.0; bool timing = true; + bool reset = true; Solvers::IterativeSolverMonitor< double, int > monitor; }; -- GitLab From 31900f16049ae3a72b584711f2bd7bb660e89de9 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 21:46:05 +0100 Subject: [PATCH 036/130] Optimized conditional OpenMP traversing in 2D and 3D grid traversers - cells only. --- .../Meshes/GridDetails/GridTraverser_impl.h | 162 +++++++++++------- 1 file changed, 101 insertions(+), 61 deletions(-) diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h index e8e96b42e..33b5e22eb 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h +++ b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h @@ -58,30 +58,35 @@ processEntities( if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 ) { #pragma omp parallel firstprivate( begin, end ) - GridEntity entity( *gridPointer ); -#pragma omp for - for( IndexType x = begin.x(); x <= end.x(); x ++ ) { - entity.getCoordinates().x() = x; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + GridEntity entity( *gridPointer ); +#pragma omp for + // TODO: g++ 5.5 crashes when coding this loop without auxiliary x as bellow + for( IndexType x = begin.x(); x <= end.x(); x++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } } } else { GridEntity entity( *gridPointer ); - for( IndexType x = begin.x(); x <= end.x(); x ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) { - entity.getCoordinates().x() = x; entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } } #else GridEntity entity( *gridPointer ); - for( IndexType x = begin.x(); x <= end.x(); x ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) { - entity.getCoordinates().x() = x; entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } @@ -332,35 +337,51 @@ processEntities( } else { - //TODO: This does not work with gcc-5.4 and older, should work at gcc 6.x -/*#pragma omp parallel for firstprivate( entity, begin, end ) if( Devices::Host::isOMPEnabled() ) - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - }*/ #ifdef HAVE_OPENMP -#pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() ) -#endif + if( Devices::Host::isOMPEnabled() ) { - GridEntity entity( *gridPointer, begin, gridEntityParameters... ); -#ifdef HAVE_OPENMP -#pragma omp for -#endif - for( IndexType y = begin.y(); y <= end.y(); y ++ ) - for( IndexType x = begin.x(); x <= end.x(); x ++ ) - { - entity.getCoordinates().x() = x; - entity.getCoordinates().y() = y; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } +#pragma omp parallel firstprivate( begin, end ) + { + GridEntity entity( *gridPointer ); +#pragma omp for + // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow + for( IndexType y = begin.y(); y <= end.y(); y ++ ) + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.getCoordinates().y() = y; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + } + else + { + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } } +#else + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +#endif } } @@ -426,7 +447,7 @@ GridTraverser2DBoundaryAlongX( typename GridType::CoordinatesType coordinates; coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = fixedY; + coordinates.y() = fixedY; if( coordinates.x() <= endX ) { @@ -436,7 +457,7 @@ GridTraverser2DBoundaryAlongX( ( *grid, userData, entity ); - } + } } // Boundary traverser using streams @@ -648,7 +669,7 @@ processEntities( if( processOnlyBoundaryEntities && ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) ) { -#ifdef GRID_TRAVERSER_USE_STREAMS +#ifdef GRID_TRAVERSER_USE_STREAMS dim3 cudaBlockSize( 256 ); dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX, cudaBlocksCountAlongY, cudaGridsCountAlongY; @@ -960,8 +981,45 @@ processEntities( } else { - // TODO: this does not work with gcc-5.4 and older, should work at gcc 6.x -/*#pragma omp parallel for firstprivate( entity, begin, end ) if( Devices::Host::isOMPEnabled() ) +#ifdef HAVE_OPENMP + if( Devices::Host::isOMPEnabled() ) + { +#pragma omp parallel firstprivate( begin, end ) + { + GridEntity entity( *gridPointer ); +#pragma omp for + // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow + for( IndexType z = begin.z(); z <= end.z(); z ++ ) + for( IndexType y = begin.y(); y <= end.y(); y ++ ) + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.getCoordinates().y() = y; + entity.getCoordinates().z() = z; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + } + else + { + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().z() = begin.z(); + entity.getCoordinates().z() <= end.z(); + entity.getCoordinates().z() ++ ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } +#else + GridEntity entity( *gridPointer ); for( entity.getCoordinates().z() = begin.z(); entity.getCoordinates().z() <= end.z(); entity.getCoordinates().z() ++ ) @@ -971,29 +1029,11 @@ processEntities( for( entity.getCoordinates().x() = begin.x(); entity.getCoordinates().x() <= end.x(); entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - }*/ -#ifdef HAVE_OPENMP -#pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() ) -#endif - { - GridEntity entity( *gridPointer, begin, gridEntityParameters... ); -#ifdef HAVE_OPENMP -#pragma omp for -#endif - for( IndexType z = begin.z(); z <= end.z(); z ++ ) - for( IndexType y = begin.y(); y <= end.y(); y ++ ) - for( IndexType x = begin.x(); x <= end.x(); x ++ ) { - entity.getCoordinates().x() = x; - entity.getCoordinates().y() = y; - entity.getCoordinates().z() = z; entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } + } +#endif } } -- GitLab From 7104d860d90872eb7d075721ee041d1c65c236eb Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 21:47:44 +0100 Subject: [PATCH 037/130] Analyzing grid traversers. --- .../Traversers/GridTraversersBenchmark_1D.h | 29 ++++++++++--- .../Traversers/GridTraversersBenchmark_2D.h | 21 +++++++++- .../Traversers/GridTraversersBenchmark_3D.h | 2 +- .../Traversers/tnl-benchmark-traversers.h | 41 ++++++++++--------- 4 files changed, 66 insertions(+), 27 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index 32cdc3229..91097ecac 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -22,6 +22,7 @@ #include #include #include "cuda-kernels.h" +#include "GridTraversersBenchmark.h" namespace TNL { namespace Benchmarks { @@ -46,7 +47,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; GridTraversersBenchmark( Index size ) - :v( size ), size( size ), grid( size ), u( grid ) + :size( size ), v( size ), grid( size ), u( grid ) { userData.u = this->u; v_data = v.getData(); @@ -93,7 +94,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { auto f = [] __cuda_callable__ ( Index i, Real* data ) { - data[ i ] = +1.0; + data[ i ] += 1.0; }; ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } @@ -106,7 +107,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > Cell entity( *currentGrid ); entity.getCoordinates().x() = i; entity.refresh(); - data[ entity.getIndex() ] = +1.0; + data[ entity.getIndex() ] += 1.0; }; ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } @@ -117,18 +118,36 @@ class GridTraversersBenchmark< 1, Device, Real, Index > MeshFunction* _u = &u.template modifyData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Real* data ) { - Cell entity( *currentGrid ); + Cell entity( grid.template getData< Device >() ); entity.getCoordinates().x() = i; entity.refresh(); - ( *_u )( entity ) = +1.0; + //( *_u )( entity ) += 1.0; + WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity ); }; ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } void writeOneUsingTraverser() { + using CoordinatesType = typename Grid::CoordinatesType; traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > ( grid, userData ); + + /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >( + grid, + CoordinatesType( 0 ), + grid->getDimensions() - CoordinatesType( 1 ), + userData );*/ + /*const CoordinatesType begin( 0 ); + const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); + MeshFunction* _u = &u.template modifyData< Device >(); + Cell entity( *grid ); + for( Index x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); + }*/ } void traverseUsingPureC() diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index cc360c349..d62d56f91 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -65,7 +65,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > { for( int i = 0; i < size; i++ ) for( int j = 0; j < size; j++ ) - v_data[ i * size + j ] = 1.0; + v_data[ i * size + j ] += 1.0; } else // Device == Devices::Cuda { @@ -150,8 +150,27 @@ class GridTraversersBenchmark< 2, Device, Real, Index > void writeOneUsingTraverser() { + using CoordinatesType = typename Grid::CoordinatesType; traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > ( grid, userData ); + + /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >( + grid, + CoordinatesType( 0 ), + grid->getDimensions() - CoordinatesType( 1 ), + userData );*/ + /*const CoordinatesType begin( 0 ); + const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); + MeshFunction* _u = &u.template modifyData< Device >(); + Cell entity( *grid ); + for( Index y = begin.y(); y <= end.y(); y ++ ) + for( Index x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.getCoordinates().y() = y; + entity.refresh(); + WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); + }*/ } void traverseUsingPureC() diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index 07ea6e5f8..383640d39 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -252,7 +252,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > GridPointer grid; MeshFunctionPointer u; Traverser traverser; - WriteOneTraverserUserDataType userData; + WriteOneTraverserUserDataType userData; }; } // namespace Traversers diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 56fbc151c..96a131f48 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -39,8 +39,8 @@ bool runBenchmark( const Config::ParameterContainer& parameters, // to pass 64-bit integer values // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); - const int minSize = parameters.getParameter< int >( "min-size" ); - const int maxSize = parameters.getParameter< int >( "max-size" ); + const std::size_t minSize = parameters.getParameter< int >( "min-size" ); + const std::size_t maxSize = parameters.getParameter< int >( "max-size" ); #ifdef HAVE_CUDA const bool withCuda = parameters.getParameter< bool >( "with-cuda" ); #else @@ -85,7 +85,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, { hostTraverserBenchmark.writeOneUsingPureC(); }; - benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); #ifdef HAVE_CUDA auto cudaWriteOneUsingPureC = [&] () @@ -95,13 +95,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC ); #endif - benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + /*benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); -#endif +#endif*/ } /**** @@ -115,7 +115,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, { hostTraverserBenchmark.writeOneUsingParallelFor(); }; - benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelFor = [&] () @@ -123,14 +123,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.writeOneUsingParallelFor(); }; if( withCuda ) - benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); #endif - benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + /*benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); -#endif +#endif*/ } /**** @@ -143,7 +143,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.writeOneUsingParallelForAndGridEntity(); }; benchmark.setOperation( "par.for+grid ent.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelForAndGridEntity ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelForAndGridEntity = [&] () @@ -151,15 +151,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.writeOneUsingParallelForAndGridEntity(); }; if( withCuda ) - benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelForAndGridEntity ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity ); #endif - benchmark.setOperation( "par.for+grid ent. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + /*benchmark.setOperation( "par.for+grid ent. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity ); -#endif +#endif*/ } /**** @@ -172,7 +172,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.writeOneUsingParallelForAndMeshFunction(); }; benchmark.setOperation( "par.for+mesh fc.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelForAndMeshFunction ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction ); #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelForAndMeshFunction = [&] () @@ -180,15 +180,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.writeOneUsingParallelForAndMeshFunction(); }; if( withCuda ) - benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelForAndGridMeshFunction ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridMeshFunction ); #endif - benchmark.setOperation( "par.for+mesh fc. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + /*benchmark.setOperation( "par.for+mesh fc. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction ); -#endif +#endif*/ } /**** @@ -211,14 +211,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); #endif - +/* benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingTraverser ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser ); -#endif +#endif*/ } + std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl; } /**** -- GitLab From fa981bc8ded19e3d361edc8ba6fe82dcc5e12629 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 21:48:16 +0100 Subject: [PATCH 038/130] Refactoring. --- src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h index d3d2a129c..99ea85876 100644 --- a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h +++ b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h @@ -146,7 +146,7 @@ processAllEntities( gridPointer, CoordinatesType( 0 ), gridPointer->getDimensions() - CoordinatesType( 1 ), - userData ); + userData ); } else //Distributed { -- GitLab From 413f4e6fae234a956899d1def5a1c4762b327644 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 4 Jan 2019 12:00:24 +0100 Subject: [PATCH 039/130] Added method containsValue to List. --- src/TNL/Containers/List.h | 9 ++++++++- src/TNL/Containers/List_impl.h | 8 ++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/TNL/Containers/List.h b/src/TNL/Containers/List.h index 2c175bcce..0cf6f762d 100644 --- a/src/TNL/Containers/List.h +++ b/src/TNL/Containers/List.h @@ -109,6 +109,13 @@ template< class T > class List template< typename Array > void toArray( Array& array ); + /*** + * \brief Checks if there is an element with value \e v in given array. + * + * \param v Reference to a value. + */ + bool containsValue( const T& v ) const; + /// Erases data element at given position. /// /// \param ind Index of the data element one chooses to remove. @@ -146,7 +153,7 @@ template< class T > class List /// /// \param file Name of file. bool DeepLoad( File& file ); - + protected: /// Pointer to the first element. ListDataElement< T >* first; diff --git a/src/TNL/Containers/List_impl.h b/src/TNL/Containers/List_impl.h index e67be136c..36fd5dbdc 100644 --- a/src/TNL/Containers/List_impl.h +++ b/src/TNL/Containers/List_impl.h @@ -207,6 +207,14 @@ void List< T >::toArray( Array& array ) for( int i = 0; i < this->getSize(); i++ ) array[ i ] = ( *this )[ i ]; } +template< typename T > +bool List< T >::containsValue( const T& v ) const +{ + for( int i = 0; i < this->getSize(); i++ ) + if( ( *this )[ i ] == v ) + return true; + return false; +} template< typename T > void List< T >::Erase( const int& ind ) -- GitLab From 10d2d333c51c887c7fbd3555754d369891894ba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 4 Jan 2019 12:00:53 +0100 Subject: [PATCH 040/130] Traversers benchmark tests can be configures as list of tests. --- .../Traversers/tnl-benchmark-traversers.h | 56 ++++--------------- 1 file changed, 12 insertions(+), 44 deletions(-) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 96a131f48..fd14ba25c 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -20,6 +20,7 @@ #include #include #include +#include using namespace TNL; using namespace TNL::Benchmarks; @@ -33,7 +34,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, Benchmark& benchmark, Benchmark::MetadataMap& metadata ) { - const String tests = parameters.getParameter< String >( "tests" ); + const Containers::List< String >& tests = parameters.getParameter< Containers::List< String > >( "tests" ); // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), // which have a default value. The workaround below works for int values, but it is not possible // to pass 64-bit integer values @@ -77,7 +78,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using C for */ - if( tests == "all" || tests == "no-bc-pure-c") + if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-pure-c" ) ) { benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); @@ -95,19 +96,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC ); #endif - /*benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); - -#ifdef HAVE_CUDA - if( withCuda ) - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); -#endif*/ } /**** * Write one using parallel for */ - if( tests == "all" || tests == "no-bc-parallel-for" ) + if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for" ) ) { benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); @@ -125,18 +119,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); #endif - /*benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); -#ifdef HAVE_CUDA - if( withCuda ) - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); -#endif*/ } /**** * Write one using parallel for with grid entity */ - if( tests == "all" || tests == "no-bc-parallel-for-and-grid-entity" ) + if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for-and-grid-entity" ) ) { auto hostWriteOneUsingParallelForAndGridEntity = [&] () { @@ -153,19 +141,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity ); #endif - - /*benchmark.setOperation( "par.for+grid ent. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); -#ifdef HAVE_CUDA - if( withCuda ) - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity ); -#endif*/ } /**** * Write one using parallel for with mesh function */ - if( tests == "all" || tests == "no-bc-parallel-for-and-mesh-function" ) + if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for-and-mesh-function" ) ) { auto hostWriteOneUsingParallelForAndMeshFunction = [&] () { @@ -180,21 +161,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.writeOneUsingParallelForAndMeshFunction(); }; if( withCuda ) - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridMeshFunction ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction ); #endif - /*benchmark.setOperation( "par.for+mesh fc. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction ); -#ifdef HAVE_CUDA - if( withCuda ) - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction ); -#endif*/ } /**** * Write one using traverser */ - if( tests == "all" || tests == "no-bc-traverser" ) + if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-traverser" ) ) { benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); auto hostWriteOneUsingTraverser = [&] () @@ -211,13 +186,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); #endif -/* - benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); -#ifdef HAVE_CUDA - if( withCuda ) - benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser ); -#endif*/ } std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl; } @@ -262,7 +230,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; #endif - if( tests == "all" || tests == "bc-pure-c" ) + if( tests.containsValue( "all" ) || tests.containsValue( "bc-pure-c" ) ) { benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC ); @@ -294,7 +262,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; #endif - if( tests == "all" || tests == "bc-parallel-for" ) + if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) ) { benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); @@ -326,7 +294,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; #endif - if( tests == "all" || tests == "bc-traverser" ) + if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) ) { benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); @@ -346,7 +314,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, void setupConfig( Config::ConfigDescription& config ) { - config.addEntry< String >( "tests", "Tests to be performed.", "all" ); + config.addList< String >( "tests", "Tests to be performed.", "all" ); config.addEntryEnum( "all" ); config.addEntryEnum( "no-bc-pure-c" ); config.addEntryEnum( "no-bc-parallel-for" ); -- GitLab From f56a60b10ec8058351358a7e22136f4bb1a5a355 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 4 Jan 2019 12:21:40 +0100 Subject: [PATCH 041/130] Fixed CUDA travresers benchmark tests. --- src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index 91097ecac..93ee77385 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -116,15 +116,15 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { const Grid* currentGrid = &grid.template getData< Device >(); MeshFunction* _u = &u.template modifyData< Device >(); - auto f = [=] __cuda_callable__ ( Index i, Real* data ) + auto f = [=] __cuda_callable__ ( Index i ) { - Cell entity( grid.template getData< Device >() ); + Cell entity( *currentGrid ); entity.getCoordinates().x() = i; entity.refresh(); - //( *_u )( entity ) += 1.0; - WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity ); + ( *_u )( entity ) += 1.0; + //WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity ); }; - ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); + ParallelFor< Device >::exec( ( Index ) 0, size, f ); } void writeOneUsingTraverser() -- GitLab From bb4a7186b887f7333a71bd57587b4763440a1332 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 4 Jan 2019 15:38:30 +0100 Subject: [PATCH 042/130] Fixing traversers benchmark kernels. --- src/Benchmarks/Traversers/cuda-kernels.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/Benchmarks/Traversers/cuda-kernels.h b/src/Benchmarks/Traversers/cuda-kernels.h index 2cd8b1b56..2802b73eb 100644 --- a/src/Benchmarks/Traversers/cuda-kernels.h +++ b/src/Benchmarks/Traversers/cuda-kernels.h @@ -27,7 +27,7 @@ __global__ void fullGridTraverseKernel1D( const Index size, const dim3 gridIdx, { const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( threadIdx_x < size ) - v_data[ threadIdx_x ] = 1.0; + v_data[ threadIdx_x ] += 1.0; } template< typename Real, @@ -37,7 +37,7 @@ __global__ void fullGridTraverseKernel2D( const Index size, const dim3 gridIdx, const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( threadIdx_x < size && threadIdx_y < size ) - v_data[ threadIdx_y * size + threadIdx_x ] = 1.0; + v_data[ threadIdx_y * size + threadIdx_x ] += 1.0; } template< typename Real, @@ -48,7 +48,7 @@ __global__ void fullGridTraverseKernel3D( const Index size, const dim3 gridIdx, const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size ) - v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0; + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 1.0; } /**** @@ -60,7 +60,7 @@ __global__ void interiorTraverseKernel1D( const Index size, const dim3 gridIdx, { const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( threadIdx_x > 0 && threadIdx_x < size - 1 ) - v_data[ threadIdx_x ] = 1.0; + v_data[ threadIdx_x ] += 1.0; } template< typename Real, @@ -71,7 +71,7 @@ __global__ void interiorTraverseKernel2D( const Index size, const dim3 gridIdx, const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_x < size - 1 && threadIdx_y < size - 1 ) - v_data[ threadIdx_y * size + threadIdx_x ] = 1.0; + v_data[ threadIdx_y * size + threadIdx_x ] += 1.0; } template< typename Real, @@ -83,7 +83,7 @@ __global__ void interiorTraverseKernel3D( const Index size, const dim3 gridIdx, const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_z > 0 && threadIdx_x < size - 1 && threadIdx_y < size - 1 && threadIdx_z < size - 1 ) - v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0; + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 1.0; } /**** @@ -95,7 +95,7 @@ __global__ void boundariesTraverseKernel1D( const Index size, const dim3 gridIdx { const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( threadIdx_x == 0 || threadIdx_x == size - 1 ) - v_data[ threadIdx_x ] = 2.0; + v_data[ threadIdx_x ] += 2.0; } template< typename Real, @@ -106,7 +106,7 @@ __global__ void boundariesTraverseKernel2D( const Index size, const dim3 gridIdx const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_x < size - 1 && threadIdx_y < size - 1 ) - v_data[ threadIdx_y * size + threadIdx_x ] = 2.0; + v_data[ threadIdx_y * size + threadIdx_x ] += 2.0; } template< typename Real, @@ -118,7 +118,7 @@ __global__ void boundariesTraverseKernel3D( const Index size, const dim3 gridIdx const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; if( threadIdx_x == 0 || threadIdx_y == 0 || threadIdx_z == 0 || threadIdx_x == size - 1 || threadIdx_y == size - 1 || threadIdx_z == size - 1 ) - v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 2.0; + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 2.0; } #endif -- GitLab From 41662ed72e36422f89cb90d9d151bdce892401a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 4 Jan 2019 16:55:59 +0100 Subject: [PATCH 043/130] Fixed tnl-benchmark-traversers.h --- src/Benchmarks/Traversers/tnl-benchmark-traversers.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index fd14ba25c..9f70589c9 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -94,7 +94,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.writeOneUsingPureC(); }; if( withCuda ) - benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); #endif } @@ -297,15 +297,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) ) { benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser ); + benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser ); #endif benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser ); #endif } } -- GitLab From a90a64d1c2873dcd9089d405b8f08cbb63897747 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 4 Jan 2019 18:32:06 +0100 Subject: [PATCH 044/130] GridTraverser_impl.h splitted into GridTraverser_1D.hpp, GridTraverser_2D.hpp and GridTraverser_3D.hpp. --- CMakeLists.txt | 4 +- .../Meshes/GridDetails/GridTraverser_1D.hpp | 290 ++++ .../Meshes/GridDetails/GridTraverser_2D.hpp | 648 ++++++++ .../Meshes/GridDetails/GridTraverser_3D.hpp | 551 +++++++ .../Meshes/GridDetails/GridTraverser_impl.h | 1436 ----------------- 5 files changed, 1491 insertions(+), 1438 deletions(-) create mode 100644 src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp create mode 100644 src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp create mode 100644 src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp delete mode 100644 src/TNL/Meshes/GridDetails/GridTraverser_impl.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 8784170f8..2c1adce6b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,7 +78,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") endif() # set Debug/Release options -set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable" ) +set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -g" ) set( CMAKE_CXX_FLAGS_DEBUG "-g" ) set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG" ) #set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" ) @@ -233,7 +233,7 @@ if( ${WITH_CUDA} ) endif() endif() endif() - set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; ${CUDA_ARCH} -D_FORCE_INLINES ) + set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; ${CUDA_ARCH} -D_FORCE_INLINES --generate-line-info) # TODO: this is necessary only due to a bug in cmake set( CUDA_ADD_LIBRARY_OPTIONS -shared ) endif() diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp new file mode 100644 index 000000000..90148f8e8 --- /dev/null +++ b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp @@ -0,0 +1,290 @@ +/*************************************************************************** + GridTraverser_1D.hpp - description + ------------------- + begin : Jan 4, 2019 + copyright : (C) 2019 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber, +// Jakub Klinkovsky, +// Vit Hanousek + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace TNL { +namespace Meshes { + +/**** + * 1D traverser, host + */ +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities > +void +GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType begin, + const CoordinatesType end, + UserData& userData, + const int& stream ) +{ + GridEntity entity( *gridPointer ); + if( processOnlyBoundaryEntities ) + { + GridEntity entity( *gridPointer ); + + entity.getCoordinates() = begin; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + entity.getCoordinates() = end; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + else + { +#ifdef HAVE_OPENMP + if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 ) + { +#pragma omp parallel firstprivate( begin, end ) + { + GridEntity entity( *gridPointer ); +#pragma omp for + // TODO: g++ 5.5 crashes when coding this loop without auxiliary x as bellow + for( IndexType x = begin.x(); x <= end.x(); x++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + } + else + { + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } +#else + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +#endif + } +} + +/**** + * 1D traverser, CUDA + */ +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor > +__global__ void +GridTraverser1D( + const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const Index gridIdx ) +{ + typedef Real RealType; + typedef Index IndexType; + typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( coordinates <= end ) + { + GridEntity entity( *grid, coordinates ); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +} + +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor > +__global__ void +GridBoundaryTraverser1D( + const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end ) +{ + typedef Real RealType; + typedef Index IndexType; + typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + if( threadIdx.x == 0 ) + { + coordinates.x() = begin.x(); + GridEntity entity( *grid, coordinates ); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + if( threadIdx.x == 1 ) + { + coordinates.x() = end.x(); + GridEntity entity( *grid, coordinates ); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +} + +#endif + +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities > +void +GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType& begin, + const CoordinatesType& end, + UserData& userData, + const int& stream ) +{ +#ifdef HAVE_CUDA + auto& pool = CudaStreamPool::getInstance(); + const cudaStream_t& s = pool.getStream( stream ); + + Devices::Cuda::synchronizeDevice(); + if( processOnlyBoundaryEntities ) + { + dim3 cudaBlockSize( 2 ); + dim3 cudaBlocks( 1 ); + GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > + <<< cudaBlocks, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end ); + } + else + { + dim3 cudaBlockSize( 256 ); + dim3 cudaBlocks; + cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); + const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); + + for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) + GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > + <<< cudaBlocks, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end, + gridXIdx ); + } + + // only launches into the stream 0 are synchronized + /*if( stream == 0 ) + { + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; + }*/ +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + +/**** + * 1D traverser, MIC + */ + +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities > +void +GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType& begin, + const CoordinatesType& end, + UserData& userData, + const int& stream ) +{ + std::cout << "Not Implemented yet Grid Traverser <1, Real, Device::MIC>" << std::endl; +/* + auto& pool = CudaStreamPool::getInstance(); + const cudaStream_t& s = pool.getStream( stream ); + + Devices::Cuda::synchronizeDevice(); + if( processOnlyBoundaryEntities ) + { + dim3 cudaBlockSize( 2 ); + dim3 cudaBlocks( 1 ); + GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > + <<< cudaBlocks, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end ); + } + else + { + dim3 cudaBlockSize( 256 ); + dim3 cudaBlocks; + cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); + const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); + + for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) + GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > + <<< cudaBlocks, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end, + gridXIdx ); + } + + // only launches into the stream 0 are synchronized + if( stream == 0 ) + { + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; + } +*/ +} + + } // namespace Meshes +} // namespace TNL diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp new file mode 100644 index 000000000..84e496017 --- /dev/null +++ b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp @@ -0,0 +1,648 @@ +/*************************************************************************** + GridTraverser_2D.hpp - description + ------------------- + begin : Jan 4, 2019 + copyright : (C) 2019 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace TNL { +namespace Meshes { + +//#define GRID_TRAVERSER_USE_STREAMS + + +/**** + * 2D traverser, host + */ +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities, + int XOrthogonalBoundary, + int YOrthogonalBoundary, + typename... GridEntityParameters > +void +GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType begin, + const CoordinatesType end, + UserData& userData, + const int& stream, + const GridEntityParameters&... gridEntityParameters ) +{ + if( processOnlyBoundaryEntities ) + { + GridEntity entity( *gridPointer, begin, gridEntityParameters... ); + + if( YOrthogonalBoundary ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.getCoordinates().y() = begin.y(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + entity.getCoordinates().y() = end.y(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + if( XOrthogonalBoundary ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + { + entity.getCoordinates().x() = begin.x(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + entity.getCoordinates().x() = end.x(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + else + { +#ifdef HAVE_OPENMP + if( Devices::Host::isOMPEnabled() ) + { +#pragma omp parallel firstprivate( begin, end ) + { + GridEntity entity( *gridPointer ); +#pragma omp for + // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow + for( IndexType y = begin.y(); y <= end.y(); y ++ ) + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.getCoordinates().y() = y; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + } + else + { + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } +#else + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +#endif + } +} + +/**** + * 2D traverser, CUDA + */ +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser2D( + const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + + if( coordinates <= end ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() ) + { + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } + } +} + +// Boundary traverser using streams +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser2DBoundaryAlongX( + const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, + UserData userData, + const Index beginX, + const Index endX, + const Index fixedY, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = fixedY; + + if( coordinates.x() <= endX ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } +} + +// Boundary traverser using streams +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser2DBoundaryAlongY( + const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, + UserData userData, + const Index beginY, + const Index endY, + const Index fixedX, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = fixedX; + coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + + if( coordinates.y() <= endY ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } +} + + +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser2DBoundary( + const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, + UserData userData, + const Index beginX, + const Index endX, + const Index beginY, + const Index endY, + const Index blocksPerFace, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + using GridType = Meshes::Grid< 2, Real, Devices::Cuda, Index >; + using CoordinatesType = typename GridType::CoordinatesType; + + const Index faceIdx = blockIdx.x / blocksPerFace; + const Index faceBlockIdx = blockIdx.x % blocksPerFace; + const Index threadId = faceBlockIdx * blockDim. x + threadIdx.x; + if( faceIdx < 2 ) + { + const Index entitiesAlongX = endX - beginX + 1; + if( threadId < entitiesAlongX ) + { + GridEntity entity( *grid, + CoordinatesType( beginX + threadId, faceIdx == 0 ? beginY : endY ), + gridEntityParameters... ); + //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + } + else + { + const Index entitiesAlongY = endY - beginY - 1; + if( threadId < entitiesAlongY ) + { + GridEntity entity( *grid, + CoordinatesType( faceIdx == 2 ? beginX : endX, beginY + threadId + 1 ), + gridEntityParameters... ); + //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + } + + + + /*const Index aux = max( entitiesAlongX, entitiesAlongY ); + const Index& warpSize = Devices::Cuda::getWarpSize(); + const Index threadsPerAxis = warpSize * ( aux / warpSize + ( aux % warpSize != 0 ) ); + + Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + GridEntity entity( *grid, + CoordinatesType( 0, 0 ), + gridEntityParameters... ); + CoordinatesType& coordinates = entity.getCoordinates(); + const Index axisIndex = threadId / threadsPerAxis; + //printf( "axisIndex %d, threadId %d thradsPerAxis %d \n", axisIndex, threadId, threadsPerAxis ); + threadId -= axisIndex * threadsPerAxis; + switch( axisIndex ) + { + case 1: + coordinates = CoordinatesType( beginX + threadId, beginY ); + if( threadId < entitiesAlongX ) + { + //printf( "X1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + break; + case 2: + coordinates = CoordinatesType( beginX + threadId, endY ); + if( threadId < entitiesAlongX ) + { + //printf( "X2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + break; + case 3: + coordinates = CoordinatesType( beginX, beginY + threadId + 1 ); + if( threadId < entitiesAlongY ) + { + //printf( "Y1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + break; + case 4: + coordinates = CoordinatesType( endX, beginY + threadId + 1 ); + if( threadId < entitiesAlongY ) + { + //printf( "Y2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + break; + }*/ + + /*if( threadId < entitiesAlongX ) + { + GridEntity entity( *grid, + CoordinatesType( beginX + threadId, beginY ), + gridEntityParameters... ); + //printf( "X1: Thread %d -> %d %d x %d %d \n ", threadId, + // entity.getCoordinates().x(), entity.getCoordinates().y(), + // grid->getDimensions().x(), grid->getDimensions().y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + else if( ( threadId -= entitiesAlongX ) < entitiesAlongX && threadId >= 0 ) + { + GridEntity entity( *grid, + CoordinatesType( beginX + threadId, endY ), + gridEntityParameters... ); + entity.refresh(); + //printf( "X2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + else if( ( ( threadId -= entitiesAlongX ) < entitiesAlongY - 1 ) && threadId >= 0 ) + { + GridEntity entity( *grid, + CoordinatesType( beginX, beginY + threadId + 1 ), + gridEntityParameters... ); + entity.refresh(); + //printf( "Y1: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + else if( ( ( threadId -= entitiesAlongY - 1 ) < entitiesAlongY - 1 ) && threadId >= 0 ) + { + GridEntity entity( *grid, + CoordinatesType( endX, beginY + threadId + 1 ), + gridEntityParameters... ); + entity.refresh(); + //printf( "Y2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); + EntitiesProcessor::processEntity( *grid, userData, entity ); + }*/ +} + + +#endif // HAVE_CUDA + +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities, + int XOrthogonalBoundary, + int YOrthogonalBoundary, + typename... GridEntityParameters > +void +GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType& begin, + const CoordinatesType& end, + UserData& userData, + const int& stream, + const GridEntityParameters&... gridEntityParameters ) +{ +#ifdef HAVE_CUDA + if( processOnlyBoundaryEntities && + ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) ) + { +#ifdef GRID_TRAVERSER_USE_STREAMS + dim3 cudaBlockSize( 256 ); + dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX, + cudaBlocksCountAlongY, cudaGridsCountAlongY; + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 ); + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 ); + + auto& pool = CudaStreamPool::getInstance(); + Devices::Cuda::synchronizeDevice(); + + const cudaStream_t& s1 = pool.getStream( stream ); + const cudaStream_t& s2 = pool.getStream( stream + 1 ); + dim3 gridIdx, cudaGridSize; + for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize ); + //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX ); + GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaGridSize, cudaBlockSize, 0, s1 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + begin.y(), + gridIdx, + gridEntityParameters... ); + GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaGridSize, cudaBlockSize, 0, s2 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + end.y(), + gridIdx, + gridEntityParameters... ); + } + const cudaStream_t& s3 = pool.getStream( stream + 2 ); + const cudaStream_t& s4 = pool.getStream( stream + 3 ); + for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize ); + GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaGridSize, cudaBlockSize, 0, s3 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.y() + 1, + end.y() - 1, + begin.x(), + gridIdx, + gridEntityParameters... ); + GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaGridSize, cudaBlockSize, 0, s4 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.y() + 1, + end.y() - 1, + end.x(), + gridIdx, + gridEntityParameters... ); + } + cudaStreamSynchronize( s1 ); + cudaStreamSynchronize( s2 ); + cudaStreamSynchronize( s3 ); + cudaStreamSynchronize( s4 ); +#else // not defined GRID_TRAVERSER_USE_STREAMS + dim3 cudaBlockSize( 256 ); + dim3 cudaBlocksCount, cudaGridsCount; + const IndexType entitiesAlongX = end.x() - begin.x() + 1; + const IndexType entitiesAlongY = end.x() - begin.x() - 1; + const IndexType maxFaceSize = max( entitiesAlongX, entitiesAlongY ); + const IndexType blocksPerFace = maxFaceSize / cudaBlockSize.x + ( maxFaceSize % cudaBlockSize.x != 0 ); + IndexType cudaThreadsCount = 4 * cudaBlockSize.x * blocksPerFace; + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount ); + //std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount + // << "cudaBlockCount = " << cudaBlocksCount.x << std::endl; + dim3 gridIdx, cudaGridSize; + Devices::Cuda::synchronizeDevice(); + for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); + //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX ); + GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaGridSize, cudaBlockSize >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + begin.y(), + end.y(), + blocksPerFace, + gridIdx, + gridEntityParameters... ); + } +#endif //GRID_TRAVERSER_USE_STREAMS + //getchar(); + TNL_CHECK_CUDA_DEVICE; + } + else + { + dim3 cudaBlockSize( 16, 16 ); + dim3 cudaBlocksCount, cudaGridsCount; + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, + end.x() - begin.x() + 1, + end.y() - begin.y() + 1 ); + + auto& pool = CudaStreamPool::getInstance(); + const cudaStream_t& s = pool.getStream( stream ); + + Devices::Cuda::synchronizeDevice(); + dim3 gridIdx, cudaGridSize; + for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) + for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); + //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount ); + GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaGridSize, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end, + gridIdx, + gridEntityParameters... ); + } + + // only launches into the stream 0 are synchronized + if( stream == 0 ) + { + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; + } + } +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + + +/**** + * 2D traverser, MIC + */ +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities, + int XOrthogonalBoundary, + int YOrthogonalBoundary, + typename... GridEntityParameters > +void +GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType& begin, + const CoordinatesType& end, + UserData& userData, + const int& stream, + const GridEntityParameters&... gridEntityParameters ) +{ + + +#ifdef HAVE_MIC + Devices::MIC::synchronizeDevice(); + + //TOHLE JE PRUSER -- nemim poslat vypustku -- + //GridEntity entity( gridPointer.template getData< Devices::MIC >(), begin, gridEntityParameters... ); + + + Devices::MICHider hMicGrid; + hMicGrid.pointer=& gridPointer.template getData< Devices::MIC >(); + Devices::MICHider hMicUserData; + hMicUserData.pointer=& userDataPointer.template modifyData(); + TNLMICSTRUCT(begin, const CoordinatesType); + TNLMICSTRUCT(end, const CoordinatesType); + + #pragma offload target(mic) in(sbegin,send,hMicUserData,hMicGrid) + { + + #pragma omp parallel firstprivate( sbegin, send ) + { + TNLMICSTRUCTUSE(begin, const CoordinatesType); + TNLMICSTRUCTUSE(end, const CoordinatesType); + GridEntity entity( *(hMicGrid.pointer), *(kernelbegin) ); + + if( processOnlyBoundaryEntities ) + { + if( YOrthogonalBoundary ) + #pragma omp for + for( auto k = kernelbegin->x(); + k <= kernelend->x(); + k ++ ) + { + entity.getCoordinates().x() = k; + entity.getCoordinates().y() = kernelbegin->y(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); + entity.getCoordinates().y() = kernelend->y(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); + } + if( XOrthogonalBoundary ) + #pragma omp for + for( auto k = kernelbegin->y(); + k <= kernelend->y(); + k ++ ) + { + entity.getCoordinates().y() = k; + entity.getCoordinates().x() = kernelbegin->x(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); + entity.getCoordinates().x() = kernelend->x(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); + } + } + else + { + #pragma omp for + for( IndexType y = kernelbegin->y(); y <= kernelend->y(); y ++ ) + for( IndexType x = kernelbegin->x(); x <= kernelend->x(); x ++ ) + { + // std::cerr << x << " " < +#include +#include +#include +#include +#include + +namespace TNL { +namespace Meshes { + + +/**** + * 3D traverser, host + */ +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities, + int XOrthogonalBoundary, + int YOrthogonalBoundary, + int ZOrthogonalBoundary, + typename... GridEntityParameters > +void +GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType begin, + const CoordinatesType end, + UserData& userData, + const int& stream, + const GridEntityParameters&... gridEntityParameters ) +{ + if( processOnlyBoundaryEntities ) + { + GridEntity entity( *gridPointer, begin, gridEntityParameters... ); + + if( ZOrthogonalBoundary ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.getCoordinates().z() = begin.z(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + entity.getCoordinates().z() = end.z(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + if( YOrthogonalBoundary ) + for( entity.getCoordinates().z() = begin.z(); + entity.getCoordinates().z() <= end.z(); + entity.getCoordinates().z() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.getCoordinates().y() = begin.y(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + entity.getCoordinates().y() = end.y(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + if( XOrthogonalBoundary ) + for( entity.getCoordinates().z() = begin.z(); + entity.getCoordinates().z() <= end.z(); + entity.getCoordinates().z() ++ ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + { + entity.getCoordinates().x() = begin.x(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + entity.getCoordinates().x() = end.x(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + else + { +#ifdef HAVE_OPENMP + if( Devices::Host::isOMPEnabled() ) + { +#pragma omp parallel firstprivate( begin, end ) + { + GridEntity entity( *gridPointer ); +#pragma omp for + // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow + for( IndexType z = begin.z(); z <= end.z(); z ++ ) + for( IndexType y = begin.y(); y <= end.y(); y ++ ) + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.getCoordinates().y() = y; + entity.getCoordinates().z() = z; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + } + else + { + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().z() = begin.z(); + entity.getCoordinates().z() <= end.z(); + entity.getCoordinates().z() ++ ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } +#else + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().z() = begin.z(); + entity.getCoordinates().z() <= end.z(); + entity.getCoordinates().z() ++ ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +#endif + } +} + +/**** + * 3D traverser, CUDA + */ +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser3D( + const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + coordinates.z() = begin.z() + Devices::Cuda::getGlobalThreadIdx_z( gridIdx ); + + if( coordinates <= end ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() ) + { + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } + } +} + +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser3DBoundaryAlongXY( + const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, + UserData userData, + const Index beginX, + const Index endX, + const Index beginY, + const Index endY, + const Index fixedZ, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + coordinates.z() = fixedZ; + + if( coordinates.x() <= endX && coordinates.y() <= endY ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } +} + +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser3DBoundaryAlongXZ( + const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, + UserData userData, + const Index beginX, + const Index endX, + const Index beginZ, + const Index endZ, + const Index fixedY, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = fixedY; + coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + + if( coordinates.x() <= endX && coordinates.z() <= endZ ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } +} + +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser3DBoundaryAlongYZ( + const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, + UserData userData, + const Index beginY, + const Index endY, + const Index beginZ, + const Index endZ, + const Index fixedX, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = fixedX; + coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + + if( coordinates.y() <= endY && coordinates.z() <= endZ ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } +} +#endif + +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities, + int XOrthogonalBoundary, + int YOrthogonalBoundary, + int ZOrthogonalBoundary, + typename... GridEntityParameters > +void +GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType& begin, + const CoordinatesType& end, + UserData& userData, + const int& stream, + const GridEntityParameters&... gridEntityParameters ) +{ +#ifdef HAVE_CUDA + if( processOnlyBoundaryEntities && + ( GridEntity::getEntityDimension() == 3 || GridEntity::getEntityDimension() == 0 ) ) + { + dim3 cudaBlockSize( 16, 16 ); + const IndexType entitiesAlongX = end.x() - begin.x() + 1; + const IndexType entitiesAlongY = end.y() - begin.y() + 1; + const IndexType entitiesAlongZ = end.z() - begin.z() + 1; + + dim3 cudaBlocksCountAlongXY, cudaBlocksCountAlongXZ, cudaBlocksCountAlongYZ, + cudaGridsCountAlongXY, cudaGridsCountAlongXZ, cudaGridsCountAlongYZ; + + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXY, cudaGridsCountAlongXY, entitiesAlongX, entitiesAlongY ); + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, entitiesAlongX, entitiesAlongZ - 2 ); + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 ); + + auto& pool = CudaStreamPool::getInstance(); + Devices::Cuda::synchronizeDevice(); + + const cudaStream_t& s1 = pool.getStream( stream ); + const cudaStream_t& s2 = pool.getStream( stream + 1 ); + const cudaStream_t& s3 = pool.getStream( stream + 2 ); + const cudaStream_t& s4 = pool.getStream( stream + 3 ); + const cudaStream_t& s5 = pool.getStream( stream + 4 ); + const cudaStream_t& s6 = pool.getStream( stream + 5 ); + + dim3 gridIdx, gridSize; + for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXY.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXY.x; gridIdx.x++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCountAlongXY, cudaGridsCountAlongXY, gridIdx, gridSize ); + GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocksCountAlongXY, cudaBlockSize, 0 , s1 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + begin.y(), + end.y(), + begin.z(), + gridIdx, + gridEntityParameters... ); + GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocksCountAlongXY, cudaBlockSize, 0, s2 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + begin.y(), + end.y(), + end.z(), + gridIdx, + gridEntityParameters... ); + } + for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXZ.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXZ.x; gridIdx.x++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, gridIdx, gridSize ); + GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s3 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + begin.z() + 1, + end.z() - 1, + begin.y(), + gridIdx, + gridEntityParameters... ); + GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s4 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + begin.z() + 1, + end.z() - 1, + end.y(), + gridIdx, + gridEntityParameters... ); + } + for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongYZ.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongYZ.x; gridIdx.x++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, gridIdx, gridSize ); + GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s5 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.y() + 1, + end.y() - 1, + begin.z() + 1, + end.z() - 1, + begin.x(), + gridIdx, + gridEntityParameters... ); + GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s6 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.y() + 1, + end.y() - 1, + begin.z() + 1, + end.z() - 1, + end.x(), + gridIdx, + gridEntityParameters... ); + } + cudaStreamSynchronize( s1 ); + cudaStreamSynchronize( s2 ); + cudaStreamSynchronize( s3 ); + cudaStreamSynchronize( s4 ); + cudaStreamSynchronize( s5 ); + cudaStreamSynchronize( s6 ); + TNL_CHECK_CUDA_DEVICE; + } + else + { + dim3 cudaBlockSize( 8, 8, 8 ); + dim3 cudaBlocksCount, cudaGridsCount; + + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, + end.x() - begin.x() + 1, + end.y() - begin.y() + 1, + end.z() - begin.z() + 1 ); + + auto& pool = CudaStreamPool::getInstance(); + const cudaStream_t& s = pool.getStream( stream ); + + Devices::Cuda::synchronizeDevice(); + dim3 gridIdx, gridSize; + for( gridIdx.z = 0; gridIdx.z < cudaGridsCount.z; gridIdx.z ++ ) + for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) + for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, gridSize ); + GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< gridSize, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end, + gridIdx, + gridEntityParameters... ); + } + + // only launches into the stream 0 are synchronized + if( stream == 0 ) + { + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; + } + } +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + +/**** + * 3D traverser, MIC + */ +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities, + int XOrthogonalBoundary, + int YOrthogonalBoundary, + int ZOrthogonalBoundary, + typename... GridEntityParameters > +void +GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType& begin, + const CoordinatesType& end, + UserData& userData, + const int& stream, + const GridEntityParameters&... gridEntityParameters ) +{ + std::cout << "Not Implemented yet Grid Traverser <3, Real, Device::MIC>" << std::endl; + +/* HAVE_CUDA + dim3 cudaBlockSize( 8, 8, 8 ); + dim3 cudaBlocks; + cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); + cudaBlocks.y = Devices::Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y ); + cudaBlocks.z = Devices::Cuda::getNumberOfBlocks( end.z() - begin.z() + 1, cudaBlockSize.z ); + const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); + const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y ); + const IndexType cudaZGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.z ); + + auto& pool = CudaStreamPool::getInstance(); + const cudaStream_t& s = pool.getStream( stream ); + + Devices::Cuda::synchronizeDevice(); + for( IndexType gridZIdx = 0; gridZIdx < cudaZGrids; gridZIdx ++ ) + for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ ) + for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) + GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocks, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end, + gridXIdx, + gridYIdx, + gridZIdx, + gridEntityParameters... ); + + // only launches into the stream 0 are synchronized + if( stream == 0 ) + { + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; + } + */ +} + } // namespace Meshes +} // namespace TNL diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h deleted file mode 100644 index 33b5e22eb..000000000 --- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h +++ /dev/null @@ -1,1436 +0,0 @@ -/*************************************************************************** - GridTraverser_impl.h - description - ------------------- - begin : Jan 2, 2016 - copyright : (C) 2016 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#include - -#pragma once - -//#define GRID_TRAVERSER_USE_STREAMS - -#include "GridTraverser.h" - -#include - -namespace TNL { -namespace Meshes { - -/**** - * 1D traverser, host - */ -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities > -void -GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType begin, - const CoordinatesType end, - UserData& userData, - const int& stream ) -{ - GridEntity entity( *gridPointer ); - if( processOnlyBoundaryEntities ) - { - GridEntity entity( *gridPointer ); - - entity.getCoordinates() = begin; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - entity.getCoordinates() = end; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - else - { -#ifdef HAVE_OPENMP - if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 ) - { -#pragma omp parallel firstprivate( begin, end ) - { - GridEntity entity( *gridPointer ); -#pragma omp for - // TODO: g++ 5.5 crashes when coding this loop without auxiliary x as bellow - for( IndexType x = begin.x(); x <= end.x(); x++ ) - { - entity.getCoordinates().x() = x; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } - } - else - { - GridEntity entity( *gridPointer ); - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } -#else - GridEntity entity( *gridPointer ); - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } -#endif - } -} - -/**** - * 1D traverser, CUDA - */ -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor > -__global__ void -GridTraverser1D( - const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, - UserData userData, - const typename GridEntity::CoordinatesType begin, - const typename GridEntity::CoordinatesType end, - const Index gridIdx ) -{ - typedef Real RealType; - typedef Index IndexType; - typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( coordinates <= end ) - { - GridEntity entity( *grid, coordinates ); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } -} - -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor > -__global__ void -GridBoundaryTraverser1D( - const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, - UserData userData, - const typename GridEntity::CoordinatesType begin, - const typename GridEntity::CoordinatesType end ) -{ - typedef Real RealType; - typedef Index IndexType; - typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - if( threadIdx.x == 0 ) - { - coordinates.x() = begin.x(); - GridEntity entity( *grid, coordinates ); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - if( threadIdx.x == 1 ) - { - coordinates.x() = end.x(); - GridEntity entity( *grid, coordinates ); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } -} - -#endif - -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities > -void -GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType& begin, - const CoordinatesType& end, - UserData& userData, - const int& stream ) -{ -#ifdef HAVE_CUDA - auto& pool = CudaStreamPool::getInstance(); - const cudaStream_t& s = pool.getStream( stream ); - - Devices::Cuda::synchronizeDevice(); - if( processOnlyBoundaryEntities ) - { - dim3 cudaBlockSize( 2 ); - dim3 cudaBlocks( 1 ); - GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > - <<< cudaBlocks, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end ); - } - else - { - dim3 cudaBlockSize( 256 ); - dim3 cudaBlocks; - cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); - const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); - - for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) - GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > - <<< cudaBlocks, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end, - gridXIdx ); - } - - // only launches into the stream 0 are synchronized - if( stream == 0 ) - { - cudaStreamSynchronize( s ); - TNL_CHECK_CUDA_DEVICE; - } -#else - throw Exceptions::CudaSupportMissing(); -#endif -} - -/**** - * 1D traverser, MIC - */ - -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities > -void -GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType& begin, - const CoordinatesType& end, - UserData& userData, - const int& stream ) -{ - std::cout << "Not Implemented yet Grid Traverser <1, Real, Device::MIC>" << std::endl; -/* - auto& pool = CudaStreamPool::getInstance(); - const cudaStream_t& s = pool.getStream( stream ); - - Devices::Cuda::synchronizeDevice(); - if( processOnlyBoundaryEntities ) - { - dim3 cudaBlockSize( 2 ); - dim3 cudaBlocks( 1 ); - GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > - <<< cudaBlocks, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end ); - } - else - { - dim3 cudaBlockSize( 256 ); - dim3 cudaBlocks; - cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); - const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); - - for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) - GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > - <<< cudaBlocks, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end, - gridXIdx ); - } - - // only launches into the stream 0 are synchronized - if( stream == 0 ) - { - cudaStreamSynchronize( s ); - TNL_CHECK_CUDA_DEVICE; - } -*/ -} - -/**** - * 2D traverser, host - */ -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities, - int XOrthogonalBoundary, - int YOrthogonalBoundary, - typename... GridEntityParameters > -void -GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType begin, - const CoordinatesType end, - UserData& userData, - const int& stream, - const GridEntityParameters&... gridEntityParameters ) -{ - if( processOnlyBoundaryEntities ) - { - GridEntity entity( *gridPointer, begin, gridEntityParameters... ); - - if( YOrthogonalBoundary ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.getCoordinates().y() = begin.y(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - entity.getCoordinates().y() = end.y(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - if( XOrthogonalBoundary ) - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - { - entity.getCoordinates().x() = begin.x(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - entity.getCoordinates().x() = end.x(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } - else - { -#ifdef HAVE_OPENMP - if( Devices::Host::isOMPEnabled() ) - { -#pragma omp parallel firstprivate( begin, end ) - { - GridEntity entity( *gridPointer ); -#pragma omp for - // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow - for( IndexType y = begin.y(); y <= end.y(); y ++ ) - for( IndexType x = begin.x(); x <= end.x(); x ++ ) - { - entity.getCoordinates().x() = x; - entity.getCoordinates().y() = y; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } - } - else - { - GridEntity entity( *gridPointer ); - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } -#else - GridEntity entity( *gridPointer ); - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } -#endif - } -} - -/**** - * 2D traverser, CUDA - */ -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser2D( - const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, - UserData userData, - const typename GridEntity::CoordinatesType begin, - const typename GridEntity::CoordinatesType end, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); - - if( coordinates <= end ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() ) - { - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } - } -} - -// Boundary traverser using streams -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser2DBoundaryAlongX( - const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, - UserData userData, - const Index beginX, - const Index endX, - const Index fixedY, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = fixedY; - - if( coordinates.x() <= endX ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } -} - -// Boundary traverser using streams -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser2DBoundaryAlongY( - const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, - UserData userData, - const Index beginY, - const Index endY, - const Index fixedX, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = fixedX; - coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - - if( coordinates.y() <= endY ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } -} - - -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser2DBoundary( - const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, - UserData userData, - const Index beginX, - const Index endX, - const Index beginY, - const Index endY, - const Index blocksPerFace, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - using GridType = Meshes::Grid< 2, Real, Devices::Cuda, Index >; - using CoordinatesType = typename GridType::CoordinatesType; - - const Index faceIdx = blockIdx.x / blocksPerFace; - const Index faceBlockIdx = blockIdx.x % blocksPerFace; - const Index threadId = faceBlockIdx * blockDim. x + threadIdx.x; - if( faceIdx < 2 ) - { - const Index entitiesAlongX = endX - beginX + 1; - if( threadId < entitiesAlongX ) - { - GridEntity entity( *grid, - CoordinatesType( beginX + threadId, faceIdx == 0 ? beginY : endY ), - gridEntityParameters... ); - //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - } - else - { - const Index entitiesAlongY = endY - beginY - 1; - if( threadId < entitiesAlongY ) - { - GridEntity entity( *grid, - CoordinatesType( faceIdx == 2 ? beginX : endX, beginY + threadId + 1 ), - gridEntityParameters... ); - //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - } - - - - /*const Index aux = max( entitiesAlongX, entitiesAlongY ); - const Index& warpSize = Devices::Cuda::getWarpSize(); - const Index threadsPerAxis = warpSize * ( aux / warpSize + ( aux % warpSize != 0 ) ); - - Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - GridEntity entity( *grid, - CoordinatesType( 0, 0 ), - gridEntityParameters... ); - CoordinatesType& coordinates = entity.getCoordinates(); - const Index axisIndex = threadId / threadsPerAxis; - //printf( "axisIndex %d, threadId %d thradsPerAxis %d \n", axisIndex, threadId, threadsPerAxis ); - threadId -= axisIndex * threadsPerAxis; - switch( axisIndex ) - { - case 1: - coordinates = CoordinatesType( beginX + threadId, beginY ); - if( threadId < entitiesAlongX ) - { - //printf( "X1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - break; - case 2: - coordinates = CoordinatesType( beginX + threadId, endY ); - if( threadId < entitiesAlongX ) - { - //printf( "X2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - break; - case 3: - coordinates = CoordinatesType( beginX, beginY + threadId + 1 ); - if( threadId < entitiesAlongY ) - { - //printf( "Y1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - break; - case 4: - coordinates = CoordinatesType( endX, beginY + threadId + 1 ); - if( threadId < entitiesAlongY ) - { - //printf( "Y2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - break; - }*/ - - /*if( threadId < entitiesAlongX ) - { - GridEntity entity( *grid, - CoordinatesType( beginX + threadId, beginY ), - gridEntityParameters... ); - //printf( "X1: Thread %d -> %d %d x %d %d \n ", threadId, - // entity.getCoordinates().x(), entity.getCoordinates().y(), - // grid->getDimensions().x(), grid->getDimensions().y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - else if( ( threadId -= entitiesAlongX ) < entitiesAlongX && threadId >= 0 ) - { - GridEntity entity( *grid, - CoordinatesType( beginX + threadId, endY ), - gridEntityParameters... ); - entity.refresh(); - //printf( "X2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - else if( ( ( threadId -= entitiesAlongX ) < entitiesAlongY - 1 ) && threadId >= 0 ) - { - GridEntity entity( *grid, - CoordinatesType( beginX, beginY + threadId + 1 ), - gridEntityParameters... ); - entity.refresh(); - //printf( "Y1: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - else if( ( ( threadId -= entitiesAlongY - 1 ) < entitiesAlongY - 1 ) && threadId >= 0 ) - { - GridEntity entity( *grid, - CoordinatesType( endX, beginY + threadId + 1 ), - gridEntityParameters... ); - entity.refresh(); - //printf( "Y2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); - EntitiesProcessor::processEntity( *grid, userData, entity ); - }*/ -} - - -#endif // HAVE_CUDA - -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities, - int XOrthogonalBoundary, - int YOrthogonalBoundary, - typename... GridEntityParameters > -void -GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType& begin, - const CoordinatesType& end, - UserData& userData, - const int& stream, - const GridEntityParameters&... gridEntityParameters ) -{ -#ifdef HAVE_CUDA - if( processOnlyBoundaryEntities && - ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) ) - { -#ifdef GRID_TRAVERSER_USE_STREAMS - dim3 cudaBlockSize( 256 ); - dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX, - cudaBlocksCountAlongY, cudaGridsCountAlongY; - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 ); - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 ); - - auto& pool = CudaStreamPool::getInstance(); - Devices::Cuda::synchronizeDevice(); - - const cudaStream_t& s1 = pool.getStream( stream ); - const cudaStream_t& s2 = pool.getStream( stream + 1 ); - dim3 gridIdx, cudaGridSize; - for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize ); - //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX ); - GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaGridSize, cudaBlockSize, 0, s1 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - begin.y(), - gridIdx, - gridEntityParameters... ); - GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaGridSize, cudaBlockSize, 0, s2 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - end.y(), - gridIdx, - gridEntityParameters... ); - } - const cudaStream_t& s3 = pool.getStream( stream + 2 ); - const cudaStream_t& s4 = pool.getStream( stream + 3 ); - for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize ); - GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaGridSize, cudaBlockSize, 0, s3 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.y() + 1, - end.y() - 1, - begin.x(), - gridIdx, - gridEntityParameters... ); - GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaGridSize, cudaBlockSize, 0, s4 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.y() + 1, - end.y() - 1, - end.x(), - gridIdx, - gridEntityParameters... ); - } - cudaStreamSynchronize( s1 ); - cudaStreamSynchronize( s2 ); - cudaStreamSynchronize( s3 ); - cudaStreamSynchronize( s4 ); -#else // not defined GRID_TRAVERSER_USE_STREAMS - dim3 cudaBlockSize( 256 ); - dim3 cudaBlocksCount, cudaGridsCount; - const IndexType entitiesAlongX = end.x() - begin.x() + 1; - const IndexType entitiesAlongY = end.x() - begin.x() - 1; - const IndexType maxFaceSize = max( entitiesAlongX, entitiesAlongY ); - const IndexType blocksPerFace = maxFaceSize / cudaBlockSize.x + ( maxFaceSize % cudaBlockSize.x != 0 ); - IndexType cudaThreadsCount = 4 * cudaBlockSize.x * blocksPerFace; - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount ); - //std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount - // << "cudaBlockCount = " << cudaBlocksCount.x << std::endl; - dim3 gridIdx, cudaGridSize; - Devices::Cuda::synchronizeDevice(); - for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); - //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX ); - GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaGridSize, cudaBlockSize >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - begin.y(), - end.y(), - blocksPerFace, - gridIdx, - gridEntityParameters... ); - } -#endif //GRID_TRAVERSER_USE_STREAMS - //getchar(); - TNL_CHECK_CUDA_DEVICE; - } - else - { - dim3 cudaBlockSize( 16, 16 ); - dim3 cudaBlocksCount, cudaGridsCount; - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, - end.x() - begin.x() + 1, - end.y() - begin.y() + 1 ); - - auto& pool = CudaStreamPool::getInstance(); - const cudaStream_t& s = pool.getStream( stream ); - - Devices::Cuda::synchronizeDevice(); - dim3 gridIdx, cudaGridSize; - for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) - for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); - //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount ); - GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaGridSize, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end, - gridIdx, - gridEntityParameters... ); - } - - // only launches into the stream 0 are synchronized - if( stream == 0 ) - { - cudaStreamSynchronize( s ); - TNL_CHECK_CUDA_DEVICE; - } - } -#else - throw Exceptions::CudaSupportMissing(); -#endif -} - - -/**** - * 2D traverser, MIC - */ -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities, - int XOrthogonalBoundary, - int YOrthogonalBoundary, - typename... GridEntityParameters > -void -GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType& begin, - const CoordinatesType& end, - UserData& userData, - const int& stream, - const GridEntityParameters&... gridEntityParameters ) -{ - - -#ifdef HAVE_MIC - Devices::MIC::synchronizeDevice(); - - //TOHLE JE PRUSER -- nemim poslat vypustku -- - //GridEntity entity( gridPointer.template getData< Devices::MIC >(), begin, gridEntityParameters... ); - - - Devices::MICHider hMicGrid; - hMicGrid.pointer=& gridPointer.template getData< Devices::MIC >(); - Devices::MICHider hMicUserData; - hMicUserData.pointer=& userDataPointer.template modifyData(); - TNLMICSTRUCT(begin, const CoordinatesType); - TNLMICSTRUCT(end, const CoordinatesType); - - #pragma offload target(mic) in(sbegin,send,hMicUserData,hMicGrid) - { - - #pragma omp parallel firstprivate( sbegin, send ) - { - TNLMICSTRUCTUSE(begin, const CoordinatesType); - TNLMICSTRUCTUSE(end, const CoordinatesType); - GridEntity entity( *(hMicGrid.pointer), *(kernelbegin) ); - - if( processOnlyBoundaryEntities ) - { - if( YOrthogonalBoundary ) - #pragma omp for - for( auto k = kernelbegin->x(); - k <= kernelend->x(); - k ++ ) - { - entity.getCoordinates().x() = k; - entity.getCoordinates().y() = kernelbegin->y(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); - entity.getCoordinates().y() = kernelend->y(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); - } - if( XOrthogonalBoundary ) - #pragma omp for - for( auto k = kernelbegin->y(); - k <= kernelend->y(); - k ++ ) - { - entity.getCoordinates().y() = k; - entity.getCoordinates().x() = kernelbegin->x(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); - entity.getCoordinates().x() = kernelend->x(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); - } - } - else - { - #pragma omp for - for( IndexType y = kernelbegin->y(); y <= kernelend->y(); y ++ ) - for( IndexType x = kernelbegin->x(); x <= kernelend->x(); x ++ ) - { - // std::cerr << x << " " < - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities, - int XOrthogonalBoundary, - int YOrthogonalBoundary, - int ZOrthogonalBoundary, - typename... GridEntityParameters > -void -GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType begin, - const CoordinatesType end, - UserData& userData, - const int& stream, - const GridEntityParameters&... gridEntityParameters ) -{ - if( processOnlyBoundaryEntities ) - { - GridEntity entity( *gridPointer, begin, gridEntityParameters... ); - - if( ZOrthogonalBoundary ) - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.getCoordinates().z() = begin.z(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - entity.getCoordinates().z() = end.z(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - if( YOrthogonalBoundary ) - for( entity.getCoordinates().z() = begin.z(); - entity.getCoordinates().z() <= end.z(); - entity.getCoordinates().z() ++ ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.getCoordinates().y() = begin.y(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - entity.getCoordinates().y() = end.y(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - if( XOrthogonalBoundary ) - for( entity.getCoordinates().z() = begin.z(); - entity.getCoordinates().z() <= end.z(); - entity.getCoordinates().z() ++ ) - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - { - entity.getCoordinates().x() = begin.x(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - entity.getCoordinates().x() = end.x(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } - else - { -#ifdef HAVE_OPENMP - if( Devices::Host::isOMPEnabled() ) - { -#pragma omp parallel firstprivate( begin, end ) - { - GridEntity entity( *gridPointer ); -#pragma omp for - // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow - for( IndexType z = begin.z(); z <= end.z(); z ++ ) - for( IndexType y = begin.y(); y <= end.y(); y ++ ) - for( IndexType x = begin.x(); x <= end.x(); x ++ ) - { - entity.getCoordinates().x() = x; - entity.getCoordinates().y() = y; - entity.getCoordinates().z() = z; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } - } - else - { - GridEntity entity( *gridPointer ); - for( entity.getCoordinates().z() = begin.z(); - entity.getCoordinates().z() <= end.z(); - entity.getCoordinates().z() ++ ) - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } -#else - GridEntity entity( *gridPointer ); - for( entity.getCoordinates().z() = begin.z(); - entity.getCoordinates().z() <= end.z(); - entity.getCoordinates().z() ++ ) - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } -#endif - } -} - -/**** - * 3D traverser, CUDA - */ -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser3D( - const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, - UserData userData, - const typename GridEntity::CoordinatesType begin, - const typename GridEntity::CoordinatesType end, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); - coordinates.z() = begin.z() + Devices::Cuda::getGlobalThreadIdx_z( gridIdx ); - - if( coordinates <= end ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() ) - { - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } - } -} - -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser3DBoundaryAlongXY( - const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, - UserData userData, - const Index beginX, - const Index endX, - const Index beginY, - const Index endY, - const Index fixedZ, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); - coordinates.z() = fixedZ; - - if( coordinates.x() <= endX && coordinates.y() <= endY ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } -} - -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser3DBoundaryAlongXZ( - const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, - UserData userData, - const Index beginX, - const Index endX, - const Index beginZ, - const Index endZ, - const Index fixedY, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = fixedY; - coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); - - if( coordinates.x() <= endX && coordinates.z() <= endZ ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } -} - -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser3DBoundaryAlongYZ( - const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, - UserData userData, - const Index beginY, - const Index endY, - const Index beginZ, - const Index endZ, - const Index fixedX, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = fixedX; - coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); - - if( coordinates.y() <= endY && coordinates.z() <= endZ ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } -} -#endif - -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities, - int XOrthogonalBoundary, - int YOrthogonalBoundary, - int ZOrthogonalBoundary, - typename... GridEntityParameters > -void -GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType& begin, - const CoordinatesType& end, - UserData& userData, - const int& stream, - const GridEntityParameters&... gridEntityParameters ) -{ -#ifdef HAVE_CUDA - if( processOnlyBoundaryEntities && - ( GridEntity::getEntityDimension() == 3 || GridEntity::getEntityDimension() == 0 ) ) - { - dim3 cudaBlockSize( 16, 16 ); - const IndexType entitiesAlongX = end.x() - begin.x() + 1; - const IndexType entitiesAlongY = end.y() - begin.y() + 1; - const IndexType entitiesAlongZ = end.z() - begin.z() + 1; - - dim3 cudaBlocksCountAlongXY, cudaBlocksCountAlongXZ, cudaBlocksCountAlongYZ, - cudaGridsCountAlongXY, cudaGridsCountAlongXZ, cudaGridsCountAlongYZ; - - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXY, cudaGridsCountAlongXY, entitiesAlongX, entitiesAlongY ); - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, entitiesAlongX, entitiesAlongZ - 2 ); - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 ); - - auto& pool = CudaStreamPool::getInstance(); - Devices::Cuda::synchronizeDevice(); - - const cudaStream_t& s1 = pool.getStream( stream ); - const cudaStream_t& s2 = pool.getStream( stream + 1 ); - const cudaStream_t& s3 = pool.getStream( stream + 2 ); - const cudaStream_t& s4 = pool.getStream( stream + 3 ); - const cudaStream_t& s5 = pool.getStream( stream + 4 ); - const cudaStream_t& s6 = pool.getStream( stream + 5 ); - - dim3 gridIdx, gridSize; - for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXY.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXY.x; gridIdx.x++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCountAlongXY, cudaGridsCountAlongXY, gridIdx, gridSize ); - GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocksCountAlongXY, cudaBlockSize, 0 , s1 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - begin.y(), - end.y(), - begin.z(), - gridIdx, - gridEntityParameters... ); - GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocksCountAlongXY, cudaBlockSize, 0, s2 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - begin.y(), - end.y(), - end.z(), - gridIdx, - gridEntityParameters... ); - } - for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXZ.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXZ.x; gridIdx.x++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, gridIdx, gridSize ); - GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s3 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - begin.z() + 1, - end.z() - 1, - begin.y(), - gridIdx, - gridEntityParameters... ); - GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s4 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - begin.z() + 1, - end.z() - 1, - end.y(), - gridIdx, - gridEntityParameters... ); - } - for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongYZ.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongYZ.x; gridIdx.x++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, gridIdx, gridSize ); - GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s5 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.y() + 1, - end.y() - 1, - begin.z() + 1, - end.z() - 1, - begin.x(), - gridIdx, - gridEntityParameters... ); - GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s6 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.y() + 1, - end.y() - 1, - begin.z() + 1, - end.z() - 1, - end.x(), - gridIdx, - gridEntityParameters... ); - } - cudaStreamSynchronize( s1 ); - cudaStreamSynchronize( s2 ); - cudaStreamSynchronize( s3 ); - cudaStreamSynchronize( s4 ); - cudaStreamSynchronize( s5 ); - cudaStreamSynchronize( s6 ); - TNL_CHECK_CUDA_DEVICE; - } - else - { - dim3 cudaBlockSize( 8, 8, 8 ); - dim3 cudaBlocksCount, cudaGridsCount; - - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, - end.x() - begin.x() + 1, - end.y() - begin.y() + 1, - end.z() - begin.z() + 1 ); - - auto& pool = CudaStreamPool::getInstance(); - const cudaStream_t& s = pool.getStream( stream ); - - Devices::Cuda::synchronizeDevice(); - dim3 gridIdx, gridSize; - for( gridIdx.z = 0; gridIdx.z < cudaGridsCount.z; gridIdx.z ++ ) - for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) - for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, gridSize ); - GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< gridSize, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end, - gridIdx, - gridEntityParameters... ); - } - - // only launches into the stream 0 are synchronized - if( stream == 0 ) - { - cudaStreamSynchronize( s ); - TNL_CHECK_CUDA_DEVICE; - } - } -#else - throw Exceptions::CudaSupportMissing(); -#endif -} - -/**** - * 3D traverser, MIC - */ -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities, - int XOrthogonalBoundary, - int YOrthogonalBoundary, - int ZOrthogonalBoundary, - typename... GridEntityParameters > -void -GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType& begin, - const CoordinatesType& end, - UserData& userData, - const int& stream, - const GridEntityParameters&... gridEntityParameters ) -{ - std::cout << "Not Implemented yet Grid Traverser <3, Real, Device::MIC>" << std::endl; - -/* HAVE_CUDA - dim3 cudaBlockSize( 8, 8, 8 ); - dim3 cudaBlocks; - cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); - cudaBlocks.y = Devices::Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y ); - cudaBlocks.z = Devices::Cuda::getNumberOfBlocks( end.z() - begin.z() + 1, cudaBlockSize.z ); - const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); - const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y ); - const IndexType cudaZGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.z ); - - auto& pool = CudaStreamPool::getInstance(); - const cudaStream_t& s = pool.getStream( stream ); - - Devices::Cuda::synchronizeDevice(); - for( IndexType gridZIdx = 0; gridZIdx < cudaZGrids; gridZIdx ++ ) - for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ ) - for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) - GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocks, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end, - gridXIdx, - gridYIdx, - gridZIdx, - gridEntityParameters... ); - - // only launches into the stream 0 are synchronized - if( stream == 0 ) - { - cudaStreamSynchronize( s ); - TNL_CHECK_CUDA_DEVICE; - } - */ -} - -} // namespace Meshes -} // namespace TNL -- GitLab From d9e5bf693e39b88b51a12bd3c8545310790ea1e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 4 Jan 2019 18:34:03 +0100 Subject: [PATCH 045/130] GridTraverser_impl.h splitted into GridTraverser_1D.hpp, GridTraverser_2D.hpp and GridTraverser_3D.hpp. --- src/Benchmarks/FunctionTimer.h | 9 +++++---- src/TNL/Meshes/GridDetails/CMakeLists.txt | 4 +++- src/TNL/Meshes/GridDetails/GridTraverser.h | 4 +++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h index 35dbb719f..601cfc16c 100644 --- a/src/Benchmarks/FunctionTimer.h +++ b/src/Benchmarks/FunctionTimer.h @@ -57,13 +57,14 @@ class FunctionTimer // the monitor, the timer is not interrupted after each loop. if( ! performReset && verbose < 2 ) { - if( timing ) - timer.start(); // Explicit synchronization of the CUDA device -#ifdef HAVE_CUDA +#ifdef HAVE_CUDA if( std::is_same< Device, Devices::Cuda >::value ) cudaDeviceSynchronize(); -#endif +#endif + if( timing ) + timer.start(); + for( loops = 0; loops < maxLoops || ( timing && timer.getRealTime() < minTime ); ++loops) diff --git a/src/TNL/Meshes/GridDetails/CMakeLists.txt b/src/TNL/Meshes/GridDetails/CMakeLists.txt index 0da067f14..3386ec242 100644 --- a/src/TNL/Meshes/GridDetails/CMakeLists.txt +++ b/src/TNL/Meshes/GridDetails/CMakeLists.txt @@ -14,7 +14,9 @@ SET( headers BoundaryGridEntityChecker.h GridEntityMeasureGetter.h GridEntityTopology.h GridTraverser.h - GridTraverser_impl.h + GridTraverser_1D.hpp + GridTraverser_2D.hpp + GridTraverser_3D.hpp NeighborGridEntitiesStorage.h NeighborGridEntityGetter1D_impl.h NeighborGridEntityGetter2D_impl.h diff --git a/src/TNL/Meshes/GridDetails/GridTraverser.h b/src/TNL/Meshes/GridDetails/GridTraverser.h index 3a74c085b..881367d3f 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser.h +++ b/src/TNL/Meshes/GridDetails/GridTraverser.h @@ -351,5 +351,7 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > > } // namespace Meshes } // namespace TNL -#include +#include +#include +#include -- GitLab From 7f504457664feb8164060337ce3e2773a9ea974c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 4 Jan 2019 18:35:04 +0100 Subject: [PATCH 046/130] Fixes in traversers benchmark. --- src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h | 6 +++--- src/Benchmarks/Traversers/tnl-benchmark-traversers.h | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index 93ee77385..1683cc868 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -154,10 +154,10 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { if( std::is_same< Device, Devices::Host >::value ) { - v_data[ 0 ] = 2; + v_data[ 0 ] = +2; for( int i = 1; i < size - 1; i++ ) - v_data[ i ] = 1.0; - v_data[ size - 1 ] = 2; + v_data[ i ] = +1.0; + v_data[ size - 1 ] = +2; } else // Device == Devices::Cuda { diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 9f70589c9..6adc0d8e3 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -189,6 +189,8 @@ bool runBenchmark( const Config::ParameterContainer& parameters, } std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl; } + return true; + /**** * Full grid traversing including boundary conditions -- GitLab From 2063d7a64aab7271bc555109fbac4fbd67e0dd2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 4 Jan 2019 22:44:30 +0100 Subject: [PATCH 047/130] Fixed ordering of indices in ParallelFor to be consistent for Host and Cuda --- .../DistributedMeshes/BufferEntitiesHelper.h | 40 +++++++++---------- .../DistributedMeshes/CopyEntitiesHelper.h | 29 ++++++-------- src/TNL/ParallelFor.h | 36 +++++++++-------- 3 files changed, 51 insertions(+), 54 deletions(-) diff --git a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h index 0b3c7b363..9b7ed0c4e 100644 --- a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h +++ b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h @@ -15,8 +15,8 @@ #include namespace TNL { -namespace Meshes { -namespace DistributedMeshes { +namespace Meshes { +namespace DistributedMeshes { template < typename MeshFunctionType, @@ -38,7 +38,7 @@ template < typename MeshFunctionType, class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 1, RealType, Device, Index > { public: - static void BufferEntities( + static void BufferEntities( MeshFunctionType& meshFunction, const MaskPointer& maskPointer, RealType* buffer, @@ -66,15 +66,15 @@ class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 1, RealType, Device, } }; ParallelFor< Device >::exec( 0, sizex, kernel ); - }; + }; }; template< typename MeshFunctionType, - typename MaskPointer, + typename MaskPointer, typename RealType, typename Device, - typename Index > + typename Index > class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 2, RealType, Device, Index > { public: @@ -90,7 +90,7 @@ class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 2, RealType, Device, bool tobuffer) { auto mesh=meshFunction.getMesh(); - RealType* meshFunctionData = meshFunction.getData().getData(); + RealType* meshFunctionData = meshFunction.getData().getData(); const typename MaskPointer::ObjectType* mask( nullptr ); if( maskPointer ) mask = &maskPointer.template getData< Device >(); @@ -98,18 +98,18 @@ class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 2, RealType, Device, auto kernel = [ tobuffer, mask, mesh, buffer, isBoundary, meshFunctionData, beginx, sizex, beginy] __cuda_callable__ ( Index i, Index j ) { typename MeshFunctionType::MeshType::Cell entity(mesh); - entity.getCoordinates().x() = beginx + j; - entity.getCoordinates().y() = beginy + i; + entity.getCoordinates().x() = beginx + i; + entity.getCoordinates().y() = beginy + j; entity.refresh(); if( ! isBoundary || ! mask || ( *mask )[ entity.getIndex() ] ) { if( tobuffer ) - buffer[ i * sizex + j ] = meshFunctionData[ entity.getIndex() ]; + buffer[ j * sizex + i ] = meshFunctionData[ entity.getIndex() ]; else - meshFunctionData[ entity.getIndex() ] = buffer[ i * sizex + j ]; + meshFunctionData[ entity.getIndex() ] = buffer[ j * sizex + i ]; } }; - ParallelFor2D< Device >::exec( 0, 0, sizey, sizex, kernel ); + ParallelFor2D< Device >::exec( 0, 0, sizex, sizey, kernel ); }; }; @@ -135,29 +135,27 @@ class BufferEntitiesHelper< MeshFunctionType, MaskPointer, 3, RealType, Device, const Index& sizez, bool tobuffer) { - auto mesh=meshFunction.getMesh(); RealType * meshFunctionData=meshFunction.getData().getData(); const typename MaskPointer::ObjectType* mask( nullptr ); if( maskPointer ) - mask = &maskPointer.template getData< Device >(); - auto kernel = [ tobuffer, mesh, mask, buffer, isBoundary, meshFunctionData, beginx, sizex, beginy, sizey, beginz] __cuda_callable__ ( Index k, Index i, Index j ) + mask = &maskPointer.template getData< Device >(); + auto kernel = [ tobuffer, mesh, mask, buffer, isBoundary, meshFunctionData, beginx, sizex, beginy, sizey, beginz] __cuda_callable__ ( Index i, Index j, Index k ) { typename MeshFunctionType::MeshType::Cell entity(mesh); - entity.getCoordinates().x() = beginx + j; + entity.getCoordinates().x() = beginx + i; + entity.getCoordinates().y() = beginy + j; entity.getCoordinates().z() = beginz + k; - entity.getCoordinates().y() = beginy + i; entity.refresh(); if( ! isBoundary || ! mask || ( *mask )[ entity.getIndex() ] ) { if( tobuffer ) - buffer[ k * sizex * sizey + i * sizex + j ] = - meshFunctionData[ entity.getIndex() ]; + buffer[ k * sizex * sizey + j * sizex + i ] = meshFunctionData[ entity.getIndex() ]; else - meshFunctionData[ entity.getIndex() ] = buffer[ k * sizex * sizey + i * sizex + j ]; + meshFunctionData[ entity.getIndex() ] = buffer[ k * sizex * sizey + j * sizex + i ]; } }; - ParallelFor3D< Device >::exec( 0, 0, 0, sizez, sizey, sizex, kernel ); + ParallelFor3D< Device >::exec( 0, 0, 0, sizex, sizey, sizez, kernel ); }; }; diff --git a/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h b/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h index fe2f82cff..df36543f3 100644 --- a/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h +++ b/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h @@ -15,8 +15,8 @@ #include namespace TNL { -namespace Meshes { -namespace DistributedMeshes { +namespace Meshes { +namespace DistributedMeshes { template @@ -40,7 +40,7 @@ class CopyEntitiesHelper typedef typename MeshFunctionType::MeshType::GlobalIndexType Index; static void Copy(MeshFunctionType &from, MeshFunctionType &to, CoordinatesType &fromBegin, CoordinatesType &toBegin, CoordinatesType &size) - { + { auto toData=to.getData().getData(); auto fromData=from.getData().getData(); auto fromMesh=from.getMesh(); @@ -49,9 +49,9 @@ class CopyEntitiesHelper { Cell fromEntity(fromMesh); Cell toEntity(toMesh); - toEntity.getCoordinates().x()=toBegin.x()+i; + toEntity.getCoordinates().x()=toBegin.x()+i; toEntity.refresh(); - fromEntity.getCoordinates().x()=fromBegin.x()+i; + fromEntity.getCoordinates().x()=fromBegin.x()+i; fromEntity.refresh(); toData[toEntity.getIndex()]=fromData[fromEntity.getIndex()]; }; @@ -77,20 +77,19 @@ class CopyEntitiesHelper auto fromData=from.getData().getData(); auto fromMesh=from.getMesh(); auto toMesh=to.getMesh(); - auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index j, Index i ) + auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index i, Index j ) { Cell fromEntity(fromMesh); Cell toEntity(toMesh); toEntity.getCoordinates().x()=toBegin.x()+i; - toEntity.getCoordinates().y()=toBegin.y()+j; + toEntity.getCoordinates().y()=toBegin.y()+j; toEntity.refresh(); fromEntity.getCoordinates().x()=fromBegin.x()+i; - fromEntity.getCoordinates().y()=fromBegin.y()+j; + fromEntity.getCoordinates().y()=fromBegin.y()+j; fromEntity.refresh(); toData[toEntity.getIndex()]=fromData[fromEntity.getIndex()]; }; - ParallelFor2D< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0,(Index)0,(Index)size.y(), (Index)size.x(), kernel ); - + ParallelFor2D< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0,(Index)0,(Index)size.x(), (Index)size.y(), kernel ); } }; @@ -110,27 +109,25 @@ class CopyEntitiesHelper auto fromData=from.getData().getData(); auto fromMesh=from.getMesh(); auto toMesh=to.getMesh(); - auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index k, Index j, Index i ) + auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index i, Index j, Index k ) { Cell fromEntity(fromMesh); Cell toEntity(toMesh); toEntity.getCoordinates().x()=toBegin.x()+i; toEntity.getCoordinates().y()=toBegin.y()+j; - toEntity.getCoordinates().z()=toBegin.z()+k; + toEntity.getCoordinates().z()=toBegin.z()+k; toEntity.refresh(); fromEntity.getCoordinates().x()=fromBegin.x()+i; fromEntity.getCoordinates().y()=fromBegin.y()+j; - fromEntity.getCoordinates().z()=fromBegin.z()+k; + fromEntity.getCoordinates().z()=fromBegin.z()+k; fromEntity.refresh(); toData[toEntity.getIndex()]=fromData[fromEntity.getIndex()]; }; - ParallelFor3D< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0,(Index)0,(Index)0,(Index)size.z() ,(Index)size.y(), (Index)size.x(), kernel ); + ParallelFor3D< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0,(Index)0,(Index)0,(Index)size.x(),(Index)size.y(), (Index)size.z(), kernel ); } }; - - } // namespace DistributedMeshes } // namespace Meshes } // namespace TNL diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h index 7eac7058c..0505aac23 100644 --- a/src/TNL/ParallelFor.h +++ b/src/TNL/ParallelFor.h @@ -69,18 +69,19 @@ struct ParallelFor2D if( TNL::Devices::Host::isOMPEnabled() ) { #pragma omp parallel for + for( Index j = startY; j < endY; j++ ) for( Index i = startX; i < endX; i++ ) - for( Index j = startY; j < endY; j++ ) - f( i, j, args... ); + f( i, j, args... ); } - else + else { + for( Index j = startY; j < endY; j++ ) for( Index i = startX; i < endX; i++ ) - for( Index j = startY; j < endY; j++ ) - f( i, j, args... ); + f( i, j, args... ); + } #else + for( Index j = startY; j < endY; j++ ) for( Index i = startX; i < endX; i++ ) - for( Index j = startY; j < endY; j++ ) - f( i, j, args... ); + f( i, j, args... ); #endif } }; @@ -99,21 +100,22 @@ struct ParallelFor3D if( TNL::Devices::Host::isOMPEnabled() ) { #pragma omp parallel for collapse(2) + for( Index k = startZ; k < endZ; k++ ) + for( Index j = startY; j < endY; j++ ) for( Index i = startX; i < endX; i++ ) - for( Index j = startY; j < endY; j++ ) - for( Index k = startZ; k < endZ; k++ ) - f( i, j, k, args... ); + f( i, j, k, args... ); } - else + else { + for( Index k = startZ; k < endZ; k++ ) + for( Index j = startY; j < endY; j++ ) for( Index i = startX; i < endX; i++ ) - for( Index j = startY; j < endY; j++ ) - for( Index k = startZ; k < endZ; k++ ) - f( i, j, k, args... ); + f( i, j, k, args... ); + } #else + for( Index k = startZ; k < endZ; k++ ) + for( Index j = startY; j < endY; j++ ) for( Index i = startX; i < endX; i++ ) - for( Index j = startY; j < endY; j++ ) - for( Index k = startZ; k < endZ; k++ ) - f( i, j, k, args... ); + f( i, j, k, args... ); #endif } }; -- GitLab From 5d9dc62787b854596479a94b4f7756c5d6f87b4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 4 Jan 2019 23:03:17 +0100 Subject: [PATCH 048/130] Fixed order of indices in the traverser benchmarks --- .../Traversers/GridTraversersBenchmark_2D.h | 12 ++++++------ .../Traversers/GridTraversersBenchmark_3D.h | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index d62d56f91..48f11bfb9 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -98,7 +98,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { - data[ i * _size + j ] += 1.0; + data[ j * _size + i ] += 1.0; }; ParallelFor2D< Device >::exec( ( Index ) 0, @@ -114,8 +114,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index > auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { Cell entity( *currentGrid ); - entity.getCoordinates().y() = i; - entity.getCoordinates().x() = j; + entity.getCoordinates().x() = i; + entity.getCoordinates().y() = j; entity.refresh(); data[ entity.getIndex() ] += 1.0; }; @@ -134,8 +134,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index > auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { Cell entity( *currentGrid ); - entity.getCoordinates().y() = i; - entity.getCoordinates().x() = j; + entity.getCoordinates().x() = i; + entity.getCoordinates().y() = j; entity.refresh(); ( *_u )( entity ) += 1.0; }; @@ -249,4 +249,4 @@ class GridTraversersBenchmark< 2, Device, Real, Index > } // namespace Traversers } // namespace Benchmarks -} // namespace TNL \ No newline at end of file +} // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index 383640d39..cceffa328 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -104,7 +104,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { - data[ ( i * _size + j ) * _size + k ] += 1.0; + data[ ( k * _size + j ) * _size + i ] += 1.0; }; ParallelFor3D< Device >::exec( ( Index ) 0, @@ -122,9 +122,9 @@ class GridTraversersBenchmark< 3, Device, Real, Index > auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { Cell entity( *currentGrid ); - entity.getCoordinates().z() = i; + entity.getCoordinates().x() = i; entity.getCoordinates().y() = j; - entity.getCoordinates().x() = k; + entity.getCoordinates().z() = k; entity.refresh(); data[ entity.getIndex() ] += 1.0; }; @@ -145,9 +145,9 @@ class GridTraversersBenchmark< 3, Device, Real, Index > auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { Cell entity( *currentGrid ); - entity.getCoordinates().z() = i; + entity.getCoordinates().x() = i; entity.getCoordinates().y() = j; - entity.getCoordinates().x() = k; + entity.getCoordinates().z() = k; entity.refresh(); ( *_u )( entity ) += 1.0; }; @@ -257,4 +257,4 @@ class GridTraversersBenchmark< 3, Device, Real, Index > } // namespace Traversers } // namespace Benchmarks -} // namespace TNL \ No newline at end of file +} // namespace TNL -- GitLab From 8ea590e97d436f869ed9fd0d79b288e62ce07aaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 4 Jan 2019 23:05:34 +0100 Subject: [PATCH 049/130] Traverser benchmarks: added explicit cast to Real Because constants 1.0 and 2.0 have type double. --- .../Traversers/GridTraversersBenchmark.h | 4 ++-- .../Traversers/GridTraversersBenchmark_1D.h | 16 +++++++------- .../Traversers/GridTraversersBenchmark_2D.h | 18 +++++++-------- .../Traversers/GridTraversersBenchmark_3D.h | 22 +++++++++---------- src/Benchmarks/Traversers/cuda-kernels.h | 18 +++++++-------- 5 files changed, 39 insertions(+), 39 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index c320dc591..bd748ed09 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -42,7 +42,7 @@ class WriteOneEntitiesProcessor const GridEntity& entity ) { auto& u = userData.u.template modifyData< DeviceType >(); - u( entity ) += 1.0; + u( entity ) += (typename MeshType::RealType) 1.0; } }; @@ -68,4 +68,4 @@ class GridTraversersBenchmark{}; #include "GridTraversersBenchmark_1D.h" #include "GridTraversersBenchmark_2D.h" -#include "GridTraversersBenchmark_3D.h" \ No newline at end of file +#include "GridTraversersBenchmark_3D.h" diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index 1683cc868..e626b17e3 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -64,7 +64,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > if( std::is_same< Device, Devices::Host >::value ) { for( int i = 0; i < size; i++ ) - v_data[ i ] += 1.0; + v_data[ i ] += (Real) 1.0; } else // Device == Devices::Cuda { @@ -94,7 +94,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { auto f = [] __cuda_callable__ ( Index i, Real* data ) { - data[ i ] += 1.0; + data[ i ] += (Real) 1.0; }; ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } @@ -107,7 +107,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > Cell entity( *currentGrid ); entity.getCoordinates().x() = i; entity.refresh(); - data[ entity.getIndex() ] += 1.0; + data[ entity.getIndex() ] += (Real) 1.0; }; ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } @@ -121,7 +121,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > Cell entity( *currentGrid ); entity.getCoordinates().x() = i; entity.refresh(); - ( *_u )( entity ) += 1.0; + ( *_u )( entity ) += (Real) 1.0; //WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity ); }; ParallelFor< Device >::exec( ( Index ) 0, size, f ); @@ -154,10 +154,10 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { if( std::is_same< Device, Devices::Host >::value ) { - v_data[ 0 ] = +2; + v_data[ 0 ] += (Real) 2; for( int i = 1; i < size - 1; i++ ) - v_data[ i ] = +1.0; - v_data[ size - 1 ] = +2; + v_data[ i ] += (Real) 1.0; + v_data[ size - 1 ] += (Real) 2; } else // Device == Devices::Cuda { @@ -213,4 +213,4 @@ class GridTraversersBenchmark< 1, Device, Real, Index > } // namespace Traversers } // namespace Benchmarks -} // namespace TNL \ No newline at end of file +} // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index 48f11bfb9..1296a9a46 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -65,7 +65,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > { for( int i = 0; i < size; i++ ) for( int j = 0; j < size; j++ ) - v_data[ i * size + j ] += 1.0; + v_data[ i * size + j ] += (Real) 1.0; } else // Device == Devices::Cuda { @@ -98,7 +98,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { - data[ j * _size + i ] += 1.0; + data[ j * _size + i ] += (Real) 1.0; }; ParallelFor2D< Device >::exec( ( Index ) 0, @@ -117,7 +117,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > entity.getCoordinates().x() = i; entity.getCoordinates().y() = j; entity.refresh(); - data[ entity.getIndex() ] += 1.0; + data[ entity.getIndex() ] += (Real) 1.0; }; ParallelFor2D< Device >::exec( ( Index ) 0, @@ -137,7 +137,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > entity.getCoordinates().x() = i; entity.getCoordinates().y() = j; entity.refresh(); - ( *_u )( entity ) += 1.0; + ( *_u )( entity ) += (Real) 1.0; }; ParallelFor2D< Device >::exec( ( Index ) 0, @@ -179,18 +179,18 @@ class GridTraversersBenchmark< 2, Device, Real, Index > { for( int i = 0; i < size; i++ ) { - v_data[ i * size ] = 2.0; - v_data[ i * size + size - 1 ] = 2.0; + v_data[ i * size ] += (Real) 2.0; + v_data[ i * size + size - 1 ] += (Real) 2.0; } for( int j = 1; j < size - 1; j++ ) { - v_data[ j ] = 2.0; - v_data[ ( size - 1 ) * size + j ] = 2.0; + v_data[ j ] += (Real) 2.0; + v_data[ ( size - 1 ) * size + j ] += (Real) 2.0; } for( int i = 1; i < size - 1; i++ ) for( int j = 1; j < size - 1; j++ ) - v_data[ i * size + j ] = 1.0; + v_data[ i * size + j ] += (Real) 1.0; } else // Device == Devices::Cuda { diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index cceffa328..35863a3c9 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -69,7 +69,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > for( int i = 0; i < size; i++ ) for( int j = 0; j < size; j++ ) for( int k = 0; k < size; k++ ) - v_data[ ( i * size + j ) * size + k ] += 1.0; + v_data[ ( i * size + j ) * size + k ] += (Real) 1.0; } else // Device == Devices::Cuda { @@ -104,7 +104,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { - data[ ( k * _size + j ) * _size + i ] += 1.0; + data[ ( k * _size + j ) * _size + i ] += (Real) 1.0; }; ParallelFor3D< Device >::exec( ( Index ) 0, @@ -126,7 +126,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > entity.getCoordinates().y() = j; entity.getCoordinates().z() = k; entity.refresh(); - data[ entity.getIndex() ] += 1.0; + data[ entity.getIndex() ] += (Real) 1.0; }; ParallelFor3D< Device >::exec( ( Index ) 0, @@ -149,7 +149,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > entity.getCoordinates().y() = j; entity.getCoordinates().z() = k; entity.refresh(); - ( *_u )( entity ) += 1.0; + ( *_u )( entity ) += (Real) 1.0; }; ParallelFor3D< Device >::exec( ( Index ) 0, @@ -175,27 +175,27 @@ class GridTraversersBenchmark< 3, Device, Real, Index > for( int i = 0; i < size; i++ ) for( int j = 0; j < size; j++ ) { - v_data[ ( i * size + j ) * size ] = 2.0; - v_data[ ( i * size + j ) * size + size - 1 ] = 2.0; + v_data[ ( i * size + j ) * size ] += (Real) 2.0; + v_data[ ( i * size + j ) * size + size - 1 ] += (Real) 2.0; } for( int j = 0; j < size; j++ ) for( int k = 1; k < size - 1; k++ ) { - v_data[ j * size + k ] = 1.0; - v_data[ ( ( size - 1) * size + j ) * size + k ] = 1.0; + v_data[ j * size + k ] += (Real) 1.0; + v_data[ ( ( size - 1) * size + j ) * size + k ] += (Real) 1.0; } for( int i = 1; i < size -1; i++ ) for( int k = 1; k < size - 1; k++ ) { - v_data[ ( i * size ) * size + k ] = 2.0; - v_data[ ( i * size + size - 1 ) * size + k ] = 2.0; + v_data[ ( i * size ) * size + k ] += (Real) 2.0; + v_data[ ( i * size + size - 1 ) * size + k ] += (Real) 2.0; } for( int i = 1; i < size -1; i++ ) for( int j = 1; j < size -1; j++ ) for( int k = 1; k < size - 1; k++ ) - v_data[ ( i * size + j ) * size + k ] = 1.0; + v_data[ ( i * size + j ) * size + k ] += (Real) 1.0; } else // Device == Devices::Cuda { diff --git a/src/Benchmarks/Traversers/cuda-kernels.h b/src/Benchmarks/Traversers/cuda-kernels.h index 2802b73eb..a90baf5b0 100644 --- a/src/Benchmarks/Traversers/cuda-kernels.h +++ b/src/Benchmarks/Traversers/cuda-kernels.h @@ -27,7 +27,7 @@ __global__ void fullGridTraverseKernel1D( const Index size, const dim3 gridIdx, { const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( threadIdx_x < size ) - v_data[ threadIdx_x ] += 1.0; + v_data[ threadIdx_x ] += (Real) 1.0; } template< typename Real, @@ -37,7 +37,7 @@ __global__ void fullGridTraverseKernel2D( const Index size, const dim3 gridIdx, const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( threadIdx_x < size && threadIdx_y < size ) - v_data[ threadIdx_y * size + threadIdx_x ] += 1.0; + v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 1.0; } template< typename Real, @@ -48,7 +48,7 @@ __global__ void fullGridTraverseKernel3D( const Index size, const dim3 gridIdx, const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size ) - v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 1.0; + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 1.0; } /**** @@ -60,7 +60,7 @@ __global__ void interiorTraverseKernel1D( const Index size, const dim3 gridIdx, { const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( threadIdx_x > 0 && threadIdx_x < size - 1 ) - v_data[ threadIdx_x ] += 1.0; + v_data[ threadIdx_x ] += (Real) 1.0; } template< typename Real, @@ -71,7 +71,7 @@ __global__ void interiorTraverseKernel2D( const Index size, const dim3 gridIdx, const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_x < size - 1 && threadIdx_y < size - 1 ) - v_data[ threadIdx_y * size + threadIdx_x ] += 1.0; + v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 1.0; } template< typename Real, @@ -83,7 +83,7 @@ __global__ void interiorTraverseKernel3D( const Index size, const dim3 gridIdx, const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_z > 0 && threadIdx_x < size - 1 && threadIdx_y < size - 1 && threadIdx_z < size - 1 ) - v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 1.0; + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 1.0; } /**** @@ -95,7 +95,7 @@ __global__ void boundariesTraverseKernel1D( const Index size, const dim3 gridIdx { const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( threadIdx_x == 0 || threadIdx_x == size - 1 ) - v_data[ threadIdx_x ] += 2.0; + v_data[ threadIdx_x ] += (Real) 2.0; } template< typename Real, @@ -106,7 +106,7 @@ __global__ void boundariesTraverseKernel2D( const Index size, const dim3 gridIdx const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_x < size - 1 && threadIdx_y < size - 1 ) - v_data[ threadIdx_y * size + threadIdx_x ] += 2.0; + v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 2.0; } template< typename Real, @@ -118,7 +118,7 @@ __global__ void boundariesTraverseKernel3D( const Index size, const dim3 gridIdx const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; if( threadIdx_x == 0 || threadIdx_y == 0 || threadIdx_z == 0 || threadIdx_x == size - 1 || threadIdx_y == size - 1 || threadIdx_z == size - 1 ) - v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 2.0; + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 2.0; } #endif -- GitLab From f5274369ddbe9ad1289972eabefedd645efa9d15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 4 Jan 2019 23:09:18 +0100 Subject: [PATCH 050/130] Fixed calculation of bandwidth in the traverser benchmarks --- .../Traversers/tnl-benchmark-traversers.h | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 6adc0d8e3..ff6d25624 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -80,7 +80,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, */ if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-pure-c" ) ) { - benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); auto hostWriteOneUsingPureC = [&] () { @@ -103,7 +103,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, */ if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for" ) ) { - benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); auto hostWriteOneUsingParallelFor = [&] () { @@ -130,7 +130,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, { hostTraverserBenchmark.writeOneUsingParallelForAndGridEntity(); }; - benchmark.setOperation( "par.for+grid ent.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "par.for+grid ent.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); #ifdef HAVE_CUDA @@ -152,7 +152,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, { hostTraverserBenchmark.writeOneUsingParallelForAndMeshFunction(); }; - benchmark.setOperation( "par.for+mesh fc.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction ); #ifdef HAVE_CUDA @@ -171,7 +171,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, */ if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-traverser" ) ) { - benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); auto hostWriteOneUsingTraverser = [&] () { hostTraverserBenchmark.writeOneUsingTraverser(); @@ -234,14 +234,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( tests.containsValue( "all" ) || tests.containsValue( "bc-pure-c" ) ) { - benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC ); #endif - benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "Pure C RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC ); #ifdef HAVE_CUDA if( withCuda ) @@ -266,14 +266,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) ) { - benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor ); #endif - benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "parallel for RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor ); #ifdef HAVE_CUDA if( withCuda ) @@ -298,13 +298,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) ) { - benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser ); #endif - benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "traverser RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser ); -- GitLab From 1ace5365d2dc74120e49ed2adaa9b0ffa76bf4e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 5 Jan 2019 08:30:26 +0100 Subject: [PATCH 051/130] Added synchrounous/asynchronous modes for grid traversers. --- src/TNL/Meshes/GridDetails/GridTraverser.h | 29 ++++++++++++---- .../Meshes/GridDetails/GridTraverser_1D.hpp | 18 +++++++--- .../Meshes/GridDetails/GridTraverser_2D.hpp | 20 +++++++---- .../Meshes/GridDetails/GridTraverser_3D.hpp | 13 ++++--- .../GridDetails/Traverser_Grid1D_impl.h | 30 ++++++++++------ .../GridDetails/Traverser_Grid2D_impl.h | 18 ++++++++++ .../GridDetails/Traverser_Grid3D_impl.h | 34 ++++++++++++++++++- 7 files changed, 129 insertions(+), 33 deletions(-) diff --git a/src/TNL/Meshes/GridDetails/GridTraverser.h b/src/TNL/Meshes/GridDetails/GridTraverser.h index 881367d3f..fb6b34da1 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser.h +++ b/src/TNL/Meshes/GridDetails/GridTraverser.h @@ -25,6 +25,8 @@ class GridTraverser { }; +enum GridTraverserMode { synchronousMode, asynchronousMode }; + /**** * 1D grid, Devices::Host */ @@ -52,6 +54,7 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > > const CoordinatesType begin, const CoordinatesType end, UserData& userData, + GridTraverserMode mode = synchronousMode, const int& stream = 0 ); }; @@ -82,6 +85,7 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > > const CoordinatesType& begin, const CoordinatesType& end, UserData& userData, + GridTraverserMode mode = synchronousMode, const int& stream = 0 ); }; @@ -112,6 +116,7 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > > const CoordinatesType& begin, const CoordinatesType& end, UserData& userData, + GridTraverserMode mode = synchronousMode, const int& stream = 0 ); }; @@ -148,7 +153,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > > const CoordinatesType end, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) -// const int& stream = 0, + //GridTraverserMode mode = synchronousMode, + GridTraverserMode mode, + // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces) @@ -186,7 +193,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > > const CoordinatesType& end, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) -// const int& stream = 0, + //GridTraverserMode mode = synchronousMode, + GridTraverserMode mode, + // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces) @@ -224,7 +233,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > > const CoordinatesType& end, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) -// const int& stream = 0, + //GridTraverserMode mode = synchronousMode, + GridTraverserMode mode, + // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces) @@ -263,7 +274,9 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > > const CoordinatesType end, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) -// const int& stream = 0, + //GridTraverserMode mode = synchronousMode, + GridTraverserMode mode, + // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces and edges) @@ -302,7 +315,9 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > > const CoordinatesType& end, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) -// const int& stream = 0, + //GridTraverserMode mode = synchronousMode, + GridTraverserMode mode, + // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces and edges) @@ -341,7 +356,9 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > > const CoordinatesType& end, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) -// const int& stream = 0, + //GridTraverserMode mode = synchronousMode, + GridTraverserMode mode, + // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces and edges) diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp index 90148f8e8..505f9c3d7 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp +++ b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp @@ -41,6 +41,7 @@ processEntities( const CoordinatesType begin, const CoordinatesType end, UserData& userData, + GridTraverserMode mode, const int& stream ) { GridEntity entity( *gridPointer ); @@ -177,13 +178,14 @@ processEntities( const CoordinatesType& begin, const CoordinatesType& end, UserData& userData, + GridTraverserMode mode, const int& stream ) { #ifdef HAVE_CUDA auto& pool = CudaStreamPool::getInstance(); const cudaStream_t& s = pool.getStream( stream ); - Devices::Cuda::synchronizeDevice(); + //Devices::Cuda::synchronizeDevice(); if( processOnlyBoundaryEntities ) { dim3 cudaBlockSize( 2 ); @@ -209,15 +211,20 @@ processEntities( userData, begin, end, - gridXIdx ); + gridXIdx );*/ } - // only launches into the stream 0 are synchronized - /*if( stream == 0 ) +#ifdef NDEBUG + if( mode == synchronousMode ) { cudaStreamSynchronize( s ); TNL_CHECK_CUDA_DEVICE; - }*/ + } +#else + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; +#endif + #else throw Exceptions::CudaSupportMissing(); #endif @@ -241,6 +248,7 @@ processEntities( const CoordinatesType& begin, const CoordinatesType& end, UserData& userData, + GridTraverserMode mode, const int& stream ) { std::cout << "Not Implemented yet Grid Traverser <1, Real, Device::MIC>" << std::endl; diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp index 84e496017..50b30c019 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp +++ b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp @@ -43,6 +43,7 @@ processEntities( const CoordinatesType begin, const CoordinatesType end, UserData& userData, + GridTraverserMode mode, const int& stream, const GridEntityParameters&... gridEntityParameters ) { @@ -402,6 +403,7 @@ processEntities( const CoordinatesType& begin, const CoordinatesType& end, UserData& userData, + GridTraverserMode mode, const int& stream, const GridEntityParameters&... gridEntityParameters ) { @@ -534,13 +536,18 @@ processEntities( gridEntityParameters... ); } - // only launches into the stream 0 are synchronized - if( stream == 0 ) - { - cudaStreamSynchronize( s ); - TNL_CHECK_CUDA_DEVICE; - } +#ifdef NDEBUG + if( mode == synchronousMode ) + { + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; } +#else + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; +#endif + } + #else throw Exceptions::CudaSupportMissing(); #endif @@ -567,6 +574,7 @@ processEntities( const CoordinatesType& begin, const CoordinatesType& end, UserData& userData, + GridTraverserMode mode, const int& stream, const GridEntityParameters&... gridEntityParameters ) { diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp index d63b81f46..9259da9bf 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp +++ b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp @@ -42,6 +42,7 @@ processEntities( const CoordinatesType begin, const CoordinatesType end, UserData& userData, + GridTraverserMode mode, const int& stream, const GridEntityParameters&... gridEntityParameters ) { @@ -324,6 +325,7 @@ processEntities( const CoordinatesType& begin, const CoordinatesType& end, UserData& userData, + GridTraverserMode mode, const int& stream, const GridEntityParameters&... gridEntityParameters ) { @@ -390,7 +392,7 @@ processEntities( ( &gridPointer.template getData< Devices::Cuda >(), userData, begin.x(), - end.x(), + end.x(), begin.z() + 1, end.z() - 1, begin.y(), @@ -401,7 +403,7 @@ processEntities( ( &gridPointer.template getData< Devices::Cuda >(), userData, begin.x(), - end.x(), + end.x(), begin.z() + 1, end.z() - 1, end.y(), @@ -417,7 +419,7 @@ processEntities( ( &gridPointer.template getData< Devices::Cuda >(), userData, begin.y() + 1, - end.y() - 1, + end.y() - 1, begin.z() + 1, end.z() - 1, begin.x(), @@ -428,7 +430,7 @@ processEntities( ( &gridPointer.template getData< Devices::Cuda >(), userData, begin.y() + 1, - end.y() - 1, + end.y() - 1, begin.z() + 1, end.z() - 1, end.x(), @@ -440,7 +442,7 @@ processEntities( cudaStreamSynchronize( s3 ); cudaStreamSynchronize( s4 ); cudaStreamSynchronize( s5 ); - cudaStreamSynchronize( s6 ); + cudaStreamSynchronize( s6 ); TNL_CHECK_CUDA_DEVICE; } else @@ -506,6 +508,7 @@ processEntities( const CoordinatesType& begin, const CoordinatesType& end, UserData& userData, + GridTraverserMode mode, const int& stream, const GridEntityParameters&... gridEntityParameters ) { diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h index 99ea85876..5669f6e83 100644 --- a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h +++ b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h @@ -43,7 +43,8 @@ processBoundaryEntities( const GridPointer& gridPointer, gridPointer, CoordinatesType( 0 ), gridPointer->getDimensions() - CoordinatesType( 1 ), - userData ); + userData, + asynchronousMode ); } else //Distributed { @@ -54,7 +55,8 @@ processBoundaryEntities( const GridPointer& gridPointer, gridPointer, CoordinatesType( 0 ) + distributedGrid->getLowerOverlap(), CoordinatesType( 0 ) + distributedGrid->getLowerOverlap(), - userData ); + userData, + asynchronousMode ); } if( neighbors[ Meshes::DistributedMeshes::Right ] == -1 ) @@ -63,7 +65,8 @@ processBoundaryEntities( const GridPointer& gridPointer, gridPointer, gridPointer->getDimensions() - CoordinatesType( 1 ) - distributedGrid->getUpperOverlap(), gridPointer->getDimensions() - CoordinatesType( 1 ) - distributedGrid->getUpperOverlap(), - userData ); + userData, + asynchronousMode ); } } @@ -92,7 +95,8 @@ processInteriorEntities( const GridPointer& gridPointer, gridPointer, CoordinatesType( 1 ), gridPointer->getDimensions() - CoordinatesType( 2 ), - userData ); + userData, + asynchronousMode ); } else //Distributed { @@ -117,7 +121,8 @@ processInteriorEntities( const GridPointer& gridPointer, gridPointer, begin, end, - userData ); + userData, + asynchronousMode ); } } @@ -146,7 +151,8 @@ processAllEntities( gridPointer, CoordinatesType( 0 ), gridPointer->getDimensions() - CoordinatesType( 1 ), - userData ); + userData, + asynchronousMode ); } else //Distributed { @@ -157,7 +163,8 @@ processAllEntities( gridPointer, begin, end, - userData ); + userData, + asynchronousMode ); } } @@ -185,7 +192,8 @@ processBoundaryEntities( const GridPointer& gridPointer, gridPointer, CoordinatesType( 0 ), gridPointer->getDimensions(), - userData ); + userData, + asynchronousMode ); } template< typename Real, @@ -208,7 +216,8 @@ processInteriorEntities( const GridPointer& gridPointer, gridPointer, CoordinatesType( 1 ), gridPointer->getDimensions() - CoordinatesType( 1 ), - userData ); + userData, + asynchronousMode ); } template< typename Real, @@ -232,7 +241,8 @@ processAllEntities( gridPointer, CoordinatesType( 0 ), gridPointer->getDimensions(), - userData ); + userData, + asynchronousMode ); } } // namespace Meshes diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h index 23d93d7e0..4d87b18ba 100644 --- a/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h +++ b/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h @@ -42,6 +42,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1 ), userData, + asynchronousMode, 0 ); } else //Distributed @@ -57,6 +58,7 @@ processBoundaryEntities( const GridPointer& gridPointer, begin, CoordinatesType( begin.x(), end.y() ), userData, + asynchronousMode, 0 ); } @@ -67,6 +69,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( end.x(), begin.y() ), end, userData, + asynchronousMode, 0 ); } @@ -78,6 +81,7 @@ processBoundaryEntities( const GridPointer& gridPointer, begin, CoordinatesType( end.x(), begin.y() ), userData, + asynchronousMode, 0 ); } @@ -88,6 +92,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( begin.x(), end.y() ), end, userData, + asynchronousMode, 0 ); } } @@ -117,6 +122,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 1 ), gridPointer->getDimensions() - CoordinatesType( 2, 2 ), userData, + asynchronousMode, 0 ); } else // distributed @@ -142,6 +148,7 @@ processInteriorEntities( const GridPointer& gridPointer, begin, end, userData, + asynchronousMode, 0); } } @@ -170,6 +177,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1 ), userData, + asynchronousMode, 0 ); } else @@ -183,6 +191,7 @@ processAllEntities( const GridPointer& gridPointer, begin, end, userData, + asynchronousMode, 0); } } @@ -211,6 +220,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 1, 0 ), CoordinatesType( 0, 1 ) ); @@ -220,6 +230,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 0 ), userData, + asynchronousMode, 0, CoordinatesType( 0, 1 ), CoordinatesType( 1, 0 ) ); @@ -246,6 +257,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 1, 0 ), CoordinatesType( 0, 1 ) ); @@ -255,6 +267,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 0, 1 ), gridPointer->getDimensions() - CoordinatesType( 1, 1 ), userData, + asynchronousMode, 0, CoordinatesType( 0, 1 ), CoordinatesType( 1, 0 ) ); @@ -281,6 +294,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 1, 0 ), CoordinatesType( 0, 1 ) ); @@ -290,6 +304,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 0 ), userData, + asynchronousMode, 0, CoordinatesType( 0, 1 ), CoordinatesType( 1, 0 ) ); @@ -316,6 +331,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions(), userData, + asynchronousMode, 0 ); } @@ -340,6 +356,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 1 ), gridPointer->getDimensions() - CoordinatesType( 1, 1 ), userData, + asynchronousMode, 0 ); } @@ -364,6 +381,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions(), userData, + asynchronousMode, 0 ); } diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h index 3c9fffd81..f4575dfec 100644 --- a/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h +++ b/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h @@ -44,6 +44,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ), userData, + asynchronousMode, 0 ); } else // distributed @@ -59,6 +60,7 @@ processBoundaryEntities( const GridPointer& gridPointer, begin, CoordinatesType( begin.x(), end.y(), end.z() ), userData, + asynchronousMode, 0 ); } @@ -69,6 +71,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( end.x() , begin.y(), begin.z() ), end, userData, + asynchronousMode, 0 ); } @@ -79,6 +82,7 @@ processBoundaryEntities( const GridPointer& gridPointer, begin, CoordinatesType( end.x(), begin.y(), end.z() ), userData, + asynchronousMode, 0 ); } @@ -89,6 +93,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( begin.x(), end.y(), begin.z() ), end, userData, + asynchronousMode, 0 ); } @@ -99,6 +104,7 @@ processBoundaryEntities( const GridPointer& gridPointer, begin, CoordinatesType( end.x(), end.y(), begin.z() ), userData, + asynchronousMode, 0 ); } @@ -109,6 +115,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( begin.x(), begin.y(), end.z() ), end, userData, + asynchronousMode, 0 ); } } @@ -138,6 +145,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 1, 1 ), gridPointer->getDimensions() - CoordinatesType( 2, 2, 2 ), userData, + asynchronousMode, 0 ); } else @@ -169,7 +177,8 @@ processInteriorEntities( const GridPointer& gridPointer, begin, end, userData, - 0); + asynchronousMode, + 0 ); } } @@ -197,6 +206,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ), userData, + asynchronousMode, 0 ); } else @@ -209,6 +219,7 @@ processAllEntities( const GridPointer& gridPointer, begin, end, userData, + asynchronousMode, 0 ); } } @@ -237,6 +248,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 1, 1 ), userData, + asynchronousMode, 2, CoordinatesType( 1, 0, 0 ), CoordinatesType( 0, 1, 1 ) ); @@ -246,6 +258,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 0, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 0, 1, 0 ), CoordinatesType( 1, 0, 1 ) ); @@ -255,6 +268,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 0 ), userData, + asynchronousMode, 0, CoordinatesType( 0, 0, 1 ), CoordinatesType( 1, 1, 0 ) ); @@ -281,6 +295,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ), userData, + asynchronousMode, 2, CoordinatesType( 1, 0, 0 ), CoordinatesType( 0, 1, 1 ) ); @@ -290,6 +305,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 0, 1, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 0, 1, 0 ), CoordinatesType( 1, 0, 1 ) ); @@ -299,6 +315,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 1 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ), userData, + asynchronousMode, 0, CoordinatesType( 0, 0, 1 ), CoordinatesType( 1, 1, 0 ) ); @@ -324,6 +341,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 1, 1 ), userData, + asynchronousMode, 2, CoordinatesType( 1, 0, 0 ), CoordinatesType( 0, 1, 1 ) ); @@ -333,6 +351,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 0, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 0, 1, 0 ), CoordinatesType( 1, 0, 1 ) ); @@ -342,6 +361,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 0 ), userData, + asynchronousMode, 0, CoordinatesType( 0, 0, 1 ), CoordinatesType( 1, 1, 0 ) ); @@ -371,6 +391,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 0, 0 ), userData, + asynchronousMode, 2, CoordinatesType( 0, 1, 1 ), CoordinatesType( 1, 0, 0 ) ); @@ -380,6 +401,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 1, 0 ), userData, + asynchronousMode, 1, CoordinatesType( 1, 0, 1 ), CoordinatesType( 0, 1, 0 ) ); @@ -389,6 +411,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 0, 1 ), userData, + asynchronousMode, 0, CoordinatesType( 1, 1, 0 ), CoordinatesType( 0, 0, 1 ) ); @@ -415,6 +438,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 0, 1, 1 ), gridPointer->getDimensions() - CoordinatesType( 0, 1, 1 ), userData, + asynchronousMode, 2, CoordinatesType( 0, 1, 1 ), CoordinatesType( 1, 0, 0 ) ); @@ -424,6 +448,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 0, 1 ), gridPointer->getDimensions() - CoordinatesType( 1, 0, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 1, 0, 1 ), CoordinatesType( 0, 1, 0 ) ); @@ -433,6 +458,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 1, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 0 ), userData, + asynchronousMode, 0, CoordinatesType( 1, 1, 0 ), CoordinatesType( 0, 0, 1 ) ); @@ -458,6 +484,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 0, 0 ), userData, + asynchronousMode, 2, CoordinatesType( 0, 1, 1 ), CoordinatesType( 1, 0, 0 ) ); @@ -467,6 +494,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 1, 0 ), userData, + asynchronousMode, 1, CoordinatesType( 1, 0, 1 ), CoordinatesType( 0, 1, 0 ) ); @@ -476,6 +504,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 0, 1 ), userData, + asynchronousMode, 0, CoordinatesType( 1, 1, 0 ), CoordinatesType( 0, 0, 1 ) ); @@ -505,6 +534,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions(), userData, + asynchronousMode, 0 ); } @@ -529,6 +559,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 1, 1 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ), userData, + asynchronousMode, 0 ); } @@ -553,6 +584,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions(), userData, + asynchronousMode, 0 ); } -- GitLab From cb834c849b1ce2af64f33f3370be88c9227c453d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 5 Jan 2019 08:33:50 +0100 Subject: [PATCH 052/130] Added GridTraverserBenchmarkHelper. --- .../Traversers/GridTraversersBenchmark_1D.h | 122 +++++++++++++++--- 1 file changed, 104 insertions(+), 18 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index e626b17e3..22f1d6899 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -28,13 +28,110 @@ namespace TNL { namespace Benchmarks { namespace Traversers { +template< typename Grid, + typename Device = typename Grid::DeviceType > +class GridTraverserBenchmarkHelper{}; + +template< typename Grid > +class GridTraverserBenchmarkHelper< Grid, Devices::Host > +{ + public: + + using GridType = Grid; + using GridPointer = Pointers::SharedPointer< Grid >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; + using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; + using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; + + + static void noBCTraverserTest( const GridPointer& grid, + WriteOneTraverserUserDataType& userData, + std::size_t size ) + { + /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >( + grid, + CoordinatesType( 0 ), + grid->getDimensions() - CoordinatesType( 1 ), + userData );*/ + + const CoordinatesType begin( 0 ); + const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); + //MeshFunction* _u = &u.template modifyData< Device >(); + Cell entity( *grid ); + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); + } + + } +}; + +template< typename Grid > +class GridTraverserBenchmarkHelper< Grid, Devices::Cuda > +{ + public: + + using GridType = Grid; + using GridPointer = Pointers::SharedPointer< Grid >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; + using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; + using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; + + + static void noBCTraverserTest( const GridPointer& grid, + WriteOneTraverserUserDataType& userData, + std::size_t size ) + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + Meshes::GridTraverser1D< RealType, IndexType, Cell, WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + <<< blocksCount, blockSize >>> + ( &grid.template getData< Devices::Cuda >(), + userData, + CoordinatesType( 0 ), + CoordinatesType( size ) - CoordinatesType( 1 ), + gridIdx.x ); + + } +#endif + } +}; + template< typename Device, typename Real, typename Index > class GridTraversersBenchmark< 1, Device, Real, Index > { public: - + using Vector = Containers::Vector< Real, Device, Index >; using Grid = Meshes::Grid< 1, Real, Device, Index >; using GridPointer = Pointers::SharedPointer< Grid >; @@ -130,24 +227,13 @@ class GridTraversersBenchmark< 1, Device, Real, Index > void writeOneUsingTraverser() { using CoordinatesType = typename Grid::CoordinatesType; - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > - ( grid, userData ); + //traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + // ( grid, userData ); - /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >( - grid, - CoordinatesType( 0 ), - grid->getDimensions() - CoordinatesType( 1 ), - userData );*/ - /*const CoordinatesType begin( 0 ); - const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); - MeshFunction* _u = &u.template modifyData< Device >(); - Cell entity( *grid ); - for( Index x = begin.x(); x <= end.x(); x ++ ) - { - entity.getCoordinates().x() = x; - entity.refresh(); - WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); - }*/ + GridTraverserBenchmarkHelper< Grid >::noBCTraverserTest( + grid, + userData, + size ); } void traverseUsingPureC() -- GitLab From a31a7e6db7910bd208b6b556ae7227705fe20557 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 5 Jan 2019 08:34:20 +0100 Subject: [PATCH 053/130] Refactoring of Grid 1D traverser. --- .../Meshes/GridDetails/GridTraverser_1D.hpp | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp index 505f9c3d7..5b35d5be9 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp +++ b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp @@ -199,7 +199,31 @@ processEntities( } else { - dim3 cudaBlockSize( 256 ); + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + end.x() - begin.x() + 1 ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > + <<< blocksCount, blockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end, + gridIdx.x ); + } + + /*dim3 cudaBlockSize( 256 ); dim3 cudaBlocks; cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); -- GitLab From 2c26ffc9685b34c718c0aac6e814dedc6cd4b797 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 5 Jan 2019 09:47:43 +0100 Subject: [PATCH 054/130] TRaversers benchmark refactoring, --- .../Traversers/AddOneEntitiesProcessor.h | 43 +++++ .../Traversers/BenchmarkTraverserUserData.h | 32 ++++ .../Traversers/GridTraverserBenchmarkHelper.h | 152 ++++++++++++++++++ .../Traversers/GridTraversersBenchmark.h | 30 +--- .../Traversers/GridTraversersBenchmark_1D.h | 116 ++----------- .../Traversers/GridTraversersBenchmark_2D.h | 23 ++- .../Traversers/GridTraversersBenchmark_3D.h | 28 ++-- .../Traversers/tnl-benchmark-traversers.h | 28 ++-- 8 files changed, 280 insertions(+), 172 deletions(-) create mode 100644 src/Benchmarks/Traversers/AddOneEntitiesProcessor.h create mode 100644 src/Benchmarks/Traversers/BenchmarkTraverserUserData.h create mode 100644 src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h diff --git a/src/Benchmarks/Traversers/AddOneEntitiesProcessor.h b/src/Benchmarks/Traversers/AddOneEntitiesProcessor.h new file mode 100644 index 000000000..6b136d074 --- /dev/null +++ b/src/Benchmarks/Traversers/AddOneEntitiesProcessor.h @@ -0,0 +1,43 @@ +/*************************************************************************** + BenchmarkTraverserUserData.h - description + ------------------- + begin : Jan 5, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename TraverserUserData > +class AddOneEntitiesProcessor +{ + public: + + using MeshType = typename TraverserUserData::MeshType; + using DeviceType = typename MeshType::DeviceType; + using RealType = typename MeshType::RealType; + + template< typename GridEntity > + __cuda_callable__ + static inline void processEntity( const MeshType& mesh, + TraverserUserData& userData, + const GridEntity& entity ) + { + auto& u = *userData.u; + u( entity ) += ( RealType ) 1.0; + } +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h new file mode 100644 index 000000000..5a2f179fa --- /dev/null +++ b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h @@ -0,0 +1,32 @@ +/*************************************************************************** + BenchmarkTraverserUserData.h - description + ------------------- + begin : Jan 5, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename MeshFunction > +class BenchmarkTraverserUserData +{ + public: + + using MeshType = typename MeshFunction::MeshType; + + MeshFunction* u; +}; + + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h new file mode 100644 index 000000000..df43f93cd --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h @@ -0,0 +1,152 @@ +/*************************************************************************** + GridTraversersBenchmarkHelper.h - description + ------------------- + begin : Jan 5, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include "AddOneEntitiesProcessor.h" +#include "BenchmarkTraverserUserData.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor > +__global__ void +_GridTraverser1D( + const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const Index gridIdx ) +{ + typedef Real RealType; + typedef Index IndexType; + typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( coordinates <= end ) + { + GridEntity entity( *grid, coordinates ); + entity.refresh(); + ( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0; + //( *userData.u )( entity) += 1.0; + //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +} +#endif + +template< typename Grid, + typename Device = typename Grid::DeviceType > +class GridTraverserBenchmarkHelper{}; + +template< typename Grid > +class GridTraverserBenchmarkHelper< Grid, Devices::Host > +{ + public: + + using GridType = Grid; + using GridPointer = Pointers::SharedPointer< Grid >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void noBCTraverserTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { + /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >( + grid, + CoordinatesType( 0 ), + grid->getDimensions() - CoordinatesType( 1 ), + userData );*/ + + const CoordinatesType begin( 0 ); + const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); + //MeshFunction* _u = &u.template modifyData< Device >(); + Cell entity( *grid ); + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); + } + + } +}; + +template< typename Grid > +class GridTraverserBenchmarkHelper< Grid, Devices::Cuda > +{ + public: + + using GridType = Grid; + using GridPointer = Pointers::SharedPointer< Grid >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void noBCTraverserTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + _GridTraverser1D< RealType, IndexType, Cell, UserDataType, AddOneEntitiesProcessorType > + <<< blocksCount, blockSize >>> + ( &grid.template getData< Devices::Cuda >(), + userData, + CoordinatesType( 0 ), + CoordinatesType( size ) - CoordinatesType( 1 ), + gridIdx.x ); + + } +#endif + } +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL + + diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index bd748ed09..be4f41d31 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -21,40 +21,16 @@ #include #include #include + +#include "GridTraverserBenchmarkHelper.h" +#include "BenchmarkTraverserUserData.h" #include "cuda-kernels.h" namespace TNL { namespace Benchmarks { namespace Traversers { -template< typename TraverserUserData > -class WriteOneEntitiesProcessor -{ - public: - - using MeshType = typename TraverserUserData::MeshType; - using DeviceType = typename MeshType::DeviceType; - - template< typename GridEntity > - __cuda_callable__ - static inline void processEntity( const MeshType& mesh, - TraverserUserData& userData, - const GridEntity& entity ) - { - auto& u = userData.u.template modifyData< DeviceType >(); - u( entity ) += (typename MeshType::RealType) 1.0; - } -}; -template< typename MeshFunctionPointer > -class WriteOneUserData -{ - public: - - using MeshType = typename MeshFunctionPointer::ObjectType::MeshType; - - MeshFunctionPointer u; -}; template< int Dimension, typename Device, diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index 22f1d6899..bdce2d746 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -28,102 +28,6 @@ namespace TNL { namespace Benchmarks { namespace Traversers { -template< typename Grid, - typename Device = typename Grid::DeviceType > -class GridTraverserBenchmarkHelper{}; - -template< typename Grid > -class GridTraverserBenchmarkHelper< Grid, Devices::Host > -{ - public: - - using GridType = Grid; - using GridPointer = Pointers::SharedPointer< Grid >; - using RealType = typename GridType::RealType; - using IndexType = typename GridType::IndexType; - using CoordinatesType = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; - using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; - using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; - using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; - - - static void noBCTraverserTest( const GridPointer& grid, - WriteOneTraverserUserDataType& userData, - std::size_t size ) - { - /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >( - grid, - CoordinatesType( 0 ), - grid->getDimensions() - CoordinatesType( 1 ), - userData );*/ - - const CoordinatesType begin( 0 ); - const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); - //MeshFunction* _u = &u.template modifyData< Device >(); - Cell entity( *grid ); - for( IndexType x = begin.x(); x <= end.x(); x ++ ) - { - entity.getCoordinates().x() = x; - entity.refresh(); - WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); - } - - } -}; - -template< typename Grid > -class GridTraverserBenchmarkHelper< Grid, Devices::Cuda > -{ - public: - - using GridType = Grid; - using GridPointer = Pointers::SharedPointer< Grid >; - using RealType = typename GridType::RealType; - using IndexType = typename GridType::IndexType; - using CoordinatesType = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; - using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; - using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; - using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; - - - static void noBCTraverserTest( const GridPointer& grid, - WriteOneTraverserUserDataType& userData, - std::size_t size ) - { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( - blockSize, - blocksCount, - gridsCount, - size ); - dim3 gridIdx; - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - Meshes::GridTraverser1D< RealType, IndexType, Cell, WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > - <<< blocksCount, blockSize >>> - ( &grid.template getData< Devices::Cuda >(), - userData, - CoordinatesType( 0 ), - CoordinatesType( size ) - CoordinatesType( 1 ), - gridIdx.x ); - - } -#endif - } -}; template< typename Device, typename Real, @@ -140,13 +44,13 @@ class GridTraversersBenchmark< 1, Device, Real, Index > using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; using Traverser = Meshes::Traverser< Grid, Cell >; - using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; - using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; GridTraversersBenchmark( Index size ) :size( size ), v( size ), grid( size ), u( grid ) { - userData.u = this->u; + userData.u = &this->u.template modifyData< Device >(); v_data = v.getData(); } @@ -156,7 +60,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > u->getData().setValue( 0.0 ); }; - void writeOneUsingPureC() + void addOneUsingPureC() { if( std::is_same< Device, Devices::Host >::value ) { @@ -187,7 +91,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > } } - void writeOneUsingParallelFor() + void addOneUsingParallelFor() { auto f = [] __cuda_callable__ ( Index i, Real* data ) { @@ -196,7 +100,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } - void writeOneUsingParallelForAndGridEntity() + void addOneUsingParallelForAndGridEntity() { const Grid* currentGrid = &grid.template getData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Real* data ) @@ -209,7 +113,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } - void writeOneUsingParallelForAndMeshFunction() + void addOneUsingParallelForAndMeshFunction() { const Grid* currentGrid = &grid.template getData< Device >(); MeshFunction* _u = &u.template modifyData< Device >(); @@ -224,7 +128,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ParallelFor< Device >::exec( ( Index ) 0, size, f ); } - void writeOneUsingTraverser() + void addOneUsingTraverser() { using CoordinatesType = typename Grid::CoordinatesType; //traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > @@ -282,7 +186,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > void traverseUsingTraverser() { // TODO !!!!!!!!!!!!!!!!!!!!!! - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); } @@ -294,7 +198,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > GridPointer grid; MeshFunctionPointer u; Traverser traverser; - WriteOneTraverserUserDataType userData; + UserDataType userData; }; } // namespace Traversers diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index 1296a9a46..6fb0e52d4 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -42,14 +42,13 @@ class GridTraversersBenchmark< 2, Device, Real, Index > using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >; using Traverser = Meshes::Traverser< Grid, Cell >; - using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; - using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; - using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; GridTraversersBenchmark( Index size ) :size( size ), v( size * size ), grid( size, size ), u( grid ) { - userData.u = this->u; + userData.u = &this->u.template modifyData< Device >(); v_data = v.getData(); } @@ -59,7 +58,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > u->getData().setValue( 0.0 ); }; - void writeOneUsingPureC() + void addOneUsingPureC() { if( std::is_same< Device, Devices::Host >::value ) { @@ -93,7 +92,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > } } - void writeOneUsingParallelFor() + void addOneUsingParallelFor() { Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) @@ -108,7 +107,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > f, v.getData() ); } - void writeOneUsingParallelForAndGridEntity() + void addOneUsingParallelForAndGridEntity() { const Grid* currentGrid = &grid.template getData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) @@ -127,7 +126,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > f, v.getData() ); } - void writeOneUsingParallelForAndMeshFunction() + void addOneUsingParallelForAndMeshFunction() { const Grid* currentGrid = &grid.template getData< Device >(); MeshFunction* _u = &u.template modifyData< Device >(); @@ -148,10 +147,10 @@ class GridTraversersBenchmark< 2, Device, Real, Index > } - void writeOneUsingTraverser() + void addOneUsingTraverser() { using CoordinatesType = typename Grid::CoordinatesType; - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >( @@ -232,7 +231,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > void traversingUsingTraverser() { // TODO !!!!!!!!!!!!!!!!!!!!!! - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); } @@ -244,7 +243,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > GridPointer grid; MeshFunctionPointer u; Traverser traverser; - WriteOneTraverserUserDataType userData; + UserDataType userData; }; } // namespace Traversers diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index 35863a3c9..977809563 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -21,7 +21,10 @@ #include #include #include + #include "cuda-kernels.h" +#include "AddOneEntitiesProcessor.h" +#include "BenchmarkTraverserUserData.h" namespace TNL { namespace Benchmarks { @@ -42,17 +45,16 @@ class GridTraversersBenchmark< 3, Device, Real, Index > using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >; using Traverser = Meshes::Traverser< Grid, Cell >; - using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; - using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; - using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; - + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + GridTraversersBenchmark( Index size ) : size( size ), v( size * size * size ), grid( size, size, size ), u( grid ) { - userData.u = this->u; + userData.u = &this->u.template modifyData< Device >(); v_data = v.getData(); } @@ -62,7 +64,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > u->getData().setValue( 0.0 ); }; - void writeOneUsingPureC() + void addOneUsingPureC() { if( std::is_same< Device, Devices::Host >::value ) { @@ -99,7 +101,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > } } - void writeOneUsingParallelFor() + void addOneUsingParallelFor() { Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) @@ -116,7 +118,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > f, v.getData() ); } - void writeOneUsingParallelForAndGridEntity() + void addOneUsingParallelForAndGridEntity() { const Grid* currentGrid = &grid.template getData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) @@ -138,7 +140,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > f, v.getData() ); } - void writeOneUsingParallelForAndMeshFunction() + void addOneUsingParallelForAndMeshFunction() { const Grid* currentGrid = &grid.template getData< Device >(); MeshFunction* _u = &u.template modifyData< Device >(); @@ -162,9 +164,9 @@ class GridTraversersBenchmark< 3, Device, Real, Index > } - void writeOneUsingTraverser() + void addOneUsingTraverser() { - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); } @@ -240,7 +242,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > void traverseUsingTraverser() { // TODO !!!!!!!!!!!!!!!!!!!!!! - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); } @@ -252,7 +254,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > GridPointer grid; MeshFunctionPointer u; Traverser traverser; - WriteOneTraverserUserDataType userData; + UserDataType userData; }; } // namespace Traversers diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index ff6d25624..c6423e452 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -84,14 +84,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters, auto hostWriteOneUsingPureC = [&] () { - hostTraverserBenchmark.writeOneUsingPureC(); + hostTraverserBenchmark.addOneUsingPureC(); }; benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); #ifdef HAVE_CUDA auto cudaWriteOneUsingPureC = [&] () { - cudaTraverserBenchmark.writeOneUsingPureC(); + cudaTraverserBenchmark.addOneUsingPureC(); }; if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); @@ -107,14 +107,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters, auto hostWriteOneUsingParallelFor = [&] () { - hostTraverserBenchmark.writeOneUsingParallelFor(); + hostTraverserBenchmark.addOneUsingParallelFor(); }; benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelFor = [&] () { - cudaTraverserBenchmark.writeOneUsingParallelFor(); + cudaTraverserBenchmark.addOneUsingParallelFor(); }; if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); @@ -128,7 +128,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, { auto hostWriteOneUsingParallelForAndGridEntity = [&] () { - hostTraverserBenchmark.writeOneUsingParallelForAndGridEntity(); + hostTraverserBenchmark.addOneUsingParallelForAndGridEntity(); }; benchmark.setOperation( "par.for+grid ent.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); @@ -136,7 +136,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelForAndGridEntity = [&] () { - cudaTraverserBenchmark.writeOneUsingParallelForAndGridEntity(); + cudaTraverserBenchmark.addOneUsingParallelForAndGridEntity(); }; if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity ); @@ -150,7 +150,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, { auto hostWriteOneUsingParallelForAndMeshFunction = [&] () { - hostTraverserBenchmark.writeOneUsingParallelForAndMeshFunction(); + hostTraverserBenchmark.addOneUsingParallelForAndMeshFunction(); }; benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction ); @@ -158,7 +158,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelForAndMeshFunction = [&] () { - cudaTraverserBenchmark.writeOneUsingParallelForAndMeshFunction(); + cudaTraverserBenchmark.addOneUsingParallelForAndMeshFunction(); }; if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction ); @@ -174,14 +174,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters, benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); auto hostWriteOneUsingTraverser = [&] () { - hostTraverserBenchmark.writeOneUsingTraverser(); + hostTraverserBenchmark.addOneUsingTraverser(); }; benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); #ifdef HAVE_CUDA auto cudaWriteOneUsingTraverser = [&] () { - cudaTraverserBenchmark.writeOneUsingTraverser(); + cudaTraverserBenchmark.addOneUsingTraverser(); }; if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); @@ -254,13 +254,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, */ auto hostTraverseUsingParallelFor = [&] () { - hostTraverserBenchmark.writeOneUsingParallelFor(); + hostTraverserBenchmark.addOneUsingParallelFor(); }; #ifdef HAVE_CUDA auto cudaTraverseUsingParallelFor = [&] () { - cudaTraverserBenchmark.writeOneUsingParallelFor(); + cudaTraverserBenchmark.addOneUsingParallelFor(); }; #endif @@ -286,13 +286,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, */ auto hostTraverseUsingTraverser = [&] () { - hostTraverserBenchmark.writeOneUsingTraverser(); + hostTraverserBenchmark.addOneUsingTraverser(); }; #ifdef HAVE_CUDA auto cudaTraverseUsingTraverser = [&] () { - cudaTraverserBenchmark.writeOneUsingTraverser(); + cudaTraverserBenchmark.addOneUsingTraverser(); }; #endif -- GitLab From 64ae289e5062053d874fc5bb1a17c506abbffd2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 5 Jan 2019 13:06:30 +0100 Subject: [PATCH 055/130] Analyzing grid entity efficiency. --- src/Benchmarks/Benchmarks.h | 2 +- .../Traversers/BenchmarkTraverserUserData.h | 9 ++- .../Traversers/GridTraverserBenchmarkHelper.h | 30 ++++++---- .../Traversers/GridTraversersBenchmark_1D.h | 4 +- .../Traversers/GridTraversersBenchmark_2D.h | 4 +- .../Traversers/GridTraversersBenchmark_3D.h | 4 +- .../Traversers/tnl-benchmark-traversers.h | 57 ++++++++++++------- 7 files changed, 68 insertions(+), 42 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index f31e21f6c..355fb4671 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -74,7 +74,7 @@ public: { config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); config.addEntry< bool >( "reset", "Call reset function between loops.", true ); - config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 ); + config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 ); config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true ); config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 ); } diff --git a/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h index 5a2f179fa..2ae00ec69 100644 --- a/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h +++ b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h @@ -20,10 +20,17 @@ template< typename MeshFunction > class BenchmarkTraverserUserData { public: - + using MeshType = typename MeshFunction::MeshType; + using RealType = typename MeshType::RealType; + using DeviceType = typename MeshType::DeviceType; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + BenchmarkTraverserUserData( MeshFunctionPointer& f ) + : u( &f.template modifyData< DeviceType >() ), data( f->getData().getData() ){} + MeshFunction* u; + RealType* data; }; diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h index df43f93cd..8b00e060a 100644 --- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h @@ -14,6 +14,7 @@ #include "AddOneEntitiesProcessor.h" #include "BenchmarkTraverserUserData.h" +#include "SimpleCell.h" namespace TNL { namespace Benchmarks { @@ -38,13 +39,16 @@ _GridTraverser1D( typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; + GridEntity entity;//( *grid, ); + //entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( coordinates <= end ) - { - GridEntity entity( *grid, coordinates ); - entity.refresh(); - ( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0; - //( *userData.u )( entity) += 1.0; + { + //entity.refresh(); + //( userData.u->getData() )[ entity.getIndex( coordinates ) ] += ( RealType ) 1.0; + //( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0; + userData.data[ coordinates.x() ] += ( RealType ) 1.0; + //( *userData.u )( entity ) += ( RealType ) 1.0; //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } } @@ -66,8 +70,9 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Host > using CoordinatesType = typename Grid::CoordinatesType; using MeshFunction = Functions::MeshFunction< Grid >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; + using CellType = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< Grid, CellType >; using UserDataType = BenchmarkTraverserUserData< MeshFunction >; using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; @@ -84,13 +89,13 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Host > const CoordinatesType begin( 0 ); const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); //MeshFunction* _u = &u.template modifyData< Device >(); - Cell entity( *grid ); + /*SimpleCellType entity( *grid ); for( IndexType x = begin.x(); x <= end.x(); x ++ ) { entity.getCoordinates().x() = x; entity.refresh(); AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); - } + }*/ } }; @@ -107,8 +112,9 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Cuda > using CoordinatesType = typename Grid::CoordinatesType; using MeshFunction = Functions::MeshFunction< Grid >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; + using CellType = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< Grid, CellType >; using UserDataType = BenchmarkTraverserUserData< MeshFunction >; using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; @@ -132,7 +138,7 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Cuda > gridsCount, gridIdx, gridSize ); - _GridTraverser1D< RealType, IndexType, Cell, UserDataType, AddOneEntitiesProcessorType > + _GridTraverser1D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType > <<< blocksCount, blockSize >>> ( &grid.template getData< Devices::Cuda >(), userData, diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index bdce2d746..006b0316f 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -48,9 +48,9 @@ class GridTraversersBenchmark< 1, Device, Real, Index > using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; GridTraversersBenchmark( Index size ) - :size( size ), v( size ), grid( size ), u( grid ) + :size( size ), v( size ), grid( size ), u( grid ), + userData( this->u ) { - userData.u = &this->u.template modifyData< Device >(); v_data = v.getData(); } diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index 6fb0e52d4..7c90a5064 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -46,9 +46,9 @@ class GridTraversersBenchmark< 2, Device, Real, Index > using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; GridTraversersBenchmark( Index size ) - :size( size ), v( size * size ), grid( size, size ), u( grid ) + :size( size ), v( size * size ), grid( size, size ), u( grid ), + userData( u ) { - userData.u = &this->u.template modifyData< Device >(); v_data = v.getData(); } diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index 977809563..2a32184ea 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -52,9 +52,9 @@ class GridTraversersBenchmark< 3, Device, Real, Index > : size( size ), v( size * size * size ), grid( size, size, size ), - u( grid ) + u( grid ), + userData( u ) { - userData.u = &this->u.template modifyData< Device >(); v_data = v.getData(); } diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index c6423e452..2963bb792 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -42,6 +42,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); const std::size_t minSize = parameters.getParameter< int >( "min-size" ); const std::size_t maxSize = parameters.getParameter< int >( "max-size" ); + const bool withHost = parameters.getParameter< bool >( "with-host" ); #ifdef HAVE_CUDA const bool withCuda = parameters.getParameter< bool >( "with-cuda" ); #else @@ -78,7 +79,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using C for */ - if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-pure-c" ) ) + if( tests.containsValue( "all" ) || tests.containsValue( "add-one-pure-c" ) ) { benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); @@ -86,7 +87,8 @@ bool runBenchmark( const Config::ParameterContainer& parameters, { hostTraverserBenchmark.addOneUsingPureC(); }; - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); #ifdef HAVE_CUDA auto cudaWriteOneUsingPureC = [&] () @@ -101,7 +103,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using parallel for */ - if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for" ) ) + if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for" ) ) { benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); @@ -109,7 +111,8 @@ bool runBenchmark( const Config::ParameterContainer& parameters, { hostTraverserBenchmark.addOneUsingParallelFor(); }; - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelFor = [&] () @@ -124,14 +127,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using parallel for with grid entity */ - if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for-and-grid-entity" ) ) + if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-grid-entity" ) ) { auto hostWriteOneUsingParallelForAndGridEntity = [&] () { hostTraverserBenchmark.addOneUsingParallelForAndGridEntity(); }; benchmark.setOperation( "par.for+grid ent.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelForAndGridEntity = [&] () @@ -146,14 +150,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using parallel for with mesh function */ - if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for-and-mesh-function" ) ) + if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-mesh-function" ) ) { auto hostWriteOneUsingParallelForAndMeshFunction = [&] () { hostTraverserBenchmark.addOneUsingParallelForAndMeshFunction(); }; benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction ); #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelForAndMeshFunction = [&] () @@ -169,14 +174,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using traverser */ - if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-traverser" ) ) + if( tests.containsValue( "all" ) || tests.containsValue( "add-one-traverser" ) ) { benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); auto hostWriteOneUsingTraverser = [&] () { hostTraverserBenchmark.addOneUsingTraverser(); }; - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); #ifdef HAVE_CUDA auto cudaWriteOneUsingTraverser = [&] () @@ -235,14 +241,16 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( tests.containsValue( "all" ) || tests.containsValue( "bc-pure-c" ) ) { benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC ); + if( withHost ) + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC ); #endif benchmark.setOperation( "Pure C RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC ); @@ -267,14 +275,16 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) ) { benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); + if( withHost ) + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor ); #endif benchmark.setOperation( "parallel for RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor ); @@ -299,13 +309,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) ) { benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser ); + if( withHost ) + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser ); #endif benchmark.setOperation( "traverser RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser ); #endif @@ -318,17 +330,18 @@ void setupConfig( Config::ConfigDescription& config ) { config.addList< String >( "tests", "Tests to be performed.", "all" ); config.addEntryEnum( "all" ); - config.addEntryEnum( "no-bc-pure-c" ); - config.addEntryEnum( "no-bc-parallel-for" ); - config.addEntryEnum( "no-bc-parallel-for-and-grid-entity" ); - config.addEntryEnum( "no-bc-traverser" ); + config.addEntryEnum( "add-one-pure-c" ); + config.addEntryEnum( "add-one-parallel-for" ); + config.addEntryEnum( "add-one-parallel-for-and-grid-entity" ); + config.addEntryEnum( "add-one-traverser" ); config.addEntryEnum( "bc-pure-c" ); config.addEntryEnum( "bc-parallel-for" ); config.addEntryEnum( "bc-traverser" ); + config.addEntry< bool >( "with-host", "Perform CPU benchmarks.", true ); #ifdef HAVE_CUDA - config.addEntry< bool >( "with-cuda", "Perform even the CUDA benchmarks.", true ); + config.addEntry< bool >( "with-cuda", "Perform CUDA benchmarks.", true ); #else - config.addEntry< bool >( "with-cuda", "Perform even the CUDA benchmarks.", false ); + config.addEntry< bool >( "with-cuda", "Perform CUDA benchmarks.", false ); #endif config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-traversers.log"); config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); -- GitLab From cd5d21ac15929722ff459d89430b69b4e39bd9d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 5 Jan 2019 20:17:09 +0100 Subject: [PATCH 056/130] Implemented SimpleCell traverser benchmark test. --- .../Traversers/GridTraverserBenchmarkHelper.h | 32 ++++--- src/Benchmarks/Traversers/SimpleCell.h | 95 +++++++++++++++++++ 2 files changed, 113 insertions(+), 14 deletions(-) create mode 100644 src/Benchmarks/Traversers/SimpleCell.h diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h index 8b00e060a..c13ec3ab7 100644 --- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h @@ -37,18 +37,19 @@ _GridTraverser1D( typedef Real RealType; typedef Index IndexType; typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; + //typename GridType::CoordinatesType coordinates; - GridEntity entity;//( *grid, ); - //entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( coordinates <= end ) + GridEntity entity( *grid ); + entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + //coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( entity.getCoordinates() <= end ) { - //entity.refresh(); + entity.refresh(); //( userData.u->getData() )[ entity.getIndex( coordinates ) ] += ( RealType ) 1.0; //( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0; - userData.data[ coordinates.x() ] += ( RealType ) 1.0; - //( *userData.u )( entity ) += ( RealType ) 1.0; + //userData.data[ entity.getIndex() ] += ( RealType ) 1.0; + //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0; + ( *userData.u )( entity ) += ( RealType ) 1.0; //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } } @@ -80,22 +81,25 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Host > UserDataType& userData, std::size_t size ) { - /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >( + /*Meshes::GridTraverser< Grid >::template processEntities< CellType, AddOneEntitiesProcessorType, UserDataType, false >( grid, CoordinatesType( 0 ), grid->getDimensions() - CoordinatesType( 1 ), - userData );*/ - + userData ); + */ + const CoordinatesType begin( 0 ); const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); //MeshFunction* _u = &u.template modifyData< Device >(); - /*SimpleCellType entity( *grid ); + SimpleCellType entity( *grid ); for( IndexType x = begin.x(); x <= end.x(); x ++ ) { entity.getCoordinates().x() = x; entity.refresh(); - AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); - }*/ + //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0; + ( *userData.u )( entity ) += ( RealType ) 1.0; + //AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); + } } }; diff --git a/src/Benchmarks/Traversers/SimpleCell.h b/src/Benchmarks/Traversers/SimpleCell.h new file mode 100644 index 000000000..c70f64fda --- /dev/null +++ b/src/Benchmarks/Traversers/SimpleCell.h @@ -0,0 +1,95 @@ +/*************************************************************************** + SimpleCell.h - description + ------------------- + begin : Jan 5, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include +#include + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename Grid > +class SimpleCell{}; + +template< typename Real, + typename Device, + typename Index > +class SimpleCell< Meshes::Grid< 1, Real, Device, Index > > +{ + public: + using GridType = Meshes::Grid< 1, Real, Device, Index >; + using RealType = typename GridType::RealType; + using DeviceType = typename GridType::DeviceType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + + constexpr static int getEntityDimension() { return 1; }; + + __cuda_callable__ + SimpleCell( const GridType& grid ) : + grid( grid ){}; + + __cuda_callable__ + const GridType& getMesh() const { return this->grid;}; + + __cuda_callable__ + CoordinatesType& getCoordinates() { return this->coordinates; }; + + __cuda_callable__ + void refresh() {index = coordinates.x();}; + + __cuda_callable__ + const IndexType& getIndex() const { return this->index; }; + + protected: + const GridType& grid; + CoordinatesType coordinates; + IndexType index; +}; + +template< typename Real, + typename Device, + typename Index > +class SimpleCell< Meshes::Grid< 2, Real, Device, Index > > +{ + public: + using GridType = Meshes::Grid< 1, Real, Device, Index >; + using RealType = typename GridType::RealType; + using DeviceType = typename GridType::DeviceType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + + constexpr static int getEntityDimension() { return 2; }; + +}; + +template< typename Real, + typename Device, + typename Index > +class SimpleCell< Meshes::Grid< 3, Real, Device, Index > > +{ + public: + using GridType = Meshes::Grid< 1, Real, Device, Index >; + using RealType = typename GridType::RealType; + using DeviceType = typename GridType::DeviceType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + + constexpr static int getEntityDimension() { return 3; }; + +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL -- GitLab From ce1886b6e9ab03d7960cb2ed4b14175c51f91f69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 5 Jan 2019 20:17:33 +0100 Subject: [PATCH 057/130] MeshFunction refactoring. --- src/TNL/Functions/MeshFunction.h | 103 +++++++++++++------------- src/TNL/Functions/MeshFunction_impl.h | 7 +- 2 files changed, 52 insertions(+), 58 deletions(-) diff --git a/src/TNL/Functions/MeshFunction.h b/src/TNL/Functions/MeshFunction.h index 4ccdab9f3..32d54ec21 100644 --- a/src/TNL/Functions/MeshFunction.h +++ b/src/TNL/Functions/MeshFunction.h @@ -20,7 +20,7 @@ namespace TNL { -namespace Functions { +namespace Functions { template< typename Mesh, int MeshEntityDimension = Mesh::getMeshDimension(), @@ -32,155 +32,152 @@ class MeshFunction : //static_assert( Mesh::DeviceType::DeviceType == Vector::DeviceType::DeviceType, // "Both mesh and vector of a mesh function must reside on the same device."); public: - + using MeshType = Mesh; using DeviceType = typename MeshType::DeviceType; using IndexType = typename MeshType::GlobalIndexType; - using MeshPointer = Pointers::SharedPointer< MeshType >; + using MeshPointer = Pointers::SharedPointer< MeshType >; using RealType = Real; using VectorType = Containers::Vector< RealType, DeviceType, IndexType >; using ThisType = Functions::MeshFunction< MeshType, MeshEntityDimension, RealType >; using DistributedMeshType = Meshes::DistributedMeshes::DistributedMesh; using DistributedMeshSynchronizerType = Meshes::DistributedMeshes::DistributedMeshSynchronizer; - + static constexpr int getEntitiesDimension() { return MeshEntityDimension; } - + static constexpr int getMeshDimension() { return MeshType::getMeshDimension(); } - + MeshFunction(); - - MeshFunction( const MeshPointer& meshPointer ); - + + MeshFunction( const MeshPointer& meshPointer ); + MeshFunction( const ThisType& meshFunction ); - + template< typename Vector > MeshFunction( const MeshPointer& meshPointer, Vector& data, - const IndexType& offset = 0 ); - - + const IndexType& offset = 0 ); + template< typename Vector > MeshFunction( const MeshPointer& meshPointer, Pointers::SharedPointer< Vector >& data, - const IndexType& offset = 0 ); - + const IndexType& offset = 0 ); + static String getType(); - + String getTypeVirtual() const; - + static String getSerializationType(); virtual String getSerializationTypeVirtual() const; - + static void configSetup( Config::ConfigDescription& config, const String& prefix = "" ); bool setup( const MeshPointer& meshPointer, const Config::ParameterContainer& parameters, const String& prefix = "" ); - + void bind( ThisType& meshFunction ); - + template< typename Vector > void bind( const Vector& data, const IndexType& offset = 0 ); - + template< typename Vector > void bind( const MeshPointer& meshPointer, const Vector& data, const IndexType& offset = 0 ); - + template< typename Vector > void bind( const MeshPointer& meshPointer, const Pointers::SharedPointer< Vector >& dataPtr, const IndexType& offset = 0 ); - + void setMesh( const MeshPointer& meshPointer ); - + template< typename Device = Devices::Host > __cuda_callable__ const MeshType& getMesh() const; - + const MeshPointer& getMeshPointer() const; - + static IndexType getDofs( const MeshPointer& meshPointer ); - - __cuda_callable__ const VectorType& getData() const; - + + __cuda_callable__ const VectorType& getData() const; + __cuda_callable__ VectorType& getData(); - + bool refresh( const RealType& time = 0.0 ) const; - + bool deepRefresh( const RealType& time = 0.0 ) const; - + template< typename EntityType > RealType getValue( const EntityType& meshEntity ) const; - + template< typename EntityType > void setValue( const EntityType& meshEntity, const RealType& value ); - + template< typename EntityType > __cuda_callable__ RealType& operator()( const EntityType& meshEntity, - const RealType& time = 0.0 ); - + const RealType& time = 0 ); + template< typename EntityType > __cuda_callable__ const RealType& operator()( const EntityType& meshEntity, - const RealType& time = 0.0 ) const; - + const RealType& time = 0 ) const; + __cuda_callable__ RealType& operator[]( const IndexType& meshEntityIndex ); - __cuda_callable__ const RealType& operator[]( const IndexType& meshEntityIndex ) const; template< typename Function > ThisType& operator = ( const Function& f ); - + template< typename Function > ThisType& operator -= ( const Function& f ); template< typename Function > ThisType& operator += ( const Function& f ); - + RealType getLpNorm( const RealType& p ) const; - + RealType getMaxNorm() const; - + bool save( File& file ) const; bool load( File& file ); - + bool boundLoad( File& file ); - + bool write( const String& fileName, const String& format = "vtk", const double& scale = 1.0 ) const; - + using Object::save; - + using Object::load; - + using Object::boundLoad; template< typename CommunicatorType, typename PeriodicBoundariesMaskType = MeshFunction< Mesh, MeshEntityDimension, bool > > void synchronize( bool withPeriodicBoundaryConditions = false, const Pointers::SharedPointer< PeriodicBoundariesMaskType, DeviceType >& mask = - Pointers::SharedPointer< PeriodicBoundariesMaskType, DeviceType >( nullptr ) ); + Pointers::SharedPointer< PeriodicBoundariesMaskType, DeviceType >( nullptr ) ); - protected: //DistributedMeshSynchronizerType synchronizer; Meshes::DistributedMeshes::DistributedMeshSynchronizer< Functions::MeshFunction< MeshType, MeshEntityDimension, RealType > > synchronizer; - + MeshPointer meshPointer; - + VectorType data; - + template< typename, typename > friend class MeshFunctionEvaluator; private: diff --git a/src/TNL/Functions/MeshFunction_impl.h b/src/TNL/Functions/MeshFunction_impl.h index 49b75d52f..16d17914d 100644 --- a/src/TNL/Functions/MeshFunction_impl.h +++ b/src/TNL/Functions/MeshFunction_impl.h @@ -19,7 +19,7 @@ #pragma once namespace TNL { -namespace Functions { + namespace Functions { template< typename Mesh, int MeshEntityDimension, @@ -48,7 +48,6 @@ template< typename Mesh, MeshFunction< Mesh, MeshEntityDimension, Real >:: MeshFunction( const ThisType& meshFunction ) { - setupSynchronizer(meshFunction.meshPointer->getDistributedMesh()); this->meshPointer=meshFunction.meshPointer; @@ -241,7 +240,6 @@ bind( const MeshPointer& meshPointer, this->data.bind( *data, offset, getMesh().template getEntitiesCount< typename Mesh::template EntityType< MeshEntityDimension > >() ); } - template< typename Mesh, int MeshEntityDimension, typename Real > @@ -578,7 +576,6 @@ operator << ( std::ostream& str, const MeshFunction< Mesh, MeshEntityDimension, return str; } - -} // namespace Functions + } // namespace Functions } // namespace TNL -- GitLab From cd43ce96b8415b188ae1e18fe3dba6f16fe09f8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 5 Jan 2019 22:21:57 +0100 Subject: [PATCH 058/130] Added asynchronous mode to ParallelFor. --- src/TNL/ParallelFor.h | 46 ++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h index 0505aac23..c27eda393 100644 --- a/src/TNL/ParallelFor.h +++ b/src/TNL/ParallelFor.h @@ -15,7 +15,7 @@ #include #include -/* +/**** * The implementation of ParallelFor is not meant to provide maximum performance * at every cost, but maximum flexibility for operating with data stored on the * device. @@ -28,7 +28,10 @@ namespace TNL { -template< typename Device = Devices::Host > +enum ParallelForMode { SynchronousMode, AsynchronousMode }; + +template< typename Device = Devices::Host, + ParallelForMode Mode = SynchronousMode > struct ParallelFor { template< typename Index, @@ -55,7 +58,8 @@ struct ParallelFor } }; -template< typename Device = Devices::Host > +template< typename Device = Devices::Host, + ParallelForMode Mode = SynchronousMode > struct ParallelFor2D { template< typename Index, @@ -86,7 +90,8 @@ struct ParallelFor2D } }; -template< typename Device = Devices::Host > +template< typename Device = Devices::Host, + ParallelForMode Mode = SynchronousMode > struct ParallelFor3D { template< typename Index, @@ -185,8 +190,8 @@ ParallelFor3DKernel( Index startX, Index startY, Index startZ, Index endX, Index } #endif -template<> -struct ParallelFor< Devices::Cuda > +template< ParallelForMode Mode > +struct ParallelFor< Devices::Cuda, Mode > { template< typename Index, typename Function, @@ -208,8 +213,11 @@ struct ParallelFor< Devices::Cuda > ParallelForKernel< true ><<< gridSize, blockSize >>>( start, end, f, args... ); } - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + if( Mode == SynchronousMode ) + { + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + } } #else throw Exceptions::CudaSupportMissing(); @@ -217,8 +225,8 @@ struct ParallelFor< Devices::Cuda > } }; -template<> -struct ParallelFor2D< Devices::Cuda > +template< ParallelForMode Mode > +struct ParallelFor2D< Devices::Cuda, Mode > { template< typename Index, typename Function, @@ -264,8 +272,11 @@ struct ParallelFor2D< Devices::Cuda > ParallelFor2DKernel< true, true ><<< gridSize, blockSize >>> ( startX, startY, endX, endY, f, args... ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + if( Mode == SynchronousMode ) + { + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + } } #else throw Exceptions::CudaSupportMissing(); @@ -273,8 +284,8 @@ struct ParallelFor2D< Devices::Cuda > } }; -template<> -struct ParallelFor3D< Devices::Cuda > +template< ParallelForMode Mode > +struct ParallelFor3D< Devices::Cuda, Mode > { template< typename Index, typename Function, @@ -343,8 +354,11 @@ struct ParallelFor3D< Devices::Cuda > ParallelFor3DKernel< true, true, true ><<< gridSize, blockSize >>> ( startX, startY, startZ, endX, endY, endZ, f, args... ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + if( Mode == SynchronousMode ) + { + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + } } #else throw Exceptions::CudaSupportMissing(); -- GitLab From 239f6a75c969c9f6be4eeb1849098b08df0b280e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 5 Jan 2019 22:22:16 +0100 Subject: [PATCH 059/130] Traversers benchmark is using asynchronous parallel for. --- .../Traversers/GridTraversersBenchmark_1D.h | 6 +-- .../Traversers/GridTraversersBenchmark_2D.h | 33 +++++++------- .../Traversers/GridTraversersBenchmark_3D.h | 45 ++++++++++--------- 3 files changed, 45 insertions(+), 39 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index 006b0316f..41391d625 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -97,7 +97,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { data[ i ] += (Real) 1.0; }; - ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); + ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() ); } void addOneUsingParallelForAndGridEntity() @@ -110,7 +110,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > entity.refresh(); data[ entity.getIndex() ] += (Real) 1.0; }; - ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); + ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() ); } void addOneUsingParallelForAndMeshFunction() @@ -125,7 +125,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ( *_u )( entity ) += (Real) 1.0; //WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity ); }; - ParallelFor< Device >::exec( ( Index ) 0, size, f ); + ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f ); } void addOneUsingTraverser() diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index 7c90a5064..1da182a54 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -100,11 +100,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index > data[ j * _size + i ] += (Real) 1.0; }; - ParallelFor2D< Device >::exec( ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - f, v.getData() ); + ParallelFor2D< Device, AsynchronousMode >::exec( + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); } void addOneUsingParallelForAndGridEntity() @@ -119,11 +120,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index > data[ entity.getIndex() ] += (Real) 1.0; }; - ParallelFor2D< Device >::exec( ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - f, v.getData() ); + ParallelFor2D< Device, AsynchronousMode >::exec( + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); } void addOneUsingParallelForAndMeshFunction() @@ -139,11 +141,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index > ( *_u )( entity ) += (Real) 1.0; }; - ParallelFor2D< Device >::exec( ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - f, v.getData() ); + ParallelFor2D< Device, AsynchronousMode >::exec( + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); } diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index 2a32184ea..858a4d1db 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -109,13 +109,14 @@ class GridTraversersBenchmark< 3, Device, Real, Index > data[ ( k * _size + j ) * _size + i ] += (Real) 1.0; }; - ParallelFor3D< Device >::exec( ( Index ) 0, - ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - this->size, - f, v.getData() ); + ParallelFor3D< Device, AsynchronousMode >::exec( + ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); } void addOneUsingParallelForAndGridEntity() @@ -131,13 +132,14 @@ class GridTraversersBenchmark< 3, Device, Real, Index > data[ entity.getIndex() ] += (Real) 1.0; }; - ParallelFor3D< Device >::exec( ( Index ) 0, - ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - this->size, - f, v.getData() ); + ParallelFor3D< Device, AsynchronousMode >::exec( + ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); } void addOneUsingParallelForAndMeshFunction() @@ -154,13 +156,14 @@ class GridTraversersBenchmark< 3, Device, Real, Index > ( *_u )( entity ) += (Real) 1.0; }; - ParallelFor3D< Device >::exec( ( Index ) 0, - ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - this->size, - f, v.getData() ); + ParallelFor3D< Device, AsynchronousMode >::exec( + ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); } -- GitLab From 3910154b5ca25f0275cc700550c274daba38c786 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sun, 6 Jan 2019 15:50:51 +0100 Subject: [PATCH 060/130] Added simple cell test. --- .../Traversers/GridTraverserBenchmarkHelper.h | 136 +-------------- .../GridTraverserBenchmarkHelper_1D.h | 154 +++++++++++++++++ .../GridTraverserBenchmarkHelper_2D.h | 152 +++++++++++++++++ .../GridTraverserBenchmarkHelper_3D.h | 156 ++++++++++++++++++ .../Traversers/GridTraversersBenchmark_1D.h | 44 ++--- .../Traversers/GridTraversersBenchmark_2D.h | 38 +++-- .../Traversers/GridTraversersBenchmark_3D.h | 36 ++-- src/Benchmarks/Traversers/SimpleCell.h | 57 ++++++- .../Traversers/tnl-benchmark-traversers.h | 24 +-- 9 files changed, 602 insertions(+), 195 deletions(-) create mode 100644 src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h create mode 100644 src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h create mode 100644 src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h index c13ec3ab7..6da7ec09b 100644 --- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h @@ -20,143 +20,15 @@ namespace TNL { namespace Benchmarks { namespace Traversers { -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor > -__global__ void -_GridTraverser1D( - const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, - UserData userData, - const typename GridEntity::CoordinatesType begin, - const typename GridEntity::CoordinatesType end, - const Index gridIdx ) -{ - typedef Real RealType; - typedef Index IndexType; - typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; - //typename GridType::CoordinatesType coordinates; - - GridEntity entity( *grid ); - entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - //coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( entity.getCoordinates() <= end ) - { - entity.refresh(); - //( userData.u->getData() )[ entity.getIndex( coordinates ) ] += ( RealType ) 1.0; - //( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0; - //userData.data[ entity.getIndex() ] += ( RealType ) 1.0; - //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0; - ( *userData.u )( entity ) += ( RealType ) 1.0; - //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } -} -#endif - -template< typename Grid, - typename Device = typename Grid::DeviceType > -class GridTraverserBenchmarkHelper{}; - template< typename Grid > -class GridTraverserBenchmarkHelper< Grid, Devices::Host > -{ - public: - - using GridType = Grid; - using GridPointer = Pointers::SharedPointer< Grid >; - using RealType = typename GridType::RealType; - using IndexType = typename GridType::IndexType; - using CoordinatesType = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; - using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using CellType = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; - using SimpleCellType = SimpleCell< GridType >; - using Traverser = Meshes::Traverser< Grid, CellType >; - using UserDataType = BenchmarkTraverserUserData< MeshFunction >; - using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; - - static void noBCTraverserTest( const GridPointer& grid, - UserDataType& userData, - std::size_t size ) - { - /*Meshes::GridTraverser< Grid >::template processEntities< CellType, AddOneEntitiesProcessorType, UserDataType, false >( - grid, - CoordinatesType( 0 ), - grid->getDimensions() - CoordinatesType( 1 ), - userData ); - */ - - const CoordinatesType begin( 0 ); - const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); - //MeshFunction* _u = &u.template modifyData< Device >(); - SimpleCellType entity( *grid ); - for( IndexType x = begin.x(); x <= end.x(); x ++ ) - { - entity.getCoordinates().x() = x; - entity.refresh(); - //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0; - ( *userData.u )( entity ) += ( RealType ) 1.0; - //AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); - } - - } -}; - -template< typename Grid > -class GridTraverserBenchmarkHelper< Grid, Devices::Cuda > -{ - public: - - using GridType = Grid; - using GridPointer = Pointers::SharedPointer< Grid >; - using RealType = typename GridType::RealType; - using IndexType = typename GridType::IndexType; - using CoordinatesType = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; - using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using CellType = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; - using SimpleCellType = SimpleCell< GridType >; - using Traverser = Meshes::Traverser< Grid, CellType >; - using UserDataType = BenchmarkTraverserUserData< MeshFunction >; - using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; - - static void noBCTraverserTest( const GridPointer& grid, - UserDataType& userData, - std::size_t size ) - { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( - blockSize, - blocksCount, - gridsCount, - size ); - dim3 gridIdx; - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - _GridTraverser1D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType > - <<< blocksCount, blockSize >>> - ( &grid.template getData< Devices::Cuda >(), - userData, - CoordinatesType( 0 ), - CoordinatesType( size ) - CoordinatesType( 1 ), - gridIdx.x ); +class GridTraverserBenchmarkHelper{}; - } -#endif - } -}; } // namespace Traversers } // namespace Benchmarks } // namespace TNL +#include "GridTraverserBenchmarkHelper_1D.h" +#include "GridTraverserBenchmarkHelper_2D.h" +#include "GridTraverserBenchmarkHelper_3D.h" diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h new file mode 100644 index 000000000..e460a8bca --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h @@ -0,0 +1,154 @@ +/*************************************************************************** + GridTraversersBenchmarkHelper_1D.h - description + ------------------- + begin : Jan 6, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include "GridTraverserBenchmarkHelper.h" +#include "AddOneEntitiesProcessor.h" +#include "BenchmarkTraverserUserData.h" +#include "SimpleCell.h" + + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor > +__global__ void +_GridTraverser1D( + const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const Index gridIdx ) +{ + typedef Real RealType; + typedef Index IndexType; + typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; + //typename GridType::CoordinatesType coordinates; + + GridEntity entity( *grid ); + entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + //coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( entity.getCoordinates() <= end ) + { + entity.refresh(); + //( userData.u->getData() )[ entity.getIndex( coordinates ) ] += ( RealType ) 1.0; + //( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0; + //userData.data[ entity.getIndex() ] += ( RealType ) 1.0; + //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0; + ( *userData.u )( entity ) += ( RealType ) 1.0; + //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +} +#endif + +template< typename Real, + typename Index > +class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Host, Index > > +{ + public: + + constexpr static int Dimension = 1; + using GridType = Meshes::Grid< Dimension, Real, Devices::Host, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void simpleCellTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { + const CoordinatesType begin( 0 ); + const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); + SimpleCellType entity( *grid ); + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0; + ( *userData.u )( entity ) += ( RealType ) 1.0; + } + + } +}; + +template< typename Real, + typename Index > +class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Cuda, Index > > +{ + public: + + constexpr static int Dimension = 1; + using GridType = Meshes::Grid< Dimension, Real, Devices::Cuda, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void simpleCellTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + _GridTraverser1D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType > + <<< blocksCount, blockSize >>> + ( &grid.template getData< Devices::Cuda >(), + userData, + CoordinatesType( 0 ), + CoordinatesType( size ) - CoordinatesType( 1 ), + gridIdx.x ); + + } +#endif + } +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h new file mode 100644 index 000000000..eca6c7fee --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h @@ -0,0 +1,152 @@ +/*************************************************************************** + GridTraversersBenchmarkHelper_2D.h - description + ------------------- + begin : Jan 6, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include "GridTraverserBenchmarkHelper.h" +#include "AddOneEntitiesProcessor.h" +#include "BenchmarkTraverserUserData.h" +#include "SimpleCell.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor > +__global__ void +_GridTraverser2D( + const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const dim3 gridIdx ) +{ + typedef Real RealType; + typedef Index IndexType; + typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; + + GridEntity entity( *grid ); + entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + if( entity.getCoordinates() <= end ) + { + entity.refresh(); + ( *userData.u )( entity ) += ( RealType ) 1.0; + } +} +#endif + +template< typename Real, + typename Index > +class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Host, Index > > +{ + public: + + constexpr static int Dimension = 2; + using GridType = Meshes::Grid< Dimension, Real, Devices::Host, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void simpleCellTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { + const CoordinatesType begin( 0 ); + const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); + SimpleCellType entity( *grid ); + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y()++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0; + ( *userData.u )( entity ) += ( RealType ) 1.0; + } + + } +}; + +template< typename Real, + typename Index > +class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Cuda, Index > > +{ + public: + + constexpr static int Dimension = 2; + using GridType = Meshes::Grid< Dimension, Real, Devices::Cuda, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void simpleCellTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { +#ifdef HAVE_CUDA + dim3 blockSize( 16, 16 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size ); + dim3 gridIdx; + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + _GridTraverser2D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType > + <<< blocksCount, blockSize >>> + ( &grid.template getData< Devices::Cuda >(), + userData, + CoordinatesType( 0 ), + CoordinatesType( size ) - CoordinatesType( 1 ), + gridIdx.x ); + } +#endif + } +}; + + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h new file mode 100644 index 000000000..4a5da6fd4 --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h @@ -0,0 +1,156 @@ +/*************************************************************************** + GridTraversersBenchmarkHelper_3D.h - description + ------------------- + begin : Jan 6, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include "GridTraverserBenchmarkHelper.h" +#include "AddOneEntitiesProcessor.h" +#include "BenchmarkTraverserUserData.h" +#include "SimpleCell.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor > +__global__ void +_GridTraverser3D( + const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const dim3 gridIdx ) +{ + typedef Real RealType; + typedef Index IndexType; + typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; + + GridEntity entity( *grid ); + entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + entity.getCoordinates().z() = begin.z() + ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + + if( entity.getCoordinates() <= end ) + { + entity.refresh(); + ( *userData.u )( entity ) += ( RealType ) 1.0; + } +} +#endif + +template< typename Real, + typename Index > +class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Host, Index > > +{ + public: + + constexpr static int Dimension = 3; + using GridType = Meshes::Grid< Dimension, Real, Devices::Host, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void simpleCellTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { + const CoordinatesType begin( 0 ); + const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); + SimpleCellType entity( *grid ); + for( entity.getCoordinates().z() = begin.z(); + entity.getCoordinates().z() <= end.z(); + entity.getCoordinates().z()++ ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y()++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + ( *userData.u )( entity ) += ( RealType ) 1.0; + } + } +}; + +template< typename Real, + typename Index > +class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Cuda, Index > > +{ + public: + + constexpr static int Dimension = 3; + using GridType = Meshes::Grid< Dimension, Real, Devices::Cuda, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void simpleCellTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { +#ifdef HAVE_CUDA + dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size, + size ); + dim3 gridIdx; + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + _GridTraverser3D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType > + <<< blocksCount, blockSize >>> + ( &grid.template getData< Devices::Cuda >(), + userData, + CoordinatesType( 0 ), + CoordinatesType( size ) - CoordinatesType( 1 ), + gridIdx.x ); + } +#endif + } +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index 41391d625..145f42ca9 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -23,6 +23,7 @@ #include #include "cuda-kernels.h" #include "GridTraversersBenchmark.h" +#include "SimpleCell.h" namespace TNL { namespace Benchmarks { @@ -37,13 +38,14 @@ class GridTraversersBenchmark< 1, Device, Real, Index > public: using Vector = Containers::Vector< Real, Device, Index >; - using Grid = Meshes::Grid< 1, Real, Device, Index >; - using GridPointer = Pointers::SharedPointer< Grid >; - using Coordinates = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; + using GridType = Meshes::Grid< 1, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using Coordinates = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; + using CellType = typename GridType::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; using UserDataType = BenchmarkTraverserUserData< MeshFunction >; using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; @@ -100,44 +102,48 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() ); } - void addOneUsingParallelForAndGridEntity() + void addOneUsingSimpleCell() { - const Grid* currentGrid = &grid.template getData< Device >(); + /*const GridType* currentGrid = &grid.template getData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Real* data ) { - Cell entity( *currentGrid ); + SimpleCellType entity( *currentGrid ); entity.getCoordinates().x() = i; entity.refresh(); data[ entity.getIndex() ] += (Real) 1.0; }; - ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() ); + ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );*/ + GridTraverserBenchmarkHelper< GridType >::simpleCellTest( + grid, + userData, + size ); } void addOneUsingParallelForAndMeshFunction() { - const Grid* currentGrid = &grid.template getData< Device >(); + const GridType* currentGrid = &grid.template getData< Device >(); MeshFunction* _u = &u.template modifyData< Device >(); auto f = [=] __cuda_callable__ ( Index i ) { - Cell entity( *currentGrid ); + SimpleCellType entity( *currentGrid ); entity.getCoordinates().x() = i; entity.refresh(); - ( *_u )( entity ) += (Real) 1.0; - //WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity ); + _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0; + // ( *_u )( entity ) += (Real) 1.0; }; ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f ); } void addOneUsingTraverser() { - using CoordinatesType = typename Grid::CoordinatesType; - //traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > - // ( grid, userData ); + using CoordinatesType = typename GridType::CoordinatesType; + traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > + ( grid, userData ); - GridTraverserBenchmarkHelper< Grid >::noBCTraverserTest( + /*GridTraverserBenchmarkHelper< GridType >::noBCTraverserTest( grid, userData, - size ); + size );*/ } void traverseUsingPureC() diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index 1da182a54..66462eb1a 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -22,6 +22,7 @@ #include #include #include "cuda-kernels.h" +#include "SimpleCell.h" namespace TNL { namespace Benchmarks { @@ -35,13 +36,14 @@ class GridTraversersBenchmark< 2, Device, Real, Index > public: using Vector = Containers::Vector< Real, Device, Index >; - using Grid = Meshes::Grid< 2, Real, Device, Index >; - using GridPointer = Pointers::SharedPointer< Grid >; - using Coordinates = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; + using GridType = Meshes::Grid< 2, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using Coordinates = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; + using CellType = typename GridType::template EntityType< 2, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; using UserDataType = BenchmarkTraverserUserData< MeshFunction >; using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; @@ -108,12 +110,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index > f, v.getData() ); } - void addOneUsingParallelForAndGridEntity() + void addOneUsingSimpleCell() { - const Grid* currentGrid = &grid.template getData< Device >(); + /*const GridType* currentGrid = &grid.template getData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { - Cell entity( *currentGrid ); + SimpleCellType entity( *currentGrid ); entity.getCoordinates().x() = i; entity.getCoordinates().y() = j; entity.refresh(); @@ -125,20 +127,26 @@ class GridTraversersBenchmark< 2, Device, Real, Index > ( Index ) 0, this->size, this->size, - f, v.getData() ); + f, v.getData() );*/ + GridTraverserBenchmarkHelper< GridType >::simpleCellTest( + grid, + userData, + size ); + } void addOneUsingParallelForAndMeshFunction() { - const Grid* currentGrid = &grid.template getData< Device >(); + const GridType* currentGrid = &grid.template getData< Device >(); MeshFunction* _u = &u.template modifyData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { - Cell entity( *currentGrid ); + SimpleCellType entity( *currentGrid ); entity.getCoordinates().x() = i; entity.getCoordinates().y() = j; entity.refresh(); - ( *_u )( entity ) += (Real) 1.0; + //( *_u )( entity ) += (Real) 1.0; + _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0; }; ParallelFor2D< Device, AsynchronousMode >::exec( @@ -152,7 +160,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > void addOneUsingTraverser() { - using CoordinatesType = typename Grid::CoordinatesType; + using CoordinatesType = typename GridType::CoordinatesType; traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); @@ -197,7 +205,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > else // Device == Devices::Cuda { #ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; + dim3 blockSize( 32, 8 ), blocksCount, gridsCount; Devices::Cuda::setupThreads( blockSize, blocksCount, diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index 858a4d1db..b6f9bd4e1 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -25,6 +25,7 @@ #include "cuda-kernels.h" #include "AddOneEntitiesProcessor.h" #include "BenchmarkTraverserUserData.h" +#include "SimpleCell.h" namespace TNL { namespace Benchmarks { @@ -38,13 +39,14 @@ class GridTraversersBenchmark< 3, Device, Real, Index > public: using Vector = Containers::Vector< Real, Device, Index >; - using Grid = Meshes::Grid< 3, Real, Device, Index >; - using GridPointer = Pointers::SharedPointer< Grid >; - using Coordinates = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; + using GridType = Meshes::Grid< 3, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using Coordinates = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; + using CellType = typename GridType::template EntityType< 3, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; using UserDataType = BenchmarkTraverserUserData< MeshFunction >; using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; @@ -119,12 +121,12 @@ class GridTraversersBenchmark< 3, Device, Real, Index > f, v.getData() ); } - void addOneUsingParallelForAndGridEntity() + void addOneUsingSimpleCell() { - const Grid* currentGrid = &grid.template getData< Device >(); + /*const GridType* currentGrid = &grid.template getData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { - Cell entity( *currentGrid ); + SimpleCellType entity( *currentGrid ); entity.getCoordinates().x() = i; entity.getCoordinates().y() = j; entity.getCoordinates().z() = k; @@ -139,21 +141,27 @@ class GridTraversersBenchmark< 3, Device, Real, Index > this->size, this->size, this->size, - f, v.getData() ); + f, v.getData() );*/ + GridTraverserBenchmarkHelper< GridType >::simpleCellTest( + grid, + userData, + size ); + } void addOneUsingParallelForAndMeshFunction() { - const Grid* currentGrid = &grid.template getData< Device >(); + const GridType* currentGrid = &grid.template getData< Device >(); MeshFunction* _u = &u.template modifyData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { - Cell entity( *currentGrid ); + SimpleCellType entity( *currentGrid ); entity.getCoordinates().x() = i; entity.getCoordinates().y() = j; entity.getCoordinates().z() = k; entity.refresh(); - ( *_u )( entity ) += (Real) 1.0; + //( *_u )( entity ) += (Real) 1.0; + _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0; }; ParallelFor3D< Device, AsynchronousMode >::exec( @@ -205,7 +213,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > else // Device == Devices::Cuda { #ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; + dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount; Devices::Cuda::setupThreads( blockSize, blocksCount, diff --git a/src/Benchmarks/Traversers/SimpleCell.h b/src/Benchmarks/Traversers/SimpleCell.h index c70f64fda..9776ef26c 100644 --- a/src/Benchmarks/Traversers/SimpleCell.h +++ b/src/Benchmarks/Traversers/SimpleCell.h @@ -47,7 +47,10 @@ class SimpleCell< Meshes::Grid< 1, Real, Device, Index > > CoordinatesType& getCoordinates() { return this->coordinates; }; __cuda_callable__ - void refresh() {index = coordinates.x();}; + const CoordinatesType& getCoordinates() const { return this->coordinates; }; + + __cuda_callable__ + void refresh() {index = this->grid.getEntityIndex( *this );}; __cuda_callable__ const IndexType& getIndex() const { return this->index; }; @@ -64,7 +67,7 @@ template< typename Real, class SimpleCell< Meshes::Grid< 2, Real, Device, Index > > { public: - using GridType = Meshes::Grid< 1, Real, Device, Index >; + using GridType = Meshes::Grid< 2, Real, Device, Index >; using RealType = typename GridType::RealType; using DeviceType = typename GridType::DeviceType; using IndexType = typename GridType::IndexType; @@ -72,6 +75,30 @@ class SimpleCell< Meshes::Grid< 2, Real, Device, Index > > constexpr static int getEntityDimension() { return 2; }; + __cuda_callable__ + SimpleCell( const GridType& grid ) : + grid( grid ){}; + + __cuda_callable__ + const GridType& getMesh() const { return this->grid;}; + + __cuda_callable__ + CoordinatesType& getCoordinates() { return this->coordinates; }; + + __cuda_callable__ + const CoordinatesType& getCoordinates() const { return this->coordinates; }; + + __cuda_callable__ + void refresh() {index = this->grid.getEntityIndex( *this );}; + + __cuda_callable__ + const IndexType& getIndex() const { return this->index; }; + + protected: + const GridType& grid; + CoordinatesType coordinates; + IndexType index; + }; template< typename Real, @@ -80,7 +107,7 @@ template< typename Real, class SimpleCell< Meshes::Grid< 3, Real, Device, Index > > { public: - using GridType = Meshes::Grid< 1, Real, Device, Index >; + using GridType = Meshes::Grid< 3, Real, Device, Index >; using RealType = typename GridType::RealType; using DeviceType = typename GridType::DeviceType; using IndexType = typename GridType::IndexType; @@ -88,6 +115,30 @@ class SimpleCell< Meshes::Grid< 3, Real, Device, Index > > constexpr static int getEntityDimension() { return 3; }; + __cuda_callable__ + SimpleCell( const GridType& grid ) : + grid( grid ){}; + + __cuda_callable__ + const GridType& getMesh() const { return this->grid;}; + + __cuda_callable__ + CoordinatesType& getCoordinates() { return this->coordinates; }; + + __cuda_callable__ + const CoordinatesType& getCoordinates() const { return this->coordinates; }; + + __cuda_callable__ + void refresh() { index = this->grid.getEntityIndex( *this ); }; + + __cuda_callable__ + const IndexType& getIndex() const { return this->index; }; + + protected: + const GridType& grid; + CoordinatesType coordinates; + IndexType index; + }; } // namespace Traversers diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 2963bb792..f329d5640 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -127,23 +127,23 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using parallel for with grid entity */ - if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-grid-entity" ) ) + if( tests.containsValue( "all" ) || tests.containsValue( "add-one-simple-cell" ) ) { - auto hostWriteOneUsingParallelForAndGridEntity = [&] () + auto hostAddOneUsingSimpleCell = [&] () { - hostTraverserBenchmark.addOneUsingParallelForAndGridEntity(); + hostTraverserBenchmark.addOneUsingSimpleCell(); }; - benchmark.setOperation( "par.for+grid ent.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "simple cell", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingSimpleCell ); #ifdef HAVE_CUDA - auto cudaWriteOneUsingParallelForAndGridEntity = [&] () + auto cudaAddOneUsingSimpleCell = [&] () { - cudaTraverserBenchmark.addOneUsingParallelForAndGridEntity(); + cudaTraverserBenchmark.addOneUsingSimpleCell(); }; if( withCuda ) - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingSimpleCell ); #endif } @@ -152,21 +152,21 @@ bool runBenchmark( const Config::ParameterContainer& parameters, */ if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-mesh-function" ) ) { - auto hostWriteOneUsingParallelForAndMeshFunction = [&] () + auto hostAddOneUsingParallelForAndMeshFunction = [&] () { hostTraverserBenchmark.addOneUsingParallelForAndMeshFunction(); }; benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingParallelForAndMeshFunction ); #ifdef HAVE_CUDA - auto cudaWriteOneUsingParallelForAndMeshFunction = [&] () + auto cudaAddOneUsingParallelForAndMeshFunction = [&] () { cudaTraverserBenchmark.addOneUsingParallelForAndMeshFunction(); }; if( withCuda ) - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingParallelForAndMeshFunction ); #endif } -- GitLab From cdbedfa40ffbdeee650e88c8cb4db569bb44dd32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Tue, 8 Jan 2019 10:41:17 +0100 Subject: [PATCH 061/130] Benchmarks: set minTime = 0.0 by default due to backwards compatibility --- src/Benchmarks/Benchmarks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 355fb4671..48e496c1e 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -332,7 +332,7 @@ public: protected: int loops = 1; - double minTime = 1; + double minTime = 0.0; double datasetSize = 0.0; double baseTime = 0.0; bool timing = true; -- GitLab From 2220c328c68c732d344699a7e4b1878bb6a40d43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Tue, 8 Jan 2019 19:50:28 +0100 Subject: [PATCH 062/130] Added check of the benchmark results. --- src/Benchmarks/Benchmarks.h | 42 +++++++---- src/Benchmarks/FunctionTimer.h | 27 ++++--- .../Traversers/GridTraversersBenchmark_1D.h | 10 ++- .../Traversers/GridTraversersBenchmark_2D.h | 11 ++- .../Traversers/GridTraversersBenchmark_3D.h | 12 ++- .../Traversers/tnl-benchmark-traversers.h | 74 +++++++++++++++++-- 6 files changed, 141 insertions(+), 35 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 48e496c1e..b05958f17 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -202,33 +202,35 @@ public: BenchmarkResult & result ) { result.time = std::numeric_limits::quiet_NaN(); + FunctionTimer< Device > functionTimer; try { if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); if( this->timing ) if( this->reset ) - result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor ); else - result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); else if( this->reset ) - result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor ); else - result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); } else { if( this->timing ) if( this->reset ) - result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor ); else - result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); else if( this->reset ) - result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor ); else - result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); } + this->performedLoops = functionTimer.getPerformedLoops(); } catch ( const std::exception& e ) { std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl; @@ -269,24 +271,25 @@ public: BenchmarkResult & result ) { result.time = std::numeric_limits::quiet_NaN(); + FunctionTimer< Device > functionTimer; try { if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); if( this->timing ) - result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); else - result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); } else { if( this->timing ) - result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); else - result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); } } catch ( const std::exception& e ) { - std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl; + std::cerr << "Function timer failed due to a C++ exception with description: " << e.what() << std::endl; } result.bandwidth = datasetSize / result.time; @@ -320,6 +323,7 @@ public: // each computation has 3 subcolumns const int colspan = 3 * numberOfComputations; writeErrorMessage( msg, colspan ); + std::cerr << msg << std::endl; } using Logging::save; @@ -330,8 +334,18 @@ public: return monitor; } + int getPerformedLoops() const + { + return this->performedLoops; + } + + bool isResetingOn() const + { + return reset; + } + protected: - int loops = 1; + int loops = 1, performedLoops = 0; double minTime = 0.0; double datasetSize = 0.0; double baseTime = 0.0; diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h index 601cfc16c..05b59d28a 100644 --- a/src/Benchmarks/FunctionTimer.h +++ b/src/Benchmarks/FunctionTimer.h @@ -22,17 +22,17 @@ namespace TNL { namespace Benchmarks { -template< typename Device, - bool timing > +template< typename Device > class FunctionTimer { public: using DeviceType = Device; - template< typename ComputeFunction, + template< bool timing, + typename ComputeFunction, typename ResetFunction, typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > - static double + double timeFunction( ComputeFunction compute, ResetFunction reset, int maxLoops, @@ -52,7 +52,6 @@ class FunctionTimer reset(); compute(); - int loops; // If we do not perform reset function and don't need // the monitor, the timer is not interrupted after each loop. if( ! performReset && verbose < 2 ) @@ -67,7 +66,7 @@ class FunctionTimer for( loops = 0; loops < maxLoops || ( timing && timer.getRealTime() < minTime ); - ++loops) + ++loops) compute(); // Explicit synchronization of the CUDA device #ifdef HAVE_CUDA @@ -85,7 +84,6 @@ class FunctionTimer { // abuse the monitor's "time" for loops monitor.setTime( loops + 1 ); - reset(); // Explicit synchronization of the CUDA device @@ -104,15 +102,17 @@ class FunctionTimer timer.stop(); } } + std::cerr << loops << std::endl; if( timing ) return timer.getRealTime() / ( double ) loops; else return std::numeric_limits::quiet_NaN(); } - template< typename ComputeFunction, + template< bool timing, + typename ComputeFunction, typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > - static double + double timeFunction( ComputeFunction compute, int maxLoops, const double& minTime, @@ -120,8 +120,15 @@ class FunctionTimer Monitor && monitor = Monitor() ) { auto noReset = [] () {}; - return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false ); + return timeFunction< timing >( compute, noReset, maxLoops, minTime, verbose, monitor, false ); } + + int getPerformedLoops() const + { + return this->loops; + } + protected: + int loops; }; } // namespace Benchmarks diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index 145f42ca9..fb79acfc8 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -54,12 +54,12 @@ class GridTraversersBenchmark< 1, Device, Real, Index > userData( this->u ) { v_data = v.getData(); + u->getData().bind( v ); } void reset() { v.setValue( 0.0 ); - u->getData().setValue( 0.0 ); }; void addOneUsingPureC() @@ -146,6 +146,14 @@ class GridTraversersBenchmark< 1, Device, Real, Index > size );*/ } + bool checkAddOne( int loops, bool reseting ) + { + std::cout << loops << " -> " << v << std::endl; + if( reseting ) + return v.containsOnlyValue( 1.0 ); + return v.containsOnlyValue( ( Real ) loops ); + } + void traverseUsingPureC() { if( std::is_same< Device, Devices::Host >::value ) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index 66462eb1a..a707d0e9c 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -52,12 +52,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index > userData( u ) { v_data = v.getData(); + u->getData().bind( v ); } void reset() { v.setValue( 0.0 ); - u->getData().setValue( 0.0 ); }; void addOneUsingPureC() @@ -71,7 +71,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > else // Device == Devices::Cuda { #ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; + dim3 blockSize( 16, 16 ), blocksCount, gridsCount; Devices::Cuda::setupThreads( blockSize, blocksCount, @@ -183,6 +183,13 @@ class GridTraversersBenchmark< 2, Device, Real, Index > }*/ } + bool checkAddOne( int loops, bool reseting ) + { + if( reseting ) + return v.containsOnlyValue( 1.0 ); + return v.containsOnlyValue( ( Real ) loops ); + } + void traverseUsingPureC() { if( std::is_same< Device, Devices::Host >::value ) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index b6f9bd4e1..833c15126 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -58,12 +58,12 @@ class GridTraversersBenchmark< 3, Device, Real, Index > userData( u ) { v_data = v.getData(); + u->getData().bind( v ); } void reset() { v.setValue( 0.0 ); - u->getData().setValue( 0.0 ); }; void addOneUsingPureC() @@ -78,7 +78,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > else // Device == Devices::Cuda { #ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; + dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount; Devices::Cuda::setupThreads( blockSize, blocksCount, @@ -174,13 +174,19 @@ class GridTraversersBenchmark< 3, Device, Real, Index > f, v.getData() ); } - void addOneUsingTraverser() { traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); } + bool checkAddOne( int loops, bool reseting ) + { + if( reseting ) + return v.containsOnlyValue( 1.0 ); + return v.containsOnlyValue( ( Real ) loops ); + } + void traverseUsingPureC() { if( std::is_same< Device, Devices::Host >::value ) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index f329d5640..59441bbbb 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -48,6 +48,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, #else const bool withCuda = false; #endif + const bool check = parameters.getParameter< bool >( "check" ); /**** * Full grid traversing with no boundary conditions @@ -77,7 +78,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, { {"size", convertToString( size ) }, } ) ); /**** - * Write one using C for + * Add one using pure C code */ if( tests.containsValue( "all" ) || tests.containsValue( "add-one-pure-c" ) ) { @@ -88,7 +89,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.addOneUsingPureC(); }; if( withHost ) + { benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); + if( check && ! hostTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } #ifdef HAVE_CUDA auto cudaWriteOneUsingPureC = [&] () @@ -96,12 +103,18 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.addOneUsingPureC(); }; if( withCuda ) + { benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } #endif } /**** - * Write one using parallel for + * Add one using parallel for */ if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for" ) ) { @@ -112,7 +125,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.addOneUsingParallelFor(); }; if( withHost ) + { benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); + if( check && ! hostTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelFor = [&] () @@ -120,12 +139,19 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.addOneUsingParallelFor(); }; if( withCuda ) + { benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } + #endif } /**** - * Write one using parallel for with grid entity + * Add one using parallel for with grid entity */ if( tests.containsValue( "all" ) || tests.containsValue( "add-one-simple-cell" ) ) { @@ -135,7 +161,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; benchmark.setOperation( "simple cell", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) + { benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingSimpleCell ); + if( check && ! hostTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } #ifdef HAVE_CUDA auto cudaAddOneUsingSimpleCell = [&] () @@ -143,12 +175,19 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.addOneUsingSimpleCell(); }; if( withCuda ) + { benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingSimpleCell ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } + #endif } /**** - * Write one using parallel for with mesh function + * Add one using parallel for with mesh function */ if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-mesh-function" ) ) { @@ -158,7 +197,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) + { benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingParallelForAndMeshFunction ); + if( check && ! hostTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } #ifdef HAVE_CUDA auto cudaAddOneUsingParallelForAndMeshFunction = [&] () @@ -166,13 +211,19 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.addOneUsingParallelForAndMeshFunction(); }; if( withCuda ) + { benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingParallelForAndMeshFunction ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } #endif } /**** - * Write one using traverser + * Add one using traverser */ if( tests.containsValue( "all" ) || tests.containsValue( "add-one-traverser" ) ) { @@ -182,7 +233,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.addOneUsingTraverser(); }; if( withHost ) + { benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); + if( check && ! hostTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } #ifdef HAVE_CUDA auto cudaWriteOneUsingTraverser = [&] () @@ -190,7 +247,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.addOneUsingTraverser(); }; if( withCuda ) + { benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } #endif } std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl; @@ -343,6 +406,7 @@ void setupConfig( Config::ConfigDescription& config ) #else config.addEntry< bool >( "with-cuda", "Perform CUDA benchmarks.", false ); #endif + config.addEntry< bool >( "check", "Checking correct results of benchmark tests.", false ); config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-traversers.log"); config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); config.addEntryEnum( "append" ); -- GitLab From 1d4ec3ea18aa7d9250f074c4ec97044ed55ca0f6 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 24 Jan 2019 13:50:41 +0100 Subject: [PATCH 063/130] Added build parameter --with-profiling. --- CMakeLists.txt | 11 +++++++++-- build | 4 ++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2c1adce6b..8dc619e72 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,6 +26,7 @@ set(WITH_CUDA_ARCH "auto" CACHE STRING "Build for these CUDA architectures") option(WITH_OPENMP "Build with OpenMP support" ON) option(WITH_GMP "Build with GMP support" OFF) option(WITH_TESTS "Build tests" ON) +option(WITH_PROFILING "Enable code profiling compiler flags" OFF ) option(WITH_COVERAGE "Enable code coverage reports from unit tests" OFF) option(WITH_EXAMPLES "Compile the 'examples' directory" ON) option(WITH_TOOLS "Compile the 'src/Tools' directory" ON) @@ -78,7 +79,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") endif() # set Debug/Release options -set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -g" ) +set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable" ) set( CMAKE_CXX_FLAGS_DEBUG "-g" ) set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG" ) #set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" ) @@ -233,7 +234,7 @@ if( ${WITH_CUDA} ) endif() endif() endif() - set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; ${CUDA_ARCH} -D_FORCE_INLINES --generate-line-info) + set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; ${CUDA_ARCH} -D_FORCE_INLINES ) # TODO: this is necessary only due to a bug in cmake set( CUDA_ADD_LIBRARY_OPTIONS -shared ) endif() @@ -247,6 +248,11 @@ if( OPENMP_FOUND AND ${WITH_OPENMP} ) set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_OPENMP ${OpenMP_CXX_FLAGS}" ) endif() +if( ${WITH_PROFILING} ) + set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g" ) + set( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-line-info") +endif() + find_package( DCMTK ) if( DCMTK_FOUND ) set( HAVE_DCMTK_H "#define HAVE_DCMTK_H 1" ) @@ -464,6 +470,7 @@ message( " WITH_CUDA_ARCH = ${WITH_CUDA_ARCH}" ) message( " WITH_OPENMP = ${WITH_OPENMP}" ) message( " WITH_GMP = ${WITH_GMP}" ) message( " WITH_TESTS = ${WITH_TESTS}" ) +message( " WITH_PROFILING = ${WITH_PROFILING}" ) message( " WITH_COVERAGE = ${WITH_COVERAGE}" ) message( " WITH_EXAMPLES = ${WITH_EXAMPLES}" ) message( " WITH_TOOLS = ${WITH_TOOLS}" ) diff --git a/build b/build index f11dbffbc..c009a2608 100755 --- a/build +++ b/build @@ -22,6 +22,7 @@ WITH_CUDA_ARCH="auto" WITH_OPENMP="yes" WITH_GMP="no" WITH_TESTS="yes" +WITH_PROFILING="no" WITH_COVERAGE="no" WITH_EXAMPLES="yes" WITH_PYTHON="yes" @@ -57,6 +58,7 @@ do --with-openmp=* ) WITH_OPENMP="${option#*=}" ;; --with-gmp=* ) WITH_GMP="${option#*=}" ;; --with-tests=* ) WITH_TESTS="${option#*=}" ;; + --with-profiling=* ) WITH_PROFILING="${option#*=}" ;; --with-coverage=* ) WITH_COVERAGE="${option#*=}" ;; --with-examples=* ) WITH_EXAMPLES="${option#*=}" ;; --with-tools=* ) WITH_TOOLS="${option#*=}" ;; @@ -95,6 +97,7 @@ if [[ ${HELP} == "yes" ]]; then echo " --with-openmp=yes/no Enables OpenMP. 'yes' by default." echo " --with-gmp=yes/no Enables the wrapper for GNU Multiple Precision Arithmetic Library. 'no' by default." echo " --with-tests=yes/no Enables unit tests. 'yes' by default." + echo " --with-profiling=yes/no Enables code profiling compiler falgs. 'no' by default." echo " --with-coverage=yes/no Enables code coverage reports for unit tests. 'no' by default (lcov is required)." echo " --with-examples=yes/no Compile the 'examples' directory. 'yes' by default." echo " --with-tools=yes/no Compile the 'src/Tools' directory. 'yes' by default." @@ -165,6 +168,7 @@ cmake_command=( -DWITH_OPENMP=${WITH_OPENMP} -DWITH_GMP=${WITH_GMP} -DWITH_TESTS=${WITH_TESTS} + -DWITH_PROFILING=${WITH_PROFILING} -DWITH_COVERAGE=${WITH_COVERAGE} -DWITH_EXAMPLES=${WITH_EXAMPLES} -DWITH_TOOLS=${WITH_TOOLS} -- GitLab From 09696a32cc3e19c2851c4059b99561d0dd6e2b12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 17 Dec 2018 21:29:46 +0100 Subject: [PATCH 064/130] Created tnl-benchmark-traversers. --- src/Benchmarks/CMakeLists.txt | 1 + src/Benchmarks/Traversers/CMakeLists.txt | 9 ++ .../Traversers/tnl-benchmark-traversers.cpp | 11 ++ .../Traversers/tnl-benchmark-traversers.cu | 11 ++ .../Traversers/tnl-benchmark-traversers.h | 102 ++++++++++++++++++ src/Benchmarks/scripts/cuda-profiler.conf | 7 -- .../scripts/process-cuda-profile.pl | 42 -------- 7 files changed, 134 insertions(+), 49 deletions(-) create mode 100644 src/Benchmarks/Traversers/CMakeLists.txt create mode 100644 src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp create mode 100644 src/Benchmarks/Traversers/tnl-benchmark-traversers.cu create mode 100644 src/Benchmarks/Traversers/tnl-benchmark-traversers.h delete mode 100644 src/Benchmarks/scripts/cuda-profiler.conf delete mode 100644 src/Benchmarks/scripts/process-cuda-profile.pl diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt index e0637205f..d4c2258c9 100644 --- a/src/Benchmarks/CMakeLists.txt +++ b/src/Benchmarks/CMakeLists.txt @@ -3,6 +3,7 @@ add_subdirectory( BLAS ) add_subdirectory( SpMV ) add_subdirectory( DistSpMV ) add_subdirectory( LinearSolvers ) +add_subdirectory( Traversers ) set( headers Benchmarks.h diff --git a/src/Benchmarks/Traversers/CMakeLists.txt b/src/Benchmarks/Traversers/CMakeLists.txt new file mode 100644 index 000000000..b58c7d66f --- /dev/null +++ b/src/Benchmarks/Traversers/CMakeLists.txt @@ -0,0 +1,9 @@ +if( BUILD_CUDA ) + CUDA_ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cu ) + TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl ${CUDA_cusparse_LIBRARY} ) +else() + ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp ) + TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl ) +endif() + +install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin ) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp new file mode 100644 index 000000000..cf69b41dd --- /dev/null +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp @@ -0,0 +1,11 @@ +/*************************************************************************** + tnl-benchmark-traversers.cpp - description + ------------------- + begin : Dec 17, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include "tnl-benchmark-traversers.h" \ No newline at end of file diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.cu b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cu new file mode 100644 index 000000000..614b0d200 --- /dev/null +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cu @@ -0,0 +1,11 @@ +/*************************************************************************** + tnl-benchmark-traversers.cu - description + ------------------- + begin : Dec 17, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include "tnl-benchmark-traversers.h" \ No newline at end of file diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h new file mode 100644 index 000000000..9d1af1ec9 --- /dev/null +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -0,0 +1,102 @@ +/*************************************************************************** + tnl-benchmark-traversers.h - description + ------------------- + begin : Dec 17, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include "../Benchmarks.h" + +#include +#include +#include +#include + +using namespace TNL; +using namespace TNL::Benchmarks; + +void setupConfig( Config::ConfigDescription& config ) +{ + config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log"); + config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); + config.addEntryEnum( "append" ); + config.addEntryEnum( "overwrite" ); + config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); + config.addEntryEnum( "float" ); + config.addEntryEnum( "double" ); + config.addEntryEnum( "all" ); + config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 ); + config.addEntry< std::size_t >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 ); + config.addEntry< std::size_t >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 ); + config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); + config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); + config.addEntry< int >( "verbose", "Verbose mode.", 1 ); + + config.addDelimiter( "Device settings:" ); + Devices::Host::configSetup( config ); + Devices::Cuda::configSetup( config ); +} + +int main( int argc, char* argv[] ) +{ + Config::ConfigDescription config; + Config::ParameterContainer parameters; + + setupConfig( config ); + if( ! parseCommandLine( argc, argv, config, parameters ) ) { + config.printUsage( argv[ 0 ] ); + return EXIT_FAILURE; + } + + if( ! Devices::Host::setup( parameters ) || + ! Devices::Cuda::setup( parameters ) ) + return EXIT_FAILURE; + + const String & logFileName = parameters.getParameter< String >( "log-file" ); + const String & outputMode = parameters.getParameter< String >( "output-mode" ); + const String & precision = parameters.getParameter< String >( "precision" ); + // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), + // which have a default value. The workaround below works for int values, but it is not possible + // to pass 64-bit integer values + // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); + // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); + const int dimension = parameters.getParameter< int >( "dimension" ); + const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); + const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); + const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); + const unsigned loops = parameters.getParameter< unsigned >( "loops" ); + const unsigned verbose = parameters.getParameter< unsigned >( "verbose" ); + + bool status( false ); + if( ! dimension ) + { + status = performBenchmark< 1 >( parameters ); + status |= performBenchmark< 2 >( parameters ); + status |= performBenchmark< 3 >( parameters ); + } + else + { + switch( dimension ) + { + case 1: + status = performBenchmark< 1 >( parameters ); + break; + case 2: + status = performBenchmark< 2 >( parameters ); + break; + case 3: + status = performBenchmark< 3 >( parameters ); + break; + } + } + if( status == false ) + return EXIT_FAILURE; + return EXIT_SUCCES; +} \ No newline at end of file diff --git a/src/Benchmarks/scripts/cuda-profiler.conf b/src/Benchmarks/scripts/cuda-profiler.conf deleted file mode 100644 index 8ff91fe3b..000000000 --- a/src/Benchmarks/scripts/cuda-profiler.conf +++ /dev/null @@ -1,7 +0,0 @@ -== cuda-kernel.conf == -timestamp -threadblocksize -l1_global_load_hit -l1_global_load_miss -gld_incoherent -gst_incoherent \ No newline at end of file diff --git a/src/Benchmarks/scripts/process-cuda-profile.pl b/src/Benchmarks/scripts/process-cuda-profile.pl deleted file mode 100644 index 187623da9..000000000 --- a/src/Benchmarks/scripts/process-cuda-profile.pl +++ /dev/null @@ -1,42 +0,0 @@ -open( INPUT, "$ARGV[0]" ) - or die "Can not open file $ARGV[ 0 ]"; -$blockSize = 0; -$testNumber = 0; -while( $line = ) -{ - if( $line =~ m/.*sparseCSRMatrixVectorProductKernel.*threadblocksize=\[ (.*), 1, 1 \] occupancy=\[ (.*) \] tex_cache_hit=\[ (.*) \] tex_cache_miss=\[ (.*) \] gld_incoherent=\[ (.*) \] gst_incoherent=\[ (.*) \].*/ ) - { - if( $blockSize != $1 ) - { - $blockSize = $1; - $occupancy{$testNumber} = $2; - $texCacheHit{$testNumber} = $3; - $texCacheMiss{$testNumber} = $4; - $gldIncoherent{$testNumber} = $5; - $gstIncoherent{$testNumber} = $6; - $testNumber = $testNumber + 1; - } - } -} -close( INPUT ); - -print "There were $testNumber tests."; - -open( LOG, ">>$ARGV[1]" ) - or die "Can not open file $ARGV[1]"; -printf LOG "| %97s |", $ARGV[ 0 ]; -$testOutput = 0; -while( $testOutput < $testNumber ) -{ - printf LOG "%10.3f |", $occupancy{$testOutput}; - printf LOG "%10.3f |", $texCahceHit{$testOutput}; - printf LOG "%10.3f |", $texCacheMiss{$testOutput}; - printf LOG "%10.3f |", $gldIncoherent{$testOutput}; - printf LOG "%10.3f |", $gstIncoherent{$testOutput}; - $testOutput = $testOutput + 1; -} -print LOG "\n"; -close( LOG ); - - - -- GitLab From 7ce8d125d0a005cd2e4ac7f6a809a4069934b0cf Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Wed, 19 Dec 2018 14:29:28 +0100 Subject: [PATCH 065/130] Implementation of the traversers benchmark. --- .../Traversers/tnl-benchmark-traversers.h | 72 +++++++++++++------ 1 file changed, 50 insertions(+), 22 deletions(-) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 9d1af1ec9..7e5189bfb 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -44,6 +44,48 @@ void setupConfig( Config::ConfigDescription& config ) Devices::Cuda::configSetup( config ); } +template< int Dimension > +bool runBenchmark( const Config::ParameterContainer& parameters, + Benchmark& benchmark, + Benchmark::MetadataMap& metadat ) +{ + // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), + // which have a default value. The workaround below works for int values, but it is not possible + // to pass 64-bit integer values + // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); + // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); + const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); + const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); + +} + +template< int Dimension > +bool setupBenchmark( const Config::ParameterContainer& parameters ) +{ + const String & logFileName = parameters.getParameter< String >( "log-file" ); + const String & outputMode = parameters.getParameter< String >( "output-mode" ); + const String & precision = parameters.getParameter< String >( "precision" ); + const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); + const unsigned loops = parameters.getParameter< unsigned >( "loops" ); + const unsigned verbose = parameters.getParameter< unsigned >( "verbose" ); + + Benchmark benchmark( loops, verbose ); + Benchmark::MetadataMap metadata = getHardwareMetadata(); + runBenchmark< Dimension >( parameters, benchmark, metadata ); + + auto mode = std::ios::out; + if( outputMode == "append" ) + mode |= std::ios::app; + std::ofstream logFile( logFileName.getString(), mode ); + + if( ! benchmark.save( logFile ) ) + { + std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; + return false; + } + return true; +} + int main( int argc, char* argv[] ) { Config::ConfigDescription config; @@ -59,44 +101,30 @@ int main( int argc, char* argv[] ) ! Devices::Cuda::setup( parameters ) ) return EXIT_FAILURE; - const String & logFileName = parameters.getParameter< String >( "log-file" ); - const String & outputMode = parameters.getParameter< String >( "output-mode" ); - const String & precision = parameters.getParameter< String >( "precision" ); - // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), - // which have a default value. The workaround below works for int values, but it is not possible - // to pass 64-bit integer values - // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); - // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); const int dimension = parameters.getParameter< int >( "dimension" ); - const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); - const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); - const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); - const unsigned loops = parameters.getParameter< unsigned >( "loops" ); - const unsigned verbose = parameters.getParameter< unsigned >( "verbose" ); - bool status( false ); if( ! dimension ) { - status = performBenchmark< 1 >( parameters ); - status |= performBenchmark< 2 >( parameters ); - status |= performBenchmark< 3 >( parameters ); + status = setupBenchmark< 1 >( parameters ); + status |= setupBenchmark< 2 >( parameters ); + status |= setupBenchmark< 3 >( parameters ); } else { switch( dimension ) { case 1: - status = performBenchmark< 1 >( parameters ); + status = setupBenchmark< 1 >( parameters ); break; case 2: - status = performBenchmark< 2 >( parameters ); + status = setupBenchmark< 2 >( parameters ); break; case 3: - status = performBenchmark< 3 >( parameters ); + status = setupBenchmark< 3 >( parameters ); break; } } if( status == false ) return EXIT_FAILURE; - return EXIT_SUCCES; -} \ No newline at end of file + return EXIT_SUCCESS; +} -- GitLab From 10d7f72179c7711971375c37ded0e9a33f9c3d35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 19 Dec 2018 21:12:19 +0100 Subject: [PATCH 066/130] Fixed typo in vector operations benchmark comment. --- src/Benchmarks/BLAS/vector-operations.h | 2 +- src/Benchmarks/Traversers/WriteOne.h | 88 +++++++++++++++++++++ src/Benchmarks/Traversers/grid-traversing.h | 54 +++++++++++++ 3 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 src/Benchmarks/Traversers/WriteOne.h create mode 100644 src/Benchmarks/Traversers/grid-traversing.h diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h index b9a68d618..8dd63de85 100644 --- a/src/Benchmarks/BLAS/vector-operations.h +++ b/src/Benchmarks/BLAS/vector-operations.h @@ -64,7 +64,7 @@ benchmarkVectorOperations( Benchmark & benchmark, deviceVector.setValue( 1.0 ); #endif // A relatively harmless call to keep the compiler from realizing we - // don't actually do any useful work with the result of the reduciton. + // don't actually do any useful work with the result of the reduction. srand48(resultHost); resultHost = resultDevice = 0.0; }; diff --git a/src/Benchmarks/Traversers/WriteOne.h b/src/Benchmarks/Traversers/WriteOne.h new file mode 100644 index 000000000..73bf0bfec --- /dev/null +++ b/src/Benchmarks/Traversers/WriteOne.h @@ -0,0 +1,88 @@ +/*************************************************************************** + WriteOne.h - description + ------------------- + begin : Dec 19, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include + +namespace TNL { + namespace Benchmarks { + + +template< int Dimenions, + typename Device, + typename Real, + typename Index > +class WriteOne{}; + +template< typename Device, + typename Real, + typename Index > +class WriteOne< 1, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + + static void run( std::size_t size ) + { + Vector v( size ); + auto writeOne = []( Index i, Real* data ) + { + data[ i ] = 1.0; + }; + + + ParallelFor< Devices::Host >::exec( ( std::size_t ) 0, size, writeOne, v.getData() ); + } +}; + + +template< typename Device, + typename Real, + typename Index > +class WriteOne< 2, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + + static void run( std::size_t size ) + { + + } +}; + +template< typename Device, + typename Real, + typename Index > +class WriteOne< 3, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + + static void run( std::size_t size ) + { + + } +}; + + + } // namespace Benchmarks +} // namespace TNL + + + diff --git a/src/Benchmarks/Traversers/grid-traversing.h b/src/Benchmarks/Traversers/grid-traversing.h new file mode 100644 index 000000000..df45b1d7f --- /dev/null +++ b/src/Benchmarks/Traversers/grid-traversing.h @@ -0,0 +1,54 @@ +/*************************************************************************** + grid-traversing.h - description + ------------------- + begin : Dec 19, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include "../Benchmarks.h" +#include "WriteOne.h" + +#include + +namespace TNL { + namespace Benchmarks { + +template< int Dimension, + typename Real = double, + typename Index = int > +class benchmarkTraversingFullGrid +{ + public: + + static void run ( Benchmark& benchmark, std::size_t size ) + { + auto reset = [&]() + {}; + + auto testHost = [&] () + { + WriteOne< Dimension, Devices::Host, Real, Index >::run( size ); + }; + + auto testCuda = [&] () + { + WriteOne< Dimension, Devices::Cuda, Real, Index >::run( size ); + }; + + benchmark.setOperation( "writeOne", size * sizeof( Real ) ); + benchmark.time( reset, "CPU", testHost ); +#ifdef HAVE_CUDA + benchmark.time( reset, "GPU", testCuda ); +#endif + + } +}; + } // namespace Benchmarks +} // namespace TNL \ No newline at end of file -- GitLab From 3e8d53c5cec2fa65876c0c2273d1f1273506bb3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 19 Dec 2018 21:13:37 +0100 Subject: [PATCH 067/130] Implementation of grid traversers benchmarks. --- .../Traversers/tnl-benchmark-traversers.h | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 7e5189bfb..e227a258d 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -13,6 +13,7 @@ #pragma once #include "../Benchmarks.h" +#include "grid-traversing.h" #include #include @@ -33,8 +34,8 @@ void setupConfig( Config::ConfigDescription& config ) config.addEntryEnum( "double" ); config.addEntryEnum( "all" ); config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 ); - config.addEntry< std::size_t >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 ); - config.addEntry< std::size_t >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 ); + config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 ); + config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 ); config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); config.addEntry< int >( "verbose", "Verbose mode.", 1 ); @@ -47,16 +48,26 @@ void setupConfig( Config::ConfigDescription& config ) template< int Dimension > bool runBenchmark( const Config::ParameterContainer& parameters, Benchmark& benchmark, - Benchmark::MetadataMap& metadat ) + Benchmark::MetadataMap& metadata ) { // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), // which have a default value. The workaround below works for int values, but it is not possible // to pass 64-bit integer values // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); - const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); - const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); + const int minSize = parameters.getParameter< int >( "min-size" ); + const int maxSize = parameters.getParameter< int >( "max-size" ); + // Full grid traversing + benchmark.newBenchmark( String("Full grid traversing " + convertToString( Dimension ) + "D" ), metadata ); + for( std::size_t size = minSize; size <= maxSize; size *= 2 ) + { + benchmark.setMetadataColumns( Benchmark::MetadataColumns({ + {"size", convertToString( size ) }, + } )); + benchmarkTraversingFullGrid< Dimension >::run( benchmark, size ); + } + return true; } template< int Dimension > -- GitLab From 5a46ce238aab3892cb4b241790abb3cf5d879c15 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 20 Dec 2018 14:06:01 +0100 Subject: [PATCH 068/130] Fixing lambda function for CUDA in traverser benchmark. --- src/Benchmarks/Traversers/WriteOne.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Benchmarks/Traversers/WriteOne.h b/src/Benchmarks/Traversers/WriteOne.h index 73bf0bfec..9fd269f10 100644 --- a/src/Benchmarks/Traversers/WriteOne.h +++ b/src/Benchmarks/Traversers/WriteOne.h @@ -39,13 +39,13 @@ class WriteOne< 1, Device, Real, Index > static void run( std::size_t size ) { Vector v( size ); - auto writeOne = []( Index i, Real* data ) + auto writeOne = [] __cuda_callable__ ( Index i, Real* data ) { data[ i ] = 1.0; }; - ParallelFor< Devices::Host >::exec( ( std::size_t ) 0, size, writeOne, v.getData() ); + ParallelFor< Device >::exec( ( std::size_t ) 0, size, writeOne, v.getData() ); } }; -- GitLab From 3a2432a345127755675ab17ec6fcb9cc85d7cdfe Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 20 Dec 2018 15:17:18 +0100 Subject: [PATCH 069/130] Implemented write-one grid traverser becnhamrk in 2D and 3D. --- src/Benchmarks/Traversers/WriteOne.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/Benchmarks/Traversers/WriteOne.h b/src/Benchmarks/Traversers/WriteOne.h index 9fd269f10..4c39926aa 100644 --- a/src/Benchmarks/Traversers/WriteOne.h +++ b/src/Benchmarks/Traversers/WriteOne.h @@ -44,7 +44,6 @@ class WriteOne< 1, Device, Real, Index > data[ i ] = 1.0; }; - ParallelFor< Device >::exec( ( std::size_t ) 0, size, writeOne, v.getData() ); } }; @@ -61,7 +60,17 @@ class WriteOne< 2, Device, Real, Index > static void run( std::size_t size ) { + Vector v( size * size ); + auto writeOne = [=] __cuda_callable__ ( Index i, Index j, Real* data ) + { + data[ i * size + j ] = 1.0; + }; + ParallelFor2D< Device >::exec( ( std::size_t ) 0, + ( std::size_t ) 0, + size, + size, + writeOne, v.getData() ); } }; @@ -76,7 +85,19 @@ class WriteOne< 3, Device, Real, Index > static void run( std::size_t size ) { + Vector v( size * size * size ); + auto writeOne = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) + { + data[ ( i * size + j ) * size + k ] = 1.0; + }; + ParallelFor3D< Device >::exec( ( std::size_t ) 0, + ( std::size_t ) 0, + ( std::size_t ) 0, + size, + size, + size, + writeOne, v.getData() ); } }; -- GitLab From 56f0c67285c11196d2c274997d8d57d3056241d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Thu, 20 Dec 2018 22:02:02 +0100 Subject: [PATCH 070/130] Added computation minimal time, config setup and setup to Benchmark. --- src/Benchmarks/Benchmarks.h | 42 +++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 559e27ee2..39973d0ba 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -26,6 +26,7 @@ #include #include #include +#include #include namespace TNL { @@ -40,6 +41,7 @@ double timeFunction( ComputeFunction compute, ResetFunction reset, int loops, + int minTime, Monitor && monitor = Monitor() ) { // the timer is constructed zero-initialized and stopped @@ -52,7 +54,11 @@ timeFunction( ComputeFunction compute, reset(); compute(); - for(int i = 0; i < loops; ++i) { + int i; + for( i = 0; + i < loops && ( ! minTime || timer.getRealTime() < ( double ) minTime ); + ++i) + { // abuse the monitor's "time" for loops monitor.setTime( i + 1 ); @@ -71,7 +77,7 @@ timeFunction( ComputeFunction compute, timer.stop(); } - return timer.getRealTime() / loops; + return timer.getRealTime() / ( double ) i; } @@ -89,6 +95,12 @@ public: : verbose(verbose) {} + void + setVerbose( bool verbose) + { + this->verbose = verbose; + } + void writeTitle( const String & title ) { @@ -309,12 +321,25 @@ public: using Logging::MetadataElement; using Logging::MetadataMap; using Logging::MetadataColumns; - + Benchmark( int loops = 10, bool verbose = true ) : Logging(verbose), loops(loops) {} + + static void configSetup( Config::ConfigDescription& config ) + { + config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); + config.addEntry< int >( "min-time", "Minimal real time in seconds for every computation.", 1 ); + } + void setup( const Config::ParameterContainer& parameters ) + { + this->loops = parameters.getParameter< unsigned >( "loops" ); + this->minTime = parameters.getParameter< unsigned >( "min-time" ); + const unsigned verbose = parameters.getParameter< unsigned >( "verbose" ); + Logging::setVerbose( verbose ); + } // TODO: ensure that this is not called in the middle of the benchmark // (or just remove it completely?) void @@ -322,6 +347,11 @@ public: { this->loops = loops; } + + void setMinTime( int minTime ) + { + this->minTime = minTime; + } // Marks the start of a new benchmark void @@ -424,10 +454,10 @@ public: if( verbose ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); - result.time = timeFunction( compute, reset, loops, monitor ); + result.time = timeFunction( compute, reset, loops, minTime, monitor ); } else { - result.time = timeFunction( compute, reset, loops, monitor ); + result.time = timeFunction( compute, reset, minTime, loops, monitor ); } } catch ( const std::exception& e ) { @@ -477,7 +507,7 @@ public: } protected: - int loops; + int loops, minTime = 1; double datasetSize = 0.0; double baseTime = 0.0; Solvers::IterativeSolverMonitor< double, int > monitor; -- GitLab From 7dce286c528896f487b13788c27613a3b2b07d02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Thu, 20 Dec 2018 22:02:46 +0100 Subject: [PATCH 071/130] Fixed grid traversers benchmark. --- src/Benchmarks/Traversers/WriteOne.h | 109 ------------------ src/Benchmarks/Traversers/grid-traversing.h | 20 +--- .../Traversers/tnl-benchmark-traversers.h | 86 +++++++++----- src/Benchmarks/scripts/CMakeLists.txt | 15 +-- 4 files changed, 63 insertions(+), 167 deletions(-) delete mode 100644 src/Benchmarks/Traversers/WriteOne.h diff --git a/src/Benchmarks/Traversers/WriteOne.h b/src/Benchmarks/Traversers/WriteOne.h deleted file mode 100644 index 4c39926aa..000000000 --- a/src/Benchmarks/Traversers/WriteOne.h +++ /dev/null @@ -1,109 +0,0 @@ -/*************************************************************************** - WriteOne.h - description - ------------------- - begin : Dec 19, 2018 - copyright : (C) 2018 by oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Tomas Oberhuber - -#pragma once - -#include -#include -#include -#include - -namespace TNL { - namespace Benchmarks { - - -template< int Dimenions, - typename Device, - typename Real, - typename Index > -class WriteOne{}; - -template< typename Device, - typename Real, - typename Index > -class WriteOne< 1, Device, Real, Index > -{ - public: - - using Vector = Containers::Vector< Real, Device, Index >; - - static void run( std::size_t size ) - { - Vector v( size ); - auto writeOne = [] __cuda_callable__ ( Index i, Real* data ) - { - data[ i ] = 1.0; - }; - - ParallelFor< Device >::exec( ( std::size_t ) 0, size, writeOne, v.getData() ); - } -}; - - -template< typename Device, - typename Real, - typename Index > -class WriteOne< 2, Device, Real, Index > -{ - public: - - using Vector = Containers::Vector< Real, Device, Index >; - - static void run( std::size_t size ) - { - Vector v( size * size ); - auto writeOne = [=] __cuda_callable__ ( Index i, Index j, Real* data ) - { - data[ i * size + j ] = 1.0; - }; - - ParallelFor2D< Device >::exec( ( std::size_t ) 0, - ( std::size_t ) 0, - size, - size, - writeOne, v.getData() ); - } -}; - -template< typename Device, - typename Real, - typename Index > -class WriteOne< 3, Device, Real, Index > -{ - public: - - using Vector = Containers::Vector< Real, Device, Index >; - - static void run( std::size_t size ) - { - Vector v( size * size * size ); - auto writeOne = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) - { - data[ ( i * size + j ) * size + k ] = 1.0; - }; - - ParallelFor3D< Device >::exec( ( std::size_t ) 0, - ( std::size_t ) 0, - ( std::size_t ) 0, - size, - size, - size, - writeOne, v.getData() ); - } -}; - - - } // namespace Benchmarks -} // namespace TNL - - - diff --git a/src/Benchmarks/Traversers/grid-traversing.h b/src/Benchmarks/Traversers/grid-traversing.h index df45b1d7f..c977fea1c 100644 --- a/src/Benchmarks/Traversers/grid-traversing.h +++ b/src/Benchmarks/Traversers/grid-traversing.h @@ -13,7 +13,7 @@ #pragma once #include "../Benchmarks.h" -#include "WriteOne.h" + #include @@ -29,24 +29,6 @@ class benchmarkTraversingFullGrid static void run ( Benchmark& benchmark, std::size_t size ) { - auto reset = [&]() - {}; - - auto testHost = [&] () - { - WriteOne< Dimension, Devices::Host, Real, Index >::run( size ); - }; - - auto testCuda = [&] () - { - WriteOne< Dimension, Devices::Cuda, Real, Index >::run( size ); - }; - - benchmark.setOperation( "writeOne", size * sizeof( Real ) ); - benchmark.time( reset, "CPU", testHost ); -#ifdef HAVE_CUDA - benchmark.time( reset, "GPU", testCuda ); -#endif } }; diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index e227a258d..3e13d52dd 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -13,7 +13,8 @@ #pragma once #include "../Benchmarks.h" -#include "grid-traversing.h" +//#include "grid-traversing.h" +#include "GridTraversersBenchmark.h" #include #include @@ -23,29 +24,10 @@ using namespace TNL; using namespace TNL::Benchmarks; -void setupConfig( Config::ConfigDescription& config ) -{ - config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log"); - config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); - config.addEntryEnum( "append" ); - config.addEntryEnum( "overwrite" ); - config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); - config.addEntryEnum( "float" ); - config.addEntryEnum( "double" ); - config.addEntryEnum( "all" ); - config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 ); - config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 ); - config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 ); - config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); - config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); - config.addEntry< int >( "verbose", "Verbose mode.", 1 ); - - config.addDelimiter( "Device settings:" ); - Devices::Host::configSetup( config ); - Devices::Cuda::configSetup( config ); -} -template< int Dimension > +template< int Dimension, + typename Real = float, + typename Index = int > bool runBenchmark( const Config::ParameterContainer& parameters, Benchmark& benchmark, Benchmark::MetadataMap& metadata ) @@ -62,14 +44,59 @@ bool runBenchmark( const Config::ParameterContainer& parameters, benchmark.newBenchmark( String("Full grid traversing " + convertToString( Dimension ) + "D" ), metadata ); for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { - benchmark.setMetadataColumns( Benchmark::MetadataColumns({ - {"size", convertToString( size ) }, - } )); - benchmarkTraversingFullGrid< Dimension >::run( benchmark, size ); + + GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size ); + GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size ); + + auto reset = [&]() {}; + + benchmark.setMetadataColumns( + Benchmark::MetadataColumns( + { {"size", convertToString( size ) }, } ) ); + + auto hostWriteOne = [&] () + { + hostTraverserBenchmark.writeOne(); + }; + + auto cudaWriteOne = [&] () + { + cudaTraverserBenchmark.writeOne(); + }; + + benchmark.setOperation( "writeOne", size * sizeof( Real ) ); + benchmark.time( reset, "CPU", hostWriteOne ); +#ifdef HAVE_CUDA + benchmark.time( reset, "GPU", cudaWriteOne ); +#endif + } return true; } +void setupConfig( Config::ConfigDescription& config ) +{ + config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log"); + config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); + config.addEntryEnum( "append" ); + config.addEntryEnum( "overwrite" ); + config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); + config.addEntryEnum( "float" ); + config.addEntryEnum( "double" ); + config.addEntryEnum( "all" ); + config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 ); + config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 ); + config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 ); + config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); + config.addEntry< bool >( "verbose", "Verbose mode.", true ); + + Benchmark::configSetup( config ); + + config.addDelimiter( "Device settings:" ); + Devices::Host::configSetup( config ); + Devices::Cuda::configSetup( config ); +} + template< int Dimension > bool setupBenchmark( const Config::ParameterContainer& parameters ) { @@ -77,10 +104,9 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) const String & outputMode = parameters.getParameter< String >( "output-mode" ); const String & precision = parameters.getParameter< String >( "precision" ); const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); - const unsigned loops = parameters.getParameter< unsigned >( "loops" ); - const unsigned verbose = parameters.getParameter< unsigned >( "verbose" ); + - Benchmark benchmark( loops, verbose ); + Benchmark benchmark; //( loops, verbose ); Benchmark::MetadataMap metadata = getHardwareMetadata(); runBenchmark< Dimension >( parameters, benchmark, metadata ); diff --git a/src/Benchmarks/scripts/CMakeLists.txt b/src/Benchmarks/scripts/CMakeLists.txt index 1388c7984..31acdeb7d 100644 --- a/src/Benchmarks/scripts/CMakeLists.txt +++ b/src/Benchmarks/scripts/CMakeLists.txt @@ -1,16 +1,13 @@ -INSTALL( FILES matrix-market - florida-matrix-market - get-matrices - convert-matrices - draw-matrices +INSTALL( FILES tnl-run-heat-equation-benchmark + run-tnl-benchmark-spmv + run-tnl-benchmark-traversers run-matrix-solvers-benchmark run-tnl-benchmark-spmv run-tnl-benchmark-linear-solvers - tnl-run-heat-equation-benchmark - cuda-profiler.conf - process-cuda-profile.pl + DESTINATION ${TNL_TARGET_DATA_DIRECTORY}/benchmark-scripts ) -INSTALL( FILES tnl-run-spmv-benchmark +INSTALL( FILES run-tnl-benchmark-spmv + run-tnl-benchmark-traversers DESTINATION bin PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) -- GitLab From 524483f7ab1a57a8f3a99c9ed52fb741c3ca4641 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 21 Dec 2018 19:57:15 +0100 Subject: [PATCH 072/130] Added script for running traversers benchmark. Fixing traversers benchmark. --- src/Benchmarks/Benchmarks.h | 2 +- .../Traversers/GridTraversersBenchmark.h | 137 ++++++++++++++++++ .../Traversers/tnl-benchmark-traversers.h | 1 + .../scripts/run-tnl-benchmark-traversers | 5 + 4 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 src/Benchmarks/Traversers/GridTraversersBenchmark.h create mode 100644 src/Benchmarks/scripts/run-tnl-benchmark-traversers diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 39973d0ba..13ba3a6d1 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -56,7 +56,7 @@ timeFunction( ComputeFunction compute, int i; for( i = 0; - i < loops && ( ! minTime || timer.getRealTime() < ( double ) minTime ); + i < loops || timer.getRealTime() < ( double ) minTime; ++i) { // abuse the monitor's "time" for loops diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h new file mode 100644 index 000000000..3302c4cb9 --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -0,0 +1,137 @@ +/*************************************************************************** + WriteOne.h - description + ------------------- + begin : Dec 19, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include + +namespace TNL { + namespace Benchmarks { + + +template< int Dimension, + typename Device, + typename Real, + typename Index > +class GridTraversersBenchmark{}; + +template< typename Device, + typename Real, + typename Index > +class GridTraversersBenchmark< 1, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + + GridTraversersBenchmark( Index size ) + :v( size ), size( size ) + {} + + void writeOne() + { + + auto f = [] __cuda_callable__ ( Index i, Real* data ) + { + data[ i ] = i; + }; + + ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); + } + + protected: + + Index size; + Vector v; +}; + + +template< typename Device, + typename Real, + typename Index > +class GridTraversersBenchmark< 2, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + + GridTraversersBenchmark( Index size ) + :size( size ), v( size * size ) { } + + void writeOne() + { + Index _size = this->size; + auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) + { + data[ i * _size + j ] = i + j; + }; + + ParallelFor2D< Device >::exec( ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); + } + + protected: + + Index size; + + Vector v; + +}; + +template< typename Device, + typename Real, + typename Index > +class GridTraversersBenchmark< 3, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + + GridTraversersBenchmark( Index size ) + : size( size ), v( size * size * size ) {} + + void writeOne() + { + Index _size = this->size; + auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) + { + data[ ( i * _size + j ) * _size + k ] = i + j + k; + }; + + ParallelFor3D< Device >::exec( ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); + } + + protected: + + Index size; + Vector v; + +}; + + + } // namespace Benchmarks +} // namespace TNL + + + diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 3e13d52dd..9b69a3163 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -107,6 +107,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) Benchmark benchmark; //( loops, verbose ); + benchmark.setup( parameters ); Benchmark::MetadataMap metadata = getHardwareMetadata(); runBenchmark< Dimension >( parameters, benchmark, metadata ); diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-traversers b/src/Benchmarks/scripts/run-tnl-benchmark-traversers new file mode 100644 index 000000000..00cd1e1ac --- /dev/null +++ b/src/Benchmarks/scripts/run-tnl-benchmark-traversers @@ -0,0 +1,5 @@ +#!/bin/bash + +tnl-benchmark-traversers --dimension 1 --loops 1 --min-size 16 --max-size 100000 --min-time 1 +tnl-benchmark-traversers --dimension 2 --loops 1 --min-size 16 --max-size 10000 --min-time 1 --output-mode append +tnl-benchmark-traversers --dimension 3 --loops 1 --min-size 16 --max-size 1000 --min-time 1 --output-mode append -- GitLab From 2c19ec9a67b7e7ec6602323685b5b5411448c96c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 21 Dec 2018 21:46:04 +0100 Subject: [PATCH 073/130] Added constructors with dimensions to grids. --- src/TNL/Meshes/GridDetails/Grid1D.h | 2 ++ src/TNL/Meshes/GridDetails/Grid1D_impl.h | 11 +++++++++++ src/TNL/Meshes/GridDetails/Grid2D.h | 2 ++ src/TNL/Meshes/GridDetails/Grid2D_impl.h | 14 ++++++++++++++ src/TNL/Meshes/GridDetails/Grid3D.h | 2 ++ src/TNL/Meshes/GridDetails/Grid3D_impl.h | 22 ++++++++++++++++++++++ 6 files changed, 53 insertions(+) diff --git a/src/TNL/Meshes/GridDetails/Grid1D.h b/src/TNL/Meshes/GridDetails/Grid1D.h index 426428ae4..9a8f14600 100644 --- a/src/TNL/Meshes/GridDetails/Grid1D.h +++ b/src/TNL/Meshes/GridDetails/Grid1D.h @@ -60,6 +60,8 @@ class Grid< 1, Real, Device, Index > : public Object * \brief Basic constructor. */ Grid(); + + Grid( const Index xSize ); /** * \brief Returns type of grid Real (value), Device type and the type of Index. diff --git a/src/TNL/Meshes/GridDetails/Grid1D_impl.h b/src/TNL/Meshes/GridDetails/Grid1D_impl.h index 1754edc58..995fa6dab 100644 --- a/src/TNL/Meshes/GridDetails/Grid1D_impl.h +++ b/src/TNL/Meshes/GridDetails/Grid1D_impl.h @@ -33,6 +33,17 @@ Grid< 1, Real, Device, Index >::Grid() { } +template< typename Real, + typename Device, + typename Index > +Grid< 1, Real, Device, Index >::Grid( const Index xSize ) +: numberOfCells( 0 ), + numberOfVertices( 0 ), + distGrid(nullptr) +{ + this->setDimensions( xSize ); +} + template< typename Real, typename Device, typename Index > diff --git a/src/TNL/Meshes/GridDetails/Grid2D.h b/src/TNL/Meshes/GridDetails/Grid2D.h index 84c6b4f33..896b61548 100644 --- a/src/TNL/Meshes/GridDetails/Grid2D.h +++ b/src/TNL/Meshes/GridDetails/Grid2D.h @@ -61,6 +61,8 @@ class Grid< 2, Real, Device, Index > : public Object /** * \brief See Grid1D::getType(). */ + Grid( const Index xSize, const Index ySize ); + static String getType(); /** diff --git a/src/TNL/Meshes/GridDetails/Grid2D_impl.h b/src/TNL/Meshes/GridDetails/Grid2D_impl.h index b315d5d08..49ad91035 100644 --- a/src/TNL/Meshes/GridDetails/Grid2D_impl.h +++ b/src/TNL/Meshes/GridDetails/Grid2D_impl.h @@ -36,6 +36,20 @@ Grid< 2, Real, Device, Index > :: Grid() { } +template< typename Real, + typename Device, + typename Index > +Grid< 2, Real, Device, Index >::Grid( const Index xSize, const Index ySize ) +: numberOfCells( 0 ), + numberOfNxFaces( 0 ), + numberOfNyFaces( 0 ), + numberOfFaces( 0 ), + numberOfVertices( 0 ), + distGrid(nullptr) +{ + this->setDimensions( xSize, ySize ); +} + template< typename Real, typename Device, typename Index > diff --git a/src/TNL/Meshes/GridDetails/Grid3D.h b/src/TNL/Meshes/GridDetails/Grid3D.h index 565198077..3ddd44735 100644 --- a/src/TNL/Meshes/GridDetails/Grid3D.h +++ b/src/TNL/Meshes/GridDetails/Grid3D.h @@ -57,6 +57,8 @@ class Grid< 3, Real, Device, Index > : public Object * \brief See Grid1D::Grid(). */ Grid(); + + Grid( const Index xSize, const Index ySize, const Index zSize ); /** * \brief See Grid1D::getType(). diff --git a/src/TNL/Meshes/GridDetails/Grid3D_impl.h b/src/TNL/Meshes/GridDetails/Grid3D_impl.h index cc6805ac0..edbee0c00 100644 --- a/src/TNL/Meshes/GridDetails/Grid3D_impl.h +++ b/src/TNL/Meshes/GridDetails/Grid3D_impl.h @@ -43,6 +43,28 @@ Grid< 3, Real, Device, Index > :: Grid() { } +template< typename Real, + typename Device, + typename Index > +Grid< 3, Real, Device, Index >::Grid( const Index xSize, const Index ySize, const Index zSize ) +: numberOfCells( 0 ), + numberOfNxFaces( 0 ), + numberOfNyFaces( 0 ), + numberOfNzFaces( 0 ), + numberOfNxAndNyFaces( 0 ), + numberOfFaces( 0 ), + numberOfDxEdges( 0 ), + numberOfDyEdges( 0 ), + numberOfDzEdges( 0 ), + numberOfDxAndDyEdges( 0 ), + numberOfEdges( 0 ), + numberOfVertices( 0 ), + distGrid(nullptr) +{ + this->setDimensions( xSize, ySize, zSize ); +} + + template< typename Real, typename Device, typename Index > -- GitLab From 920d7c1820fe783fd0465abd9bfd4948bbbf1e2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 21 Dec 2018 21:47:40 +0100 Subject: [PATCH 074/130] Fixed memory bandwidth in traversers benchmark. --- .../Traversers/GridTraversersBenchmark.h | 63 +++++++++++++++---- .../Traversers/tnl-benchmark-traversers.h | 14 ++--- src/TNL/Meshes/GridDetails/Grid2D.h | 4 +- src/TNL/Meshes/GridDetails/Grid2D_impl.h | 2 +- src/TNL/Meshes/GridDetails/Grid3D.h | 4 +- 5 files changed, 64 insertions(+), 23 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 3302c4cb9..6f1019deb 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -16,6 +16,10 @@ #include #include #include +#include +#include +#include +#include namespace TNL { namespace Benchmarks { @@ -35,26 +39,52 @@ class GridTraversersBenchmark< 1, Device, Real, Index > public: using Vector = Containers::Vector< Real, Device, Index >; + using Grid = Meshes::Grid< 1, Real, Device, Index >; + using Coordinates = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using Cell = typename Grid::EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; GridTraversersBenchmark( Index size ) - :v( size ), size( size ) - {} + :v( size ), size( size ), grid( size ) + { + } - void writeOne() + void writeOneUsingParallelFor() { auto f = [] __cuda_callable__ ( Index i, Real* data ) { - data[ i ] = i; + data[ i ] = 1.0; }; ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } + void writeOneUsingTraverser() + { + class EntitiesProcessor + { + + }; + + class UserData + { + + }; + + Traverser traverser; + /*traverser.template processAllEntities< UserData, EntitiesProcessor > + ( meshPointer, + userData );*/ + + } + protected: Index size; Vector v; + Grid grid; }; @@ -66,16 +96,20 @@ class GridTraversersBenchmark< 2, Device, Real, Index > public: using Vector = Containers::Vector< Real, Device, Index >; + using Grid = Meshes::Grid< 2, Real, Device, Index >; + using Coordinates = typename Grid::CoordinatesType; GridTraversersBenchmark( Index size ) - :size( size ), v( size * size ) { } + :size( size ), v( size * size ), grid( size, size ) + { + } - void writeOne() + void writeOneUsingParallelFor() { Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { - data[ i * _size + j ] = i + j; + data[ i * _size + j ] = 1.0; }; ParallelFor2D< Device >::exec( ( Index ) 0, @@ -88,8 +122,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index > protected: Index size; - Vector v; + Grid grid; }; @@ -101,16 +135,22 @@ class GridTraversersBenchmark< 3, Device, Real, Index > public: using Vector = Containers::Vector< Real, Device, Index >; + using Grid = Meshes::Grid< 3, Real, Device, Index >; + using Coordinates = typename Grid::CoordinatesType; GridTraversersBenchmark( Index size ) - : size( size ), v( size * size * size ) {} + : size( size ), + v( size * size * size ), + grid( size, size, size ) + { + } - void writeOne() + void writeOneUsingParallelFor() { Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { - data[ ( i * _size + j ) * _size + k ] = i + j + k; + data[ ( i * _size + j ) * _size + k ] = 1.0; }; ParallelFor3D< Device >::exec( ( Index ) 0, @@ -126,6 +166,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > Index size; Vector v; + Grid grid; }; diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 9b69a3163..c6349f596 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -54,20 +54,20 @@ bool runBenchmark( const Config::ParameterContainer& parameters, Benchmark::MetadataColumns( { {"size", convertToString( size ) }, } ) ); - auto hostWriteOne = [&] () + auto hostWriteOneUsingParallelFor = [&] () { - hostTraverserBenchmark.writeOne(); + hostTraverserBenchmark.writeOneUsingParallelFor(); }; - auto cudaWriteOne = [&] () + auto cudaWriteOneUsingParallelFor = [&] () { - cudaTraverserBenchmark.writeOne(); + cudaTraverserBenchmark.writeOneUsingParallelFor(); }; - benchmark.setOperation( "writeOne", size * sizeof( Real ) ); - benchmark.time( reset, "CPU", hostWriteOne ); + benchmark.setOperation( "write 1 using parallel for", size * sizeof( Real ) / oneGB ); + benchmark.time( reset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA - benchmark.time( reset, "GPU", cudaWriteOne ); + benchmark.time( reset, "GPU", cudaWriteOneUsingParallelFor ); #endif } diff --git a/src/TNL/Meshes/GridDetails/Grid2D.h b/src/TNL/Meshes/GridDetails/Grid2D.h index 896b61548..f2dbebc5c 100644 --- a/src/TNL/Meshes/GridDetails/Grid2D.h +++ b/src/TNL/Meshes/GridDetails/Grid2D.h @@ -82,8 +82,8 @@ class Grid< 2, Real, Device, Index > : public Object /** * \brief Sets the size of dimensions. - * \param xSize Size of dimesion x. - * \param ySize Size of dimesion y. + * \param xSize Size of dimension x. + * \param ySize Size of dimension y. */ void setDimensions( const Index xSize, const Index ySize ); diff --git a/src/TNL/Meshes/GridDetails/Grid2D_impl.h b/src/TNL/Meshes/GridDetails/Grid2D_impl.h index 49ad91035..41e05d8b5 100644 --- a/src/TNL/Meshes/GridDetails/Grid2D_impl.h +++ b/src/TNL/Meshes/GridDetails/Grid2D_impl.h @@ -43,7 +43,7 @@ Grid< 2, Real, Device, Index >::Grid( const Index xSize, const Index ySize ) : numberOfCells( 0 ), numberOfNxFaces( 0 ), numberOfNyFaces( 0 ), - numberOfFaces( 0 ), + numberOfFaces( 0 ), numberOfVertices( 0 ), distGrid(nullptr) { diff --git a/src/TNL/Meshes/GridDetails/Grid3D.h b/src/TNL/Meshes/GridDetails/Grid3D.h index 3ddd44735..617efe7f3 100644 --- a/src/TNL/Meshes/GridDetails/Grid3D.h +++ b/src/TNL/Meshes/GridDetails/Grid3D.h @@ -57,8 +57,8 @@ class Grid< 3, Real, Device, Index > : public Object * \brief See Grid1D::Grid(). */ Grid(); - - Grid( const Index xSize, const Index ySize, const Index zSize ); + + Grid( const Index xSize, const Index ySize, const Index zSize ); /** * \brief See Grid1D::getType(). -- GitLab From de8f034c210691e4dc8a5725159f3897cd01c315 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 21 Dec 2018 22:20:28 +0100 Subject: [PATCH 075/130] Changing minTime in Benchmark from int to double. --- src/Benchmarks/Benchmarks.h | 13 +++++++------ src/Benchmarks/Traversers/GridTraversersBenchmark.h | 12 +++++------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 13ba3a6d1..61452d074 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -41,7 +41,7 @@ double timeFunction( ComputeFunction compute, ResetFunction reset, int loops, - int minTime, + const double& minTime, Monitor && monitor = Monitor() ) { // the timer is constructed zero-initialized and stopped @@ -56,7 +56,7 @@ timeFunction( ComputeFunction compute, int i; for( i = 0; - i < loops || timer.getRealTime() < ( double ) minTime; + i < loops || timer.getRealTime() < minTime; ++i) { // abuse the monitor's "time" for loops @@ -330,13 +330,13 @@ public: static void configSetup( Config::ConfigDescription& config ) { config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); - config.addEntry< int >( "min-time", "Minimal real time in seconds for every computation.", 1 ); + config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 ); } void setup( const Config::ParameterContainer& parameters ) { this->loops = parameters.getParameter< unsigned >( "loops" ); - this->minTime = parameters.getParameter< unsigned >( "min-time" ); + this->minTime = parameters.getParameter< double >( "min-time" ); const unsigned verbose = parameters.getParameter< unsigned >( "verbose" ); Logging::setVerbose( verbose ); } @@ -348,7 +348,7 @@ public: this->loops = loops; } - void setMinTime( int minTime ) + void setMinTime( const double& minTime ) { this->minTime = minTime; } @@ -507,7 +507,8 @@ public: } protected: - int loops, minTime = 1; + int loops = 1; + double minTime = 1; double datasetSize = 0.0; double baseTime = 0.0; Solvers::IterativeSolverMonitor< double, int > monitor; diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 6f1019deb..dcb6f5fdd 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -65,23 +65,21 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { class EntitiesProcessor { - }; - + class UserData { - }; - + Traverser traverser; /*traverser.template processAllEntities< UserData, EntitiesProcessor > ( meshPointer, userData );*/ - + } - + protected: - + Index size; Vector v; Grid grid; -- GitLab From f983e9d78fd09e51d8b029382c435e64a2054f4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 22 Dec 2018 09:28:11 +0100 Subject: [PATCH 076/130] Fixed indexing and data set size in traversers benchmark. --- src/Benchmarks/Traversers/GridTraversersBenchmark.h | 4 ++-- src/Benchmarks/Traversers/tnl-benchmark-traversers.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index dcb6f5fdd..735d0a241 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -105,7 +105,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > void writeOneUsingParallelFor() { Index _size = this->size; - auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) + auto f = [=] __cuda_callable__ ( Index j, Index i, Real* data ) { data[ i * _size + j ] = 1.0; }; @@ -146,7 +146,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > void writeOneUsingParallelFor() { Index _size = this->size; - auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) + auto f = [=] __cuda_callable__ ( Index k, Index j, Index i, Real* data ) { data[ ( i * _size + j ) * _size + k ] = 1.0; }; diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index c6349f596..6f9a4575a 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -64,7 +64,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.writeOneUsingParallelFor(); }; - benchmark.setOperation( "write 1 using parallel for", size * sizeof( Real ) / oneGB ); + benchmark.setOperation( "write 1 using parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time( reset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA benchmark.time( reset, "GPU", cudaWriteOneUsingParallelFor ); -- GitLab From 36ebcef0cb38a9e949c13f2dc54618c0c6da6c7b Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Sun, 23 Dec 2018 13:34:55 +0100 Subject: [PATCH 077/130] Fixed traversers benchmark test using traverser. --- .../Traversers/GridTraversersBenchmark.h | 107 ++++++++++++++---- .../Traversers/tnl-benchmark-traversers.h | 25 +++- 2 files changed, 110 insertions(+), 22 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 735d0a241..0190532c3 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -20,11 +20,43 @@ #include #include #include +#include namespace TNL { namespace Benchmarks { +template< typename TraverserUserData > +class WriteOneEntitiesProcessor +{ + public: + + using MeshType = typename TraverserUserData::MeshType; + using DeviceType = typename MeshType::DeviceType; + + template< typename GridEntity > + __cuda_callable__ + static inline void processEntity( const MeshType& mesh, + TraverserUserData& userData, + const GridEntity& entity ) + { + auto& u = userData.u.template modifyData< DeviceType >(); + u( entity ) = 1.0; + } +}; + +template< typename MeshFunctionPointer > +class WriteOneUserData +{ + public: + + using MeshType = typename MeshFunctionPointer::ObjectType::MeshType; + + MeshFunctionPointer u; + +}; + + template< int Dimension, typename Device, typename Real, @@ -40,14 +72,19 @@ class GridTraversersBenchmark< 1, Device, Real, Index > using Vector = Containers::Vector< Real, Device, Index >; using Grid = Meshes::Grid< 1, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< Grid >; using Coordinates = typename Grid::CoordinatesType; using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; using Cell = typename Grid::EntityType< 1, Meshes::GridEntityNoStencilStorage >; using Traverser = Meshes::Traverser< Grid, Cell >; + using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; + using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; GridTraversersBenchmark( Index size ) - :v( size ), size( size ), grid( size ) + :v( size ), size( size ), grid( size ), u( grid ) { + userData.u = this->u; } void writeOneUsingParallelFor() @@ -63,26 +100,18 @@ class GridTraversersBenchmark< 1, Device, Real, Index > void writeOneUsingTraverser() { - class EntitiesProcessor - { - }; - - class UserData - { - }; - - Traverser traverser; - /*traverser.template processAllEntities< UserData, EntitiesProcessor > - ( meshPointer, - userData );*/ - + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); } protected: Index size; Vector v; - Grid grid; + GridPointer grid; + MeshFunctionPointer u; + Traverser traverser; + WriteOneTraverserUserDataType userData; }; @@ -95,11 +124,20 @@ class GridTraversersBenchmark< 2, Device, Real, Index > using Vector = Containers::Vector< Real, Device, Index >; using Grid = Meshes::Grid< 2, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< Grid >; using Coordinates = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using Cell = typename Grid::EntityType< 2, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; + using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; + using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; + using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; GridTraversersBenchmark( Index size ) - :size( size ), v( size * size ), grid( size, size ) + :size( size ), v( size * size ), grid( size, size ), u( grid ) { + userData.u = this->u; } void writeOneUsingParallelFor() @@ -116,13 +154,22 @@ class GridTraversersBenchmark< 2, Device, Real, Index > this->size, f, v.getData() ); } + + void writeOneUsingTraverser() + { + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } + protected: Index size; Vector v; - Grid grid; - + GridPointer grid; + MeshFunctionPointer u; + Traverser traverser; + WriteOneTraverserUserDataType userData; }; template< typename Device, @@ -134,13 +181,23 @@ class GridTraversersBenchmark< 3, Device, Real, Index > using Vector = Containers::Vector< Real, Device, Index >; using Grid = Meshes::Grid< 3, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< Grid >; using Coordinates = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using Cell = typename Grid::EntityType< 3, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; + using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; + using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; + using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; GridTraversersBenchmark( Index size ) : size( size ), v( size * size * size ), - grid( size, size, size ) + grid( size, size, size ), + u( grid ) { + userData.u = this->u; } void writeOneUsingParallelFor() @@ -159,13 +216,21 @@ class GridTraversersBenchmark< 3, Device, Real, Index > this->size, f, v.getData() ); } + + void writeOneUsingTraverser() + { + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } protected: Index size; Vector v; - Grid grid; - + GridPointer grid; + MeshFunctionPointer u; + Traverser traverser; + WriteOneTraverserUserDataType userData; }; diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 6f9a4575a..4f839faf7 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -54,6 +54,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, Benchmark::MetadataColumns( { {"size", convertToString( size ) }, } ) ); + /**** + * Write one using parallel for + */ auto hostWriteOneUsingParallelFor = [&] () { hostTraverserBenchmark.writeOneUsingParallelFor(); @@ -69,6 +72,26 @@ bool runBenchmark( const Config::ParameterContainer& parameters, #ifdef HAVE_CUDA benchmark.time( reset, "GPU", cudaWriteOneUsingParallelFor ); #endif + + /**** + * Write one using traverser + */ + auto hostWriteOneUsingTraverser = [&] () + { + hostTraverserBenchmark.writeOneUsingTraverser(); + }; + + auto cudaWriteOneUsingTraverser = [&] () + { + cudaTraverserBenchmark.writeOneUsingTraverser(); + }; + + benchmark.setOperation( "write 1 using traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time( reset, "CPU", hostWriteOneUsingTraverser ); +#ifdef HAVE_CUDA + benchmark.time( reset, "GPU", cudaWriteOneUsingTraverser ); +#endif + } return true; @@ -76,7 +99,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, void setupConfig( Config::ConfigDescription& config ) { - config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log"); + config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-traversers.log"); config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); config.addEntryEnum( "append" ); config.addEntryEnum( "overwrite" ); -- GitLab From e3225772a3dd703c11bbb450550b2f723f57bd3e Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Mon, 24 Dec 2018 15:42:57 +0100 Subject: [PATCH 078/130] Changing verbose form bool to int to have three levels of verbosity in Benchmark. --- src/Benchmarks/Benchmarks.h | 16 +++++++++------- .../Traversers/tnl-benchmark-traversers.h | 1 - 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 61452d074..7a6b12676 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -48,12 +48,13 @@ timeFunction( ComputeFunction compute, Timer timer; // set timer to the monitor - monitor.setTimer( timer ); + //monitor.setTimer( timer ); // warm up reset(); compute(); + //timer.start(); int i; for( i = 0; i < loops || timer.getRealTime() < minTime; @@ -91,12 +92,12 @@ public: using HeaderElements = std::vector< String >; using RowElements = std::vector< double >; - Logging( bool verbose = true ) + Logging( int verbose = true ) : verbose(verbose) {} void - setVerbose( bool verbose) + setVerbose( int verbose) { this->verbose = verbose; } @@ -286,7 +287,7 @@ protected: std::string header_indent; std::string body_indent; - bool verbose; + int verbose; MetadataColumns metadataColumns; bool header_changed = true; std::vector< std::pair< String, int > > horizontalGroups; @@ -331,13 +332,14 @@ public: { config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 ); + config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 ); } void setup( const Config::ParameterContainer& parameters ) { this->loops = parameters.getParameter< unsigned >( "loops" ); this->minTime = parameters.getParameter< double >( "min-time" ); - const unsigned verbose = parameters.getParameter< unsigned >( "verbose" ); + const int verbose = parameters.getParameter< unsigned >( "verbose" ); Logging::setVerbose( verbose ); } // TODO: ensure that this is not called in the middle of the benchmark @@ -451,13 +453,13 @@ public: { result.time = std::numeric_limits::quiet_NaN(); try { - if( verbose ) { + if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); result.time = timeFunction( compute, reset, loops, minTime, monitor ); } else { - result.time = timeFunction( compute, reset, minTime, loops, monitor ); + result.time = timeFunction( compute, reset, loops, minTime, monitor ); } } catch ( const std::exception& e ) { diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 4f839faf7..d9958e29c 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -111,7 +111,6 @@ void setupConfig( Config::ConfigDescription& config ) config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 ); config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 ); config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); - config.addEntry< bool >( "verbose", "Verbose mode.", true ); Benchmark::configSetup( config ); -- GitLab From 61a560fa7d4ebb96b4d8b5df62041ee7dfee6fbc Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Tue, 25 Dec 2018 13:11:48 +0100 Subject: [PATCH 079/130] Added pure-C test to traversers benchmark. --- src/Benchmarks/Benchmarks.h | 2 +- .../Traversers/GridTraversersBenchmark.h | 174 ++++++++++++++++-- .../Traversers/tnl-benchmark-traversers.h | 66 +++++-- 3 files changed, 208 insertions(+), 34 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 7a6b12676..c371e2dfb 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -48,7 +48,7 @@ timeFunction( ComputeFunction compute, Timer timer; // set timer to the monitor - //monitor.setTimer( timer ); + monitor.setTimer( timer ); // warm up reset(); diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 0190532c3..ee18adfa6 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -53,9 +53,37 @@ class WriteOneUserData using MeshType = typename MeshFunctionPointer::ObjectType::MeshType; MeshFunctionPointer u; - }; - + +template< typename Real, + typename Index > +__global__ void simpleCudaKernel1D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( threadIdx_x < size ) + v_data[ threadIdx_x ] = 1.0; +} + +template< typename Real, + typename Index > +__global__ void simpleCudaKernel2D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + if( threadIdx_x < size && threadIdx_y < size ) + v_data[ threadIdx_y * size + threadIdx_x ] = 1.0; +} + +template< typename Real, + typename Index > +__global__ void simpleCudaKernel3D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size ) + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0; +} template< int Dimension, typename Device, @@ -85,19 +113,55 @@ class GridTraversersBenchmark< 1, Device, Real, Index > :v( size ), size( size ), grid( size ), u( grid ) { userData.u = this->u; + v_data = v.getData(); + } + + void reset() + { + v.setValue( 0.0 ); + u->getData().setValue( 0.0 ); + }; + + void writeOneUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + v_data[ i ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + simpleCudaKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } } void writeOneUsingParallelFor() { - auto f = [] __cuda_callable__ ( Index i, Real* data ) { data[ i ] = 1.0; }; - ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } - + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > @@ -108,6 +172,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > Index size; Vector v; + Real* v_data; GridPointer grid; MeshFunctionPointer u; Traverser traverser; @@ -133,11 +198,52 @@ class GridTraversersBenchmark< 2, Device, Real, Index > using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; - + GridTraversersBenchmark( Index size ) :size( size ), v( size * size ), grid( size, size ), u( grid ) { userData.u = this->u; + v_data = v.getData(); + } + + void reset() + { + v.setValue( 0.0 ); + u->getData().setValue( 0.0 ); + }; + + void writeOneUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + for( int j = 0; j < size; j++ ) + v_data[ i * size + j ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size ); + dim3 gridIdx; + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + simpleCudaKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } } void writeOneUsingParallelFor() @@ -154,18 +260,18 @@ class GridTraversersBenchmark< 2, Device, Real, Index > this->size, f, v.getData() ); } - + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > ( grid, userData ); } - protected: Index size; Vector v; + Real* v_data; GridPointer grid; MeshFunctionPointer u; Traverser traverser; @@ -178,7 +284,7 @@ template< typename Device, class GridTraversersBenchmark< 3, Device, Real, Index > { public: - + using Vector = Containers::Vector< Real, Device, Index >; using Grid = Meshes::Grid< 3, Real, Device, Index >; using GridPointer = Pointers::SharedPointer< Grid >; @@ -198,6 +304,50 @@ class GridTraversersBenchmark< 3, Device, Real, Index > u( grid ) { userData.u = this->u; + v_data = v.getData(); + } + + void reset() + { + v.setValue( 0.0 ); + u->getData().setValue( 0.0 ); + }; + + void writeOneUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + for( int j = 0; j < size; j++ ) + for( int k = 0; k < size; k++ ) + v_data[ ( i * size + j ) * size + k ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size, + size ); + dim3 gridIdx; + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + simpleCudaKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } } void writeOneUsingParallelFor() @@ -227,6 +377,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > Index size; Vector v; + Real* v_data; GridPointer grid; MeshFunctionPointer u; Traverser traverser; @@ -235,7 +386,4 @@ class GridTraversersBenchmark< 3, Device, Real, Index > } // namespace Benchmarks -} // namespace TNL - - - +} // namespace TNL \ No newline at end of file diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index d9958e29c..f1c4efeed 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -39,21 +39,50 @@ bool runBenchmark( const Config::ParameterContainer& parameters, // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); const int minSize = parameters.getParameter< int >( "min-size" ); const int maxSize = parameters.getParameter< int >( "max-size" ); - + // Full grid traversing - benchmark.newBenchmark( String("Full grid traversing " + convertToString( Dimension ) + "D" ), metadata ); + benchmark.newBenchmark( String("Full grid traversing - write 1" + convertToString( Dimension ) + "D" ), metadata ); for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size ); GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size ); - auto reset = [&]() {}; + auto noReset = []() {}; + + auto hostReset = [&]() + { + hostTraverserBenchmark.reset(); + }; + + auto cudaReset = [&]() + { + cudaTraverserBenchmark.reset(); + }; benchmark.setMetadataColumns( Benchmark::MetadataColumns( { {"size", convertToString( size ) }, } ) ); + /**** + * Write one using C for + */ + auto hostWriteOneUsingPureC = [&] () + { + hostTraverserBenchmark.writeOneUsingPureC(); + }; + + auto cudaWriteOneUsingPureC = [&] () + { + cudaTraverserBenchmark.writeOneUsingPureC(); + }; + + benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time( hostReset, "CPU", hostWriteOneUsingPureC ); +#ifdef HAVE_CUDA + benchmark.time( cudaReset, "GPU", cudaWriteOneUsingPureC ); +#endif + /**** * Write one using parallel for */ @@ -67,10 +96,10 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.writeOneUsingParallelFor(); }; - benchmark.setOperation( "write 1 using parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time( reset, "CPU", hostWriteOneUsingParallelFor ); + benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA - benchmark.time( reset, "GPU", cudaWriteOneUsingParallelFor ); + benchmark.time( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); #endif /**** @@ -84,16 +113,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters, auto cudaWriteOneUsingTraverser = [&] () { cudaTraverserBenchmark.writeOneUsingTraverser(); - }; - - benchmark.setOperation( "write 1 using traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time( reset, "CPU", hostWriteOneUsingTraverser ); + } + + benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time( hostReset, "CPU", hostWriteOneUsingTraverser ); #ifdef HAVE_CUDA - benchmark.time( reset, "GPU", cudaWriteOneUsingTraverser ); + benchmark.time( cudaReset, "GPU", cudaWriteOneUsingTraverser ); #endif - - - } + } return true; } @@ -107,16 +134,16 @@ void setupConfig( Config::ConfigDescription& config ) config.addEntryEnum( "float" ); config.addEntryEnum( "double" ); config.addEntryEnum( "all" ); - config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 ); + config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 ); config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 ); config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 ); config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); Benchmark::configSetup( config ); - + config.addDelimiter( "Device settings:" ); Devices::Host::configSetup( config ); - Devices::Cuda::configSetup( config ); + Devices::Cuda::configSetup( config ); } template< int Dimension > @@ -126,18 +153,17 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) const String & outputMode = parameters.getParameter< String >( "output-mode" ); const String & precision = parameters.getParameter< String >( "precision" ); const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); - Benchmark benchmark; //( loops, verbose ); benchmark.setup( parameters ); Benchmark::MetadataMap metadata = getHardwareMetadata(); runBenchmark< Dimension >( parameters, benchmark, metadata ); - + auto mode = std::ios::out; if( outputMode == "append" ) mode |= std::ios::app; std::ofstream logFile( logFileName.getString(), mode ); - + if( ! benchmark.save( logFile ) ) { std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; -- GitLab From 769a0dbb38586b45cacdf5979e5803d34dafbe7d Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Tue, 25 Dec 2018 17:42:34 +0100 Subject: [PATCH 080/130] CUDA device synchronization is performed only for CUDA benchmarks. --- src/Benchmarks/BLAS/array-operations.h | 24 ++-- src/Benchmarks/BLAS/spmv.h | 4 +- src/Benchmarks/BLAS/vector-operations.h | 58 +++++----- src/Benchmarks/Benchmarks.h | 103 ++++++++++-------- .../DistSpMV/tnl-benchmark-distributed-spmv.h | 4 +- src/Benchmarks/LinearSolvers/benchmarks.h | 4 +- .../Traversers/tnl-benchmark-traversers.h | 16 +-- 7 files changed, 111 insertions(+), 102 deletions(-) diff --git a/src/Benchmarks/BLAS/array-operations.h b/src/Benchmarks/BLAS/array-operations.h index 9ee6ff8a0..b5cf9ff58 100644 --- a/src/Benchmarks/BLAS/array-operations.h +++ b/src/Benchmarks/BLAS/array-operations.h @@ -72,9 +72,9 @@ benchmarkArrayOperations( Benchmark & benchmark, resultDevice = (int) deviceArray == deviceArray2; }; benchmark.setOperation( "comparison (operator==)", 2 * datasetSize ); - benchmark.time( reset1, "CPU", compareHost ); + benchmark.time< Devices::Host >( reset1, "CPU", compareHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", compareCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", compareCuda ); #endif @@ -87,9 +87,9 @@ benchmarkArrayOperations( Benchmark & benchmark, benchmark.setOperation( "copy (operator=)", 2 * datasetSize ); // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will // complain when compiling without CUDA - const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost ); + const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", copyAssignCudaCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", copyAssignCudaCuda ); #endif @@ -101,8 +101,8 @@ benchmarkArrayOperations( Benchmark & benchmark, }; #ifdef HAVE_CUDA benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime ); - benchmark.time( reset1, "CPU->GPU", copyAssignHostCuda ); - benchmark.time( reset1, "GPU->CPU", copyAssignCudaHost ); + benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost ); #endif @@ -113,9 +113,9 @@ benchmarkArrayOperations( Benchmark & benchmark, deviceArray.setValue( 3.0 ); }; benchmark.setOperation( "setValue", datasetSize ); - benchmark.time( reset1, "CPU", setValueHost ); + benchmark.time< Devices::Host >( reset1, "CPU", setValueHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", setValueCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", setValueCuda ); #endif @@ -132,9 +132,9 @@ benchmarkArrayOperations( Benchmark & benchmark, #endif }; benchmark.setOperation( "allocation (setSize)", datasetSize ); - benchmark.time( resetSize1, "CPU", setSizeHost ); + benchmark.time< Devices::Host >( resetSize1, "CPU", setSizeHost ); #ifdef HAVE_CUDA - benchmark.time( resetSize1, "GPU", setSizeCuda ); + benchmark.time< Devices::Cuda >( resetSize1, "GPU", setSizeCuda ); #endif @@ -151,9 +151,9 @@ benchmarkArrayOperations( Benchmark & benchmark, #endif }; benchmark.setOperation( "deallocation (reset)", datasetSize ); - benchmark.time( setSize1, "CPU", resetSizeHost ); + benchmark.time< Devices::Host >( setSize1, "CPU", resetSizeHost ); #ifdef HAVE_CUDA - benchmark.time( setSize1, "GPU", resetSizeCuda ); + benchmark.time< Devices::Cuda >( setSize1, "GPU", resetSizeCuda ); #endif return true; diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h index 5c3813b0a..966a4ec06 100644 --- a/src/Benchmarks/BLAS/spmv.h +++ b/src/Benchmarks/BLAS/spmv.h @@ -161,9 +161,9 @@ benchmarkSpMV( Benchmark & benchmark, }; benchmark.setOperation( datasetSize ); - benchmark.time( reset, "CPU", spmvHost ); + benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); #ifdef HAVE_CUDA - benchmark.time( reset, "GPU", spmvCuda ); + benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda ); #endif return true; diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h index 8dd63de85..e191b8fbb 100644 --- a/src/Benchmarks/BLAS/vector-operations.h +++ b/src/Benchmarks/BLAS/vector-operations.h @@ -90,9 +90,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.max(); }; benchmark.setOperation( "max", datasetSize ); - benchmark.time( reset1, "CPU", maxHost ); + benchmark.time< Devices::Host >( reset1, "CPU", maxHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", maxCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", maxCuda ); #endif @@ -103,9 +103,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.min(); }; benchmark.setOperation( "min", datasetSize ); - benchmark.time( reset1, "CPU", minHost ); + benchmark.time< Devices::Host >( reset1, "CPU", minHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", minCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", minCuda ); #endif @@ -125,10 +125,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "absMax", datasetSize ); - benchmark.time( reset1, "CPU", absMaxHost ); + benchmark.time< Devices::Host >( reset1, "CPU", absMaxHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", absMaxCuda ); - benchmark.time( reset1, "cuBLAS", absMaxCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", absMaxCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMaxCublas ); #endif @@ -148,10 +148,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "absMin", datasetSize ); - benchmark.time( reset1, "CPU", absMinHost ); + benchmark.time< Devices::Host >( reset1, "CPU", absMinHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", absMinCuda ); - benchmark.time( reset1, "cuBLAS", absMinCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", absMinCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMinCublas ); #endif @@ -162,9 +162,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.sum(); }; benchmark.setOperation( "sum", datasetSize ); - benchmark.time( reset1, "CPU", sumHost ); + benchmark.time< Devices::Host >( reset1, "CPU", sumHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", sumCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", sumCuda ); #endif @@ -182,10 +182,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "l1 norm", datasetSize ); - benchmark.time( reset1, "CPU", l1normHost ); + benchmark.time< Devices::Host >( reset1, "CPU", l1normHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", l1normCuda ); - benchmark.time( reset1, "cuBLAS", l1normCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", l1normCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l1normCublas ); #endif @@ -203,10 +203,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "l2 norm", datasetSize ); - benchmark.time( reset1, "CPU", l2normHost ); + benchmark.time< Devices::Host >( reset1, "CPU", l2normHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", l2normCuda ); - benchmark.time( reset1, "cuBLAS", l2normCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", l2normCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l2normCublas ); #endif @@ -217,9 +217,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.lpNorm( 3.0 ); }; benchmark.setOperation( "l3 norm", datasetSize ); - benchmark.time( reset1, "CPU", l3normHost ); + benchmark.time< Devices::Host >( reset1, "CPU", l3normHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", l3normCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", l3normCuda ); #endif @@ -238,10 +238,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "scalar product", 2 * datasetSize ); - benchmark.time( reset1, "CPU", scalarProductHost ); + benchmark.time< Devices::Host >( reset1, "CPU", scalarProductHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", scalarProductCuda ); - benchmark.time( reset1, "cuBLAS", scalarProductCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", scalarProductCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas ); #endif /* @@ -289,10 +289,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "scalar multiplication", 2 * datasetSize ); - benchmark.time( reset1, "CPU", multiplyHost ); + benchmark.time< Devices::Host >( reset1, "CPU", multiplyHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", multiplyCuda ); - benchmark.time( reset1, "cuBLAS", multiplyCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", multiplyCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas ); #endif @@ -312,10 +312,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "vector addition", 3 * datasetSize ); - benchmark.time( reset1, "CPU", addVectorHost ); + benchmark.time< Devices::Host >( reset1, "CPU", addVectorHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", addVectorCuda ); - benchmark.time( reset1, "cuBLAS", addVectorCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", addVectorCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", addVectorCublas ); #endif diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index c371e2dfb..435e70373 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -34,53 +34,60 @@ namespace Benchmarks { const double oneGB = 1024.0 * 1024.0 * 1024.0; -template< typename ComputeFunction, - typename ResetFunction, - typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > -double -timeFunction( ComputeFunction compute, - ResetFunction reset, - int loops, - const double& minTime, - Monitor && monitor = Monitor() ) +template< typename Device > +class FunctionTimer { - // the timer is constructed zero-initialized and stopped - Timer timer; - - // set timer to the monitor - monitor.setTimer( timer ); - - // warm up - reset(); - compute(); - - //timer.start(); - int i; - for( i = 0; - i < loops || timer.getRealTime() < minTime; - ++i) - { - // abuse the monitor's "time" for loops - monitor.setTime( i + 1 ); - - reset(); - - // Explicit synchronization of the CUDA device - // TODO: not necessary for host computations -#ifdef HAVE_CUDA - cudaDeviceSynchronize(); + public: + using DeviceType = Device; + + template< typename ComputeFunction, + typename ResetFunction, + typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > + static double + timeFunction( ComputeFunction compute, + ResetFunction reset, + int loops, + const double& minTime, + Monitor && monitor = Monitor() ) + { + // the timer is constructed zero-initialized and stopped + Timer timer; + + // set timer to the monitor + monitor.setTimer( timer ); + + // warm up + reset(); + compute(); + + //timer.start(); + int i; + for( i = 0; + i < loops || timer.getRealTime() < minTime; + ++i) + { + // abuse the monitor's "time" for loops + monitor.setTime( i + 1 ); + + reset(); + + // Explicit synchronization of the CUDA device +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); #endif - timer.start(); - compute(); + timer.start(); + compute(); #ifdef HAVE_CUDA - cudaDeviceSynchronize(); + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); #endif - timer.stop(); - } - - return timer.getRealTime() / ( double ) i; -} + timer.stop(); + } + return timer.getRealTime() / ( double ) i; + } +}; class Logging { @@ -443,7 +450,8 @@ public: // "speedup" columns. // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation) // Also terminates the recursion of the following variadic template. - template< typename ResetFunction, + template< typename Device, + typename ResetFunction, typename ComputeFunction > double time( ResetFunction reset, @@ -456,10 +464,10 @@ public: if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); - result.time = timeFunction( compute, reset, loops, minTime, monitor ); + result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor ); } else { - result.time = timeFunction( compute, reset, loops, minTime, monitor ); + result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor ); } } catch ( const std::exception& e ) { @@ -477,7 +485,8 @@ public: return this->baseTime; } - template< typename ResetFunction, + template< typename Device, + typename ResetFunction, typename ComputeFunction, typename... NextComputations > inline double @@ -486,7 +495,7 @@ public: ComputeFunction & compute ) { BenchmarkResult result; - return time( reset, performer, compute, result ); + return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result ); } // Adds an error message to the log. Should be called in places where the diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h index 73001e958..23f081527 100644 --- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h +++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h @@ -62,7 +62,7 @@ benchmarkSpmv( Benchmark& benchmark, matrix.vectorProduct( x, y ); }; - benchmark.time( reset, performer, compute ); + benchmark.time< typename Matrix::DeviceType >( reset, performer, compute ); } template< typename Matrix, typename Vector > @@ -114,7 +114,7 @@ benchmarkDistributedSpmv( Benchmark& benchmark, Matrix::CommunicatorType::Barrier( matrix.getCommunicationGroup() ); }; - benchmark.time( reset, performer, compute ); + benchmark.time< typename Matrix::DeviceType >( reset, performer, compute ); } template< typename Matrix, typename Vector > diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h index a82ec2dc2..c6278a76b 100644 --- a/src/Benchmarks/LinearSolvers/benchmarks.h +++ b/src/Benchmarks/LinearSolvers/benchmarks.h @@ -73,7 +73,7 @@ benchmarkPreconditionerUpdate( Benchmark& benchmark, barrier( matrix ); }; - benchmark.time( reset, performer, compute ); + benchmark.time< typename Matrix::DeviceType >( reset, performer, compute ); } template< template class Solver, template class Preconditioner, typename Matrix, typename Vector > @@ -166,7 +166,7 @@ benchmarkSolver( Benchmark& benchmark, }; MyBenchmarkResult benchmarkResult( solver, matrix, x, b ); - benchmark.time( reset, performer, compute, benchmarkResult ); + benchmark.time< typename Matrix::DeviceType >( reset, performer, compute, benchmarkResult ); } #ifdef HAVE_ARMADILLO diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index f1c4efeed..9e80b0d06 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -41,7 +41,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, const int maxSize = parameters.getParameter< int >( "max-size" ); // Full grid traversing - benchmark.newBenchmark( String("Full grid traversing - write 1" + convertToString( Dimension ) + "D" ), metadata ); + benchmark.newBenchmark( String("Full grid traversing - write 1 " + convertToString( Dimension ) + "D" ), metadata ); for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { @@ -78,9 +78,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time( hostReset, "CPU", hostWriteOneUsingPureC ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); #ifdef HAVE_CUDA - benchmark.time( cudaReset, "GPU", cudaWriteOneUsingPureC ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); #endif /**** @@ -97,9 +97,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time( hostReset, "CPU", hostWriteOneUsingParallelFor ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA - benchmark.time( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); #endif /**** @@ -113,12 +113,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters, auto cudaWriteOneUsingTraverser = [&] () { cudaTraverserBenchmark.writeOneUsingTraverser(); - } + }; benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time( hostReset, "CPU", hostWriteOneUsingTraverser ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); #ifdef HAVE_CUDA - benchmark.time( cudaReset, "GPU", cudaWriteOneUsingTraverser ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); #endif } return true; -- GitLab From 95e61d26c9b71499798457e90740c6c2be540968 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Tue, 25 Dec 2018 20:59:18 +0100 Subject: [PATCH 081/130] Added benchmark function timing without reset function to measure CPU cache effect. --- src/Benchmarks/Benchmarks.h | 129 +++++++++++++++--- .../Traversers/tnl-benchmark-traversers.h | 9 +- 2 files changed, 114 insertions(+), 24 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 435e70373..6ca7c3830 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -46,46 +46,84 @@ class FunctionTimer static double timeFunction( ComputeFunction compute, ResetFunction reset, - int loops, - const double& minTime, - Monitor && monitor = Monitor() ) + int maxLoops, + const double& minTime, + int verbose = 1, + Monitor && monitor = Monitor(), + bool performReset = true ) { // the timer is constructed zero-initialized and stopped Timer timer; // set timer to the monitor - monitor.setTimer( timer ); + if( verbose > 1 ) + monitor.setTimer( timer ); // warm up reset(); compute(); - //timer.start(); - int i; - for( i = 0; - i < loops || timer.getRealTime() < minTime; - ++i) + int loops; + // If we do not perform reset function and don't need + // the monitor, the timer is not interrupted after each loop. + if( ! performReset && verbose < 2 ) { - // abuse the monitor's "time" for loops - monitor.setTime( i + 1 ); - - reset(); - + timer.start(); // Explicit synchronization of the CUDA device #ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - timer.start(); - compute(); + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + for( loops = 0; + loops < maxLoops || timer.getRealTime() < minTime; + ++loops) + compute(); + // Explicit synchronization of the CUDA device #ifdef HAVE_CUDA if( std::is_same< Device, Devices::Cuda >::value ) cudaDeviceSynchronize(); #endif timer.stop(); } + else + { + for( loops = 0; + loops < maxLoops || timer.getRealTime() < minTime; + ++loops) + { + // abuse the monitor's "time" for loops + monitor.setTime( loops + 1 ); + + reset(); + + // Explicit synchronization of the CUDA device +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + timer.start(); + compute(); +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + timer.stop(); + } + } + return timer.getRealTime() / ( double ) loops; + } - return timer.getRealTime() / ( double ) i; + template< typename ComputeFunction, + typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > + static double + timeFunction( ComputeFunction compute, + int maxLoops, + const double& minTime, + int verbose = 1, + Monitor && monitor = Monitor() ) + { + auto noReset = [] () {}; + return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false ); } }; @@ -464,10 +502,10 @@ public: if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); - result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor ); + result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); } else { - result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor ); + result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); } } catch ( const std::exception& e ) { @@ -497,6 +535,53 @@ public: BenchmarkResult result; return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result ); } + + /**** + * The same methods as above but without reset function + */ + template< typename Device, + typename ComputeFunction > + double + time( const String & performer, + ComputeFunction & compute, + BenchmarkResult & result ) + { + result.time = std::numeric_limits::quiet_NaN(); + try { + if( verbose > 1 ) { + // run the monitor main loop + Solvers::SolverMonitorThread monitor_thread( monitor ); + result.time = FunctionTimer< Device >::timeFunction( compute, loops, minTime, verbose, monitor ); + } + else { + result.time = FunctionTimer< Device >::timeFunction( compute, loops, minTime, verbose, monitor ); + } + } + catch ( const std::exception& e ) { + std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl; + } + + result.bandwidth = datasetSize / result.time; + result.speedup = this->baseTime / result.time; + if( this->baseTime == 0.0 ) + this->baseTime = result.time; + + writeTableHeader( performer, result.getTableHeader() ); + writeTableRow( performer, result.getRowElements() ); + + return this->baseTime; + } + + template< typename Device, + typename ComputeFunction, + typename... NextComputations > + inline double + time( const String & performer, + ComputeFunction & compute ) + { + BenchmarkResult result; + return time< Device, ComputeFunction >( performer, compute, result ); + } // Adds an error message to the log. Should be called in places where the // "time" method could not be called (e.g. due to failed allocation). diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 9e80b0d06..6d2ed7cea 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -48,8 +48,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters, GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size ); GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size ); - auto noReset = []() {}; - auto hostReset = [&]() { hostTraverserBenchmark.reset(); @@ -78,10 +76,17 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC ); +#endif + + benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); #endif + /**** * Write one using parallel for -- GitLab From a37731df660ba5e8602a1c86cb56bce7ecf1ceee Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Tue, 25 Dec 2018 21:58:19 +0100 Subject: [PATCH 082/130] Added traversers benchmark tests without reseting. --- .../Traversers/tnl-benchmark-traversers.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 6d2ed7cea..53b29b92a 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -102,6 +102,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor ); +#endif + + benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); @@ -113,7 +119,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, auto hostWriteOneUsingTraverser = [&] () { hostTraverserBenchmark.writeOneUsingTraverser(); - }; + }; auto cudaWriteOneUsingTraverser = [&] () { @@ -125,6 +131,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); #endif + + benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingTraverser ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser ); +#endif + } return true; } -- GitLab From d48aa3a2d4495f37d6b9be1a3836dc461d5e6e5a Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Tue, 25 Dec 2018 23:20:53 +0100 Subject: [PATCH 083/130] Splitting Benchmarks.h into Benchmarks, Logging and FunctionTimer. --- src/Benchmarks/Benchmarks.h | 313 +-------------------------------- src/Benchmarks/CMakeLists.txt | 2 + src/Benchmarks/FunctionTimer.h | 119 +++++++++++++ src/Benchmarks/Logging.h | 240 +++++++++++++++++++++++++ 4 files changed, 366 insertions(+), 308 deletions(-) create mode 100644 src/Benchmarks/FunctionTimer.h create mode 100644 src/Benchmarks/Logging.h diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 6ca7c3830..0770680d2 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -8,20 +8,20 @@ /* See Copyright Notice in tnl/Copyright */ -// Implemented by: Jakub Klinkovsky +// Implemented by: Jakub Klinkovsky, +// Tomas Oberhuber #pragma once +#include "FunctionTimer.h" +#include "Logging.h" + #include #include -#include -#include #include #include -#include #include -#include #include #include @@ -34,309 +34,6 @@ namespace Benchmarks { const double oneGB = 1024.0 * 1024.0 * 1024.0; -template< typename Device > -class FunctionTimer -{ - public: - using DeviceType = Device; - - template< typename ComputeFunction, - typename ResetFunction, - typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > - static double - timeFunction( ComputeFunction compute, - ResetFunction reset, - int maxLoops, - const double& minTime, - int verbose = 1, - Monitor && monitor = Monitor(), - bool performReset = true ) - { - // the timer is constructed zero-initialized and stopped - Timer timer; - - // set timer to the monitor - if( verbose > 1 ) - monitor.setTimer( timer ); - - // warm up - reset(); - compute(); - - int loops; - // If we do not perform reset function and don't need - // the monitor, the timer is not interrupted after each loop. - if( ! performReset && verbose < 2 ) - { - timer.start(); - // Explicit synchronization of the CUDA device -#ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - for( loops = 0; - loops < maxLoops || timer.getRealTime() < minTime; - ++loops) - compute(); - // Explicit synchronization of the CUDA device -#ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - timer.stop(); - } - else - { - for( loops = 0; - loops < maxLoops || timer.getRealTime() < minTime; - ++loops) - { - // abuse the monitor's "time" for loops - monitor.setTime( loops + 1 ); - - reset(); - - // Explicit synchronization of the CUDA device -#ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - timer.start(); - compute(); -#ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - timer.stop(); - } - } - return timer.getRealTime() / ( double ) loops; - } - - template< typename ComputeFunction, - typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > - static double - timeFunction( ComputeFunction compute, - int maxLoops, - const double& minTime, - int verbose = 1, - Monitor && monitor = Monitor() ) - { - auto noReset = [] () {}; - return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false ); - } -}; - -class Logging -{ -public: - using MetadataElement = std::pair< const char*, String >; - using MetadataMap = std::map< const char*, String >; - using MetadataColumns = std::vector; - - using HeaderElements = std::vector< String >; - using RowElements = std::vector< double >; - - Logging( int verbose = true ) - : verbose(verbose) - {} - - void - setVerbose( int verbose) - { - this->verbose = verbose; - } - - void - writeTitle( const String & title ) - { - if( verbose ) - std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl; - log << ": title = " << title << std::endl; - } - - void - writeMetadata( const MetadataMap & metadata ) - { - if( verbose ) - std::cout << "properties:" << std::endl; - - for( auto & it : metadata ) { - if( verbose ) - std::cout << " " << it.first << " = " << it.second << std::endl; - log << ": " << it.first << " = " << it.second << std::endl; - } - if( verbose ) - std::cout << std::endl; - } - - void - writeTableHeader( const String & spanningElement, - const HeaderElements & subElements ) - { - if( verbose && header_changed ) { - for( auto & it : metadataColumns ) { - std::cout << std::setw( 20 ) << it.first; - } - - // spanning element is printed as usual column to stdout, - // but is excluded from header - std::cout << std::setw( 15 ) << ""; - - for( auto & it : subElements ) { - std::cout << std::setw( 15 ) << it; - } - std::cout << std::endl; - - header_changed = false; - } - - // initial indent string - header_indent = "!"; - log << std::endl; - for( auto & it : metadataColumns ) { - log << header_indent << " " << it.first << std::endl; - } - - // dump stacked spanning columns - if( horizontalGroups.size() > 0 ) - while( horizontalGroups.back().second <= 0 ) { - horizontalGroups.pop_back(); - header_indent.pop_back(); - } - for( size_t i = 0; i < horizontalGroups.size(); i++ ) { - if( horizontalGroups[ i ].second > 0 ) { - log << header_indent << " " << horizontalGroups[ i ].first << std::endl; - header_indent += "!"; - } - } - - log << header_indent << " " << spanningElement << std::endl; - for( auto & it : subElements ) { - log << header_indent << "! " << it << std::endl; - } - - if( horizontalGroups.size() > 0 ) { - horizontalGroups.back().second--; - header_indent.pop_back(); - } - } - - void - writeTableRow( const String & spanningElement, - const RowElements & subElements ) - { - if( verbose ) { - for( auto & it : metadataColumns ) { - std::cout << std::setw( 20 ) << it.second; - } - // spanning element is printed as usual column to stdout - std::cout << std::setw( 15 ) << spanningElement; - for( auto & it : subElements ) { - std::cout << std::setw( 15 ); - if( it != 0.0 )std::cout << it; - else std::cout << "N/A"; - } - std::cout << std::endl; - } - - // only when changed (the header has been already adjusted) - // print each element on separate line - for( auto & it : metadataColumns ) { - log << it.second << std::endl; - } - - // benchmark data are indented - const String indent = " "; - for( auto & it : subElements ) { - if( it != 0.0 ) log << indent << it << std::endl; - else log << indent << "N/A" << std::endl; - } - } - - void - writeErrorMessage( const char* msg, - int colspan = 1 ) - { - // initial indent string - header_indent = "!"; - log << std::endl; - for( auto & it : metadataColumns ) { - log << header_indent << " " << it.first << std::endl; - } - - // make sure there is a header column for the message - if( horizontalGroups.size() == 0 ) - horizontalGroups.push_back( {"", 1} ); - - // dump stacked spanning columns - while( horizontalGroups.back().second <= 0 ) { - horizontalGroups.pop_back(); - header_indent.pop_back(); - } - for( size_t i = 0; i < horizontalGroups.size(); i++ ) { - if( horizontalGroups[ i ].second > 0 ) { - log << header_indent << " " << horizontalGroups[ i ].first << std::endl; - header_indent += "!"; - } - } - if( horizontalGroups.size() > 0 ) { - horizontalGroups.back().second -= colspan; - header_indent.pop_back(); - } - - // only when changed (the header has been already adjusted) - // print each element on separate line - for( auto & it : metadataColumns ) { - log << it.second << std::endl; - } - log << msg << std::endl; - } - - void - closeTable() - { - log << std::endl; - header_indent = body_indent = ""; - header_changed = true; - horizontalGroups.clear(); - } - - bool save( std::ostream & logFile ) - { - closeTable(); - logFile << log.str(); - if( logFile.good() ) { - log.str() = ""; - return true; - } - return false; - } - -protected: - - // manual double -> String conversion with fixed precision - static String - _to_string( double num, int precision = 0, bool fixed = false ) - { - std::stringstream str; - if( fixed ) - str << std::fixed; - if( precision ) - str << std::setprecision( precision ); - str << num; - return String( str.str().data() ); - } - - std::stringstream log; - std::string header_indent; - std::string body_indent; - - int verbose; - MetadataColumns metadataColumns; - bool header_changed = true; - std::vector< std::pair< String, int > > horizontalGroups; -}; struct BenchmarkResult diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt index d4c2258c9..556dc1604 100644 --- a/src/Benchmarks/CMakeLists.txt +++ b/src/Benchmarks/CMakeLists.txt @@ -7,6 +7,8 @@ add_subdirectory( Traversers ) set( headers Benchmarks.h + FunctionTimer.h + Logging.h ) install( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/Benchmarks ) diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h new file mode 100644 index 000000000..091eb4a2a --- /dev/null +++ b/src/Benchmarks/FunctionTimer.h @@ -0,0 +1,119 @@ +/*************************************************************************** + FunctionTimer.h - description + ------------------- + begin : Dec 25, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky, +// Tomas Oberhuber + +#pragma once + +#include + +#include +#include + +namespace TNL { + namespace Benchmarks { + + +template< typename Device > +class FunctionTimer +{ + public: + using DeviceType = Device; + + template< typename ComputeFunction, + typename ResetFunction, + typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > + static double + timeFunction( ComputeFunction compute, + ResetFunction reset, + int maxLoops, + const double& minTime, + int verbose = 1, + Monitor && monitor = Monitor(), + bool performReset = true ) + { + // the timer is constructed zero-initialized and stopped + Timer timer; + + // set timer to the monitor + if( verbose > 1 ) + monitor.setTimer( timer ); + + // warm up + reset(); + compute(); + + int loops; + // If we do not perform reset function and don't need + // the monitor, the timer is not interrupted after each loop. + if( ! performReset && verbose < 2 ) + { + timer.start(); + // Explicit synchronization of the CUDA device +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + for( loops = 0; + loops < maxLoops || timer.getRealTime() < minTime; + ++loops) + compute(); + // Explicit synchronization of the CUDA device +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + timer.stop(); + } + else + { + for( loops = 0; + loops < maxLoops || timer.getRealTime() < minTime; + ++loops) + { + // abuse the monitor's "time" for loops + monitor.setTime( loops + 1 ); + + reset(); + + // Explicit synchronization of the CUDA device +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + timer.start(); + compute(); +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + timer.stop(); + } + } + return timer.getRealTime() / ( double ) loops; + } + + template< typename ComputeFunction, + typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > + static double + timeFunction( ComputeFunction compute, + int maxLoops, + const double& minTime, + int verbose = 1, + Monitor && monitor = Monitor() ) + { + auto noReset = [] () {}; + return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false ); + } +}; + + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Logging.h b/src/Benchmarks/Logging.h new file mode 100644 index 000000000..b10ab7199 --- /dev/null +++ b/src/Benchmarks/Logging.h @@ -0,0 +1,240 @@ +/*************************************************************************** + Logging.h - description + ------------------- + begin : Dec 25, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky, +// Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include +#include + +namespace TNL { + namespace Benchmarks { + +class Logging +{ + public: + using MetadataElement = std::pair< const char*, String >; + using MetadataMap = std::map< const char*, String >; + using MetadataColumns = std::vector; + + using HeaderElements = std::vector< String >; + using RowElements = std::vector< double >; + + Logging( int verbose = true ) + : verbose(verbose) + {} + + void + setVerbose( int verbose) + { + this->verbose = verbose; + } + + void + writeTitle( const String & title ) + { + if( verbose ) + std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl; + log << ": title = " << title << std::endl; + } + + void + writeMetadata( const MetadataMap & metadata ) + { + if( verbose ) + std::cout << "properties:" << std::endl; + + for( auto & it : metadata ) { + if( verbose ) + std::cout << " " << it.first << " = " << it.second << std::endl; + log << ": " << it.first << " = " << it.second << std::endl; + } + if( verbose ) + std::cout << std::endl; + } + + void + writeTableHeader( const String & spanningElement, + const HeaderElements & subElements ) + { + if( verbose && header_changed ) { + for( auto & it : metadataColumns ) { + std::cout << std::setw( 20 ) << it.first; + } + + // spanning element is printed as usual column to stdout, + // but is excluded from header + std::cout << std::setw( 15 ) << ""; + + for( auto & it : subElements ) { + std::cout << std::setw( 15 ) << it; + } + std::cout << std::endl; + + header_changed = false; + } + + // initial indent string + header_indent = "!"; + log << std::endl; + for( auto & it : metadataColumns ) { + log << header_indent << " " << it.first << std::endl; + } + + // dump stacked spanning columns + if( horizontalGroups.size() > 0 ) + while( horizontalGroups.back().second <= 0 ) { + horizontalGroups.pop_back(); + header_indent.pop_back(); + } + for( size_t i = 0; i < horizontalGroups.size(); i++ ) { + if( horizontalGroups[ i ].second > 0 ) { + log << header_indent << " " << horizontalGroups[ i ].first << std::endl; + header_indent += "!"; + } + } + + log << header_indent << " " << spanningElement << std::endl; + for( auto & it : subElements ) { + log << header_indent << "! " << it << std::endl; + } + + if( horizontalGroups.size() > 0 ) { + horizontalGroups.back().second--; + header_indent.pop_back(); + } + } + + void + writeTableRow( const String & spanningElement, + const RowElements & subElements ) + { + if( verbose ) { + for( auto & it : metadataColumns ) { + std::cout << std::setw( 20 ) << it.second; + } + // spanning element is printed as usual column to stdout + std::cout << std::setw( 15 ) << spanningElement; + for( auto & it : subElements ) { + std::cout << std::setw( 15 ); + if( it != 0.0 )std::cout << it; + else std::cout << "N/A"; + } + std::cout << std::endl; + } + + // only when changed (the header has been already adjusted) + // print each element on separate line + for( auto & it : metadataColumns ) { + log << it.second << std::endl; + } + + // benchmark data are indented + const String indent = " "; + for( auto & it : subElements ) { + if( it != 0.0 ) log << indent << it << std::endl; + else log << indent << "N/A" << std::endl; + } + } + + void + writeErrorMessage( const char* msg, + int colspan = 1 ) + { + // initial indent string + header_indent = "!"; + log << std::endl; + for( auto & it : metadataColumns ) { + log << header_indent << " " << it.first << std::endl; + } + + // make sure there is a header column for the message + if( horizontalGroups.size() == 0 ) + horizontalGroups.push_back( {"", 1} ); + + // dump stacked spanning columns + while( horizontalGroups.back().second <= 0 ) { + horizontalGroups.pop_back(); + header_indent.pop_back(); + } + for( size_t i = 0; i < horizontalGroups.size(); i++ ) { + if( horizontalGroups[ i ].second > 0 ) { + log << header_indent << " " << horizontalGroups[ i ].first << std::endl; + header_indent += "!"; + } + } + if( horizontalGroups.size() > 0 ) { + horizontalGroups.back().second -= colspan; + header_indent.pop_back(); + } + + // only when changed (the header has been already adjusted) + // print each element on separate line + for( auto & it : metadataColumns ) { + log << it.second << std::endl; + } + log << msg << std::endl; + } + + void + closeTable() + { + log << std::endl; + header_indent = body_indent = ""; + header_changed = true; + horizontalGroups.clear(); + } + + bool save( std::ostream & logFile ) + { + closeTable(); + logFile << log.str(); + if( logFile.good() ) { + log.str() = ""; + return true; + } + return false; + } + + protected: + + // manual double -> String conversion with fixed precision + static String + _to_string( double num, int precision = 0, bool fixed = false ) + { + std::stringstream str; + if( fixed ) + str << std::fixed; + if( precision ) + str << std::setprecision( precision ); + str << num; + return String( str.str().data() ); + } + + std::stringstream log; + std::string header_indent; + std::string body_indent; + + int verbose; + MetadataColumns metadataColumns; + bool header_changed = true; + std::vector< std::pair< String, int > > horizontalGroups; +}; + + + } // namespace Benchmarks +} // namespace TNL + + -- GitLab From 8fc2b437d1e8dc0a4192477c64ad5ce325606087 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Thu, 27 Dec 2018 16:48:39 +0100 Subject: [PATCH 084/130] Added traversers benchmarks with boundaries. --- .../Traversers/GridTraversersBenchmark.h | 247 +++++++++++++++--- src/Benchmarks/Traversers/cuda-kernels.h | 128 +++++++++ src/Benchmarks/Traversers/grid-traversing.h | 36 --- .../Traversers/tnl-benchmark-traversers.h | 114 +++++++- 4 files changed, 439 insertions(+), 86 deletions(-) create mode 100644 src/Benchmarks/Traversers/cuda-kernels.h delete mode 100644 src/Benchmarks/Traversers/grid-traversing.h diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index ee18adfa6..2f439f988 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -1,5 +1,5 @@ /*************************************************************************** - WriteOne.h - description + GridTraversersBenchmark.h - description ------------------- begin : Dec 19, 2018 copyright : (C) 2018 by oberhuber @@ -21,10 +21,11 @@ #include #include #include +#include "cuda-kernels.h" namespace TNL { namespace Benchmarks { - + namespace Traversers { template< typename TraverserUserData > class WriteOneEntitiesProcessor @@ -55,35 +56,6 @@ class WriteOneUserData MeshFunctionPointer u; }; -template< typename Real, - typename Index > -__global__ void simpleCudaKernel1D( const Index size, const dim3 gridIdx, Real* v_data ) -{ - const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( threadIdx_x < size ) - v_data[ threadIdx_x ] = 1.0; -} - -template< typename Real, - typename Index > -__global__ void simpleCudaKernel2D( const Index size, const dim3 gridIdx, Real* v_data ) -{ - const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; - if( threadIdx_x < size && threadIdx_y < size ) - v_data[ threadIdx_y * size + threadIdx_x ] = 1.0; -} - -template< typename Real, - typename Index > -__global__ void simpleCudaKernel3D( const Index size, const dim3 gridIdx, Real* v_data ) -{ - const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; - const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; - if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size ) - v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0; -} template< int Dimension, typename Device, @@ -147,12 +119,12 @@ class GridTraversersBenchmark< 1, Device, Real, Index > gridsCount, gridIdx, gridSize ); - simpleCudaKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + fullGridTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); } #endif } } - + void writeOneUsingParallelFor() { auto f = [] __cuda_callable__ ( Index i, Real* data ) @@ -168,6 +140,56 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ( grid, userData ); } + void traverseUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + v_data[ 0 ] = 2; + for( int i = 1; i < size - 1; i++ ) + v_data[ i ] = 1.0; + v_data[ size - 1 ] = 2; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + boundariesTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + interiorTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void traverseUsingTraverser() + { + // TODO !!!!!!!!!!!!!!!!!!!!!! + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } + protected: Index size; @@ -240,7 +262,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > gridsCount, gridIdx, gridSize ); - simpleCudaKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + fullGridTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); } #endif } @@ -267,6 +289,69 @@ class GridTraversersBenchmark< 2, Device, Real, Index > ( grid, userData ); } + void traverseUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + { + v_data[ i * size ] = 2.0; + v_data[ i * size + size - 1 ] = 2.0; + } + for( int j = 1; j < size - 1; j++ ) + { + v_data[ j ] = 2.0; + v_data[ ( size - 1 ) * size + j ] = 2.0; + } + + for( int i = 1; i < size - 1; i++ ) + for( int j = 1; j < size - 1; j++ ) + v_data[ i * size + j ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size ); + dim3 gridIdx; + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + boundariesTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + interiorTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void traversingUsingTraverser() + { + // TODO !!!!!!!!!!!!!!!!!!!!!! + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } + protected: Index size; @@ -344,12 +429,12 @@ class GridTraversersBenchmark< 3, Device, Real, Index > gridsCount, gridIdx, gridSize ); - simpleCudaKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + fullGridTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); } #endif } } - + void writeOneUsingParallelFor() { Index _size = this->size; @@ -358,20 +443,96 @@ class GridTraversersBenchmark< 3, Device, Real, Index > data[ ( i * _size + j ) * _size + k ] = 1.0; }; - ParallelFor3D< Device >::exec( ( Index ) 0, - ( Index ) 0, - ( Index ) 0, + ParallelFor3D< Device >::exec( ( Index ) 0, + ( Index ) 0, + ( Index ) 0, this->size, this->size, this->size, - f, v.getData() ); + f, v.getData() ); } - + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > ( grid, userData ); - } + } + + void traverseUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + for( int j = 0; j < size; j++ ) + { + v_data[ ( i * size + j ) * size ] = 2.0; + v_data[ ( i * size + j ) * size + size - 1 ] = 2.0; + } + for( int j = 0; j < size; j++ ) + for( int k = 1; k < size - 1; k++ ) + { + v_data[ j * size + k ] = 1.0; + v_data[ ( ( size - 1) * size + j ) * size + k ] = 1.0; + } + + for( int i = 1; i < size -1; i++ ) + for( int k = 1; k < size - 1; k++ ) + { + v_data[ ( i * size ) * size + k ] = 2.0; + v_data[ ( i * size + size - 1 ) * size + k ] = 2.0; + } + + for( int i = 1; i < size -1; i++ ) + for( int j = 1; j < size -1; j++ ) + for( int k = 1; k < size - 1; k++ ) + v_data[ ( i * size + j ) * size + k ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size, + size ); + dim3 gridIdx; + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + boundariesTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + interiorTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void traverseUsingTraverser() + { + // TODO !!!!!!!!!!!!!!!!!!!!!! + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } protected: @@ -384,6 +545,6 @@ class GridTraversersBenchmark< 3, Device, Real, Index > WriteOneTraverserUserDataType userData; }; - + } // namespace Traversers } // namespace Benchmarks } // namespace TNL \ No newline at end of file diff --git a/src/Benchmarks/Traversers/cuda-kernels.h b/src/Benchmarks/Traversers/cuda-kernels.h new file mode 100644 index 000000000..2cd8b1b56 --- /dev/null +++ b/src/Benchmarks/Traversers/cuda-kernels.h @@ -0,0 +1,128 @@ +/*************************************************************************** + cuda-kernels.h - description + ------------------- + begin : Dec 19, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +#ifdef HAVE_CUDA + +/**** + * Full grid traversing + */ +template< typename Real, + typename Index > +__global__ void fullGridTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( threadIdx_x < size ) + v_data[ threadIdx_x ] = 1.0; +} + +template< typename Real, + typename Index > +__global__ void fullGridTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + if( threadIdx_x < size && threadIdx_y < size ) + v_data[ threadIdx_y * size + threadIdx_x ] = 1.0; +} + +template< typename Real, + typename Index > +__global__ void fullGridTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size ) + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0; +} + +/**** + * Traversing interior cells + */ +template< typename Real, + typename Index > +__global__ void interiorTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( threadIdx_x > 0 && threadIdx_x < size - 1 ) + v_data[ threadIdx_x ] = 1.0; +} + +template< typename Real, + typename Index > +__global__ void interiorTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + if( threadIdx_x > 0 && threadIdx_y > 0 && + threadIdx_x < size - 1 && threadIdx_y < size - 1 ) + v_data[ threadIdx_y * size + threadIdx_x ] = 1.0; +} + +template< typename Real, + typename Index > +__global__ void interiorTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_z > 0 && + threadIdx_x < size - 1 && threadIdx_y < size - 1 && threadIdx_z < size - 1 ) + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0; +} + +/**** + * Grid boundaries traversing + */ +template< typename Real, + typename Index > +__global__ void boundariesTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( threadIdx_x == 0 || threadIdx_x == size - 1 ) + v_data[ threadIdx_x ] = 2.0; +} + +template< typename Real, + typename Index > +__global__ void boundariesTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + if( threadIdx_x > 0 && threadIdx_y > 0 && + threadIdx_x < size - 1 && threadIdx_y < size - 1 ) + v_data[ threadIdx_y * size + threadIdx_x ] = 2.0; +} + +template< typename Real, + typename Index > +__global__ void boundariesTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + if( threadIdx_x == 0 || threadIdx_y == 0 || threadIdx_z == 0 || + threadIdx_x == size - 1 || threadIdx_y == size - 1 || threadIdx_z == size - 1 ) + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 2.0; +} + +#endif + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL + diff --git a/src/Benchmarks/Traversers/grid-traversing.h b/src/Benchmarks/Traversers/grid-traversing.h deleted file mode 100644 index c977fea1c..000000000 --- a/src/Benchmarks/Traversers/grid-traversing.h +++ /dev/null @@ -1,36 +0,0 @@ -/*************************************************************************** - grid-traversing.h - description - ------------------- - begin : Dec 19, 2018 - copyright : (C) 2018 by oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Tomas Oberhuber - -#pragma once - -#include "../Benchmarks.h" - - -#include - -namespace TNL { - namespace Benchmarks { - -template< int Dimension, - typename Real = double, - typename Index = int > -class benchmarkTraversingFullGrid -{ - public: - - static void run ( Benchmark& benchmark, std::size_t size ) - { - - } -}; - } // namespace Benchmarks -} // namespace TNL \ No newline at end of file diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 53b29b92a..276497f51 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -23,6 +23,7 @@ using namespace TNL; using namespace TNL::Benchmarks; +using namespace TNL::Benchmarks::Traversers; template< int Dimension, @@ -40,13 +41,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters, const int minSize = parameters.getParameter< int >( "min-size" ); const int maxSize = parameters.getParameter< int >( "max-size" ); - // Full grid traversing - benchmark.newBenchmark( String("Full grid traversing - write 1 " + convertToString( Dimension ) + "D" ), metadata ); + /**** + * Full grid traversing + */ + benchmark.newBenchmark( String("Traversing without boundary conditions" + convertToString( Dimension ) + "D" ), metadata ); for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { - GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size ); - GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size ); + GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size ); auto hostReset = [&]() { @@ -86,7 +88,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters, #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); #endif - /**** * Write one using parallel for @@ -94,12 +95,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters, auto hostWriteOneUsingParallelFor = [&] () { hostTraverserBenchmark.writeOneUsingParallelFor(); - }; + }; auto cudaWriteOneUsingParallelFor = [&] () { cudaTraverserBenchmark.writeOneUsingParallelFor(); - }; + }; benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor ); @@ -137,8 +138,107 @@ bool runBenchmark( const Config::ParameterContainer& parameters, #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser ); #endif + } + + /**** + * Full grid traversing + */ + benchmark.newBenchmark( String("Traversing with boundary conditions" + convertToString( Dimension ) + "D" ), metadata ); + for( std::size_t size = minSize; size <= maxSize; size *= 2 ) + { + GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size ); + GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size ); + + auto hostReset = [&]() + { + hostTraverserBenchmark.reset(); + }; + + auto cudaReset = [&]() + { + cudaTraverserBenchmark.reset(); + }; + + benchmark.setMetadataColumns( + Benchmark::MetadataColumns( + { {"size", convertToString( size ) }, } ) ); + + /**** + * Write one using C for + */ + auto hostTraverseUsingPureC = [&] () + { + hostTraverserBenchmark.traverseUsingPureC(); + }; + + auto cudaTraverseUsingPureC = [&] () + { + cudaTraverserBenchmark.traverseUsingPureC(); + }; + + benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC ); +#endif + + benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC ); +#endif + + /**** + * Write one using parallel for + */ + auto hostTraverseUsingParallelFor = [&] () + { + hostTraverserBenchmark.writeOneUsingParallelFor(); + }; + + auto cudaTraverseUsingParallelFor = [&] () + { + cudaTraverserBenchmark.writeOneUsingParallelFor(); + }; + + benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor ); +#endif + + benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor ); +#endif + /**** + * Write one using traverser + */ + auto hostTraverseUsingTraverser = [&] () + { + hostTraverserBenchmark.writeOneUsingTraverser(); + }; + + auto cudaTraverseUsingTraverser = [&] () + { + cudaTraverserBenchmark.writeOneUsingTraverser(); + }; + + benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser ); +#endif + + benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser ); +#endif } + return true; } -- GitLab From 71c1c71c1b93d450a6d2acbf2ab0038702dd23f3 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Wed, 2 Jan 2019 20:06:49 +0100 Subject: [PATCH 085/130] Timing can be turned off in the becnhmark - for better profiling. --- src/Benchmarks/Benchmarks.h | 25 ++++++++++++++++++++----- src/Benchmarks/FunctionTimer.h | 24 ++++++++++++++++-------- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 0770680d2..71f808ad8 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -74,6 +74,7 @@ public: { config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 ); + config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true ); config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 ); } @@ -81,6 +82,7 @@ public: { this->loops = parameters.getParameter< unsigned >( "loops" ); this->minTime = parameters.getParameter< double >( "min-time" ); + this->timing = parameters.getParameter< bool >( "timing" ); const int verbose = parameters.getParameter< unsigned >( "verbose" ); Logging::setVerbose( verbose ); } @@ -199,10 +201,16 @@ public: if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); - result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + if( this->timing ) + result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + else + result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); } else { - result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + if( this->timing ) + result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + else + result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); } } catch ( const std::exception& e ) { @@ -232,7 +240,7 @@ public: BenchmarkResult result; return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result ); } - + /**** * The same methods as above but without reset function */ @@ -248,10 +256,16 @@ public: if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); - result.time = FunctionTimer< Device >::timeFunction( compute, loops, minTime, verbose, monitor ); + if( this->timing ) + result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); + else + result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); } else { - result.time = FunctionTimer< Device >::timeFunction( compute, loops, minTime, verbose, monitor ); + if( this->timing ) + result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); + else + result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); } } catch ( const std::exception& e ) { @@ -304,6 +318,7 @@ protected: double minTime = 1; double datasetSize = 0.0; double baseTime = 0.0; + bool timing = true; Solvers::IterativeSolverMonitor< double, int > monitor; }; diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h index 091eb4a2a..35dbb719f 100644 --- a/src/Benchmarks/FunctionTimer.h +++ b/src/Benchmarks/FunctionTimer.h @@ -22,7 +22,8 @@ namespace TNL { namespace Benchmarks { -template< typename Device > +template< typename Device, + bool timing > class FunctionTimer { public: @@ -56,14 +57,15 @@ class FunctionTimer // the monitor, the timer is not interrupted after each loop. if( ! performReset && verbose < 2 ) { - timer.start(); + if( timing ) + timer.start(); // Explicit synchronization of the CUDA device #ifdef HAVE_CUDA if( std::is_same< Device, Devices::Cuda >::value ) cudaDeviceSynchronize(); #endif for( loops = 0; - loops < maxLoops || timer.getRealTime() < minTime; + loops < maxLoops || ( timing && timer.getRealTime() < minTime ); ++loops) compute(); // Explicit synchronization of the CUDA device @@ -71,12 +73,13 @@ class FunctionTimer if( std::is_same< Device, Devices::Cuda >::value ) cudaDeviceSynchronize(); #endif - timer.stop(); + if( timing ) + timer.stop(); } else { for( loops = 0; - loops < maxLoops || timer.getRealTime() < minTime; + loops < maxLoops || ( timing && timer.getRealTime() < minTime ); ++loops) { // abuse the monitor's "time" for loops @@ -89,16 +92,21 @@ class FunctionTimer if( std::is_same< Device, Devices::Cuda >::value ) cudaDeviceSynchronize(); #endif - timer.start(); + if( timing ) + timer.start(); compute(); #ifdef HAVE_CUDA if( std::is_same< Device, Devices::Cuda >::value ) cudaDeviceSynchronize(); #endif - timer.stop(); + if( timing ) + timer.stop(); } } - return timer.getRealTime() / ( double ) loops; + if( timing ) + return timer.getRealTime() / ( double ) loops; + else + return std::numeric_limits::quiet_NaN(); } template< typename ComputeFunction, -- GitLab From 61c2c6155b18affca69b1d5a716f661fd19e438e Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Wed, 2 Jan 2019 20:07:46 +0100 Subject: [PATCH 086/130] Added flag -g to compilation of the traversers benchmark. --- src/Benchmarks/Traversers/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Benchmarks/Traversers/CMakeLists.txt b/src/Benchmarks/Traversers/CMakeLists.txt index b58c7d66f..a80487135 100644 --- a/src/Benchmarks/Traversers/CMakeLists.txt +++ b/src/Benchmarks/Traversers/CMakeLists.txt @@ -5,5 +5,6 @@ else() ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp ) TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl ) endif() +SET_TARGET_PROPERTIES( tnl-benchmark-traversers PROPERTIES COMPILE_OPTIONS "-g" ) install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin ) -- GitLab From e447d94a62e6e559ab774f86ccfe53add30ce2cb Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Wed, 2 Jan 2019 20:08:36 +0100 Subject: [PATCH 087/130] Fixed cell type in traversers benchmark. --- src/Benchmarks/Traversers/GridTraversersBenchmark.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 2f439f988..2ea81ed14 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -76,7 +76,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > using Coordinates = typename Grid::CoordinatesType; using MeshFunction = Functions::MeshFunction< Grid >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; using Traverser = Meshes::Traverser< Grid, Cell >; using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; @@ -215,7 +215,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > using Coordinates = typename Grid::CoordinatesType; using MeshFunction = Functions::MeshFunction< Grid >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::EntityType< 2, Meshes::GridEntityNoStencilStorage >; + using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >; using Traverser = Meshes::Traverser< Grid, Cell >; using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; @@ -376,7 +376,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > using Coordinates = typename Grid::CoordinatesType; using MeshFunction = Functions::MeshFunction< Grid >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::EntityType< 3, Meshes::GridEntityNoStencilStorage >; + using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >; using Traverser = Meshes::Traverser< Grid, Cell >; using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; -- GitLab From 741b5f1bb74ffd2e0a546ebd6c2dd94fef7190f5 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Wed, 2 Jan 2019 20:09:30 +0100 Subject: [PATCH 088/130] Traversers benchmark tests can be controled from the command line. --- .../Traversers/tnl-benchmark-traversers.h | 127 ++++++++++++------ 1 file changed, 87 insertions(+), 40 deletions(-) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 276497f51..11899b369 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -33,6 +33,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, Benchmark& benchmark, Benchmark::MetadataMap& metadata ) { + const String tests = parameters.getParameter< String >( "tests" ); // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), // which have a default value. The workaround below works for int values, but it is not possible // to pass 64-bit integer values @@ -72,22 +73,28 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.writeOneUsingPureC(); }; +#ifdef HAVE_CUDA auto cudaWriteOneUsingPureC = [&] () { cudaTraverserBenchmark.writeOneUsingPureC(); }; +#endif - benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC ); + if( tests == "all" || tests == "no-bc-pure-c") + { + benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC ); + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC ); #endif - - benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); + benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); #endif + } /**** * Write one using parallel for @@ -97,22 +104,29 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.writeOneUsingParallelFor(); }; +#ifdef HAVE_CUDA auto cudaWriteOneUsingParallelFor = [&] () { cudaTraverserBenchmark.writeOneUsingParallelFor(); }; +#endif - benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor ); + if( tests == "all" || tests == "no-bc-parallel-for" ) + { + benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor ); + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor ); #endif - - benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); + + benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); #endif + } /**** * Write one using traverser @@ -154,96 +168,129 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.reset(); }; +#ifdef HAVE_CUDA auto cudaReset = [&]() { cudaTraverserBenchmark.reset(); }; - +#endif + benchmark.setMetadataColumns( - Benchmark::MetadataColumns( + Benchmark::MetadataColumns( { {"size", convertToString( size ) }, } ) ); /**** - * Write one using C for + * Write one and two (as BC) using C for */ auto hostTraverseUsingPureC = [&] () { hostTraverserBenchmark.traverseUsingPureC(); }; +#ifdef HAVE_CUDA auto cudaTraverseUsingPureC = [&] () { cudaTraverserBenchmark.traverseUsingPureC(); }; +#endif - benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC ); + if( tests == "all" || tests == "bc-pure-c" ) + { + benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC ); + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC ); #endif - - benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC ); + + benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC ); + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC ); #endif + } /**** - * Write one using parallel for + * Write one and two (as BC) using parallel for */ auto hostTraverseUsingParallelFor = [&] () { hostTraverserBenchmark.writeOneUsingParallelFor(); }; +#ifdef HAVE_CUDA auto cudaTraverseUsingParallelFor = [&] () { cudaTraverserBenchmark.writeOneUsingParallelFor(); }; +#endif - benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); + if( tests == "all" || tests == "bc-parallel-for" ) + { + benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor ); + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor ); #endif - - benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor ); + + benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor ); + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor ); #endif + } /**** - * Write one using traverser + * Write one and two (as BC) using traverser */ auto hostTraverseUsingTraverser = [&] () { hostTraverserBenchmark.writeOneUsingTraverser(); }; +#ifdef HAVE_CUDA auto cudaTraverseUsingTraverser = [&] () { cudaTraverserBenchmark.writeOneUsingTraverser(); }; +#endif - benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); + if( tests == "all" || tests == "bc-traverser" ) + { + benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser ); #endif - benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser ); + benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser ); + benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser ); #endif + } } - return true; } void setupConfig( Config::ConfigDescription& config ) { + config.addEntry< String >( "tests", "Tests to be performed.", "all" ); + config.addEntryEnum( "all" ); + config.addEntryEnum( "no-bc-pure-c" ); + config.addEntryEnum( "no-bc-parallel-for" ); + config.addEntryEnum( "no-bc-traverser" ); + config.addEntryEnum( "bc-pure-c" ); + config.addEntryEnum( "bc-parallel-for" ); + config.addEntryEnum( "bc-traverser" ); +#ifdef HAVE_CUDA + config.addEntry< bool >( "with-cuda", "Perform even the CUDA benchmarks.", true ); +#else + config.addEntry< bool >( "with-cuda", "Perform even the CUDA benchmarks.", false ); +#endif config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-traversers.log"); config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); config.addEntryEnum( "append" ); -- GitLab From cd1ac1c00ec4048acd9be74a9fe61904d4c9b639 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Wed, 2 Jan 2019 20:10:40 +0100 Subject: [PATCH 089/130] Additional fixes of the traversers benchmark tests. --- .../Traversers/tnl-benchmark-traversers.h | 34 ++++++++++++++----- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 11899b369..60f672b22 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -41,26 +41,35 @@ bool runBenchmark( const Config::ParameterContainer& parameters, // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); const int minSize = parameters.getParameter< int >( "min-size" ); const int maxSize = parameters.getParameter< int >( "max-size" ); +#ifdef HAVE_CUDA + const bool withCuda = parameters.getParameter< bool >( "with-cuda" ); +#else + const bool withCuda = false; +#endif /**** - * Full grid traversing + * Full grid traversing with no boundary conditions */ benchmark.newBenchmark( String("Traversing without boundary conditions" + convertToString( Dimension ) + "D" ), metadata ); for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size ); +#ifdef HAVE_CUDA GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size ); +#endif auto hostReset = [&]() { hostTraverserBenchmark.reset(); }; +#ifdef HAVE_CUDA auto cudaReset = [&]() { cudaTraverserBenchmark.reset(); }; - +#endif + benchmark.setMetadataColumns( Benchmark::MetadataColumns( { {"size", convertToString( size ) }, } ) ); @@ -136,26 +145,33 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.writeOneUsingTraverser(); }; +#ifdef HAVE_CUDA auto cudaWriteOneUsingTraverser = [&] () { cudaTraverserBenchmark.writeOneUsingTraverser(); }; +#endif - benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); + if( tests == "all" || tests == "no-bc-traverser" ) + { + benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); #endif - benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingTraverser ); + benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingTraverser ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser ); + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser ); #endif + } } /**** - * Full grid traversing + * Full grid traversing including boundary conditions */ benchmark.newBenchmark( String("Traversing with boundary conditions" + convertToString( Dimension ) + "D" ), metadata ); for( std::size_t size = minSize; size <= maxSize; size *= 2 ) -- GitLab From e7ceacf788fb1fbbc39a96ce61ea8a6dc79fd625 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Wed, 2 Jan 2019 21:58:35 +0100 Subject: [PATCH 090/130] Fixing indexes ordering in parallel for in traversers benchmark. --- src/Benchmarks/Traversers/GridTraversersBenchmark.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 2ea81ed14..5ae8c14b3 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -99,7 +99,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > if( std::is_same< Device, Devices::Host >::value ) { for( int i = 0; i < size; i++ ) - v_data[ i ] = 1.0; + v_data[ i ] += 1.0; } else // Device == Devices::Cuda { @@ -129,7 +129,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { auto f = [] __cuda_callable__ ( Index i, Real* data ) { - data[ i ] = 1.0; + data[ i ] = +1.0; }; ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } @@ -271,7 +271,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > void writeOneUsingParallelFor() { Index _size = this->size; - auto f = [=] __cuda_callable__ ( Index j, Index i, Real* data ) + auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { data[ i * _size + j ] = 1.0; }; @@ -438,7 +438,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > void writeOneUsingParallelFor() { Index _size = this->size; - auto f = [=] __cuda_callable__ ( Index k, Index j, Index i, Real* data ) + auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { data[ ( i * _size + j ) * _size + k ] = 1.0; }; -- GitLab From acab7f7161f477d8bade5022be62d8bbb68ef1d4 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 12:02:53 +0100 Subject: [PATCH 091/130] Added traversers benchmark test - parallel for with a grid entity. --- .../Traversers/GridTraversersBenchmark.h | 45 ++++++++- .../Traversers/tnl-benchmark-traversers.h | 91 ++++++++++++------- .../Meshes/GridDetails/GridTraverser_impl.h | 35 ++++++- 3 files changed, 134 insertions(+), 37 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 5ae8c14b3..508a68eec 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -42,7 +42,7 @@ class WriteOneEntitiesProcessor const GridEntity& entity ) { auto& u = userData.u.template modifyData< DeviceType >(); - u( entity ) = 1.0; + u( entity ) += 1.0; } }; @@ -134,6 +134,15 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } + void writeOneUsingParallelForAndGridEntity() + { + auto f = [] __cuda_callable__ ( Index i, Real* data ) + { + data[ i ] = +1.0; + }; + ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); + } + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > @@ -267,7 +276,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > #endif } } - + void writeOneUsingParallelFor() { Index _size = this->size; @@ -283,6 +292,21 @@ class GridTraversersBenchmark< 2, Device, Real, Index > f, v.getData() ); } + void writeOneUsingParallelForAndGridEntity() + { + Index _size = this->size; + auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) + { + data[ i * _size + j ] = 1.0; + }; + + ParallelFor2D< Device >::exec( ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); + } + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > @@ -452,6 +476,23 @@ class GridTraversersBenchmark< 3, Device, Real, Index > f, v.getData() ); } + void writeOneUsingParallelForAndGridEntity() + { + Index _size = this->size; + auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) + { + data[ ( i * _size + j ) * _size + k ] = 1.0; + }; + + ParallelFor3D< Device >::exec( ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); + } + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 60f672b22..9f7920e3c 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -77,28 +77,27 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using C for */ - auto hostWriteOneUsingPureC = [&] () - { - hostTraverserBenchmark.writeOneUsingPureC(); - }; - -#ifdef HAVE_CUDA - auto cudaWriteOneUsingPureC = [&] () - { - cudaTraverserBenchmark.writeOneUsingPureC(); - }; -#endif - if( tests == "all" || tests == "no-bc-pure-c") { benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + + auto hostWriteOneUsingPureC = [&] () + { + hostTraverserBenchmark.writeOneUsingPureC(); + }; benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC ); + #ifdef HAVE_CUDA + auto cudaWriteOneUsingPureC = [&] () + { + cudaTraverserBenchmark.writeOneUsingPureC(); + }; if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC ); #endif benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); + #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); @@ -108,27 +107,24 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using parallel for */ - auto hostWriteOneUsingParallelFor = [&] () - { - hostTraverserBenchmark.writeOneUsingParallelFor(); - }; - -#ifdef HAVE_CUDA - auto cudaWriteOneUsingParallelFor = [&] () - { - cudaTraverserBenchmark.writeOneUsingParallelFor(); - }; -#endif - if( tests == "all" || tests == "no-bc-parallel-for" ) { benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + + auto hostWriteOneUsingParallelFor = [&] () + { + hostTraverserBenchmark.writeOneUsingParallelFor(); + }; benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor ); + #ifdef HAVE_CUDA + auto cudaWriteOneUsingParallelFor = [&] () + { + cudaTraverserBenchmark.writeOneUsingParallelFor(); + }; if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor ); #endif - benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA @@ -138,25 +134,51 @@ bool runBenchmark( const Config::ParameterContainer& parameters, } /**** - * Write one using traverser + * Write one using parallel for with grid entity */ - auto hostWriteOneUsingTraverser = [&] () + if( tests == "all" || tests == "no-bc-parallel-for-and-grid-entity" ) { - hostTraverserBenchmark.writeOneUsingTraverser(); - }; + auto hostWriteOneUsingParallelForAndGridEntity = [&] () + { + hostTraverserBenchmark.writeOneUsingParallelForAndGridEntity(); + }; + benchmark.setOperation( "par.for+grid ent.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelForAndGridEntity ); #ifdef HAVE_CUDA - auto cudaWriteOneUsingTraverser = [&] () - { - cudaTraverserBenchmark.writeOneUsingTraverser(); - }; + auto cudaWriteOneUsingParallelForAndGridEntity = [&] () + { + cudaTraverserBenchmark.writeOneUsingParallelForAndGridEntity(); + }; + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelForAndGridEntity ); +#endif + + benchmark.setOperation( "par.for+grid ent. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); +#ifdef HAVE_CUDA + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity ); #endif + } + /**** + * Write one using traverser + */ if( tests == "all" || tests == "no-bc-traverser" ) { benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + auto hostWriteOneUsingTraverser = [&] () + { + hostTraverserBenchmark.writeOneUsingTraverser(); + }; benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); + #ifdef HAVE_CUDA + auto cudaWriteOneUsingTraverser = [&] () + { + cudaTraverserBenchmark.writeOneUsingTraverser(); + }; if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); #endif @@ -298,6 +320,7 @@ void setupConfig( Config::ConfigDescription& config ) config.addEntryEnum( "all" ); config.addEntryEnum( "no-bc-pure-c" ); config.addEntryEnum( "no-bc-parallel-for" ); + config.addEntryEnum( "no-bc-parallel-for-and-grid-entity" ); config.addEntryEnum( "no-bc-traverser" ); config.addEntryEnum( "bc-pure-c" ); config.addEntryEnum( "bc-parallel-for" ); @@ -343,7 +366,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) auto mode = std::ios::out; if( outputMode == "append" ) mode |= std::ios::app; - std::ofstream logFile( logFileName.getString(), mode ); + std::ofstream logFile( logFileName.getString(), mode ); if( ! benchmark.save( logFile ) ) { diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h index 258325a76..ba6ab7e9b 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h +++ b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h @@ -64,6 +64,39 @@ processEntities( EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); }*/ #ifdef HAVE_OPENMP + if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 ) + { +#pragma omp parallel firstprivate( begin, end ) + GridEntity entity( *gridPointer ); +#pragma omp for + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + else + { + GridEntity entity( *gridPointer ); + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } +#else + GridEntity entity( *gridPointer ); + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +#endif + +/* #pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() ) #endif { @@ -77,7 +110,7 @@ processEntities( entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } - } + }*/ } } -- GitLab From a437ec9aa8ee526d67a4f6c0d7a1caaf8d75082b Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 12:37:17 +0100 Subject: [PATCH 092/130] Implemented traversers benchmark test - parallel for with a grid entity. --- .../Traversers/GridTraversersBenchmark.h | 70 ++++++++++++++----- 1 file changed, 54 insertions(+), 16 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 508a68eec..ef89bf969 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -86,6 +86,8 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { userData.u = this->u; v_data = v.getData(); + hostGrid = &grid.template getData< Devices::Host >(); + cudaGrid = &grid.template getData< Devices::Cuda >(); } void reset() @@ -136,9 +138,17 @@ class GridTraversersBenchmark< 1, Device, Real, Index > void writeOneUsingParallelForAndGridEntity() { - auto f = [] __cuda_callable__ ( Index i, Real* data ) + const Grid* currentGrid; + if( std::is_same< Device, Devices::Host >::value ) + currentGrid = hostGrid; + else + currentGrid = cudaGrid; + auto f = [=] __cuda_callable__ ( Index i, Real* data ) { - data[ i ] = +1.0; + Cell entity( *currentGrid ); + entity.getCoordinates().x() = i; + entity.refresh(); + data[ entity.getIndex() ] = +1.0; }; ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } @@ -199,15 +209,17 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ( grid, userData ); } - protected: + protected: - Index size; - Vector v; - Real* v_data; - GridPointer grid; - MeshFunctionPointer u; - Traverser traverser; - WriteOneTraverserUserDataType userData; + Index size; + Vector v; + Real* v_data; + GridPointer grid; + const Grid* hostGrid; + const Grid* cudaGrid; + MeshFunctionPointer u; + Traverser traverser; + WriteOneTraverserUserDataType userData; }; @@ -235,6 +247,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index > { userData.u = this->u; v_data = v.getData(); + hostGrid = &grid.template getData< Devices::Host >(); + cudaGrid = &grid.template getData< Devices::Cuda >(); } void reset() @@ -282,7 +296,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { - data[ i * _size + j ] = 1.0; + data[ i * _size + j ] += 1.0; }; ParallelFor2D< Device >::exec( ( Index ) 0, @@ -294,10 +308,18 @@ class GridTraversersBenchmark< 2, Device, Real, Index > void writeOneUsingParallelForAndGridEntity() { - Index _size = this->size; + const Grid* currentGrid; + if( std::is_same< Device, Devices::Host >::value ) + currentGrid = hostGrid; + else + currentGrid = cudaGrid; auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { - data[ i * _size + j ] = 1.0; + Cell entity( *currentGrid ); + entity.getCoordinates().y() = i; + entity.getCoordinates().x() = j; + entity.refresh(); + data[ entity.getIndex() ] += 1.0; }; ParallelFor2D< Device >::exec( ( Index ) 0, @@ -382,6 +404,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index > Vector v; Real* v_data; GridPointer grid; + const Grid* hostGrid; + const Grid* cudaGrid; MeshFunctionPointer u; Traverser traverser; WriteOneTraverserUserDataType userData; @@ -414,6 +438,8 @@ class GridTraversersBenchmark< 3, Device, Real, Index > { userData.u = this->u; v_data = v.getData(); + hostGrid = &grid.template getData< Devices::Host >(); + cudaGrid = &grid.template getData< Devices::Cuda >(); } void reset() @@ -429,7 +455,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > for( int i = 0; i < size; i++ ) for( int j = 0; j < size; j++ ) for( int k = 0; k < size; k++ ) - v_data[ ( i * size + j ) * size + k ] = 1.0; + v_data[ ( i * size + j ) * size + k ] += 1.0; } else // Device == Devices::Cuda { @@ -464,7 +490,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { - data[ ( i * _size + j ) * _size + k ] = 1.0; + data[ ( i * _size + j ) * _size + k ] += 1.0; }; ParallelFor3D< Device >::exec( ( Index ) 0, @@ -478,10 +504,20 @@ class GridTraversersBenchmark< 3, Device, Real, Index > void writeOneUsingParallelForAndGridEntity() { + const Grid* currentGrid; + if( std::is_same< Device, Devices::Host >::value ) + currentGrid = hostGrid; + else + currentGrid = cudaGrid; Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { - data[ ( i * _size + j ) * _size + k ] = 1.0; + Cell entity( *currentGrid ); + entity.getCoordinates().z() = i; + entity.getCoordinates().y() = j; + entity.getCoordinates().x() = k; + entity.refresh(); + data[ entity.getIndex() ] += 1.0; }; ParallelFor3D< Device >::exec( ( Index ) 0, @@ -581,6 +617,8 @@ class GridTraversersBenchmark< 3, Device, Real, Index > Vector v; Real* v_data; GridPointer grid; + const Grid* hostGrid; + const Grid* cudaGrid; MeshFunctionPointer u; Traverser traverser; WriteOneTraverserUserDataType userData; -- GitLab From fce930fd51d766d461ff86ce6bccd0c570a78767 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 12:43:42 +0100 Subject: [PATCH 093/130] GridTraversersBenchmark.h splitted into GridTraversersBenchmark_1D.h, GridTraversersBenchmark_2D.h and GridTraversersBenchmark_3D.h. --- .../Traversers/GridTraversersBenchmark.h | 568 +----------------- 1 file changed, 5 insertions(+), 563 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index ef89bf969..c320dc591 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -56,574 +56,16 @@ class WriteOneUserData MeshFunctionPointer u; }; - template< int Dimension, typename Device, typename Real, typename Index > class GridTraversersBenchmark{}; -template< typename Device, - typename Real, - typename Index > -class GridTraversersBenchmark< 1, Device, Real, Index > -{ - public: - - using Vector = Containers::Vector< Real, Device, Index >; - using Grid = Meshes::Grid< 1, Real, Device, Index >; - using GridPointer = Pointers::SharedPointer< Grid >; - using Coordinates = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; - using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; - using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; - using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; - - GridTraversersBenchmark( Index size ) - :v( size ), size( size ), grid( size ), u( grid ) - { - userData.u = this->u; - v_data = v.getData(); - hostGrid = &grid.template getData< Devices::Host >(); - cudaGrid = &grid.template getData< Devices::Cuda >(); - } - - void reset() - { - v.setValue( 0.0 ); - u->getData().setValue( 0.0 ); - }; - - void writeOneUsingPureC() - { - if( std::is_same< Device, Devices::Host >::value ) - { - for( int i = 0; i < size; i++ ) - v_data[ i ] += 1.0; - } - else // Device == Devices::Cuda - { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( - blockSize, - blocksCount, - gridsCount, - size ); - dim3 gridIdx; - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - fullGridTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); - } -#endif - } - } - - void writeOneUsingParallelFor() - { - auto f = [] __cuda_callable__ ( Index i, Real* data ) - { - data[ i ] = +1.0; - }; - ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); - } - - void writeOneUsingParallelForAndGridEntity() - { - const Grid* currentGrid; - if( std::is_same< Device, Devices::Host >::value ) - currentGrid = hostGrid; - else - currentGrid = cudaGrid; - auto f = [=] __cuda_callable__ ( Index i, Real* data ) - { - Cell entity( *currentGrid ); - entity.getCoordinates().x() = i; - entity.refresh(); - data[ entity.getIndex() ] = +1.0; - }; - ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); - } - - void writeOneUsingTraverser() - { - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > - ( grid, userData ); - } - - void traverseUsingPureC() - { - if( std::is_same< Device, Devices::Host >::value ) - { - v_data[ 0 ] = 2; - for( int i = 1; i < size - 1; i++ ) - v_data[ i ] = 1.0; - v_data[ size - 1 ] = 2; - } - else // Device == Devices::Cuda - { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( - blockSize, - blocksCount, - gridsCount, - size ); - dim3 gridIdx; - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - boundariesTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); - } - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - interiorTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); - } -#endif - } - } - - void traverseUsingTraverser() - { - // TODO !!!!!!!!!!!!!!!!!!!!!! - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > - ( grid, userData ); - } - - protected: - - Index size; - Vector v; - Real* v_data; - GridPointer grid; - const Grid* hostGrid; - const Grid* cudaGrid; - MeshFunctionPointer u; - Traverser traverser; - WriteOneTraverserUserDataType userData; -}; - - -template< typename Device, - typename Real, - typename Index > -class GridTraversersBenchmark< 2, Device, Real, Index > -{ - public: - - using Vector = Containers::Vector< Real, Device, Index >; - using Grid = Meshes::Grid< 2, Real, Device, Index >; - using GridPointer = Pointers::SharedPointer< Grid >; - using Coordinates = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; - using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; - using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; - using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; - using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; - - GridTraversersBenchmark( Index size ) - :size( size ), v( size * size ), grid( size, size ), u( grid ) - { - userData.u = this->u; - v_data = v.getData(); - hostGrid = &grid.template getData< Devices::Host >(); - cudaGrid = &grid.template getData< Devices::Cuda >(); - } - - void reset() - { - v.setValue( 0.0 ); - u->getData().setValue( 0.0 ); - }; - - void writeOneUsingPureC() - { - if( std::is_same< Device, Devices::Host >::value ) - { - for( int i = 0; i < size; i++ ) - for( int j = 0; j < size; j++ ) - v_data[ i * size + j ] = 1.0; - } - else // Device == Devices::Cuda - { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( - blockSize, - blocksCount, - gridsCount, - size, - size ); - dim3 gridIdx; - for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - fullGridTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); - } -#endif - } - } - - void writeOneUsingParallelFor() - { - Index _size = this->size; - auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) - { - data[ i * _size + j ] += 1.0; - }; - - ParallelFor2D< Device >::exec( ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - f, v.getData() ); - } - - void writeOneUsingParallelForAndGridEntity() - { - const Grid* currentGrid; - if( std::is_same< Device, Devices::Host >::value ) - currentGrid = hostGrid; - else - currentGrid = cudaGrid; - auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) - { - Cell entity( *currentGrid ); - entity.getCoordinates().y() = i; - entity.getCoordinates().x() = j; - entity.refresh(); - data[ entity.getIndex() ] += 1.0; - }; - - ParallelFor2D< Device >::exec( ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - f, v.getData() ); - } - - void writeOneUsingTraverser() - { - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > - ( grid, userData ); - } - - void traverseUsingPureC() - { - if( std::is_same< Device, Devices::Host >::value ) - { - for( int i = 0; i < size; i++ ) - { - v_data[ i * size ] = 2.0; - v_data[ i * size + size - 1 ] = 2.0; - } - for( int j = 1; j < size - 1; j++ ) - { - v_data[ j ] = 2.0; - v_data[ ( size - 1 ) * size + j ] = 2.0; - } - - for( int i = 1; i < size - 1; i++ ) - for( int j = 1; j < size - 1; j++ ) - v_data[ i * size + j ] = 1.0; - } - else // Device == Devices::Cuda - { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( - blockSize, - blocksCount, - gridsCount, - size, - size ); - dim3 gridIdx; - for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - boundariesTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); - } - for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - interiorTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); - } -#endif - } - } - - void traversingUsingTraverser() - { - // TODO !!!!!!!!!!!!!!!!!!!!!! - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > - ( grid, userData ); - } - - protected: - - Index size; - Vector v; - Real* v_data; - GridPointer grid; - const Grid* hostGrid; - const Grid* cudaGrid; - MeshFunctionPointer u; - Traverser traverser; - WriteOneTraverserUserDataType userData; -}; - -template< typename Device, - typename Real, - typename Index > -class GridTraversersBenchmark< 3, Device, Real, Index > -{ - public: - - using Vector = Containers::Vector< Real, Device, Index >; - using Grid = Meshes::Grid< 3, Real, Device, Index >; - using GridPointer = Pointers::SharedPointer< Grid >; - using Coordinates = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; - using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; - using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; - using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; - using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; - - GridTraversersBenchmark( Index size ) - : size( size ), - v( size * size * size ), - grid( size, size, size ), - u( grid ) - { - userData.u = this->u; - v_data = v.getData(); - hostGrid = &grid.template getData< Devices::Host >(); - cudaGrid = &grid.template getData< Devices::Cuda >(); - } - - void reset() - { - v.setValue( 0.0 ); - u->getData().setValue( 0.0 ); - }; - - void writeOneUsingPureC() - { - if( std::is_same< Device, Devices::Host >::value ) - { - for( int i = 0; i < size; i++ ) - for( int j = 0; j < size; j++ ) - for( int k = 0; k < size; k++ ) - v_data[ ( i * size + j ) * size + k ] += 1.0; - } - else // Device == Devices::Cuda - { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( - blockSize, - blocksCount, - gridsCount, - size, - size, - size ); - dim3 gridIdx; - for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) - for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - fullGridTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); - } -#endif - } - } - - void writeOneUsingParallelFor() - { - Index _size = this->size; - auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) - { - data[ ( i * _size + j ) * _size + k ] += 1.0; - }; - - ParallelFor3D< Device >::exec( ( Index ) 0, - ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - this->size, - f, v.getData() ); - } - - void writeOneUsingParallelForAndGridEntity() - { - const Grid* currentGrid; - if( std::is_same< Device, Devices::Host >::value ) - currentGrid = hostGrid; - else - currentGrid = cudaGrid; - Index _size = this->size; - auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) - { - Cell entity( *currentGrid ); - entity.getCoordinates().z() = i; - entity.getCoordinates().y() = j; - entity.getCoordinates().x() = k; - entity.refresh(); - data[ entity.getIndex() ] += 1.0; - }; - - ParallelFor3D< Device >::exec( ( Index ) 0, - ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - this->size, - f, v.getData() ); - } - - void writeOneUsingTraverser() - { - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > - ( grid, userData ); - } - - void traverseUsingPureC() - { - if( std::is_same< Device, Devices::Host >::value ) - { - for( int i = 0; i < size; i++ ) - for( int j = 0; j < size; j++ ) - { - v_data[ ( i * size + j ) * size ] = 2.0; - v_data[ ( i * size + j ) * size + size - 1 ] = 2.0; - } - for( int j = 0; j < size; j++ ) - for( int k = 1; k < size - 1; k++ ) - { - v_data[ j * size + k ] = 1.0; - v_data[ ( ( size - 1) * size + j ) * size + k ] = 1.0; - } - - for( int i = 1; i < size -1; i++ ) - for( int k = 1; k < size - 1; k++ ) - { - v_data[ ( i * size ) * size + k ] = 2.0; - v_data[ ( i * size + size - 1 ) * size + k ] = 2.0; - } - - for( int i = 1; i < size -1; i++ ) - for( int j = 1; j < size -1; j++ ) - for( int k = 1; k < size - 1; k++ ) - v_data[ ( i * size + j ) * size + k ] = 1.0; - } - else // Device == Devices::Cuda - { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( - blockSize, - blocksCount, - gridsCount, - size, - size, - size ); - dim3 gridIdx; - for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) - for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - boundariesTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); - } - for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) - for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - interiorTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); - } -#endif - } - } - - void traverseUsingTraverser() - { - // TODO !!!!!!!!!!!!!!!!!!!!!! - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > - ( grid, userData ); - } - - protected: - - Index size; - Vector v; - Real* v_data; - GridPointer grid; - const Grid* hostGrid; - const Grid* cudaGrid; - MeshFunctionPointer u; - Traverser traverser; - WriteOneTraverserUserDataType userData; -}; - } // namespace Traversers } // namespace Benchmarks -} // namespace TNL \ No newline at end of file +} // namespace TNL + +#include "GridTraversersBenchmark_1D.h" +#include "GridTraversersBenchmark_2D.h" +#include "GridTraversersBenchmark_3D.h" \ No newline at end of file -- GitLab From 1e749ff17520c9ed8a67224ead24b16323e7b5ad Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 12:52:39 +0100 Subject: [PATCH 094/130] GridTraversersBenchmark.h splitted into GridTraversersBenchmark_1D.h, GridTraversersBenchmark_2D.h and GridTraversersBenchmark_3D.h. --- .../Traversers/GridTraversersBenchmark_1D.h | 191 ++++++++++++++ .../Traversers/GridTraversersBenchmark_2D.h | 220 ++++++++++++++++ .../Traversers/GridTraversersBenchmark_3D.h | 245 ++++++++++++++++++ 3 files changed, 656 insertions(+) create mode 100644 src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h create mode 100644 src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h create mode 100644 src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h new file mode 100644 index 000000000..c270080fc --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -0,0 +1,191 @@ +/*************************************************************************** + GridTraversersBenchmark_1D.h - description + ------------------- + begin : Jan 3, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda-kernels.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename Device, + typename Real, + typename Index > +class GridTraversersBenchmark< 1, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + using Grid = Meshes::Grid< 1, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< Grid >; + using Coordinates = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; + using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; + using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; + + GridTraversersBenchmark( Index size ) + :v( size ), size( size ), grid( size ), u( grid ) + { + userData.u = this->u; + v_data = v.getData(); + hostGrid = &grid.template getData< Devices::Host >(); + cudaGrid = &grid.template getData< Devices::Cuda >(); + } + + void reset() + { + v.setValue( 0.0 ); + u->getData().setValue( 0.0 ); + }; + + void writeOneUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + v_data[ i ] += 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + fullGridTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void writeOneUsingParallelFor() + { + auto f = [] __cuda_callable__ ( Index i, Real* data ) + { + data[ i ] = +1.0; + }; + ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); + } + + void writeOneUsingParallelForAndGridEntity() + { + const Grid* currentGrid; + if( std::is_same< Device, Devices::Host >::value ) + currentGrid = hostGrid; + else + currentGrid = cudaGrid; + auto f = [=] __cuda_callable__ ( Index i, Real* data ) + { + Cell entity( *currentGrid ); + entity.getCoordinates().x() = i; + entity.refresh(); + data[ entity.getIndex() ] = +1.0; + }; + ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); + } + + void writeOneUsingTraverser() + { + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } + + void traverseUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + v_data[ 0 ] = 2; + for( int i = 1; i < size - 1; i++ ) + v_data[ i ] = 1.0; + v_data[ size - 1 ] = 2; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + boundariesTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + interiorTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void traverseUsingTraverser() + { + // TODO !!!!!!!!!!!!!!!!!!!!!! + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } + + protected: + + Index size; + Vector v; + Real* v_data; + GridPointer grid; + const Grid* hostGrid; + const Grid* cudaGrid; + MeshFunctionPointer u; + Traverser traverser; + WriteOneTraverserUserDataType userData; +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL \ No newline at end of file diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h new file mode 100644 index 000000000..d8823c335 --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -0,0 +1,220 @@ +/*************************************************************************** + GridTraversersBenchmark_2D.h - description + ------------------- + begin : Jan 3, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda-kernels.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename Device, + typename Real, + typename Index > +class GridTraversersBenchmark< 2, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + using Grid = Meshes::Grid< 2, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< Grid >; + using Coordinates = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; + using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; + using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; + using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; + + GridTraversersBenchmark( Index size ) + :size( size ), v( size * size ), grid( size, size ), u( grid ) + { + userData.u = this->u; + v_data = v.getData(); + hostGrid = &grid.template getData< Devices::Host >(); + cudaGrid = &grid.template getData< Devices::Cuda >(); + } + + void reset() + { + v.setValue( 0.0 ); + u->getData().setValue( 0.0 ); + }; + + void writeOneUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + for( int j = 0; j < size; j++ ) + v_data[ i * size + j ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size ); + dim3 gridIdx; + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + fullGridTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void writeOneUsingParallelFor() + { + Index _size = this->size; + auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) + { + data[ i * _size + j ] += 1.0; + }; + + ParallelFor2D< Device >::exec( ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); + } + + void writeOneUsingParallelForAndGridEntity() + { + const Grid* currentGrid; + if( std::is_same< Device, Devices::Host >::value ) + currentGrid = hostGrid; + else + currentGrid = cudaGrid; + auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) + { + Cell entity( *currentGrid ); + entity.getCoordinates().y() = i; + entity.getCoordinates().x() = j; + entity.refresh(); + data[ entity.getIndex() ] += 1.0; + }; + + ParallelFor2D< Device >::exec( ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); + } + + void writeOneUsingTraverser() + { + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } + + void traverseUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + { + v_data[ i * size ] = 2.0; + v_data[ i * size + size - 1 ] = 2.0; + } + for( int j = 1; j < size - 1; j++ ) + { + v_data[ j ] = 2.0; + v_data[ ( size - 1 ) * size + j ] = 2.0; + } + + for( int i = 1; i < size - 1; i++ ) + for( int j = 1; j < size - 1; j++ ) + v_data[ i * size + j ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size ); + dim3 gridIdx; + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + boundariesTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + interiorTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void traversingUsingTraverser() + { + // TODO !!!!!!!!!!!!!!!!!!!!!! + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } + + protected: + + Index size; + Vector v; + Real* v_data; + GridPointer grid; + const Grid* hostGrid; + const Grid* cudaGrid; + MeshFunctionPointer u; + Traverser traverser; + WriteOneTraverserUserDataType userData; +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL \ No newline at end of file diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h new file mode 100644 index 000000000..8f3a55e19 --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -0,0 +1,245 @@ +/*************************************************************************** + GridTraversersBenchmark_3D.h - description + ------------------- + begin : Jan 3, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda-kernels.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename Device, + typename Real, + typename Index > +class GridTraversersBenchmark< 3, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + using Grid = Meshes::Grid< 3, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< Grid >; + using Coordinates = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; + using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; + using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; + using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; + + GridTraversersBenchmark( Index size ) + : size( size ), + v( size * size * size ), + grid( size, size, size ), + u( grid ) + { + userData.u = this->u; + v_data = v.getData(); + hostGrid = &grid.template getData< Devices::Host >(); + cudaGrid = &grid.template getData< Devices::Cuda >(); + } + + void reset() + { + v.setValue( 0.0 ); + u->getData().setValue( 0.0 ); + }; + + void writeOneUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + for( int j = 0; j < size; j++ ) + for( int k = 0; k < size; k++ ) + v_data[ ( i * size + j ) * size + k ] += 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size, + size ); + dim3 gridIdx; + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + fullGridTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void writeOneUsingParallelFor() + { + Index _size = this->size; + auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) + { + data[ ( i * _size + j ) * _size + k ] += 1.0; + }; + + ParallelFor3D< Device >::exec( ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); + } + + void writeOneUsingParallelForAndGridEntity() + { + const Grid* currentGrid; + if( std::is_same< Device, Devices::Host >::value ) + currentGrid = hostGrid; + else + currentGrid = cudaGrid; + Index _size = this->size; + auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) + { + Cell entity( *currentGrid ); + entity.getCoordinates().z() = i; + entity.getCoordinates().y() = j; + entity.getCoordinates().x() = k; + entity.refresh(); + data[ entity.getIndex() ] += 1.0; + }; + + ParallelFor3D< Device >::exec( ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); + } + + void writeOneUsingTraverser() + { + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } + + void traverseUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + for( int j = 0; j < size; j++ ) + { + v_data[ ( i * size + j ) * size ] = 2.0; + v_data[ ( i * size + j ) * size + size - 1 ] = 2.0; + } + for( int j = 0; j < size; j++ ) + for( int k = 1; k < size - 1; k++ ) + { + v_data[ j * size + k ] = 1.0; + v_data[ ( ( size - 1) * size + j ) * size + k ] = 1.0; + } + + for( int i = 1; i < size -1; i++ ) + for( int k = 1; k < size - 1; k++ ) + { + v_data[ ( i * size ) * size + k ] = 2.0; + v_data[ ( i * size + size - 1 ) * size + k ] = 2.0; + } + + for( int i = 1; i < size -1; i++ ) + for( int j = 1; j < size -1; j++ ) + for( int k = 1; k < size - 1; k++ ) + v_data[ ( i * size + j ) * size + k ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size, + size ); + dim3 gridIdx; + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + boundariesTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + interiorTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void traverseUsingTraverser() + { + // TODO !!!!!!!!!!!!!!!!!!!!!! + traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + ( grid, userData ); + } + + protected: + + Index size; + Vector v; + Real* v_data; + GridPointer grid; + const Grid* hostGrid; + const Grid* cudaGrid; + MeshFunctionPointer u; + Traverser traverser; + WriteOneTraverserUserDataType userData; +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL \ No newline at end of file -- GitLab From e9ff6904e96fbf25df1e70f050866d9b22ab1f73 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 12:52:59 +0100 Subject: [PATCH 095/130] Deleting old code. --- .../Meshes/GridDetails/GridTraverser_impl.h | 28 +------------------ 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h index ba6ab7e9b..e8e96b42e 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h +++ b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h @@ -54,15 +54,6 @@ processEntities( } else { - //TODO: This does not work with gcc-5.4 and older, should work at gcc 6.x -/*#pragma omp parallel for firstprivate( entity, begin, end ) if( Devices::Host::isOMPEnabled() ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - }*/ #ifdef HAVE_OPENMP if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 ) { @@ -95,23 +86,6 @@ processEntities( EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } #endif - -/* -#pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() ) -#endif - { - GridEntity entity( *gridPointer ); -#ifdef HAVE_OPENMP -#pragma omp for -#endif - for( IndexType x = begin.x(); x <= end.x(); x ++ ) - { - entity.getCoordinates().x() = x; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - }*/ - } } @@ -385,7 +359,7 @@ processEntities( entity.getCoordinates().y() = y; entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } + } } } } -- GitLab From 69c8055b774bb10837d2f98a80a9905dc5b9a4bc Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 13:24:28 +0100 Subject: [PATCH 096/130] Added traversers benchmark test with mesh function. --- .../Traversers/GridTraversersBenchmark_1D.h | 24 ++++++++----- .../Traversers/GridTraversersBenchmark_2D.h | 31 +++++++++++----- .../Traversers/GridTraversersBenchmark_3D.h | 35 +++++++++++++------ .../Traversers/tnl-benchmark-traversers.h | 29 +++++++++++++++ 4 files changed, 91 insertions(+), 28 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index c270080fc..32cdc3229 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -50,8 +50,6 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { userData.u = this->u; v_data = v.getData(); - hostGrid = &grid.template getData< Devices::Host >(); - cudaGrid = &grid.template getData< Devices::Cuda >(); } void reset() @@ -102,11 +100,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > void writeOneUsingParallelForAndGridEntity() { - const Grid* currentGrid; - if( std::is_same< Device, Devices::Host >::value ) - currentGrid = hostGrid; - else - currentGrid = cudaGrid; + const Grid* currentGrid = &grid.template getData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Real* data ) { Cell entity( *currentGrid ); @@ -117,6 +111,20 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } + void writeOneUsingParallelForAndMeshFunction() + { + const Grid* currentGrid = &grid.template getData< Device >(); + MeshFunction* _u = &u.template modifyData< Device >(); + auto f = [=] __cuda_callable__ ( Index i, Real* data ) + { + Cell entity( *currentGrid ); + entity.getCoordinates().x() = i; + entity.refresh(); + ( *_u )( entity ) = +1.0; + }; + ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); + } + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > @@ -179,8 +187,6 @@ class GridTraversersBenchmark< 1, Device, Real, Index > Vector v; Real* v_data; GridPointer grid; - const Grid* hostGrid; - const Grid* cudaGrid; MeshFunctionPointer u; Traverser traverser; WriteOneTraverserUserDataType userData; diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index d8823c335..cc360c349 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -51,8 +51,6 @@ class GridTraversersBenchmark< 2, Device, Real, Index > { userData.u = this->u; v_data = v.getData(); - hostGrid = &grid.template getData< Devices::Host >(); - cudaGrid = &grid.template getData< Devices::Cuda >(); } void reset() @@ -112,11 +110,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > void writeOneUsingParallelForAndGridEntity() { - const Grid* currentGrid; - if( std::is_same< Device, Devices::Host >::value ) - currentGrid = hostGrid; - else - currentGrid = cudaGrid; + const Grid* currentGrid = &grid.template getData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { Cell entity( *currentGrid ); @@ -133,6 +127,27 @@ class GridTraversersBenchmark< 2, Device, Real, Index > f, v.getData() ); } + void writeOneUsingParallelForAndMeshFunction() + { + const Grid* currentGrid = &grid.template getData< Device >(); + MeshFunction* _u = &u.template modifyData< Device >(); + auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) + { + Cell entity( *currentGrid ); + entity.getCoordinates().y() = i; + entity.getCoordinates().x() = j; + entity.refresh(); + ( *_u )( entity ) += 1.0; + }; + + ParallelFor2D< Device >::exec( ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); + } + + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > @@ -208,8 +223,6 @@ class GridTraversersBenchmark< 2, Device, Real, Index > Vector v; Real* v_data; GridPointer grid; - const Grid* hostGrid; - const Grid* cudaGrid; MeshFunctionPointer u; Traverser traverser; WriteOneTraverserUserDataType userData; diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index 8f3a55e19..07ea6e5f8 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -54,8 +54,6 @@ class GridTraversersBenchmark< 3, Device, Real, Index > { userData.u = this->u; v_data = v.getData(); - hostGrid = &grid.template getData< Devices::Host >(); - cudaGrid = &grid.template getData< Devices::Cuda >(); } void reset() @@ -120,12 +118,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > void writeOneUsingParallelForAndGridEntity() { - const Grid* currentGrid; - if( std::is_same< Device, Devices::Host >::value ) - currentGrid = hostGrid; - else - currentGrid = cudaGrid; - Index _size = this->size; + const Grid* currentGrid = &grid.template getData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { Cell entity( *currentGrid ); @@ -145,6 +138,30 @@ class GridTraversersBenchmark< 3, Device, Real, Index > f, v.getData() ); } + void writeOneUsingParallelForAndMeshFunction() + { + const Grid* currentGrid = &grid.template getData< Device >(); + MeshFunction* _u = &u.template modifyData< Device >(); + auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) + { + Cell entity( *currentGrid ); + entity.getCoordinates().z() = i; + entity.getCoordinates().y() = j; + entity.getCoordinates().x() = k; + entity.refresh(); + ( *_u )( entity ) += 1.0; + }; + + ParallelFor3D< Device >::exec( ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); + } + + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > @@ -233,8 +250,6 @@ class GridTraversersBenchmark< 3, Device, Real, Index > Vector v; Real* v_data; GridPointer grid; - const Grid* hostGrid; - const Grid* cudaGrid; MeshFunctionPointer u; Traverser traverser; WriteOneTraverserUserDataType userData; diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 9f7920e3c..56fbc151c 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -162,6 +162,35 @@ bool runBenchmark( const Config::ParameterContainer& parameters, #endif } + /**** + * Write one using parallel for with mesh function + */ + if( tests == "all" || tests == "no-bc-parallel-for-and-mesh-function" ) + { + auto hostWriteOneUsingParallelForAndMeshFunction = [&] () + { + hostTraverserBenchmark.writeOneUsingParallelForAndMeshFunction(); + }; + benchmark.setOperation( "par.for+mesh fc.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelForAndMeshFunction ); + +#ifdef HAVE_CUDA + auto cudaWriteOneUsingParallelForAndMeshFunction = [&] () + { + cudaTraverserBenchmark.writeOneUsingParallelForAndMeshFunction(); + }; + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelForAndGridMeshFunction ); +#endif + + benchmark.setOperation( "par.for+mesh fc. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction ); +#ifdef HAVE_CUDA + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction ); +#endif + } + /**** * Write one using traverser */ -- GitLab From 6e91b1726961c57afb97a87035614900cb1b6986 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 19:41:52 +0100 Subject: [PATCH 097/130] Added configuration parameter 'reset' to Benchmark. --- src/Benchmarks/Benchmarks.h | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 71f808ad8..f31e21f6c 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -73,6 +73,7 @@ public: static void configSetup( Config::ConfigDescription& config ) { config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); + config.addEntry< bool >( "reset", "Call reset function between loops.", true ); config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 ); config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true ); config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 ); @@ -81,6 +82,7 @@ public: void setup( const Config::ParameterContainer& parameters ) { this->loops = parameters.getParameter< unsigned >( "loops" ); + this->reset = parameters.getParameter< bool >( "reset" ); this->minTime = parameters.getParameter< double >( "min-time" ); this->timing = parameters.getParameter< bool >( "timing" ); const int verbose = parameters.getParameter< unsigned >( "verbose" ); @@ -114,8 +116,11 @@ public: { closeTable(); writeTitle( title ); - // add loops to metadata + // add loops and reset flag to metadata metadata["loops"] = convertToString(loops); + metadata["reset"] = convertToString( reset ); + metadata["minimal test time"] = convertToString( minTime ); + metadata["timing"] = convertToString( timing ); writeMetadata( metadata ); } @@ -202,15 +207,27 @@ public: // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); if( this->timing ) - result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + if( this->reset ) + result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + else + result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); else - result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + if( this->reset ) + result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + else + result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); } else { if( this->timing ) - result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + if( this->reset ) + result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + else + result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); else - result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + if( this->reset ) + result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + else + result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); } } catch ( const std::exception& e ) { @@ -319,6 +336,7 @@ protected: double datasetSize = 0.0; double baseTime = 0.0; bool timing = true; + bool reset = true; Solvers::IterativeSolverMonitor< double, int > monitor; }; -- GitLab From bb7d26648cf5a89b5d75897afe0ccc9d23bc0f14 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 21:46:05 +0100 Subject: [PATCH 098/130] Optimized conditional OpenMP traversing in 2D and 3D grid traversers - cells only. --- .../Meshes/GridDetails/GridTraverser_impl.h | 162 +++++++++++------- 1 file changed, 101 insertions(+), 61 deletions(-) diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h index e8e96b42e..33b5e22eb 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h +++ b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h @@ -58,30 +58,35 @@ processEntities( if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 ) { #pragma omp parallel firstprivate( begin, end ) - GridEntity entity( *gridPointer ); -#pragma omp for - for( IndexType x = begin.x(); x <= end.x(); x ++ ) { - entity.getCoordinates().x() = x; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + GridEntity entity( *gridPointer ); +#pragma omp for + // TODO: g++ 5.5 crashes when coding this loop without auxiliary x as bellow + for( IndexType x = begin.x(); x <= end.x(); x++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } } } else { GridEntity entity( *gridPointer ); - for( IndexType x = begin.x(); x <= end.x(); x ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) { - entity.getCoordinates().x() = x; entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } } #else GridEntity entity( *gridPointer ); - for( IndexType x = begin.x(); x <= end.x(); x ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) { - entity.getCoordinates().x() = x; entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } @@ -332,35 +337,51 @@ processEntities( } else { - //TODO: This does not work with gcc-5.4 and older, should work at gcc 6.x -/*#pragma omp parallel for firstprivate( entity, begin, end ) if( Devices::Host::isOMPEnabled() ) - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - }*/ #ifdef HAVE_OPENMP -#pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() ) -#endif + if( Devices::Host::isOMPEnabled() ) { - GridEntity entity( *gridPointer, begin, gridEntityParameters... ); -#ifdef HAVE_OPENMP -#pragma omp for -#endif - for( IndexType y = begin.y(); y <= end.y(); y ++ ) - for( IndexType x = begin.x(); x <= end.x(); x ++ ) - { - entity.getCoordinates().x() = x; - entity.getCoordinates().y() = y; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } +#pragma omp parallel firstprivate( begin, end ) + { + GridEntity entity( *gridPointer ); +#pragma omp for + // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow + for( IndexType y = begin.y(); y <= end.y(); y ++ ) + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.getCoordinates().y() = y; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + } + else + { + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } } +#else + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +#endif } } @@ -426,7 +447,7 @@ GridTraverser2DBoundaryAlongX( typename GridType::CoordinatesType coordinates; coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = fixedY; + coordinates.y() = fixedY; if( coordinates.x() <= endX ) { @@ -436,7 +457,7 @@ GridTraverser2DBoundaryAlongX( ( *grid, userData, entity ); - } + } } // Boundary traverser using streams @@ -648,7 +669,7 @@ processEntities( if( processOnlyBoundaryEntities && ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) ) { -#ifdef GRID_TRAVERSER_USE_STREAMS +#ifdef GRID_TRAVERSER_USE_STREAMS dim3 cudaBlockSize( 256 ); dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX, cudaBlocksCountAlongY, cudaGridsCountAlongY; @@ -960,8 +981,45 @@ processEntities( } else { - // TODO: this does not work with gcc-5.4 and older, should work at gcc 6.x -/*#pragma omp parallel for firstprivate( entity, begin, end ) if( Devices::Host::isOMPEnabled() ) +#ifdef HAVE_OPENMP + if( Devices::Host::isOMPEnabled() ) + { +#pragma omp parallel firstprivate( begin, end ) + { + GridEntity entity( *gridPointer ); +#pragma omp for + // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow + for( IndexType z = begin.z(); z <= end.z(); z ++ ) + for( IndexType y = begin.y(); y <= end.y(); y ++ ) + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.getCoordinates().y() = y; + entity.getCoordinates().z() = z; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + } + else + { + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().z() = begin.z(); + entity.getCoordinates().z() <= end.z(); + entity.getCoordinates().z() ++ ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } +#else + GridEntity entity( *gridPointer ); for( entity.getCoordinates().z() = begin.z(); entity.getCoordinates().z() <= end.z(); entity.getCoordinates().z() ++ ) @@ -971,29 +1029,11 @@ processEntities( for( entity.getCoordinates().x() = begin.x(); entity.getCoordinates().x() <= end.x(); entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - }*/ -#ifdef HAVE_OPENMP -#pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() ) -#endif - { - GridEntity entity( *gridPointer, begin, gridEntityParameters... ); -#ifdef HAVE_OPENMP -#pragma omp for -#endif - for( IndexType z = begin.z(); z <= end.z(); z ++ ) - for( IndexType y = begin.y(); y <= end.y(); y ++ ) - for( IndexType x = begin.x(); x <= end.x(); x ++ ) { - entity.getCoordinates().x() = x; - entity.getCoordinates().y() = y; - entity.getCoordinates().z() = z; entity.refresh(); EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } + } +#endif } } -- GitLab From 7122a97826dc92a217b245ae656750e088dc40b2 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 21:47:44 +0100 Subject: [PATCH 099/130] Analyzing grid traversers. --- .../Traversers/GridTraversersBenchmark_1D.h | 29 ++++++++++--- .../Traversers/GridTraversersBenchmark_2D.h | 21 +++++++++- .../Traversers/GridTraversersBenchmark_3D.h | 2 +- .../Traversers/tnl-benchmark-traversers.h | 41 ++++++++++--------- 4 files changed, 66 insertions(+), 27 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index 32cdc3229..91097ecac 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -22,6 +22,7 @@ #include #include #include "cuda-kernels.h" +#include "GridTraversersBenchmark.h" namespace TNL { namespace Benchmarks { @@ -46,7 +47,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; GridTraversersBenchmark( Index size ) - :v( size ), size( size ), grid( size ), u( grid ) + :size( size ), v( size ), grid( size ), u( grid ) { userData.u = this->u; v_data = v.getData(); @@ -93,7 +94,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { auto f = [] __cuda_callable__ ( Index i, Real* data ) { - data[ i ] = +1.0; + data[ i ] += 1.0; }; ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } @@ -106,7 +107,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > Cell entity( *currentGrid ); entity.getCoordinates().x() = i; entity.refresh(); - data[ entity.getIndex() ] = +1.0; + data[ entity.getIndex() ] += 1.0; }; ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } @@ -117,18 +118,36 @@ class GridTraversersBenchmark< 1, Device, Real, Index > MeshFunction* _u = &u.template modifyData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Real* data ) { - Cell entity( *currentGrid ); + Cell entity( grid.template getData< Device >() ); entity.getCoordinates().x() = i; entity.refresh(); - ( *_u )( entity ) = +1.0; + //( *_u )( entity ) += 1.0; + WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity ); }; ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } void writeOneUsingTraverser() { + using CoordinatesType = typename Grid::CoordinatesType; traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > ( grid, userData ); + + /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >( + grid, + CoordinatesType( 0 ), + grid->getDimensions() - CoordinatesType( 1 ), + userData );*/ + /*const CoordinatesType begin( 0 ); + const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); + MeshFunction* _u = &u.template modifyData< Device >(); + Cell entity( *grid ); + for( Index x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); + }*/ } void traverseUsingPureC() diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index cc360c349..d62d56f91 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -65,7 +65,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > { for( int i = 0; i < size; i++ ) for( int j = 0; j < size; j++ ) - v_data[ i * size + j ] = 1.0; + v_data[ i * size + j ] += 1.0; } else // Device == Devices::Cuda { @@ -150,8 +150,27 @@ class GridTraversersBenchmark< 2, Device, Real, Index > void writeOneUsingTraverser() { + using CoordinatesType = typename Grid::CoordinatesType; traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > ( grid, userData ); + + /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >( + grid, + CoordinatesType( 0 ), + grid->getDimensions() - CoordinatesType( 1 ), + userData );*/ + /*const CoordinatesType begin( 0 ); + const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); + MeshFunction* _u = &u.template modifyData< Device >(); + Cell entity( *grid ); + for( Index y = begin.y(); y <= end.y(); y ++ ) + for( Index x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.getCoordinates().y() = y; + entity.refresh(); + WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); + }*/ } void traverseUsingPureC() diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index 07ea6e5f8..383640d39 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -252,7 +252,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > GridPointer grid; MeshFunctionPointer u; Traverser traverser; - WriteOneTraverserUserDataType userData; + WriteOneTraverserUserDataType userData; }; } // namespace Traversers diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 56fbc151c..96a131f48 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -39,8 +39,8 @@ bool runBenchmark( const Config::ParameterContainer& parameters, // to pass 64-bit integer values // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); - const int minSize = parameters.getParameter< int >( "min-size" ); - const int maxSize = parameters.getParameter< int >( "max-size" ); + const std::size_t minSize = parameters.getParameter< int >( "min-size" ); + const std::size_t maxSize = parameters.getParameter< int >( "max-size" ); #ifdef HAVE_CUDA const bool withCuda = parameters.getParameter< bool >( "with-cuda" ); #else @@ -85,7 +85,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, { hostTraverserBenchmark.writeOneUsingPureC(); }; - benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingPureC ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); #ifdef HAVE_CUDA auto cudaWriteOneUsingPureC = [&] () @@ -95,13 +95,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC ); #endif - benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + /*benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); -#endif +#endif*/ } /**** @@ -115,7 +115,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, { hostTraverserBenchmark.writeOneUsingParallelFor(); }; - benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelFor ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelFor = [&] () @@ -123,14 +123,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.writeOneUsingParallelFor(); }; if( withCuda ) - benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelFor ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); #endif - benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + /*benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); -#endif +#endif*/ } /**** @@ -143,7 +143,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.writeOneUsingParallelForAndGridEntity(); }; benchmark.setOperation( "par.for+grid ent.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelForAndGridEntity ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelForAndGridEntity = [&] () @@ -151,15 +151,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.writeOneUsingParallelForAndGridEntity(); }; if( withCuda ) - benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelForAndGridEntity ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity ); #endif - benchmark.setOperation( "par.for+grid ent. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + /*benchmark.setOperation( "par.for+grid ent. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity ); -#endif +#endif*/ } /**** @@ -172,7 +172,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.writeOneUsingParallelForAndMeshFunction(); }; benchmark.setOperation( "par.for+mesh fc.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingParallelForAndMeshFunction ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction ); #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelForAndMeshFunction = [&] () @@ -180,15 +180,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.writeOneUsingParallelForAndMeshFunction(); }; if( withCuda ) - benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingParallelForAndGridMeshFunction ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridMeshFunction ); #endif - benchmark.setOperation( "par.for+mesh fc. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + /*benchmark.setOperation( "par.for+mesh fc. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction ); -#endif +#endif*/ } /**** @@ -211,14 +211,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); #endif - +/* benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostWriteOneUsingTraverser ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser ); -#endif +#endif*/ } + std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl; } /**** -- GitLab From 6b7abdc2b362554795fdd490fa0a93c9a4158901 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 3 Jan 2019 21:48:16 +0100 Subject: [PATCH 100/130] Refactoring. --- src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h index c0ddcf2da..448c7bc8b 100644 --- a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h +++ b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h @@ -146,7 +146,7 @@ processAllEntities( gridPointer, CoordinatesType( 0 ), gridPointer->getDimensions() - CoordinatesType( 1 ), - userData ); + userData ); } else //Distributed { -- GitLab From 3bd901a53cb5503bfe20f41500c967b304ae55b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 4 Jan 2019 12:00:24 +0100 Subject: [PATCH 101/130] Added method containsValue to List. --- src/TNL/Containers/List.h | 9 ++++++++- src/TNL/Containers/List_impl.h | 8 ++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/TNL/Containers/List.h b/src/TNL/Containers/List.h index 2c175bcce..0cf6f762d 100644 --- a/src/TNL/Containers/List.h +++ b/src/TNL/Containers/List.h @@ -109,6 +109,13 @@ template< class T > class List template< typename Array > void toArray( Array& array ); + /*** + * \brief Checks if there is an element with value \e v in given array. + * + * \param v Reference to a value. + */ + bool containsValue( const T& v ) const; + /// Erases data element at given position. /// /// \param ind Index of the data element one chooses to remove. @@ -146,7 +153,7 @@ template< class T > class List /// /// \param file Name of file. bool DeepLoad( File& file ); - + protected: /// Pointer to the first element. ListDataElement< T >* first; diff --git a/src/TNL/Containers/List_impl.h b/src/TNL/Containers/List_impl.h index e67be136c..36fd5dbdc 100644 --- a/src/TNL/Containers/List_impl.h +++ b/src/TNL/Containers/List_impl.h @@ -207,6 +207,14 @@ void List< T >::toArray( Array& array ) for( int i = 0; i < this->getSize(); i++ ) array[ i ] = ( *this )[ i ]; } +template< typename T > +bool List< T >::containsValue( const T& v ) const +{ + for( int i = 0; i < this->getSize(); i++ ) + if( ( *this )[ i ] == v ) + return true; + return false; +} template< typename T > void List< T >::Erase( const int& ind ) -- GitLab From 733c42e8d8ee868ae794a424ae1ef67197fdf54e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 4 Jan 2019 12:00:53 +0100 Subject: [PATCH 102/130] Traversers benchmark tests can be configures as list of tests. --- .../Traversers/tnl-benchmark-traversers.h | 56 ++++--------------- 1 file changed, 12 insertions(+), 44 deletions(-) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 96a131f48..fd14ba25c 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -20,6 +20,7 @@ #include #include #include +#include using namespace TNL; using namespace TNL::Benchmarks; @@ -33,7 +34,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, Benchmark& benchmark, Benchmark::MetadataMap& metadata ) { - const String tests = parameters.getParameter< String >( "tests" ); + const Containers::List< String >& tests = parameters.getParameter< Containers::List< String > >( "tests" ); // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), // which have a default value. The workaround below works for int values, but it is not possible // to pass 64-bit integer values @@ -77,7 +78,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using C for */ - if( tests == "all" || tests == "no-bc-pure-c") + if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-pure-c" ) ) { benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); @@ -95,19 +96,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC ); #endif - /*benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); - -#ifdef HAVE_CUDA - if( withCuda ) - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); -#endif*/ } /**** * Write one using parallel for */ - if( tests == "all" || tests == "no-bc-parallel-for" ) + if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for" ) ) { benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); @@ -125,18 +119,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); #endif - /*benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); -#ifdef HAVE_CUDA - if( withCuda ) - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); -#endif*/ } /**** * Write one using parallel for with grid entity */ - if( tests == "all" || tests == "no-bc-parallel-for-and-grid-entity" ) + if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for-and-grid-entity" ) ) { auto hostWriteOneUsingParallelForAndGridEntity = [&] () { @@ -153,19 +141,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity ); #endif - - /*benchmark.setOperation( "par.for+grid ent. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); -#ifdef HAVE_CUDA - if( withCuda ) - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity ); -#endif*/ } /**** * Write one using parallel for with mesh function */ - if( tests == "all" || tests == "no-bc-parallel-for-and-mesh-function" ) + if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for-and-mesh-function" ) ) { auto hostWriteOneUsingParallelForAndMeshFunction = [&] () { @@ -180,21 +161,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.writeOneUsingParallelForAndMeshFunction(); }; if( withCuda ) - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridMeshFunction ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction ); #endif - /*benchmark.setOperation( "par.for+mesh fc. RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction ); -#ifdef HAVE_CUDA - if( withCuda ) - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction ); -#endif*/ } /**** * Write one using traverser */ - if( tests == "all" || tests == "no-bc-traverser" ) + if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-traverser" ) ) { benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); auto hostWriteOneUsingTraverser = [&] () @@ -211,13 +186,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); #endif -/* - benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); -#ifdef HAVE_CUDA - if( withCuda ) - benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingTraverser ); -#endif*/ } std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl; } @@ -262,7 +230,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; #endif - if( tests == "all" || tests == "bc-pure-c" ) + if( tests.containsValue( "all" ) || tests.containsValue( "bc-pure-c" ) ) { benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC ); @@ -294,7 +262,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; #endif - if( tests == "all" || tests == "bc-parallel-for" ) + if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) ) { benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); @@ -326,7 +294,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; #endif - if( tests == "all" || tests == "bc-traverser" ) + if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) ) { benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); @@ -346,7 +314,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, void setupConfig( Config::ConfigDescription& config ) { - config.addEntry< String >( "tests", "Tests to be performed.", "all" ); + config.addList< String >( "tests", "Tests to be performed.", "all" ); config.addEntryEnum( "all" ); config.addEntryEnum( "no-bc-pure-c" ); config.addEntryEnum( "no-bc-parallel-for" ); -- GitLab From be5a80021ac1d7cdc46b6bf06ccd4596ee519f49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 4 Jan 2019 12:21:40 +0100 Subject: [PATCH 103/130] Fixed CUDA travresers benchmark tests. --- src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index 91097ecac..93ee77385 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -116,15 +116,15 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { const Grid* currentGrid = &grid.template getData< Device >(); MeshFunction* _u = &u.template modifyData< Device >(); - auto f = [=] __cuda_callable__ ( Index i, Real* data ) + auto f = [=] __cuda_callable__ ( Index i ) { - Cell entity( grid.template getData< Device >() ); + Cell entity( *currentGrid ); entity.getCoordinates().x() = i; entity.refresh(); - //( *_u )( entity ) += 1.0; - WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity ); + ( *_u )( entity ) += 1.0; + //WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity ); }; - ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); + ParallelFor< Device >::exec( ( Index ) 0, size, f ); } void writeOneUsingTraverser() -- GitLab From c9182447939700c4df1a6655999b8641b798f386 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 4 Jan 2019 15:38:30 +0100 Subject: [PATCH 104/130] Fixing traversers benchmark kernels. --- src/Benchmarks/Traversers/cuda-kernels.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/Benchmarks/Traversers/cuda-kernels.h b/src/Benchmarks/Traversers/cuda-kernels.h index 2cd8b1b56..2802b73eb 100644 --- a/src/Benchmarks/Traversers/cuda-kernels.h +++ b/src/Benchmarks/Traversers/cuda-kernels.h @@ -27,7 +27,7 @@ __global__ void fullGridTraverseKernel1D( const Index size, const dim3 gridIdx, { const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( threadIdx_x < size ) - v_data[ threadIdx_x ] = 1.0; + v_data[ threadIdx_x ] += 1.0; } template< typename Real, @@ -37,7 +37,7 @@ __global__ void fullGridTraverseKernel2D( const Index size, const dim3 gridIdx, const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( threadIdx_x < size && threadIdx_y < size ) - v_data[ threadIdx_y * size + threadIdx_x ] = 1.0; + v_data[ threadIdx_y * size + threadIdx_x ] += 1.0; } template< typename Real, @@ -48,7 +48,7 @@ __global__ void fullGridTraverseKernel3D( const Index size, const dim3 gridIdx, const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size ) - v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0; + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 1.0; } /**** @@ -60,7 +60,7 @@ __global__ void interiorTraverseKernel1D( const Index size, const dim3 gridIdx, { const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( threadIdx_x > 0 && threadIdx_x < size - 1 ) - v_data[ threadIdx_x ] = 1.0; + v_data[ threadIdx_x ] += 1.0; } template< typename Real, @@ -71,7 +71,7 @@ __global__ void interiorTraverseKernel2D( const Index size, const dim3 gridIdx, const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_x < size - 1 && threadIdx_y < size - 1 ) - v_data[ threadIdx_y * size + threadIdx_x ] = 1.0; + v_data[ threadIdx_y * size + threadIdx_x ] += 1.0; } template< typename Real, @@ -83,7 +83,7 @@ __global__ void interiorTraverseKernel3D( const Index size, const dim3 gridIdx, const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_z > 0 && threadIdx_x < size - 1 && threadIdx_y < size - 1 && threadIdx_z < size - 1 ) - v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0; + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 1.0; } /**** @@ -95,7 +95,7 @@ __global__ void boundariesTraverseKernel1D( const Index size, const dim3 gridIdx { const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( threadIdx_x == 0 || threadIdx_x == size - 1 ) - v_data[ threadIdx_x ] = 2.0; + v_data[ threadIdx_x ] += 2.0; } template< typename Real, @@ -106,7 +106,7 @@ __global__ void boundariesTraverseKernel2D( const Index size, const dim3 gridIdx const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_x < size - 1 && threadIdx_y < size - 1 ) - v_data[ threadIdx_y * size + threadIdx_x ] = 2.0; + v_data[ threadIdx_y * size + threadIdx_x ] += 2.0; } template< typename Real, @@ -118,7 +118,7 @@ __global__ void boundariesTraverseKernel3D( const Index size, const dim3 gridIdx const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; if( threadIdx_x == 0 || threadIdx_y == 0 || threadIdx_z == 0 || threadIdx_x == size - 1 || threadIdx_y == size - 1 || threadIdx_z == size - 1 ) - v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 2.0; + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 2.0; } #endif -- GitLab From b4a904e4ea5cb1ccc94efbbba2105549571d6c2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 4 Jan 2019 16:55:59 +0100 Subject: [PATCH 105/130] Fixed tnl-benchmark-traversers.h --- src/Benchmarks/Traversers/tnl-benchmark-traversers.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index fd14ba25c..9f70589c9 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -94,7 +94,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.writeOneUsingPureC(); }; if( withCuda ) - benchmark.time< Devices::Cuda >( "GPU", cudaWriteOneUsingPureC ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); #endif } @@ -297,15 +297,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) ) { benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser ); + benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser ); #endif benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); #ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser ); #endif } } -- GitLab From be8f2ac40580637c9d23c6d126fce5a618c3f936 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 4 Jan 2019 18:32:06 +0100 Subject: [PATCH 106/130] GridTraverser_impl.h splitted into GridTraverser_1D.hpp, GridTraverser_2D.hpp and GridTraverser_3D.hpp. --- CMakeLists.txt | 4 +- .../Meshes/GridDetails/GridTraverser_1D.hpp | 290 ++++ .../Meshes/GridDetails/GridTraverser_2D.hpp | 648 ++++++++ .../Meshes/GridDetails/GridTraverser_3D.hpp | 551 +++++++ .../Meshes/GridDetails/GridTraverser_impl.h | 1436 ----------------- 5 files changed, 1491 insertions(+), 1438 deletions(-) create mode 100644 src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp create mode 100644 src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp create mode 100644 src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp delete mode 100644 src/TNL/Meshes/GridDetails/GridTraverser_impl.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 348ad4ac2..fe5519d12 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,7 +74,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") endif() # set Debug/Release options -set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable" ) +set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -g" ) set( CMAKE_CXX_FLAGS_DEBUG "-g" ) set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG" ) #set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" ) @@ -229,7 +229,7 @@ if( ${WITH_CUDA} ) endif() endif() endif() - set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; ${CUDA_ARCH} -D_FORCE_INLINES ) + set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; ${CUDA_ARCH} -D_FORCE_INLINES --generate-line-info) # TODO: this is necessary only due to a bug in cmake set( CUDA_ADD_LIBRARY_OPTIONS -shared ) endif() diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp new file mode 100644 index 000000000..90148f8e8 --- /dev/null +++ b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp @@ -0,0 +1,290 @@ +/*************************************************************************** + GridTraverser_1D.hpp - description + ------------------- + begin : Jan 4, 2019 + copyright : (C) 2019 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber, +// Jakub Klinkovsky, +// Vit Hanousek + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace TNL { +namespace Meshes { + +/**** + * 1D traverser, host + */ +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities > +void +GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType begin, + const CoordinatesType end, + UserData& userData, + const int& stream ) +{ + GridEntity entity( *gridPointer ); + if( processOnlyBoundaryEntities ) + { + GridEntity entity( *gridPointer ); + + entity.getCoordinates() = begin; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + entity.getCoordinates() = end; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + else + { +#ifdef HAVE_OPENMP + if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 ) + { +#pragma omp parallel firstprivate( begin, end ) + { + GridEntity entity( *gridPointer ); +#pragma omp for + // TODO: g++ 5.5 crashes when coding this loop without auxiliary x as bellow + for( IndexType x = begin.x(); x <= end.x(); x++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + } + else + { + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } +#else + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +#endif + } +} + +/**** + * 1D traverser, CUDA + */ +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor > +__global__ void +GridTraverser1D( + const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const Index gridIdx ) +{ + typedef Real RealType; + typedef Index IndexType; + typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( coordinates <= end ) + { + GridEntity entity( *grid, coordinates ); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +} + +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor > +__global__ void +GridBoundaryTraverser1D( + const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end ) +{ + typedef Real RealType; + typedef Index IndexType; + typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + if( threadIdx.x == 0 ) + { + coordinates.x() = begin.x(); + GridEntity entity( *grid, coordinates ); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + if( threadIdx.x == 1 ) + { + coordinates.x() = end.x(); + GridEntity entity( *grid, coordinates ); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +} + +#endif + +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities > +void +GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType& begin, + const CoordinatesType& end, + UserData& userData, + const int& stream ) +{ +#ifdef HAVE_CUDA + auto& pool = CudaStreamPool::getInstance(); + const cudaStream_t& s = pool.getStream( stream ); + + Devices::Cuda::synchronizeDevice(); + if( processOnlyBoundaryEntities ) + { + dim3 cudaBlockSize( 2 ); + dim3 cudaBlocks( 1 ); + GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > + <<< cudaBlocks, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end ); + } + else + { + dim3 cudaBlockSize( 256 ); + dim3 cudaBlocks; + cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); + const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); + + for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) + GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > + <<< cudaBlocks, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end, + gridXIdx ); + } + + // only launches into the stream 0 are synchronized + /*if( stream == 0 ) + { + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; + }*/ +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + +/**** + * 1D traverser, MIC + */ + +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities > +void +GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType& begin, + const CoordinatesType& end, + UserData& userData, + const int& stream ) +{ + std::cout << "Not Implemented yet Grid Traverser <1, Real, Device::MIC>" << std::endl; +/* + auto& pool = CudaStreamPool::getInstance(); + const cudaStream_t& s = pool.getStream( stream ); + + Devices::Cuda::synchronizeDevice(); + if( processOnlyBoundaryEntities ) + { + dim3 cudaBlockSize( 2 ); + dim3 cudaBlocks( 1 ); + GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > + <<< cudaBlocks, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end ); + } + else + { + dim3 cudaBlockSize( 256 ); + dim3 cudaBlocks; + cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); + const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); + + for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) + GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > + <<< cudaBlocks, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end, + gridXIdx ); + } + + // only launches into the stream 0 are synchronized + if( stream == 0 ) + { + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; + } +*/ +} + + } // namespace Meshes +} // namespace TNL diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp new file mode 100644 index 000000000..84e496017 --- /dev/null +++ b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp @@ -0,0 +1,648 @@ +/*************************************************************************** + GridTraverser_2D.hpp - description + ------------------- + begin : Jan 4, 2019 + copyright : (C) 2019 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace TNL { +namespace Meshes { + +//#define GRID_TRAVERSER_USE_STREAMS + + +/**** + * 2D traverser, host + */ +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities, + int XOrthogonalBoundary, + int YOrthogonalBoundary, + typename... GridEntityParameters > +void +GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType begin, + const CoordinatesType end, + UserData& userData, + const int& stream, + const GridEntityParameters&... gridEntityParameters ) +{ + if( processOnlyBoundaryEntities ) + { + GridEntity entity( *gridPointer, begin, gridEntityParameters... ); + + if( YOrthogonalBoundary ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.getCoordinates().y() = begin.y(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + entity.getCoordinates().y() = end.y(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + if( XOrthogonalBoundary ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + { + entity.getCoordinates().x() = begin.x(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + entity.getCoordinates().x() = end.x(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + else + { +#ifdef HAVE_OPENMP + if( Devices::Host::isOMPEnabled() ) + { +#pragma omp parallel firstprivate( begin, end ) + { + GridEntity entity( *gridPointer ); +#pragma omp for + // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow + for( IndexType y = begin.y(); y <= end.y(); y ++ ) + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.getCoordinates().y() = y; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + } + else + { + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } +#else + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +#endif + } +} + +/**** + * 2D traverser, CUDA + */ +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser2D( + const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + + if( coordinates <= end ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() ) + { + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } + } +} + +// Boundary traverser using streams +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser2DBoundaryAlongX( + const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, + UserData userData, + const Index beginX, + const Index endX, + const Index fixedY, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = fixedY; + + if( coordinates.x() <= endX ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } +} + +// Boundary traverser using streams +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser2DBoundaryAlongY( + const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, + UserData userData, + const Index beginY, + const Index endY, + const Index fixedX, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = fixedX; + coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + + if( coordinates.y() <= endY ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } +} + + +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser2DBoundary( + const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, + UserData userData, + const Index beginX, + const Index endX, + const Index beginY, + const Index endY, + const Index blocksPerFace, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + using GridType = Meshes::Grid< 2, Real, Devices::Cuda, Index >; + using CoordinatesType = typename GridType::CoordinatesType; + + const Index faceIdx = blockIdx.x / blocksPerFace; + const Index faceBlockIdx = blockIdx.x % blocksPerFace; + const Index threadId = faceBlockIdx * blockDim. x + threadIdx.x; + if( faceIdx < 2 ) + { + const Index entitiesAlongX = endX - beginX + 1; + if( threadId < entitiesAlongX ) + { + GridEntity entity( *grid, + CoordinatesType( beginX + threadId, faceIdx == 0 ? beginY : endY ), + gridEntityParameters... ); + //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + } + else + { + const Index entitiesAlongY = endY - beginY - 1; + if( threadId < entitiesAlongY ) + { + GridEntity entity( *grid, + CoordinatesType( faceIdx == 2 ? beginX : endX, beginY + threadId + 1 ), + gridEntityParameters... ); + //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + } + + + + /*const Index aux = max( entitiesAlongX, entitiesAlongY ); + const Index& warpSize = Devices::Cuda::getWarpSize(); + const Index threadsPerAxis = warpSize * ( aux / warpSize + ( aux % warpSize != 0 ) ); + + Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + GridEntity entity( *grid, + CoordinatesType( 0, 0 ), + gridEntityParameters... ); + CoordinatesType& coordinates = entity.getCoordinates(); + const Index axisIndex = threadId / threadsPerAxis; + //printf( "axisIndex %d, threadId %d thradsPerAxis %d \n", axisIndex, threadId, threadsPerAxis ); + threadId -= axisIndex * threadsPerAxis; + switch( axisIndex ) + { + case 1: + coordinates = CoordinatesType( beginX + threadId, beginY ); + if( threadId < entitiesAlongX ) + { + //printf( "X1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + break; + case 2: + coordinates = CoordinatesType( beginX + threadId, endY ); + if( threadId < entitiesAlongX ) + { + //printf( "X2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + break; + case 3: + coordinates = CoordinatesType( beginX, beginY + threadId + 1 ); + if( threadId < entitiesAlongY ) + { + //printf( "Y1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + break; + case 4: + coordinates = CoordinatesType( endX, beginY + threadId + 1 ); + if( threadId < entitiesAlongY ) + { + //printf( "Y2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + break; + }*/ + + /*if( threadId < entitiesAlongX ) + { + GridEntity entity( *grid, + CoordinatesType( beginX + threadId, beginY ), + gridEntityParameters... ); + //printf( "X1: Thread %d -> %d %d x %d %d \n ", threadId, + // entity.getCoordinates().x(), entity.getCoordinates().y(), + // grid->getDimensions().x(), grid->getDimensions().y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + else if( ( threadId -= entitiesAlongX ) < entitiesAlongX && threadId >= 0 ) + { + GridEntity entity( *grid, + CoordinatesType( beginX + threadId, endY ), + gridEntityParameters... ); + entity.refresh(); + //printf( "X2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + else if( ( ( threadId -= entitiesAlongX ) < entitiesAlongY - 1 ) && threadId >= 0 ) + { + GridEntity entity( *grid, + CoordinatesType( beginX, beginY + threadId + 1 ), + gridEntityParameters... ); + entity.refresh(); + //printf( "Y1: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + else if( ( ( threadId -= entitiesAlongY - 1 ) < entitiesAlongY - 1 ) && threadId >= 0 ) + { + GridEntity entity( *grid, + CoordinatesType( endX, beginY + threadId + 1 ), + gridEntityParameters... ); + entity.refresh(); + //printf( "Y2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); + EntitiesProcessor::processEntity( *grid, userData, entity ); + }*/ +} + + +#endif // HAVE_CUDA + +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities, + int XOrthogonalBoundary, + int YOrthogonalBoundary, + typename... GridEntityParameters > +void +GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType& begin, + const CoordinatesType& end, + UserData& userData, + const int& stream, + const GridEntityParameters&... gridEntityParameters ) +{ +#ifdef HAVE_CUDA + if( processOnlyBoundaryEntities && + ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) ) + { +#ifdef GRID_TRAVERSER_USE_STREAMS + dim3 cudaBlockSize( 256 ); + dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX, + cudaBlocksCountAlongY, cudaGridsCountAlongY; + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 ); + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 ); + + auto& pool = CudaStreamPool::getInstance(); + Devices::Cuda::synchronizeDevice(); + + const cudaStream_t& s1 = pool.getStream( stream ); + const cudaStream_t& s2 = pool.getStream( stream + 1 ); + dim3 gridIdx, cudaGridSize; + for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize ); + //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX ); + GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaGridSize, cudaBlockSize, 0, s1 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + begin.y(), + gridIdx, + gridEntityParameters... ); + GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaGridSize, cudaBlockSize, 0, s2 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + end.y(), + gridIdx, + gridEntityParameters... ); + } + const cudaStream_t& s3 = pool.getStream( stream + 2 ); + const cudaStream_t& s4 = pool.getStream( stream + 3 ); + for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize ); + GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaGridSize, cudaBlockSize, 0, s3 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.y() + 1, + end.y() - 1, + begin.x(), + gridIdx, + gridEntityParameters... ); + GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaGridSize, cudaBlockSize, 0, s4 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.y() + 1, + end.y() - 1, + end.x(), + gridIdx, + gridEntityParameters... ); + } + cudaStreamSynchronize( s1 ); + cudaStreamSynchronize( s2 ); + cudaStreamSynchronize( s3 ); + cudaStreamSynchronize( s4 ); +#else // not defined GRID_TRAVERSER_USE_STREAMS + dim3 cudaBlockSize( 256 ); + dim3 cudaBlocksCount, cudaGridsCount; + const IndexType entitiesAlongX = end.x() - begin.x() + 1; + const IndexType entitiesAlongY = end.x() - begin.x() - 1; + const IndexType maxFaceSize = max( entitiesAlongX, entitiesAlongY ); + const IndexType blocksPerFace = maxFaceSize / cudaBlockSize.x + ( maxFaceSize % cudaBlockSize.x != 0 ); + IndexType cudaThreadsCount = 4 * cudaBlockSize.x * blocksPerFace; + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount ); + //std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount + // << "cudaBlockCount = " << cudaBlocksCount.x << std::endl; + dim3 gridIdx, cudaGridSize; + Devices::Cuda::synchronizeDevice(); + for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); + //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX ); + GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaGridSize, cudaBlockSize >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + begin.y(), + end.y(), + blocksPerFace, + gridIdx, + gridEntityParameters... ); + } +#endif //GRID_TRAVERSER_USE_STREAMS + //getchar(); + TNL_CHECK_CUDA_DEVICE; + } + else + { + dim3 cudaBlockSize( 16, 16 ); + dim3 cudaBlocksCount, cudaGridsCount; + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, + end.x() - begin.x() + 1, + end.y() - begin.y() + 1 ); + + auto& pool = CudaStreamPool::getInstance(); + const cudaStream_t& s = pool.getStream( stream ); + + Devices::Cuda::synchronizeDevice(); + dim3 gridIdx, cudaGridSize; + for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) + for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); + //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount ); + GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaGridSize, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end, + gridIdx, + gridEntityParameters... ); + } + + // only launches into the stream 0 are synchronized + if( stream == 0 ) + { + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; + } + } +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + + +/**** + * 2D traverser, MIC + */ +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities, + int XOrthogonalBoundary, + int YOrthogonalBoundary, + typename... GridEntityParameters > +void +GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType& begin, + const CoordinatesType& end, + UserData& userData, + const int& stream, + const GridEntityParameters&... gridEntityParameters ) +{ + + +#ifdef HAVE_MIC + Devices::MIC::synchronizeDevice(); + + //TOHLE JE PRUSER -- nemim poslat vypustku -- + //GridEntity entity( gridPointer.template getData< Devices::MIC >(), begin, gridEntityParameters... ); + + + Devices::MICHider hMicGrid; + hMicGrid.pointer=& gridPointer.template getData< Devices::MIC >(); + Devices::MICHider hMicUserData; + hMicUserData.pointer=& userDataPointer.template modifyData(); + TNLMICSTRUCT(begin, const CoordinatesType); + TNLMICSTRUCT(end, const CoordinatesType); + + #pragma offload target(mic) in(sbegin,send,hMicUserData,hMicGrid) + { + + #pragma omp parallel firstprivate( sbegin, send ) + { + TNLMICSTRUCTUSE(begin, const CoordinatesType); + TNLMICSTRUCTUSE(end, const CoordinatesType); + GridEntity entity( *(hMicGrid.pointer), *(kernelbegin) ); + + if( processOnlyBoundaryEntities ) + { + if( YOrthogonalBoundary ) + #pragma omp for + for( auto k = kernelbegin->x(); + k <= kernelend->x(); + k ++ ) + { + entity.getCoordinates().x() = k; + entity.getCoordinates().y() = kernelbegin->y(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); + entity.getCoordinates().y() = kernelend->y(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); + } + if( XOrthogonalBoundary ) + #pragma omp for + for( auto k = kernelbegin->y(); + k <= kernelend->y(); + k ++ ) + { + entity.getCoordinates().y() = k; + entity.getCoordinates().x() = kernelbegin->x(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); + entity.getCoordinates().x() = kernelend->x(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); + } + } + else + { + #pragma omp for + for( IndexType y = kernelbegin->y(); y <= kernelend->y(); y ++ ) + for( IndexType x = kernelbegin->x(); x <= kernelend->x(); x ++ ) + { + // std::cerr << x << " " < +#include +#include +#include +#include +#include + +namespace TNL { +namespace Meshes { + + +/**** + * 3D traverser, host + */ +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities, + int XOrthogonalBoundary, + int YOrthogonalBoundary, + int ZOrthogonalBoundary, + typename... GridEntityParameters > +void +GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType begin, + const CoordinatesType end, + UserData& userData, + const int& stream, + const GridEntityParameters&... gridEntityParameters ) +{ + if( processOnlyBoundaryEntities ) + { + GridEntity entity( *gridPointer, begin, gridEntityParameters... ); + + if( ZOrthogonalBoundary ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.getCoordinates().z() = begin.z(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + entity.getCoordinates().z() = end.z(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + if( YOrthogonalBoundary ) + for( entity.getCoordinates().z() = begin.z(); + entity.getCoordinates().z() <= end.z(); + entity.getCoordinates().z() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.getCoordinates().y() = begin.y(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + entity.getCoordinates().y() = end.y(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + if( XOrthogonalBoundary ) + for( entity.getCoordinates().z() = begin.z(); + entity.getCoordinates().z() <= end.z(); + entity.getCoordinates().z() ++ ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + { + entity.getCoordinates().x() = begin.x(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + entity.getCoordinates().x() = end.x(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + else + { +#ifdef HAVE_OPENMP + if( Devices::Host::isOMPEnabled() ) + { +#pragma omp parallel firstprivate( begin, end ) + { + GridEntity entity( *gridPointer ); +#pragma omp for + // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow + for( IndexType z = begin.z(); z <= end.z(); z ++ ) + for( IndexType y = begin.y(); y <= end.y(); y ++ ) + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.getCoordinates().y() = y; + entity.getCoordinates().z() = z; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + } + else + { + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().z() = begin.z(); + entity.getCoordinates().z() <= end.z(); + entity.getCoordinates().z() ++ ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } +#else + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().z() = begin.z(); + entity.getCoordinates().z() <= end.z(); + entity.getCoordinates().z() ++ ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +#endif + } +} + +/**** + * 3D traverser, CUDA + */ +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser3D( + const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + coordinates.z() = begin.z() + Devices::Cuda::getGlobalThreadIdx_z( gridIdx ); + + if( coordinates <= end ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() ) + { + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } + } +} + +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser3DBoundaryAlongXY( + const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, + UserData userData, + const Index beginX, + const Index endX, + const Index beginY, + const Index endY, + const Index fixedZ, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + coordinates.z() = fixedZ; + + if( coordinates.x() <= endX && coordinates.y() <= endY ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } +} + +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser3DBoundaryAlongXZ( + const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, + UserData userData, + const Index beginX, + const Index endX, + const Index beginZ, + const Index endZ, + const Index fixedY, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = fixedY; + coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + + if( coordinates.x() <= endX && coordinates.z() <= endZ ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } +} + +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser3DBoundaryAlongYZ( + const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, + UserData userData, + const Index beginY, + const Index endY, + const Index beginZ, + const Index endZ, + const Index fixedX, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = fixedX; + coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + + if( coordinates.y() <= endY && coordinates.z() <= endZ ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } +} +#endif + +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities, + int XOrthogonalBoundary, + int YOrthogonalBoundary, + int ZOrthogonalBoundary, + typename... GridEntityParameters > +void +GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType& begin, + const CoordinatesType& end, + UserData& userData, + const int& stream, + const GridEntityParameters&... gridEntityParameters ) +{ +#ifdef HAVE_CUDA + if( processOnlyBoundaryEntities && + ( GridEntity::getEntityDimension() == 3 || GridEntity::getEntityDimension() == 0 ) ) + { + dim3 cudaBlockSize( 16, 16 ); + const IndexType entitiesAlongX = end.x() - begin.x() + 1; + const IndexType entitiesAlongY = end.y() - begin.y() + 1; + const IndexType entitiesAlongZ = end.z() - begin.z() + 1; + + dim3 cudaBlocksCountAlongXY, cudaBlocksCountAlongXZ, cudaBlocksCountAlongYZ, + cudaGridsCountAlongXY, cudaGridsCountAlongXZ, cudaGridsCountAlongYZ; + + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXY, cudaGridsCountAlongXY, entitiesAlongX, entitiesAlongY ); + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, entitiesAlongX, entitiesAlongZ - 2 ); + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 ); + + auto& pool = CudaStreamPool::getInstance(); + Devices::Cuda::synchronizeDevice(); + + const cudaStream_t& s1 = pool.getStream( stream ); + const cudaStream_t& s2 = pool.getStream( stream + 1 ); + const cudaStream_t& s3 = pool.getStream( stream + 2 ); + const cudaStream_t& s4 = pool.getStream( stream + 3 ); + const cudaStream_t& s5 = pool.getStream( stream + 4 ); + const cudaStream_t& s6 = pool.getStream( stream + 5 ); + + dim3 gridIdx, gridSize; + for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXY.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXY.x; gridIdx.x++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCountAlongXY, cudaGridsCountAlongXY, gridIdx, gridSize ); + GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocksCountAlongXY, cudaBlockSize, 0 , s1 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + begin.y(), + end.y(), + begin.z(), + gridIdx, + gridEntityParameters... ); + GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocksCountAlongXY, cudaBlockSize, 0, s2 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + begin.y(), + end.y(), + end.z(), + gridIdx, + gridEntityParameters... ); + } + for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXZ.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXZ.x; gridIdx.x++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, gridIdx, gridSize ); + GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s3 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + begin.z() + 1, + end.z() - 1, + begin.y(), + gridIdx, + gridEntityParameters... ); + GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s4 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + begin.z() + 1, + end.z() - 1, + end.y(), + gridIdx, + gridEntityParameters... ); + } + for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongYZ.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongYZ.x; gridIdx.x++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, gridIdx, gridSize ); + GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s5 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.y() + 1, + end.y() - 1, + begin.z() + 1, + end.z() - 1, + begin.x(), + gridIdx, + gridEntityParameters... ); + GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s6 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.y() + 1, + end.y() - 1, + begin.z() + 1, + end.z() - 1, + end.x(), + gridIdx, + gridEntityParameters... ); + } + cudaStreamSynchronize( s1 ); + cudaStreamSynchronize( s2 ); + cudaStreamSynchronize( s3 ); + cudaStreamSynchronize( s4 ); + cudaStreamSynchronize( s5 ); + cudaStreamSynchronize( s6 ); + TNL_CHECK_CUDA_DEVICE; + } + else + { + dim3 cudaBlockSize( 8, 8, 8 ); + dim3 cudaBlocksCount, cudaGridsCount; + + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, + end.x() - begin.x() + 1, + end.y() - begin.y() + 1, + end.z() - begin.z() + 1 ); + + auto& pool = CudaStreamPool::getInstance(); + const cudaStream_t& s = pool.getStream( stream ); + + Devices::Cuda::synchronizeDevice(); + dim3 gridIdx, gridSize; + for( gridIdx.z = 0; gridIdx.z < cudaGridsCount.z; gridIdx.z ++ ) + for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) + for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, gridSize ); + GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< gridSize, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end, + gridIdx, + gridEntityParameters... ); + } + + // only launches into the stream 0 are synchronized + if( stream == 0 ) + { + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; + } + } +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + +/**** + * 3D traverser, MIC + */ +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities, + int XOrthogonalBoundary, + int YOrthogonalBoundary, + int ZOrthogonalBoundary, + typename... GridEntityParameters > +void +GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType& begin, + const CoordinatesType& end, + UserData& userData, + const int& stream, + const GridEntityParameters&... gridEntityParameters ) +{ + std::cout << "Not Implemented yet Grid Traverser <3, Real, Device::MIC>" << std::endl; + +/* HAVE_CUDA + dim3 cudaBlockSize( 8, 8, 8 ); + dim3 cudaBlocks; + cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); + cudaBlocks.y = Devices::Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y ); + cudaBlocks.z = Devices::Cuda::getNumberOfBlocks( end.z() - begin.z() + 1, cudaBlockSize.z ); + const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); + const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y ); + const IndexType cudaZGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.z ); + + auto& pool = CudaStreamPool::getInstance(); + const cudaStream_t& s = pool.getStream( stream ); + + Devices::Cuda::synchronizeDevice(); + for( IndexType gridZIdx = 0; gridZIdx < cudaZGrids; gridZIdx ++ ) + for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ ) + for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) + GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocks, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end, + gridXIdx, + gridYIdx, + gridZIdx, + gridEntityParameters... ); + + // only launches into the stream 0 are synchronized + if( stream == 0 ) + { + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; + } + */ +} + } // namespace Meshes +} // namespace TNL diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h deleted file mode 100644 index 33b5e22eb..000000000 --- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h +++ /dev/null @@ -1,1436 +0,0 @@ -/*************************************************************************** - GridTraverser_impl.h - description - ------------------- - begin : Jan 2, 2016 - copyright : (C) 2016 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#include - -#pragma once - -//#define GRID_TRAVERSER_USE_STREAMS - -#include "GridTraverser.h" - -#include - -namespace TNL { -namespace Meshes { - -/**** - * 1D traverser, host - */ -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities > -void -GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType begin, - const CoordinatesType end, - UserData& userData, - const int& stream ) -{ - GridEntity entity( *gridPointer ); - if( processOnlyBoundaryEntities ) - { - GridEntity entity( *gridPointer ); - - entity.getCoordinates() = begin; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - entity.getCoordinates() = end; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - else - { -#ifdef HAVE_OPENMP - if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 ) - { -#pragma omp parallel firstprivate( begin, end ) - { - GridEntity entity( *gridPointer ); -#pragma omp for - // TODO: g++ 5.5 crashes when coding this loop without auxiliary x as bellow - for( IndexType x = begin.x(); x <= end.x(); x++ ) - { - entity.getCoordinates().x() = x; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } - } - else - { - GridEntity entity( *gridPointer ); - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } -#else - GridEntity entity( *gridPointer ); - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } -#endif - } -} - -/**** - * 1D traverser, CUDA - */ -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor > -__global__ void -GridTraverser1D( - const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, - UserData userData, - const typename GridEntity::CoordinatesType begin, - const typename GridEntity::CoordinatesType end, - const Index gridIdx ) -{ - typedef Real RealType; - typedef Index IndexType; - typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( coordinates <= end ) - { - GridEntity entity( *grid, coordinates ); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } -} - -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor > -__global__ void -GridBoundaryTraverser1D( - const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, - UserData userData, - const typename GridEntity::CoordinatesType begin, - const typename GridEntity::CoordinatesType end ) -{ - typedef Real RealType; - typedef Index IndexType; - typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - if( threadIdx.x == 0 ) - { - coordinates.x() = begin.x(); - GridEntity entity( *grid, coordinates ); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - if( threadIdx.x == 1 ) - { - coordinates.x() = end.x(); - GridEntity entity( *grid, coordinates ); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } -} - -#endif - -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities > -void -GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType& begin, - const CoordinatesType& end, - UserData& userData, - const int& stream ) -{ -#ifdef HAVE_CUDA - auto& pool = CudaStreamPool::getInstance(); - const cudaStream_t& s = pool.getStream( stream ); - - Devices::Cuda::synchronizeDevice(); - if( processOnlyBoundaryEntities ) - { - dim3 cudaBlockSize( 2 ); - dim3 cudaBlocks( 1 ); - GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > - <<< cudaBlocks, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end ); - } - else - { - dim3 cudaBlockSize( 256 ); - dim3 cudaBlocks; - cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); - const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); - - for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) - GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > - <<< cudaBlocks, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end, - gridXIdx ); - } - - // only launches into the stream 0 are synchronized - if( stream == 0 ) - { - cudaStreamSynchronize( s ); - TNL_CHECK_CUDA_DEVICE; - } -#else - throw Exceptions::CudaSupportMissing(); -#endif -} - -/**** - * 1D traverser, MIC - */ - -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities > -void -GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType& begin, - const CoordinatesType& end, - UserData& userData, - const int& stream ) -{ - std::cout << "Not Implemented yet Grid Traverser <1, Real, Device::MIC>" << std::endl; -/* - auto& pool = CudaStreamPool::getInstance(); - const cudaStream_t& s = pool.getStream( stream ); - - Devices::Cuda::synchronizeDevice(); - if( processOnlyBoundaryEntities ) - { - dim3 cudaBlockSize( 2 ); - dim3 cudaBlocks( 1 ); - GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > - <<< cudaBlocks, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end ); - } - else - { - dim3 cudaBlockSize( 256 ); - dim3 cudaBlocks; - cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); - const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); - - for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) - GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > - <<< cudaBlocks, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end, - gridXIdx ); - } - - // only launches into the stream 0 are synchronized - if( stream == 0 ) - { - cudaStreamSynchronize( s ); - TNL_CHECK_CUDA_DEVICE; - } -*/ -} - -/**** - * 2D traverser, host - */ -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities, - int XOrthogonalBoundary, - int YOrthogonalBoundary, - typename... GridEntityParameters > -void -GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType begin, - const CoordinatesType end, - UserData& userData, - const int& stream, - const GridEntityParameters&... gridEntityParameters ) -{ - if( processOnlyBoundaryEntities ) - { - GridEntity entity( *gridPointer, begin, gridEntityParameters... ); - - if( YOrthogonalBoundary ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.getCoordinates().y() = begin.y(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - entity.getCoordinates().y() = end.y(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - if( XOrthogonalBoundary ) - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - { - entity.getCoordinates().x() = begin.x(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - entity.getCoordinates().x() = end.x(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } - else - { -#ifdef HAVE_OPENMP - if( Devices::Host::isOMPEnabled() ) - { -#pragma omp parallel firstprivate( begin, end ) - { - GridEntity entity( *gridPointer ); -#pragma omp for - // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow - for( IndexType y = begin.y(); y <= end.y(); y ++ ) - for( IndexType x = begin.x(); x <= end.x(); x ++ ) - { - entity.getCoordinates().x() = x; - entity.getCoordinates().y() = y; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } - } - else - { - GridEntity entity( *gridPointer ); - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } -#else - GridEntity entity( *gridPointer ); - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } -#endif - } -} - -/**** - * 2D traverser, CUDA - */ -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser2D( - const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, - UserData userData, - const typename GridEntity::CoordinatesType begin, - const typename GridEntity::CoordinatesType end, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); - - if( coordinates <= end ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() ) - { - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } - } -} - -// Boundary traverser using streams -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser2DBoundaryAlongX( - const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, - UserData userData, - const Index beginX, - const Index endX, - const Index fixedY, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = fixedY; - - if( coordinates.x() <= endX ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } -} - -// Boundary traverser using streams -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser2DBoundaryAlongY( - const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, - UserData userData, - const Index beginY, - const Index endY, - const Index fixedX, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = fixedX; - coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - - if( coordinates.y() <= endY ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } -} - - -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser2DBoundary( - const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, - UserData userData, - const Index beginX, - const Index endX, - const Index beginY, - const Index endY, - const Index blocksPerFace, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - using GridType = Meshes::Grid< 2, Real, Devices::Cuda, Index >; - using CoordinatesType = typename GridType::CoordinatesType; - - const Index faceIdx = blockIdx.x / blocksPerFace; - const Index faceBlockIdx = blockIdx.x % blocksPerFace; - const Index threadId = faceBlockIdx * blockDim. x + threadIdx.x; - if( faceIdx < 2 ) - { - const Index entitiesAlongX = endX - beginX + 1; - if( threadId < entitiesAlongX ) - { - GridEntity entity( *grid, - CoordinatesType( beginX + threadId, faceIdx == 0 ? beginY : endY ), - gridEntityParameters... ); - //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - } - else - { - const Index entitiesAlongY = endY - beginY - 1; - if( threadId < entitiesAlongY ) - { - GridEntity entity( *grid, - CoordinatesType( faceIdx == 2 ? beginX : endX, beginY + threadId + 1 ), - gridEntityParameters... ); - //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - } - - - - /*const Index aux = max( entitiesAlongX, entitiesAlongY ); - const Index& warpSize = Devices::Cuda::getWarpSize(); - const Index threadsPerAxis = warpSize * ( aux / warpSize + ( aux % warpSize != 0 ) ); - - Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - GridEntity entity( *grid, - CoordinatesType( 0, 0 ), - gridEntityParameters... ); - CoordinatesType& coordinates = entity.getCoordinates(); - const Index axisIndex = threadId / threadsPerAxis; - //printf( "axisIndex %d, threadId %d thradsPerAxis %d \n", axisIndex, threadId, threadsPerAxis ); - threadId -= axisIndex * threadsPerAxis; - switch( axisIndex ) - { - case 1: - coordinates = CoordinatesType( beginX + threadId, beginY ); - if( threadId < entitiesAlongX ) - { - //printf( "X1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - break; - case 2: - coordinates = CoordinatesType( beginX + threadId, endY ); - if( threadId < entitiesAlongX ) - { - //printf( "X2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - break; - case 3: - coordinates = CoordinatesType( beginX, beginY + threadId + 1 ); - if( threadId < entitiesAlongY ) - { - //printf( "Y1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - break; - case 4: - coordinates = CoordinatesType( endX, beginY + threadId + 1 ); - if( threadId < entitiesAlongY ) - { - //printf( "Y2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - break; - }*/ - - /*if( threadId < entitiesAlongX ) - { - GridEntity entity( *grid, - CoordinatesType( beginX + threadId, beginY ), - gridEntityParameters... ); - //printf( "X1: Thread %d -> %d %d x %d %d \n ", threadId, - // entity.getCoordinates().x(), entity.getCoordinates().y(), - // grid->getDimensions().x(), grid->getDimensions().y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - else if( ( threadId -= entitiesAlongX ) < entitiesAlongX && threadId >= 0 ) - { - GridEntity entity( *grid, - CoordinatesType( beginX + threadId, endY ), - gridEntityParameters... ); - entity.refresh(); - //printf( "X2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - else if( ( ( threadId -= entitiesAlongX ) < entitiesAlongY - 1 ) && threadId >= 0 ) - { - GridEntity entity( *grid, - CoordinatesType( beginX, beginY + threadId + 1 ), - gridEntityParameters... ); - entity.refresh(); - //printf( "Y1: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - else if( ( ( threadId -= entitiesAlongY - 1 ) < entitiesAlongY - 1 ) && threadId >= 0 ) - { - GridEntity entity( *grid, - CoordinatesType( endX, beginY + threadId + 1 ), - gridEntityParameters... ); - entity.refresh(); - //printf( "Y2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); - EntitiesProcessor::processEntity( *grid, userData, entity ); - }*/ -} - - -#endif // HAVE_CUDA - -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities, - int XOrthogonalBoundary, - int YOrthogonalBoundary, - typename... GridEntityParameters > -void -GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType& begin, - const CoordinatesType& end, - UserData& userData, - const int& stream, - const GridEntityParameters&... gridEntityParameters ) -{ -#ifdef HAVE_CUDA - if( processOnlyBoundaryEntities && - ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) ) - { -#ifdef GRID_TRAVERSER_USE_STREAMS - dim3 cudaBlockSize( 256 ); - dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX, - cudaBlocksCountAlongY, cudaGridsCountAlongY; - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 ); - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 ); - - auto& pool = CudaStreamPool::getInstance(); - Devices::Cuda::synchronizeDevice(); - - const cudaStream_t& s1 = pool.getStream( stream ); - const cudaStream_t& s2 = pool.getStream( stream + 1 ); - dim3 gridIdx, cudaGridSize; - for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize ); - //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX ); - GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaGridSize, cudaBlockSize, 0, s1 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - begin.y(), - gridIdx, - gridEntityParameters... ); - GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaGridSize, cudaBlockSize, 0, s2 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - end.y(), - gridIdx, - gridEntityParameters... ); - } - const cudaStream_t& s3 = pool.getStream( stream + 2 ); - const cudaStream_t& s4 = pool.getStream( stream + 3 ); - for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize ); - GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaGridSize, cudaBlockSize, 0, s3 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.y() + 1, - end.y() - 1, - begin.x(), - gridIdx, - gridEntityParameters... ); - GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaGridSize, cudaBlockSize, 0, s4 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.y() + 1, - end.y() - 1, - end.x(), - gridIdx, - gridEntityParameters... ); - } - cudaStreamSynchronize( s1 ); - cudaStreamSynchronize( s2 ); - cudaStreamSynchronize( s3 ); - cudaStreamSynchronize( s4 ); -#else // not defined GRID_TRAVERSER_USE_STREAMS - dim3 cudaBlockSize( 256 ); - dim3 cudaBlocksCount, cudaGridsCount; - const IndexType entitiesAlongX = end.x() - begin.x() + 1; - const IndexType entitiesAlongY = end.x() - begin.x() - 1; - const IndexType maxFaceSize = max( entitiesAlongX, entitiesAlongY ); - const IndexType blocksPerFace = maxFaceSize / cudaBlockSize.x + ( maxFaceSize % cudaBlockSize.x != 0 ); - IndexType cudaThreadsCount = 4 * cudaBlockSize.x * blocksPerFace; - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount ); - //std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount - // << "cudaBlockCount = " << cudaBlocksCount.x << std::endl; - dim3 gridIdx, cudaGridSize; - Devices::Cuda::synchronizeDevice(); - for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); - //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX ); - GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaGridSize, cudaBlockSize >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - begin.y(), - end.y(), - blocksPerFace, - gridIdx, - gridEntityParameters... ); - } -#endif //GRID_TRAVERSER_USE_STREAMS - //getchar(); - TNL_CHECK_CUDA_DEVICE; - } - else - { - dim3 cudaBlockSize( 16, 16 ); - dim3 cudaBlocksCount, cudaGridsCount; - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, - end.x() - begin.x() + 1, - end.y() - begin.y() + 1 ); - - auto& pool = CudaStreamPool::getInstance(); - const cudaStream_t& s = pool.getStream( stream ); - - Devices::Cuda::synchronizeDevice(); - dim3 gridIdx, cudaGridSize; - for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) - for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); - //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount ); - GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaGridSize, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end, - gridIdx, - gridEntityParameters... ); - } - - // only launches into the stream 0 are synchronized - if( stream == 0 ) - { - cudaStreamSynchronize( s ); - TNL_CHECK_CUDA_DEVICE; - } - } -#else - throw Exceptions::CudaSupportMissing(); -#endif -} - - -/**** - * 2D traverser, MIC - */ -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities, - int XOrthogonalBoundary, - int YOrthogonalBoundary, - typename... GridEntityParameters > -void -GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType& begin, - const CoordinatesType& end, - UserData& userData, - const int& stream, - const GridEntityParameters&... gridEntityParameters ) -{ - - -#ifdef HAVE_MIC - Devices::MIC::synchronizeDevice(); - - //TOHLE JE PRUSER -- nemim poslat vypustku -- - //GridEntity entity( gridPointer.template getData< Devices::MIC >(), begin, gridEntityParameters... ); - - - Devices::MICHider hMicGrid; - hMicGrid.pointer=& gridPointer.template getData< Devices::MIC >(); - Devices::MICHider hMicUserData; - hMicUserData.pointer=& userDataPointer.template modifyData(); - TNLMICSTRUCT(begin, const CoordinatesType); - TNLMICSTRUCT(end, const CoordinatesType); - - #pragma offload target(mic) in(sbegin,send,hMicUserData,hMicGrid) - { - - #pragma omp parallel firstprivate( sbegin, send ) - { - TNLMICSTRUCTUSE(begin, const CoordinatesType); - TNLMICSTRUCTUSE(end, const CoordinatesType); - GridEntity entity( *(hMicGrid.pointer), *(kernelbegin) ); - - if( processOnlyBoundaryEntities ) - { - if( YOrthogonalBoundary ) - #pragma omp for - for( auto k = kernelbegin->x(); - k <= kernelend->x(); - k ++ ) - { - entity.getCoordinates().x() = k; - entity.getCoordinates().y() = kernelbegin->y(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); - entity.getCoordinates().y() = kernelend->y(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); - } - if( XOrthogonalBoundary ) - #pragma omp for - for( auto k = kernelbegin->y(); - k <= kernelend->y(); - k ++ ) - { - entity.getCoordinates().y() = k; - entity.getCoordinates().x() = kernelbegin->x(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); - entity.getCoordinates().x() = kernelend->x(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); - } - } - else - { - #pragma omp for - for( IndexType y = kernelbegin->y(); y <= kernelend->y(); y ++ ) - for( IndexType x = kernelbegin->x(); x <= kernelend->x(); x ++ ) - { - // std::cerr << x << " " < - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities, - int XOrthogonalBoundary, - int YOrthogonalBoundary, - int ZOrthogonalBoundary, - typename... GridEntityParameters > -void -GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType begin, - const CoordinatesType end, - UserData& userData, - const int& stream, - const GridEntityParameters&... gridEntityParameters ) -{ - if( processOnlyBoundaryEntities ) - { - GridEntity entity( *gridPointer, begin, gridEntityParameters... ); - - if( ZOrthogonalBoundary ) - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.getCoordinates().z() = begin.z(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - entity.getCoordinates().z() = end.z(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - if( YOrthogonalBoundary ) - for( entity.getCoordinates().z() = begin.z(); - entity.getCoordinates().z() <= end.z(); - entity.getCoordinates().z() ++ ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.getCoordinates().y() = begin.y(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - entity.getCoordinates().y() = end.y(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - if( XOrthogonalBoundary ) - for( entity.getCoordinates().z() = begin.z(); - entity.getCoordinates().z() <= end.z(); - entity.getCoordinates().z() ++ ) - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - { - entity.getCoordinates().x() = begin.x(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - entity.getCoordinates().x() = end.x(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } - else - { -#ifdef HAVE_OPENMP - if( Devices::Host::isOMPEnabled() ) - { -#pragma omp parallel firstprivate( begin, end ) - { - GridEntity entity( *gridPointer ); -#pragma omp for - // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow - for( IndexType z = begin.z(); z <= end.z(); z ++ ) - for( IndexType y = begin.y(); y <= end.y(); y ++ ) - for( IndexType x = begin.x(); x <= end.x(); x ++ ) - { - entity.getCoordinates().x() = x; - entity.getCoordinates().y() = y; - entity.getCoordinates().z() = z; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } - } - else - { - GridEntity entity( *gridPointer ); - for( entity.getCoordinates().z() = begin.z(); - entity.getCoordinates().z() <= end.z(); - entity.getCoordinates().z() ++ ) - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } -#else - GridEntity entity( *gridPointer ); - for( entity.getCoordinates().z() = begin.z(); - entity.getCoordinates().z() <= end.z(); - entity.getCoordinates().z() ++ ) - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } -#endif - } -} - -/**** - * 3D traverser, CUDA - */ -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser3D( - const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, - UserData userData, - const typename GridEntity::CoordinatesType begin, - const typename GridEntity::CoordinatesType end, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); - coordinates.z() = begin.z() + Devices::Cuda::getGlobalThreadIdx_z( gridIdx ); - - if( coordinates <= end ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() ) - { - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } - } -} - -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser3DBoundaryAlongXY( - const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, - UserData userData, - const Index beginX, - const Index endX, - const Index beginY, - const Index endY, - const Index fixedZ, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); - coordinates.z() = fixedZ; - - if( coordinates.x() <= endX && coordinates.y() <= endY ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } -} - -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser3DBoundaryAlongXZ( - const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, - UserData userData, - const Index beginX, - const Index endX, - const Index beginZ, - const Index endZ, - const Index fixedY, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = fixedY; - coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); - - if( coordinates.x() <= endX && coordinates.z() <= endZ ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } -} - -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser3DBoundaryAlongYZ( - const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, - UserData userData, - const Index beginY, - const Index endY, - const Index beginZ, - const Index endZ, - const Index fixedX, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = fixedX; - coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); - - if( coordinates.y() <= endY && coordinates.z() <= endZ ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } -} -#endif - -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities, - int XOrthogonalBoundary, - int YOrthogonalBoundary, - int ZOrthogonalBoundary, - typename... GridEntityParameters > -void -GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType& begin, - const CoordinatesType& end, - UserData& userData, - const int& stream, - const GridEntityParameters&... gridEntityParameters ) -{ -#ifdef HAVE_CUDA - if( processOnlyBoundaryEntities && - ( GridEntity::getEntityDimension() == 3 || GridEntity::getEntityDimension() == 0 ) ) - { - dim3 cudaBlockSize( 16, 16 ); - const IndexType entitiesAlongX = end.x() - begin.x() + 1; - const IndexType entitiesAlongY = end.y() - begin.y() + 1; - const IndexType entitiesAlongZ = end.z() - begin.z() + 1; - - dim3 cudaBlocksCountAlongXY, cudaBlocksCountAlongXZ, cudaBlocksCountAlongYZ, - cudaGridsCountAlongXY, cudaGridsCountAlongXZ, cudaGridsCountAlongYZ; - - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXY, cudaGridsCountAlongXY, entitiesAlongX, entitiesAlongY ); - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, entitiesAlongX, entitiesAlongZ - 2 ); - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 ); - - auto& pool = CudaStreamPool::getInstance(); - Devices::Cuda::synchronizeDevice(); - - const cudaStream_t& s1 = pool.getStream( stream ); - const cudaStream_t& s2 = pool.getStream( stream + 1 ); - const cudaStream_t& s3 = pool.getStream( stream + 2 ); - const cudaStream_t& s4 = pool.getStream( stream + 3 ); - const cudaStream_t& s5 = pool.getStream( stream + 4 ); - const cudaStream_t& s6 = pool.getStream( stream + 5 ); - - dim3 gridIdx, gridSize; - for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXY.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXY.x; gridIdx.x++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCountAlongXY, cudaGridsCountAlongXY, gridIdx, gridSize ); - GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocksCountAlongXY, cudaBlockSize, 0 , s1 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - begin.y(), - end.y(), - begin.z(), - gridIdx, - gridEntityParameters... ); - GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocksCountAlongXY, cudaBlockSize, 0, s2 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - begin.y(), - end.y(), - end.z(), - gridIdx, - gridEntityParameters... ); - } - for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXZ.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXZ.x; gridIdx.x++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, gridIdx, gridSize ); - GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s3 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - begin.z() + 1, - end.z() - 1, - begin.y(), - gridIdx, - gridEntityParameters... ); - GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s4 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - begin.z() + 1, - end.z() - 1, - end.y(), - gridIdx, - gridEntityParameters... ); - } - for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongYZ.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongYZ.x; gridIdx.x++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, gridIdx, gridSize ); - GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s5 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.y() + 1, - end.y() - 1, - begin.z() + 1, - end.z() - 1, - begin.x(), - gridIdx, - gridEntityParameters... ); - GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s6 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.y() + 1, - end.y() - 1, - begin.z() + 1, - end.z() - 1, - end.x(), - gridIdx, - gridEntityParameters... ); - } - cudaStreamSynchronize( s1 ); - cudaStreamSynchronize( s2 ); - cudaStreamSynchronize( s3 ); - cudaStreamSynchronize( s4 ); - cudaStreamSynchronize( s5 ); - cudaStreamSynchronize( s6 ); - TNL_CHECK_CUDA_DEVICE; - } - else - { - dim3 cudaBlockSize( 8, 8, 8 ); - dim3 cudaBlocksCount, cudaGridsCount; - - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, - end.x() - begin.x() + 1, - end.y() - begin.y() + 1, - end.z() - begin.z() + 1 ); - - auto& pool = CudaStreamPool::getInstance(); - const cudaStream_t& s = pool.getStream( stream ); - - Devices::Cuda::synchronizeDevice(); - dim3 gridIdx, gridSize; - for( gridIdx.z = 0; gridIdx.z < cudaGridsCount.z; gridIdx.z ++ ) - for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) - for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, gridSize ); - GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< gridSize, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end, - gridIdx, - gridEntityParameters... ); - } - - // only launches into the stream 0 are synchronized - if( stream == 0 ) - { - cudaStreamSynchronize( s ); - TNL_CHECK_CUDA_DEVICE; - } - } -#else - throw Exceptions::CudaSupportMissing(); -#endif -} - -/**** - * 3D traverser, MIC - */ -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities, - int XOrthogonalBoundary, - int YOrthogonalBoundary, - int ZOrthogonalBoundary, - typename... GridEntityParameters > -void -GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType& begin, - const CoordinatesType& end, - UserData& userData, - const int& stream, - const GridEntityParameters&... gridEntityParameters ) -{ - std::cout << "Not Implemented yet Grid Traverser <3, Real, Device::MIC>" << std::endl; - -/* HAVE_CUDA - dim3 cudaBlockSize( 8, 8, 8 ); - dim3 cudaBlocks; - cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); - cudaBlocks.y = Devices::Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y ); - cudaBlocks.z = Devices::Cuda::getNumberOfBlocks( end.z() - begin.z() + 1, cudaBlockSize.z ); - const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); - const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y ); - const IndexType cudaZGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.z ); - - auto& pool = CudaStreamPool::getInstance(); - const cudaStream_t& s = pool.getStream( stream ); - - Devices::Cuda::synchronizeDevice(); - for( IndexType gridZIdx = 0; gridZIdx < cudaZGrids; gridZIdx ++ ) - for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ ) - for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) - GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocks, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end, - gridXIdx, - gridYIdx, - gridZIdx, - gridEntityParameters... ); - - // only launches into the stream 0 are synchronized - if( stream == 0 ) - { - cudaStreamSynchronize( s ); - TNL_CHECK_CUDA_DEVICE; - } - */ -} - -} // namespace Meshes -} // namespace TNL -- GitLab From 317b5bfd5c37c1deb1058c4b851f291650579a6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 4 Jan 2019 18:34:03 +0100 Subject: [PATCH 107/130] GridTraverser_impl.h splitted into GridTraverser_1D.hpp, GridTraverser_2D.hpp and GridTraverser_3D.hpp. --- src/Benchmarks/FunctionTimer.h | 9 +++++---- src/TNL/Meshes/GridDetails/CMakeLists.txt | 4 +++- src/TNL/Meshes/GridDetails/GridTraverser.h | 4 +++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h index 35dbb719f..601cfc16c 100644 --- a/src/Benchmarks/FunctionTimer.h +++ b/src/Benchmarks/FunctionTimer.h @@ -57,13 +57,14 @@ class FunctionTimer // the monitor, the timer is not interrupted after each loop. if( ! performReset && verbose < 2 ) { - if( timing ) - timer.start(); // Explicit synchronization of the CUDA device -#ifdef HAVE_CUDA +#ifdef HAVE_CUDA if( std::is_same< Device, Devices::Cuda >::value ) cudaDeviceSynchronize(); -#endif +#endif + if( timing ) + timer.start(); + for( loops = 0; loops < maxLoops || ( timing && timer.getRealTime() < minTime ); ++loops) diff --git a/src/TNL/Meshes/GridDetails/CMakeLists.txt b/src/TNL/Meshes/GridDetails/CMakeLists.txt index 0da067f14..3386ec242 100644 --- a/src/TNL/Meshes/GridDetails/CMakeLists.txt +++ b/src/TNL/Meshes/GridDetails/CMakeLists.txt @@ -14,7 +14,9 @@ SET( headers BoundaryGridEntityChecker.h GridEntityMeasureGetter.h GridEntityTopology.h GridTraverser.h - GridTraverser_impl.h + GridTraverser_1D.hpp + GridTraverser_2D.hpp + GridTraverser_3D.hpp NeighborGridEntitiesStorage.h NeighborGridEntityGetter1D_impl.h NeighborGridEntityGetter2D_impl.h diff --git a/src/TNL/Meshes/GridDetails/GridTraverser.h b/src/TNL/Meshes/GridDetails/GridTraverser.h index 3a74c085b..881367d3f 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser.h +++ b/src/TNL/Meshes/GridDetails/GridTraverser.h @@ -351,5 +351,7 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > > } // namespace Meshes } // namespace TNL -#include +#include +#include +#include -- GitLab From 7a151198661359738c02533e86abbed76a4bff85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 4 Jan 2019 18:35:04 +0100 Subject: [PATCH 108/130] Fixes in traversers benchmark. --- src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h | 6 +++--- src/Benchmarks/Traversers/tnl-benchmark-traversers.h | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index 93ee77385..1683cc868 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -154,10 +154,10 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { if( std::is_same< Device, Devices::Host >::value ) { - v_data[ 0 ] = 2; + v_data[ 0 ] = +2; for( int i = 1; i < size - 1; i++ ) - v_data[ i ] = 1.0; - v_data[ size - 1 ] = 2; + v_data[ i ] = +1.0; + v_data[ size - 1 ] = +2; } else // Device == Devices::Cuda { diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 9f70589c9..6adc0d8e3 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -189,6 +189,8 @@ bool runBenchmark( const Config::ParameterContainer& parameters, } std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl; } + return true; + /**** * Full grid traversing including boundary conditions -- GitLab From 57dc814cfb3120f87a2047e62e1618f7ec287057 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 4 Jan 2019 23:03:17 +0100 Subject: [PATCH 109/130] Fixed order of indices in the traverser benchmarks --- .../Traversers/GridTraversersBenchmark_2D.h | 12 ++++++------ .../Traversers/GridTraversersBenchmark_3D.h | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index d62d56f91..48f11bfb9 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -98,7 +98,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { - data[ i * _size + j ] += 1.0; + data[ j * _size + i ] += 1.0; }; ParallelFor2D< Device >::exec( ( Index ) 0, @@ -114,8 +114,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index > auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { Cell entity( *currentGrid ); - entity.getCoordinates().y() = i; - entity.getCoordinates().x() = j; + entity.getCoordinates().x() = i; + entity.getCoordinates().y() = j; entity.refresh(); data[ entity.getIndex() ] += 1.0; }; @@ -134,8 +134,8 @@ class GridTraversersBenchmark< 2, Device, Real, Index > auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { Cell entity( *currentGrid ); - entity.getCoordinates().y() = i; - entity.getCoordinates().x() = j; + entity.getCoordinates().x() = i; + entity.getCoordinates().y() = j; entity.refresh(); ( *_u )( entity ) += 1.0; }; @@ -249,4 +249,4 @@ class GridTraversersBenchmark< 2, Device, Real, Index > } // namespace Traversers } // namespace Benchmarks -} // namespace TNL \ No newline at end of file +} // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index 383640d39..cceffa328 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -104,7 +104,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { - data[ ( i * _size + j ) * _size + k ] += 1.0; + data[ ( k * _size + j ) * _size + i ] += 1.0; }; ParallelFor3D< Device >::exec( ( Index ) 0, @@ -122,9 +122,9 @@ class GridTraversersBenchmark< 3, Device, Real, Index > auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { Cell entity( *currentGrid ); - entity.getCoordinates().z() = i; + entity.getCoordinates().x() = i; entity.getCoordinates().y() = j; - entity.getCoordinates().x() = k; + entity.getCoordinates().z() = k; entity.refresh(); data[ entity.getIndex() ] += 1.0; }; @@ -145,9 +145,9 @@ class GridTraversersBenchmark< 3, Device, Real, Index > auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { Cell entity( *currentGrid ); - entity.getCoordinates().z() = i; + entity.getCoordinates().x() = i; entity.getCoordinates().y() = j; - entity.getCoordinates().x() = k; + entity.getCoordinates().z() = k; entity.refresh(); ( *_u )( entity ) += 1.0; }; @@ -257,4 +257,4 @@ class GridTraversersBenchmark< 3, Device, Real, Index > } // namespace Traversers } // namespace Benchmarks -} // namespace TNL \ No newline at end of file +} // namespace TNL -- GitLab From 7f7bff4c23f211128c26efd1bef6dadf7c2bf552 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 4 Jan 2019 23:05:34 +0100 Subject: [PATCH 110/130] Traverser benchmarks: added explicit cast to Real Because constants 1.0 and 2.0 have type double. --- .../Traversers/GridTraversersBenchmark.h | 4 ++-- .../Traversers/GridTraversersBenchmark_1D.h | 16 +++++++------- .../Traversers/GridTraversersBenchmark_2D.h | 18 +++++++-------- .../Traversers/GridTraversersBenchmark_3D.h | 22 +++++++++---------- src/Benchmarks/Traversers/cuda-kernels.h | 18 +++++++-------- 5 files changed, 39 insertions(+), 39 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index c320dc591..bd748ed09 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -42,7 +42,7 @@ class WriteOneEntitiesProcessor const GridEntity& entity ) { auto& u = userData.u.template modifyData< DeviceType >(); - u( entity ) += 1.0; + u( entity ) += (typename MeshType::RealType) 1.0; } }; @@ -68,4 +68,4 @@ class GridTraversersBenchmark{}; #include "GridTraversersBenchmark_1D.h" #include "GridTraversersBenchmark_2D.h" -#include "GridTraversersBenchmark_3D.h" \ No newline at end of file +#include "GridTraversersBenchmark_3D.h" diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index 1683cc868..e626b17e3 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -64,7 +64,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > if( std::is_same< Device, Devices::Host >::value ) { for( int i = 0; i < size; i++ ) - v_data[ i ] += 1.0; + v_data[ i ] += (Real) 1.0; } else // Device == Devices::Cuda { @@ -94,7 +94,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { auto f = [] __cuda_callable__ ( Index i, Real* data ) { - data[ i ] += 1.0; + data[ i ] += (Real) 1.0; }; ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } @@ -107,7 +107,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > Cell entity( *currentGrid ); entity.getCoordinates().x() = i; entity.refresh(); - data[ entity.getIndex() ] += 1.0; + data[ entity.getIndex() ] += (Real) 1.0; }; ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } @@ -121,7 +121,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > Cell entity( *currentGrid ); entity.getCoordinates().x() = i; entity.refresh(); - ( *_u )( entity ) += 1.0; + ( *_u )( entity ) += (Real) 1.0; //WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity ); }; ParallelFor< Device >::exec( ( Index ) 0, size, f ); @@ -154,10 +154,10 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { if( std::is_same< Device, Devices::Host >::value ) { - v_data[ 0 ] = +2; + v_data[ 0 ] += (Real) 2; for( int i = 1; i < size - 1; i++ ) - v_data[ i ] = +1.0; - v_data[ size - 1 ] = +2; + v_data[ i ] += (Real) 1.0; + v_data[ size - 1 ] += (Real) 2; } else // Device == Devices::Cuda { @@ -213,4 +213,4 @@ class GridTraversersBenchmark< 1, Device, Real, Index > } // namespace Traversers } // namespace Benchmarks -} // namespace TNL \ No newline at end of file +} // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index 48f11bfb9..1296a9a46 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -65,7 +65,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > { for( int i = 0; i < size; i++ ) for( int j = 0; j < size; j++ ) - v_data[ i * size + j ] += 1.0; + v_data[ i * size + j ] += (Real) 1.0; } else // Device == Devices::Cuda { @@ -98,7 +98,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { - data[ j * _size + i ] += 1.0; + data[ j * _size + i ] += (Real) 1.0; }; ParallelFor2D< Device >::exec( ( Index ) 0, @@ -117,7 +117,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > entity.getCoordinates().x() = i; entity.getCoordinates().y() = j; entity.refresh(); - data[ entity.getIndex() ] += 1.0; + data[ entity.getIndex() ] += (Real) 1.0; }; ParallelFor2D< Device >::exec( ( Index ) 0, @@ -137,7 +137,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > entity.getCoordinates().x() = i; entity.getCoordinates().y() = j; entity.refresh(); - ( *_u )( entity ) += 1.0; + ( *_u )( entity ) += (Real) 1.0; }; ParallelFor2D< Device >::exec( ( Index ) 0, @@ -179,18 +179,18 @@ class GridTraversersBenchmark< 2, Device, Real, Index > { for( int i = 0; i < size; i++ ) { - v_data[ i * size ] = 2.0; - v_data[ i * size + size - 1 ] = 2.0; + v_data[ i * size ] += (Real) 2.0; + v_data[ i * size + size - 1 ] += (Real) 2.0; } for( int j = 1; j < size - 1; j++ ) { - v_data[ j ] = 2.0; - v_data[ ( size - 1 ) * size + j ] = 2.0; + v_data[ j ] += (Real) 2.0; + v_data[ ( size - 1 ) * size + j ] += (Real) 2.0; } for( int i = 1; i < size - 1; i++ ) for( int j = 1; j < size - 1; j++ ) - v_data[ i * size + j ] = 1.0; + v_data[ i * size + j ] += (Real) 1.0; } else // Device == Devices::Cuda { diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index cceffa328..35863a3c9 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -69,7 +69,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > for( int i = 0; i < size; i++ ) for( int j = 0; j < size; j++ ) for( int k = 0; k < size; k++ ) - v_data[ ( i * size + j ) * size + k ] += 1.0; + v_data[ ( i * size + j ) * size + k ] += (Real) 1.0; } else // Device == Devices::Cuda { @@ -104,7 +104,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { - data[ ( k * _size + j ) * _size + i ] += 1.0; + data[ ( k * _size + j ) * _size + i ] += (Real) 1.0; }; ParallelFor3D< Device >::exec( ( Index ) 0, @@ -126,7 +126,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > entity.getCoordinates().y() = j; entity.getCoordinates().z() = k; entity.refresh(); - data[ entity.getIndex() ] += 1.0; + data[ entity.getIndex() ] += (Real) 1.0; }; ParallelFor3D< Device >::exec( ( Index ) 0, @@ -149,7 +149,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > entity.getCoordinates().y() = j; entity.getCoordinates().z() = k; entity.refresh(); - ( *_u )( entity ) += 1.0; + ( *_u )( entity ) += (Real) 1.0; }; ParallelFor3D< Device >::exec( ( Index ) 0, @@ -175,27 +175,27 @@ class GridTraversersBenchmark< 3, Device, Real, Index > for( int i = 0; i < size; i++ ) for( int j = 0; j < size; j++ ) { - v_data[ ( i * size + j ) * size ] = 2.0; - v_data[ ( i * size + j ) * size + size - 1 ] = 2.0; + v_data[ ( i * size + j ) * size ] += (Real) 2.0; + v_data[ ( i * size + j ) * size + size - 1 ] += (Real) 2.0; } for( int j = 0; j < size; j++ ) for( int k = 1; k < size - 1; k++ ) { - v_data[ j * size + k ] = 1.0; - v_data[ ( ( size - 1) * size + j ) * size + k ] = 1.0; + v_data[ j * size + k ] += (Real) 1.0; + v_data[ ( ( size - 1) * size + j ) * size + k ] += (Real) 1.0; } for( int i = 1; i < size -1; i++ ) for( int k = 1; k < size - 1; k++ ) { - v_data[ ( i * size ) * size + k ] = 2.0; - v_data[ ( i * size + size - 1 ) * size + k ] = 2.0; + v_data[ ( i * size ) * size + k ] += (Real) 2.0; + v_data[ ( i * size + size - 1 ) * size + k ] += (Real) 2.0; } for( int i = 1; i < size -1; i++ ) for( int j = 1; j < size -1; j++ ) for( int k = 1; k < size - 1; k++ ) - v_data[ ( i * size + j ) * size + k ] = 1.0; + v_data[ ( i * size + j ) * size + k ] += (Real) 1.0; } else // Device == Devices::Cuda { diff --git a/src/Benchmarks/Traversers/cuda-kernels.h b/src/Benchmarks/Traversers/cuda-kernels.h index 2802b73eb..a90baf5b0 100644 --- a/src/Benchmarks/Traversers/cuda-kernels.h +++ b/src/Benchmarks/Traversers/cuda-kernels.h @@ -27,7 +27,7 @@ __global__ void fullGridTraverseKernel1D( const Index size, const dim3 gridIdx, { const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( threadIdx_x < size ) - v_data[ threadIdx_x ] += 1.0; + v_data[ threadIdx_x ] += (Real) 1.0; } template< typename Real, @@ -37,7 +37,7 @@ __global__ void fullGridTraverseKernel2D( const Index size, const dim3 gridIdx, const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( threadIdx_x < size && threadIdx_y < size ) - v_data[ threadIdx_y * size + threadIdx_x ] += 1.0; + v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 1.0; } template< typename Real, @@ -48,7 +48,7 @@ __global__ void fullGridTraverseKernel3D( const Index size, const dim3 gridIdx, const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size ) - v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 1.0; + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 1.0; } /**** @@ -60,7 +60,7 @@ __global__ void interiorTraverseKernel1D( const Index size, const dim3 gridIdx, { const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( threadIdx_x > 0 && threadIdx_x < size - 1 ) - v_data[ threadIdx_x ] += 1.0; + v_data[ threadIdx_x ] += (Real) 1.0; } template< typename Real, @@ -71,7 +71,7 @@ __global__ void interiorTraverseKernel2D( const Index size, const dim3 gridIdx, const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_x < size - 1 && threadIdx_y < size - 1 ) - v_data[ threadIdx_y * size + threadIdx_x ] += 1.0; + v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 1.0; } template< typename Real, @@ -83,7 +83,7 @@ __global__ void interiorTraverseKernel3D( const Index size, const dim3 gridIdx, const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_z > 0 && threadIdx_x < size - 1 && threadIdx_y < size - 1 && threadIdx_z < size - 1 ) - v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 1.0; + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 1.0; } /**** @@ -95,7 +95,7 @@ __global__ void boundariesTraverseKernel1D( const Index size, const dim3 gridIdx { const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( threadIdx_x == 0 || threadIdx_x == size - 1 ) - v_data[ threadIdx_x ] += 2.0; + v_data[ threadIdx_x ] += (Real) 2.0; } template< typename Real, @@ -106,7 +106,7 @@ __global__ void boundariesTraverseKernel2D( const Index size, const dim3 gridIdx const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_x < size - 1 && threadIdx_y < size - 1 ) - v_data[ threadIdx_y * size + threadIdx_x ] += 2.0; + v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 2.0; } template< typename Real, @@ -118,7 +118,7 @@ __global__ void boundariesTraverseKernel3D( const Index size, const dim3 gridIdx const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; if( threadIdx_x == 0 || threadIdx_y == 0 || threadIdx_z == 0 || threadIdx_x == size - 1 || threadIdx_y == size - 1 || threadIdx_z == size - 1 ) - v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += 2.0; + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 2.0; } #endif -- GitLab From d78e659f8a6258e3f83f718e5a942d4e0fb87999 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 4 Jan 2019 23:09:18 +0100 Subject: [PATCH 111/130] Fixed calculation of bandwidth in the traverser benchmarks --- .../Traversers/tnl-benchmark-traversers.h | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 6adc0d8e3..ff6d25624 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -80,7 +80,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, */ if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-pure-c" ) ) { - benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); auto hostWriteOneUsingPureC = [&] () { @@ -103,7 +103,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, */ if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for" ) ) { - benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); auto hostWriteOneUsingParallelFor = [&] () { @@ -130,7 +130,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, { hostTraverserBenchmark.writeOneUsingParallelForAndGridEntity(); }; - benchmark.setOperation( "par.for+grid ent.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "par.for+grid ent.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); #ifdef HAVE_CUDA @@ -152,7 +152,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, { hostTraverserBenchmark.writeOneUsingParallelForAndMeshFunction(); }; - benchmark.setOperation( "par.for+mesh fc.", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction ); #ifdef HAVE_CUDA @@ -171,7 +171,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, */ if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-traverser" ) ) { - benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); auto hostWriteOneUsingTraverser = [&] () { hostTraverserBenchmark.writeOneUsingTraverser(); @@ -234,14 +234,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( tests.containsValue( "all" ) || tests.containsValue( "bc-pure-c" ) ) { - benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC ); #endif - benchmark.setOperation( "Pure C RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "Pure C RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC ); #ifdef HAVE_CUDA if( withCuda ) @@ -266,14 +266,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) ) { - benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor ); #endif - benchmark.setOperation( "parallel for RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "parallel for RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor ); #ifdef HAVE_CUDA if( withCuda ) @@ -298,13 +298,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) ) { - benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser ); #endif - benchmark.setOperation( "traverser RST", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "traverser RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser ); -- GitLab From a33d9e7014ae9a4a13492eb43da3397651df7f1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 5 Jan 2019 08:30:26 +0100 Subject: [PATCH 112/130] Added synchrounous/asynchronous modes for grid traversers. --- src/TNL/Meshes/GridDetails/GridTraverser.h | 29 ++++++++++++---- .../Meshes/GridDetails/GridTraverser_1D.hpp | 18 +++++++--- .../Meshes/GridDetails/GridTraverser_2D.hpp | 20 +++++++---- .../Meshes/GridDetails/GridTraverser_3D.hpp | 13 ++++--- .../GridDetails/Traverser_Grid1D_impl.h | 30 ++++++++++------ .../GridDetails/Traverser_Grid2D_impl.h | 18 ++++++++++ .../GridDetails/Traverser_Grid3D_impl.h | 34 ++++++++++++++++++- 7 files changed, 129 insertions(+), 33 deletions(-) diff --git a/src/TNL/Meshes/GridDetails/GridTraverser.h b/src/TNL/Meshes/GridDetails/GridTraverser.h index 881367d3f..fb6b34da1 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser.h +++ b/src/TNL/Meshes/GridDetails/GridTraverser.h @@ -25,6 +25,8 @@ class GridTraverser { }; +enum GridTraverserMode { synchronousMode, asynchronousMode }; + /**** * 1D grid, Devices::Host */ @@ -52,6 +54,7 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > > const CoordinatesType begin, const CoordinatesType end, UserData& userData, + GridTraverserMode mode = synchronousMode, const int& stream = 0 ); }; @@ -82,6 +85,7 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > > const CoordinatesType& begin, const CoordinatesType& end, UserData& userData, + GridTraverserMode mode = synchronousMode, const int& stream = 0 ); }; @@ -112,6 +116,7 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > > const CoordinatesType& begin, const CoordinatesType& end, UserData& userData, + GridTraverserMode mode = synchronousMode, const int& stream = 0 ); }; @@ -148,7 +153,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > > const CoordinatesType end, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) -// const int& stream = 0, + //GridTraverserMode mode = synchronousMode, + GridTraverserMode mode, + // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces) @@ -186,7 +193,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > > const CoordinatesType& end, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) -// const int& stream = 0, + //GridTraverserMode mode = synchronousMode, + GridTraverserMode mode, + // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces) @@ -224,7 +233,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > > const CoordinatesType& end, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) -// const int& stream = 0, + //GridTraverserMode mode = synchronousMode, + GridTraverserMode mode, + // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces) @@ -263,7 +274,9 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > > const CoordinatesType end, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) -// const int& stream = 0, + //GridTraverserMode mode = synchronousMode, + GridTraverserMode mode, + // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces and edges) @@ -302,7 +315,9 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > > const CoordinatesType& end, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) -// const int& stream = 0, + //GridTraverserMode mode = synchronousMode, + GridTraverserMode mode, + // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces and edges) @@ -341,7 +356,9 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > > const CoordinatesType& end, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) -// const int& stream = 0, + //GridTraverserMode mode = synchronousMode, + GridTraverserMode mode, + // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces and edges) diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp index 90148f8e8..505f9c3d7 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp +++ b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp @@ -41,6 +41,7 @@ processEntities( const CoordinatesType begin, const CoordinatesType end, UserData& userData, + GridTraverserMode mode, const int& stream ) { GridEntity entity( *gridPointer ); @@ -177,13 +178,14 @@ processEntities( const CoordinatesType& begin, const CoordinatesType& end, UserData& userData, + GridTraverserMode mode, const int& stream ) { #ifdef HAVE_CUDA auto& pool = CudaStreamPool::getInstance(); const cudaStream_t& s = pool.getStream( stream ); - Devices::Cuda::synchronizeDevice(); + //Devices::Cuda::synchronizeDevice(); if( processOnlyBoundaryEntities ) { dim3 cudaBlockSize( 2 ); @@ -209,15 +211,20 @@ processEntities( userData, begin, end, - gridXIdx ); + gridXIdx );*/ } - // only launches into the stream 0 are synchronized - /*if( stream == 0 ) +#ifdef NDEBUG + if( mode == synchronousMode ) { cudaStreamSynchronize( s ); TNL_CHECK_CUDA_DEVICE; - }*/ + } +#else + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; +#endif + #else throw Exceptions::CudaSupportMissing(); #endif @@ -241,6 +248,7 @@ processEntities( const CoordinatesType& begin, const CoordinatesType& end, UserData& userData, + GridTraverserMode mode, const int& stream ) { std::cout << "Not Implemented yet Grid Traverser <1, Real, Device::MIC>" << std::endl; diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp index 84e496017..50b30c019 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp +++ b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp @@ -43,6 +43,7 @@ processEntities( const CoordinatesType begin, const CoordinatesType end, UserData& userData, + GridTraverserMode mode, const int& stream, const GridEntityParameters&... gridEntityParameters ) { @@ -402,6 +403,7 @@ processEntities( const CoordinatesType& begin, const CoordinatesType& end, UserData& userData, + GridTraverserMode mode, const int& stream, const GridEntityParameters&... gridEntityParameters ) { @@ -534,13 +536,18 @@ processEntities( gridEntityParameters... ); } - // only launches into the stream 0 are synchronized - if( stream == 0 ) - { - cudaStreamSynchronize( s ); - TNL_CHECK_CUDA_DEVICE; - } +#ifdef NDEBUG + if( mode == synchronousMode ) + { + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; } +#else + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; +#endif + } + #else throw Exceptions::CudaSupportMissing(); #endif @@ -567,6 +574,7 @@ processEntities( const CoordinatesType& begin, const CoordinatesType& end, UserData& userData, + GridTraverserMode mode, const int& stream, const GridEntityParameters&... gridEntityParameters ) { diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp index d63b81f46..9259da9bf 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp +++ b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp @@ -42,6 +42,7 @@ processEntities( const CoordinatesType begin, const CoordinatesType end, UserData& userData, + GridTraverserMode mode, const int& stream, const GridEntityParameters&... gridEntityParameters ) { @@ -324,6 +325,7 @@ processEntities( const CoordinatesType& begin, const CoordinatesType& end, UserData& userData, + GridTraverserMode mode, const int& stream, const GridEntityParameters&... gridEntityParameters ) { @@ -390,7 +392,7 @@ processEntities( ( &gridPointer.template getData< Devices::Cuda >(), userData, begin.x(), - end.x(), + end.x(), begin.z() + 1, end.z() - 1, begin.y(), @@ -401,7 +403,7 @@ processEntities( ( &gridPointer.template getData< Devices::Cuda >(), userData, begin.x(), - end.x(), + end.x(), begin.z() + 1, end.z() - 1, end.y(), @@ -417,7 +419,7 @@ processEntities( ( &gridPointer.template getData< Devices::Cuda >(), userData, begin.y() + 1, - end.y() - 1, + end.y() - 1, begin.z() + 1, end.z() - 1, begin.x(), @@ -428,7 +430,7 @@ processEntities( ( &gridPointer.template getData< Devices::Cuda >(), userData, begin.y() + 1, - end.y() - 1, + end.y() - 1, begin.z() + 1, end.z() - 1, end.x(), @@ -440,7 +442,7 @@ processEntities( cudaStreamSynchronize( s3 ); cudaStreamSynchronize( s4 ); cudaStreamSynchronize( s5 ); - cudaStreamSynchronize( s6 ); + cudaStreamSynchronize( s6 ); TNL_CHECK_CUDA_DEVICE; } else @@ -506,6 +508,7 @@ processEntities( const CoordinatesType& begin, const CoordinatesType& end, UserData& userData, + GridTraverserMode mode, const int& stream, const GridEntityParameters&... gridEntityParameters ) { diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h index 448c7bc8b..741331538 100644 --- a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h +++ b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h @@ -43,7 +43,8 @@ processBoundaryEntities( const GridPointer& gridPointer, gridPointer, CoordinatesType( 0 ), gridPointer->getDimensions() - CoordinatesType( 1 ), - userData ); + userData, + asynchronousMode ); } else //Distributed { @@ -54,7 +55,8 @@ processBoundaryEntities( const GridPointer& gridPointer, gridPointer, CoordinatesType( 0 ) + distributedGrid->getLowerOverlap(), CoordinatesType( 0 ) + distributedGrid->getLowerOverlap(), - userData ); + userData, + asynchronousMode ); } if( neighbors[ Meshes::DistributedMeshes::ZzYzXp ] == -1 ) @@ -63,7 +65,8 @@ processBoundaryEntities( const GridPointer& gridPointer, gridPointer, gridPointer->getDimensions() - CoordinatesType( 1 ) - distributedGrid->getUpperOverlap(), gridPointer->getDimensions() - CoordinatesType( 1 ) - distributedGrid->getUpperOverlap(), - userData ); + userData, + asynchronousMode ); } } @@ -92,7 +95,8 @@ processInteriorEntities( const GridPointer& gridPointer, gridPointer, CoordinatesType( 1 ), gridPointer->getDimensions() - CoordinatesType( 2 ), - userData ); + userData, + asynchronousMode ); } else //Distributed { @@ -117,7 +121,8 @@ processInteriorEntities( const GridPointer& gridPointer, gridPointer, begin, end, - userData ); + userData, + asynchronousMode ); } } @@ -146,7 +151,8 @@ processAllEntities( gridPointer, CoordinatesType( 0 ), gridPointer->getDimensions() - CoordinatesType( 1 ), - userData ); + userData, + asynchronousMode ); } else //Distributed { @@ -157,7 +163,8 @@ processAllEntities( gridPointer, begin, end, - userData ); + userData, + asynchronousMode ); } } @@ -185,7 +192,8 @@ processBoundaryEntities( const GridPointer& gridPointer, gridPointer, CoordinatesType( 0 ), gridPointer->getDimensions(), - userData ); + userData, + asynchronousMode ); } template< typename Real, @@ -208,7 +216,8 @@ processInteriorEntities( const GridPointer& gridPointer, gridPointer, CoordinatesType( 1 ), gridPointer->getDimensions() - CoordinatesType( 1 ), - userData ); + userData, + asynchronousMode ); } template< typename Real, @@ -232,7 +241,8 @@ processAllEntities( gridPointer, CoordinatesType( 0 ), gridPointer->getDimensions(), - userData ); + userData, + asynchronousMode ); } } // namespace Meshes diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h index 41e161256..7809c9739 100644 --- a/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h +++ b/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h @@ -42,6 +42,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1 ), userData, + asynchronousMode, 0 ); } else //Distributed @@ -57,6 +58,7 @@ processBoundaryEntities( const GridPointer& gridPointer, begin, CoordinatesType( begin.x(), end.y() ), userData, + asynchronousMode, 0 ); } @@ -67,6 +69,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( end.x(), begin.y() ), end, userData, + asynchronousMode, 0 ); } @@ -77,6 +80,7 @@ processBoundaryEntities( const GridPointer& gridPointer, begin, CoordinatesType( end.x(), begin.y() ), userData, + asynchronousMode, 0 ); } @@ -87,6 +91,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( begin.x(), end.y() ), end, userData, + asynchronousMode, 0 ); } } @@ -116,6 +121,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 1 ), gridPointer->getDimensions() - CoordinatesType( 2, 2 ), userData, + asynchronousMode, 0 ); } else // distributed @@ -141,6 +147,7 @@ processInteriorEntities( const GridPointer& gridPointer, begin, end, userData, + asynchronousMode, 0); } } @@ -169,6 +176,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1 ), userData, + asynchronousMode, 0 ); } else @@ -182,6 +190,7 @@ processAllEntities( const GridPointer& gridPointer, begin, end, userData, + asynchronousMode, 0); } } @@ -210,6 +219,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 1, 0 ), CoordinatesType( 0, 1 ) ); @@ -219,6 +229,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 0 ), userData, + asynchronousMode, 0, CoordinatesType( 0, 1 ), CoordinatesType( 1, 0 ) ); @@ -245,6 +256,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 1, 0 ), CoordinatesType( 0, 1 ) ); @@ -254,6 +266,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 0, 1 ), gridPointer->getDimensions() - CoordinatesType( 1, 1 ), userData, + asynchronousMode, 0, CoordinatesType( 0, 1 ), CoordinatesType( 1, 0 ) ); @@ -280,6 +293,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 1, 0 ), CoordinatesType( 0, 1 ) ); @@ -289,6 +303,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 0 ), userData, + asynchronousMode, 0, CoordinatesType( 0, 1 ), CoordinatesType( 1, 0 ) ); @@ -315,6 +330,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions(), userData, + asynchronousMode, 0 ); } @@ -339,6 +355,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 1 ), gridPointer->getDimensions() - CoordinatesType( 1, 1 ), userData, + asynchronousMode, 0 ); } @@ -363,6 +380,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions(), userData, + asynchronousMode, 0 ); } diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h index e32c5a12e..ec242e367 100644 --- a/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h +++ b/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h @@ -44,6 +44,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ), userData, + asynchronousMode, 0 ); } else // distributed @@ -59,6 +60,7 @@ processBoundaryEntities( const GridPointer& gridPointer, begin, CoordinatesType( begin.x(), end.y(), end.z() ), userData, + asynchronousMode, 0 ); } @@ -69,6 +71,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( end.x() , begin.y(), begin.z() ), end, userData, + asynchronousMode, 0 ); } @@ -79,6 +82,7 @@ processBoundaryEntities( const GridPointer& gridPointer, begin, CoordinatesType( end.x(), begin.y(), end.z() ), userData, + asynchronousMode, 0 ); } @@ -89,6 +93,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( begin.x(), end.y(), begin.z() ), end, userData, + asynchronousMode, 0 ); } @@ -99,6 +104,7 @@ processBoundaryEntities( const GridPointer& gridPointer, begin, CoordinatesType( end.x(), end.y(), begin.z() ), userData, + asynchronousMode, 0 ); } @@ -109,6 +115,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( begin.x(), begin.y(), end.z() ), end, userData, + asynchronousMode, 0 ); } } @@ -138,6 +145,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 1, 1 ), gridPointer->getDimensions() - CoordinatesType( 2, 2, 2 ), userData, + asynchronousMode, 0 ); } else @@ -169,7 +177,8 @@ processInteriorEntities( const GridPointer& gridPointer, begin, end, userData, - 0); + asynchronousMode, + 0 ); } } @@ -197,6 +206,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ), userData, + asynchronousMode, 0 ); } else @@ -209,6 +219,7 @@ processAllEntities( const GridPointer& gridPointer, begin, end, userData, + asynchronousMode, 0 ); } } @@ -237,6 +248,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 1, 1 ), userData, + asynchronousMode, 2, CoordinatesType( 1, 0, 0 ), CoordinatesType( 0, 1, 1 ) ); @@ -246,6 +258,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 0, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 0, 1, 0 ), CoordinatesType( 1, 0, 1 ) ); @@ -255,6 +268,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 0 ), userData, + asynchronousMode, 0, CoordinatesType( 0, 0, 1 ), CoordinatesType( 1, 1, 0 ) ); @@ -281,6 +295,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ), userData, + asynchronousMode, 2, CoordinatesType( 1, 0, 0 ), CoordinatesType( 0, 1, 1 ) ); @@ -290,6 +305,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 0, 1, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 0, 1, 0 ), CoordinatesType( 1, 0, 1 ) ); @@ -299,6 +315,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 1 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ), userData, + asynchronousMode, 0, CoordinatesType( 0, 0, 1 ), CoordinatesType( 1, 1, 0 ) ); @@ -324,6 +341,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 1, 1 ), userData, + asynchronousMode, 2, CoordinatesType( 1, 0, 0 ), CoordinatesType( 0, 1, 1 ) ); @@ -333,6 +351,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 0, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 0, 1, 0 ), CoordinatesType( 1, 0, 1 ) ); @@ -342,6 +361,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 0 ), userData, + asynchronousMode, 0, CoordinatesType( 0, 0, 1 ), CoordinatesType( 1, 1, 0 ) ); @@ -371,6 +391,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 0, 0 ), userData, + asynchronousMode, 2, CoordinatesType( 0, 1, 1 ), CoordinatesType( 1, 0, 0 ) ); @@ -380,6 +401,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 1, 0 ), userData, + asynchronousMode, 1, CoordinatesType( 1, 0, 1 ), CoordinatesType( 0, 1, 0 ) ); @@ -389,6 +411,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 0, 1 ), userData, + asynchronousMode, 0, CoordinatesType( 1, 1, 0 ), CoordinatesType( 0, 0, 1 ) ); @@ -415,6 +438,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 0, 1, 1 ), gridPointer->getDimensions() - CoordinatesType( 0, 1, 1 ), userData, + asynchronousMode, 2, CoordinatesType( 0, 1, 1 ), CoordinatesType( 1, 0, 0 ) ); @@ -424,6 +448,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 0, 1 ), gridPointer->getDimensions() - CoordinatesType( 1, 0, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 1, 0, 1 ), CoordinatesType( 0, 1, 0 ) ); @@ -433,6 +458,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 1, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 0 ), userData, + asynchronousMode, 0, CoordinatesType( 1, 1, 0 ), CoordinatesType( 0, 0, 1 ) ); @@ -458,6 +484,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 0, 0 ), userData, + asynchronousMode, 2, CoordinatesType( 0, 1, 1 ), CoordinatesType( 1, 0, 0 ) ); @@ -467,6 +494,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 1, 0 ), userData, + asynchronousMode, 1, CoordinatesType( 1, 0, 1 ), CoordinatesType( 0, 1, 0 ) ); @@ -476,6 +504,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 0, 1 ), userData, + asynchronousMode, 0, CoordinatesType( 1, 1, 0 ), CoordinatesType( 0, 0, 1 ) ); @@ -505,6 +534,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions(), userData, + asynchronousMode, 0 ); } @@ -529,6 +559,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 1, 1 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ), userData, + asynchronousMode, 0 ); } @@ -553,6 +584,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions(), userData, + asynchronousMode, 0 ); } -- GitLab From 7349216c682ffd6d8bf14bbcf6f168b35fdcd2d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 5 Jan 2019 08:33:50 +0100 Subject: [PATCH 113/130] Added GridTraverserBenchmarkHelper. --- .../Traversers/GridTraversersBenchmark_1D.h | 122 +++++++++++++++--- 1 file changed, 104 insertions(+), 18 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index e626b17e3..22f1d6899 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -28,13 +28,110 @@ namespace TNL { namespace Benchmarks { namespace Traversers { +template< typename Grid, + typename Device = typename Grid::DeviceType > +class GridTraverserBenchmarkHelper{}; + +template< typename Grid > +class GridTraverserBenchmarkHelper< Grid, Devices::Host > +{ + public: + + using GridType = Grid; + using GridPointer = Pointers::SharedPointer< Grid >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; + using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; + using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; + + + static void noBCTraverserTest( const GridPointer& grid, + WriteOneTraverserUserDataType& userData, + std::size_t size ) + { + /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >( + grid, + CoordinatesType( 0 ), + grid->getDimensions() - CoordinatesType( 1 ), + userData );*/ + + const CoordinatesType begin( 0 ); + const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); + //MeshFunction* _u = &u.template modifyData< Device >(); + Cell entity( *grid ); + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); + } + + } +}; + +template< typename Grid > +class GridTraverserBenchmarkHelper< Grid, Devices::Cuda > +{ + public: + + using GridType = Grid; + using GridPointer = Pointers::SharedPointer< Grid >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; + using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; + using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; + + + static void noBCTraverserTest( const GridPointer& grid, + WriteOneTraverserUserDataType& userData, + std::size_t size ) + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + Meshes::GridTraverser1D< RealType, IndexType, Cell, WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + <<< blocksCount, blockSize >>> + ( &grid.template getData< Devices::Cuda >(), + userData, + CoordinatesType( 0 ), + CoordinatesType( size ) - CoordinatesType( 1 ), + gridIdx.x ); + + } +#endif + } +}; + template< typename Device, typename Real, typename Index > class GridTraversersBenchmark< 1, Device, Real, Index > { public: - + using Vector = Containers::Vector< Real, Device, Index >; using Grid = Meshes::Grid< 1, Real, Device, Index >; using GridPointer = Pointers::SharedPointer< Grid >; @@ -130,24 +227,13 @@ class GridTraversersBenchmark< 1, Device, Real, Index > void writeOneUsingTraverser() { using CoordinatesType = typename Grid::CoordinatesType; - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > - ( grid, userData ); + //traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + // ( grid, userData ); - /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >( - grid, - CoordinatesType( 0 ), - grid->getDimensions() - CoordinatesType( 1 ), - userData );*/ - /*const CoordinatesType begin( 0 ); - const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); - MeshFunction* _u = &u.template modifyData< Device >(); - Cell entity( *grid ); - for( Index x = begin.x(); x <= end.x(); x ++ ) - { - entity.getCoordinates().x() = x; - entity.refresh(); - WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); - }*/ + GridTraverserBenchmarkHelper< Grid >::noBCTraverserTest( + grid, + userData, + size ); } void traverseUsingPureC() -- GitLab From 0348875812f14bd4f7bb90ac576932e2a4074bd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 5 Jan 2019 08:34:20 +0100 Subject: [PATCH 114/130] Refactoring of Grid 1D traverser. --- .../Meshes/GridDetails/GridTraverser_1D.hpp | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp index 505f9c3d7..5b35d5be9 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp +++ b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp @@ -199,7 +199,31 @@ processEntities( } else { - dim3 cudaBlockSize( 256 ); + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + end.x() - begin.x() + 1 ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > + <<< blocksCount, blockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end, + gridIdx.x ); + } + + /*dim3 cudaBlockSize( 256 ); dim3 cudaBlocks; cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); -- GitLab From 1a78c9e5997855b884edc102d52b327b9ad0f9e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 5 Jan 2019 09:47:43 +0100 Subject: [PATCH 115/130] TRaversers benchmark refactoring, --- .../Traversers/AddOneEntitiesProcessor.h | 43 +++++ .../Traversers/BenchmarkTraverserUserData.h | 32 ++++ .../Traversers/GridTraverserBenchmarkHelper.h | 152 ++++++++++++++++++ .../Traversers/GridTraversersBenchmark.h | 30 +--- .../Traversers/GridTraversersBenchmark_1D.h | 116 ++----------- .../Traversers/GridTraversersBenchmark_2D.h | 23 ++- .../Traversers/GridTraversersBenchmark_3D.h | 28 ++-- .../Traversers/tnl-benchmark-traversers.h | 28 ++-- 8 files changed, 280 insertions(+), 172 deletions(-) create mode 100644 src/Benchmarks/Traversers/AddOneEntitiesProcessor.h create mode 100644 src/Benchmarks/Traversers/BenchmarkTraverserUserData.h create mode 100644 src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h diff --git a/src/Benchmarks/Traversers/AddOneEntitiesProcessor.h b/src/Benchmarks/Traversers/AddOneEntitiesProcessor.h new file mode 100644 index 000000000..6b136d074 --- /dev/null +++ b/src/Benchmarks/Traversers/AddOneEntitiesProcessor.h @@ -0,0 +1,43 @@ +/*************************************************************************** + BenchmarkTraverserUserData.h - description + ------------------- + begin : Jan 5, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename TraverserUserData > +class AddOneEntitiesProcessor +{ + public: + + using MeshType = typename TraverserUserData::MeshType; + using DeviceType = typename MeshType::DeviceType; + using RealType = typename MeshType::RealType; + + template< typename GridEntity > + __cuda_callable__ + static inline void processEntity( const MeshType& mesh, + TraverserUserData& userData, + const GridEntity& entity ) + { + auto& u = *userData.u; + u( entity ) += ( RealType ) 1.0; + } +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h new file mode 100644 index 000000000..5a2f179fa --- /dev/null +++ b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h @@ -0,0 +1,32 @@ +/*************************************************************************** + BenchmarkTraverserUserData.h - description + ------------------- + begin : Jan 5, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename MeshFunction > +class BenchmarkTraverserUserData +{ + public: + + using MeshType = typename MeshFunction::MeshType; + + MeshFunction* u; +}; + + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h new file mode 100644 index 000000000..df43f93cd --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h @@ -0,0 +1,152 @@ +/*************************************************************************** + GridTraversersBenchmarkHelper.h - description + ------------------- + begin : Jan 5, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include "AddOneEntitiesProcessor.h" +#include "BenchmarkTraverserUserData.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor > +__global__ void +_GridTraverser1D( + const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const Index gridIdx ) +{ + typedef Real RealType; + typedef Index IndexType; + typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( coordinates <= end ) + { + GridEntity entity( *grid, coordinates ); + entity.refresh(); + ( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0; + //( *userData.u )( entity) += 1.0; + //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +} +#endif + +template< typename Grid, + typename Device = typename Grid::DeviceType > +class GridTraverserBenchmarkHelper{}; + +template< typename Grid > +class GridTraverserBenchmarkHelper< Grid, Devices::Host > +{ + public: + + using GridType = Grid; + using GridPointer = Pointers::SharedPointer< Grid >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void noBCTraverserTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { + /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >( + grid, + CoordinatesType( 0 ), + grid->getDimensions() - CoordinatesType( 1 ), + userData );*/ + + const CoordinatesType begin( 0 ); + const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); + //MeshFunction* _u = &u.template modifyData< Device >(); + Cell entity( *grid ); + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); + } + + } +}; + +template< typename Grid > +class GridTraverserBenchmarkHelper< Grid, Devices::Cuda > +{ + public: + + using GridType = Grid; + using GridPointer = Pointers::SharedPointer< Grid >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename Grid::CoordinatesType; + using MeshFunction = Functions::MeshFunction< Grid >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using Traverser = Meshes::Traverser< Grid, Cell >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void noBCTraverserTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + _GridTraverser1D< RealType, IndexType, Cell, UserDataType, AddOneEntitiesProcessorType > + <<< blocksCount, blockSize >>> + ( &grid.template getData< Devices::Cuda >(), + userData, + CoordinatesType( 0 ), + CoordinatesType( size ) - CoordinatesType( 1 ), + gridIdx.x ); + + } +#endif + } +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL + + diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index bd748ed09..be4f41d31 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -21,40 +21,16 @@ #include #include #include + +#include "GridTraverserBenchmarkHelper.h" +#include "BenchmarkTraverserUserData.h" #include "cuda-kernels.h" namespace TNL { namespace Benchmarks { namespace Traversers { -template< typename TraverserUserData > -class WriteOneEntitiesProcessor -{ - public: - - using MeshType = typename TraverserUserData::MeshType; - using DeviceType = typename MeshType::DeviceType; - - template< typename GridEntity > - __cuda_callable__ - static inline void processEntity( const MeshType& mesh, - TraverserUserData& userData, - const GridEntity& entity ) - { - auto& u = userData.u.template modifyData< DeviceType >(); - u( entity ) += (typename MeshType::RealType) 1.0; - } -}; -template< typename MeshFunctionPointer > -class WriteOneUserData -{ - public: - - using MeshType = typename MeshFunctionPointer::ObjectType::MeshType; - - MeshFunctionPointer u; -}; template< int Dimension, typename Device, diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index 22f1d6899..bdce2d746 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -28,102 +28,6 @@ namespace TNL { namespace Benchmarks { namespace Traversers { -template< typename Grid, - typename Device = typename Grid::DeviceType > -class GridTraverserBenchmarkHelper{}; - -template< typename Grid > -class GridTraverserBenchmarkHelper< Grid, Devices::Host > -{ - public: - - using GridType = Grid; - using GridPointer = Pointers::SharedPointer< Grid >; - using RealType = typename GridType::RealType; - using IndexType = typename GridType::IndexType; - using CoordinatesType = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; - using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; - using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; - using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; - - - static void noBCTraverserTest( const GridPointer& grid, - WriteOneTraverserUserDataType& userData, - std::size_t size ) - { - /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >( - grid, - CoordinatesType( 0 ), - grid->getDimensions() - CoordinatesType( 1 ), - userData );*/ - - const CoordinatesType begin( 0 ); - const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); - //MeshFunction* _u = &u.template modifyData< Device >(); - Cell entity( *grid ); - for( IndexType x = begin.x(); x <= end.x(); x ++ ) - { - entity.getCoordinates().x() = x; - entity.refresh(); - WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); - } - - } -}; - -template< typename Grid > -class GridTraverserBenchmarkHelper< Grid, Devices::Cuda > -{ - public: - - using GridType = Grid; - using GridPointer = Pointers::SharedPointer< Grid >; - using RealType = typename GridType::RealType; - using IndexType = typename GridType::IndexType; - using CoordinatesType = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; - using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; - using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; - using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; - - - static void noBCTraverserTest( const GridPointer& grid, - WriteOneTraverserUserDataType& userData, - std::size_t size ) - { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( - blockSize, - blocksCount, - gridsCount, - size ); - dim3 gridIdx; - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - Meshes::GridTraverser1D< RealType, IndexType, Cell, WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > - <<< blocksCount, blockSize >>> - ( &grid.template getData< Devices::Cuda >(), - userData, - CoordinatesType( 0 ), - CoordinatesType( size ) - CoordinatesType( 1 ), - gridIdx.x ); - - } -#endif - } -}; template< typename Device, typename Real, @@ -140,13 +44,13 @@ class GridTraversersBenchmark< 1, Device, Real, Index > using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; using Traverser = Meshes::Traverser< Grid, Cell >; - using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; - using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; GridTraversersBenchmark( Index size ) :size( size ), v( size ), grid( size ), u( grid ) { - userData.u = this->u; + userData.u = &this->u.template modifyData< Device >(); v_data = v.getData(); } @@ -156,7 +60,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > u->getData().setValue( 0.0 ); }; - void writeOneUsingPureC() + void addOneUsingPureC() { if( std::is_same< Device, Devices::Host >::value ) { @@ -187,7 +91,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > } } - void writeOneUsingParallelFor() + void addOneUsingParallelFor() { auto f = [] __cuda_callable__ ( Index i, Real* data ) { @@ -196,7 +100,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } - void writeOneUsingParallelForAndGridEntity() + void addOneUsingParallelForAndGridEntity() { const Grid* currentGrid = &grid.template getData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Real* data ) @@ -209,7 +113,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } - void writeOneUsingParallelForAndMeshFunction() + void addOneUsingParallelForAndMeshFunction() { const Grid* currentGrid = &grid.template getData< Device >(); MeshFunction* _u = &u.template modifyData< Device >(); @@ -224,7 +128,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ParallelFor< Device >::exec( ( Index ) 0, size, f ); } - void writeOneUsingTraverser() + void addOneUsingTraverser() { using CoordinatesType = typename Grid::CoordinatesType; //traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > @@ -282,7 +186,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > void traverseUsingTraverser() { // TODO !!!!!!!!!!!!!!!!!!!!!! - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); } @@ -294,7 +198,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > GridPointer grid; MeshFunctionPointer u; Traverser traverser; - WriteOneTraverserUserDataType userData; + UserDataType userData; }; } // namespace Traversers diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index 1296a9a46..6fb0e52d4 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -42,14 +42,13 @@ class GridTraversersBenchmark< 2, Device, Real, Index > using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >; using Traverser = Meshes::Traverser< Grid, Cell >; - using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; - using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; - using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; GridTraversersBenchmark( Index size ) :size( size ), v( size * size ), grid( size, size ), u( grid ) { - userData.u = this->u; + userData.u = &this->u.template modifyData< Device >(); v_data = v.getData(); } @@ -59,7 +58,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > u->getData().setValue( 0.0 ); }; - void writeOneUsingPureC() + void addOneUsingPureC() { if( std::is_same< Device, Devices::Host >::value ) { @@ -93,7 +92,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > } } - void writeOneUsingParallelFor() + void addOneUsingParallelFor() { Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) @@ -108,7 +107,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > f, v.getData() ); } - void writeOneUsingParallelForAndGridEntity() + void addOneUsingParallelForAndGridEntity() { const Grid* currentGrid = &grid.template getData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) @@ -127,7 +126,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > f, v.getData() ); } - void writeOneUsingParallelForAndMeshFunction() + void addOneUsingParallelForAndMeshFunction() { const Grid* currentGrid = &grid.template getData< Device >(); MeshFunction* _u = &u.template modifyData< Device >(); @@ -148,10 +147,10 @@ class GridTraversersBenchmark< 2, Device, Real, Index > } - void writeOneUsingTraverser() + void addOneUsingTraverser() { using CoordinatesType = typename Grid::CoordinatesType; - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >( @@ -232,7 +231,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > void traversingUsingTraverser() { // TODO !!!!!!!!!!!!!!!!!!!!!! - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); } @@ -244,7 +243,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > GridPointer grid; MeshFunctionPointer u; Traverser traverser; - WriteOneTraverserUserDataType userData; + UserDataType userData; }; } // namespace Traversers diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index 35863a3c9..977809563 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -21,7 +21,10 @@ #include #include #include + #include "cuda-kernels.h" +#include "AddOneEntitiesProcessor.h" +#include "BenchmarkTraverserUserData.h" namespace TNL { namespace Benchmarks { @@ -42,17 +45,16 @@ class GridTraversersBenchmark< 3, Device, Real, Index > using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >; using Traverser = Meshes::Traverser< Grid, Cell >; - using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; - using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; - using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; - + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + GridTraversersBenchmark( Index size ) : size( size ), v( size * size * size ), grid( size, size, size ), u( grid ) { - userData.u = this->u; + userData.u = &this->u.template modifyData< Device >(); v_data = v.getData(); } @@ -62,7 +64,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > u->getData().setValue( 0.0 ); }; - void writeOneUsingPureC() + void addOneUsingPureC() { if( std::is_same< Device, Devices::Host >::value ) { @@ -99,7 +101,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > } } - void writeOneUsingParallelFor() + void addOneUsingParallelFor() { Index _size = this->size; auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) @@ -116,7 +118,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > f, v.getData() ); } - void writeOneUsingParallelForAndGridEntity() + void addOneUsingParallelForAndGridEntity() { const Grid* currentGrid = &grid.template getData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) @@ -138,7 +140,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > f, v.getData() ); } - void writeOneUsingParallelForAndMeshFunction() + void addOneUsingParallelForAndMeshFunction() { const Grid* currentGrid = &grid.template getData< Device >(); MeshFunction* _u = &u.template modifyData< Device >(); @@ -162,9 +164,9 @@ class GridTraversersBenchmark< 3, Device, Real, Index > } - void writeOneUsingTraverser() + void addOneUsingTraverser() { - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); } @@ -240,7 +242,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > void traverseUsingTraverser() { // TODO !!!!!!!!!!!!!!!!!!!!!! - traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > + traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); } @@ -252,7 +254,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > GridPointer grid; MeshFunctionPointer u; Traverser traverser; - WriteOneTraverserUserDataType userData; + UserDataType userData; }; } // namespace Traversers diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index ff6d25624..c6423e452 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -84,14 +84,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters, auto hostWriteOneUsingPureC = [&] () { - hostTraverserBenchmark.writeOneUsingPureC(); + hostTraverserBenchmark.addOneUsingPureC(); }; benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); #ifdef HAVE_CUDA auto cudaWriteOneUsingPureC = [&] () { - cudaTraverserBenchmark.writeOneUsingPureC(); + cudaTraverserBenchmark.addOneUsingPureC(); }; if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); @@ -107,14 +107,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters, auto hostWriteOneUsingParallelFor = [&] () { - hostTraverserBenchmark.writeOneUsingParallelFor(); + hostTraverserBenchmark.addOneUsingParallelFor(); }; benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelFor = [&] () { - cudaTraverserBenchmark.writeOneUsingParallelFor(); + cudaTraverserBenchmark.addOneUsingParallelFor(); }; if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); @@ -128,7 +128,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, { auto hostWriteOneUsingParallelForAndGridEntity = [&] () { - hostTraverserBenchmark.writeOneUsingParallelForAndGridEntity(); + hostTraverserBenchmark.addOneUsingParallelForAndGridEntity(); }; benchmark.setOperation( "par.for+grid ent.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); @@ -136,7 +136,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelForAndGridEntity = [&] () { - cudaTraverserBenchmark.writeOneUsingParallelForAndGridEntity(); + cudaTraverserBenchmark.addOneUsingParallelForAndGridEntity(); }; if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity ); @@ -150,7 +150,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, { auto hostWriteOneUsingParallelForAndMeshFunction = [&] () { - hostTraverserBenchmark.writeOneUsingParallelForAndMeshFunction(); + hostTraverserBenchmark.addOneUsingParallelForAndMeshFunction(); }; benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction ); @@ -158,7 +158,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelForAndMeshFunction = [&] () { - cudaTraverserBenchmark.writeOneUsingParallelForAndMeshFunction(); + cudaTraverserBenchmark.addOneUsingParallelForAndMeshFunction(); }; if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction ); @@ -174,14 +174,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters, benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); auto hostWriteOneUsingTraverser = [&] () { - hostTraverserBenchmark.writeOneUsingTraverser(); + hostTraverserBenchmark.addOneUsingTraverser(); }; benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); #ifdef HAVE_CUDA auto cudaWriteOneUsingTraverser = [&] () { - cudaTraverserBenchmark.writeOneUsingTraverser(); + cudaTraverserBenchmark.addOneUsingTraverser(); }; if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); @@ -254,13 +254,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, */ auto hostTraverseUsingParallelFor = [&] () { - hostTraverserBenchmark.writeOneUsingParallelFor(); + hostTraverserBenchmark.addOneUsingParallelFor(); }; #ifdef HAVE_CUDA auto cudaTraverseUsingParallelFor = [&] () { - cudaTraverserBenchmark.writeOneUsingParallelFor(); + cudaTraverserBenchmark.addOneUsingParallelFor(); }; #endif @@ -286,13 +286,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, */ auto hostTraverseUsingTraverser = [&] () { - hostTraverserBenchmark.writeOneUsingTraverser(); + hostTraverserBenchmark.addOneUsingTraverser(); }; #ifdef HAVE_CUDA auto cudaTraverseUsingTraverser = [&] () { - cudaTraverserBenchmark.writeOneUsingTraverser(); + cudaTraverserBenchmark.addOneUsingTraverser(); }; #endif -- GitLab From 31303f1a37c797b65f16faae428990d321f55cf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 5 Jan 2019 13:06:30 +0100 Subject: [PATCH 116/130] Analyzing grid entity efficiency. --- src/Benchmarks/Benchmarks.h | 2 +- .../Traversers/BenchmarkTraverserUserData.h | 9 ++- .../Traversers/GridTraverserBenchmarkHelper.h | 30 ++++++---- .../Traversers/GridTraversersBenchmark_1D.h | 4 +- .../Traversers/GridTraversersBenchmark_2D.h | 4 +- .../Traversers/GridTraversersBenchmark_3D.h | 4 +- .../Traversers/tnl-benchmark-traversers.h | 57 ++++++++++++------- 7 files changed, 68 insertions(+), 42 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index f31e21f6c..355fb4671 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -74,7 +74,7 @@ public: { config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); config.addEntry< bool >( "reset", "Call reset function between loops.", true ); - config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 1 ); + config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 ); config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true ); config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 ); } diff --git a/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h index 5a2f179fa..2ae00ec69 100644 --- a/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h +++ b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h @@ -20,10 +20,17 @@ template< typename MeshFunction > class BenchmarkTraverserUserData { public: - + using MeshType = typename MeshFunction::MeshType; + using RealType = typename MeshType::RealType; + using DeviceType = typename MeshType::DeviceType; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + BenchmarkTraverserUserData( MeshFunctionPointer& f ) + : u( &f.template modifyData< DeviceType >() ), data( f->getData().getData() ){} + MeshFunction* u; + RealType* data; }; diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h index df43f93cd..8b00e060a 100644 --- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h @@ -14,6 +14,7 @@ #include "AddOneEntitiesProcessor.h" #include "BenchmarkTraverserUserData.h" +#include "SimpleCell.h" namespace TNL { namespace Benchmarks { @@ -38,13 +39,16 @@ _GridTraverser1D( typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; + GridEntity entity;//( *grid, ); + //entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( coordinates <= end ) - { - GridEntity entity( *grid, coordinates ); - entity.refresh(); - ( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0; - //( *userData.u )( entity) += 1.0; + { + //entity.refresh(); + //( userData.u->getData() )[ entity.getIndex( coordinates ) ] += ( RealType ) 1.0; + //( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0; + userData.data[ coordinates.x() ] += ( RealType ) 1.0; + //( *userData.u )( entity ) += ( RealType ) 1.0; //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } } @@ -66,8 +70,9 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Host > using CoordinatesType = typename Grid::CoordinatesType; using MeshFunction = Functions::MeshFunction< Grid >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; + using CellType = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< Grid, CellType >; using UserDataType = BenchmarkTraverserUserData< MeshFunction >; using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; @@ -84,13 +89,13 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Host > const CoordinatesType begin( 0 ); const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); //MeshFunction* _u = &u.template modifyData< Device >(); - Cell entity( *grid ); + /*SimpleCellType entity( *grid ); for( IndexType x = begin.x(); x <= end.x(); x ++ ) { entity.getCoordinates().x() = x; entity.refresh(); AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); - } + }*/ } }; @@ -107,8 +112,9 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Cuda > using CoordinatesType = typename Grid::CoordinatesType; using MeshFunction = Functions::MeshFunction< Grid >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; + using CellType = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< Grid, CellType >; using UserDataType = BenchmarkTraverserUserData< MeshFunction >; using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; @@ -132,7 +138,7 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Cuda > gridsCount, gridIdx, gridSize ); - _GridTraverser1D< RealType, IndexType, Cell, UserDataType, AddOneEntitiesProcessorType > + _GridTraverser1D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType > <<< blocksCount, blockSize >>> ( &grid.template getData< Devices::Cuda >(), userData, diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index bdce2d746..006b0316f 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -48,9 +48,9 @@ class GridTraversersBenchmark< 1, Device, Real, Index > using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; GridTraversersBenchmark( Index size ) - :size( size ), v( size ), grid( size ), u( grid ) + :size( size ), v( size ), grid( size ), u( grid ), + userData( this->u ) { - userData.u = &this->u.template modifyData< Device >(); v_data = v.getData(); } diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index 6fb0e52d4..7c90a5064 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -46,9 +46,9 @@ class GridTraversersBenchmark< 2, Device, Real, Index > using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; GridTraversersBenchmark( Index size ) - :size( size ), v( size * size ), grid( size, size ), u( grid ) + :size( size ), v( size * size ), grid( size, size ), u( grid ), + userData( u ) { - userData.u = &this->u.template modifyData< Device >(); v_data = v.getData(); } diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index 977809563..2a32184ea 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -52,9 +52,9 @@ class GridTraversersBenchmark< 3, Device, Real, Index > : size( size ), v( size * size * size ), grid( size, size, size ), - u( grid ) + u( grid ), + userData( u ) { - userData.u = &this->u.template modifyData< Device >(); v_data = v.getData(); } diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index c6423e452..2963bb792 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -42,6 +42,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); const std::size_t minSize = parameters.getParameter< int >( "min-size" ); const std::size_t maxSize = parameters.getParameter< int >( "max-size" ); + const bool withHost = parameters.getParameter< bool >( "with-host" ); #ifdef HAVE_CUDA const bool withCuda = parameters.getParameter< bool >( "with-cuda" ); #else @@ -78,7 +79,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using C for */ - if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-pure-c" ) ) + if( tests.containsValue( "all" ) || tests.containsValue( "add-one-pure-c" ) ) { benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); @@ -86,7 +87,8 @@ bool runBenchmark( const Config::ParameterContainer& parameters, { hostTraverserBenchmark.addOneUsingPureC(); }; - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); #ifdef HAVE_CUDA auto cudaWriteOneUsingPureC = [&] () @@ -101,7 +103,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using parallel for */ - if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for" ) ) + if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for" ) ) { benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); @@ -109,7 +111,8 @@ bool runBenchmark( const Config::ParameterContainer& parameters, { hostTraverserBenchmark.addOneUsingParallelFor(); }; - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelFor = [&] () @@ -124,14 +127,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using parallel for with grid entity */ - if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for-and-grid-entity" ) ) + if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-grid-entity" ) ) { auto hostWriteOneUsingParallelForAndGridEntity = [&] () { hostTraverserBenchmark.addOneUsingParallelForAndGridEntity(); }; benchmark.setOperation( "par.for+grid ent.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelForAndGridEntity = [&] () @@ -146,14 +150,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using parallel for with mesh function */ - if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-parallel-for-and-mesh-function" ) ) + if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-mesh-function" ) ) { auto hostWriteOneUsingParallelForAndMeshFunction = [&] () { hostTraverserBenchmark.addOneUsingParallelForAndMeshFunction(); }; benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction ); #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelForAndMeshFunction = [&] () @@ -169,14 +174,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using traverser */ - if( tests.containsValue( "all" ) || tests.containsValue( "no-bc-traverser" ) ) + if( tests.containsValue( "all" ) || tests.containsValue( "add-one-traverser" ) ) { benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); auto hostWriteOneUsingTraverser = [&] () { hostTraverserBenchmark.addOneUsingTraverser(); }; - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); #ifdef HAVE_CUDA auto cudaWriteOneUsingTraverser = [&] () @@ -235,14 +241,16 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( tests.containsValue( "all" ) || tests.containsValue( "bc-pure-c" ) ) { benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC ); + if( withHost ) + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC ); #endif benchmark.setOperation( "Pure C RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC ); @@ -267,14 +275,16 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) ) { benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); + if( withHost ) + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor ); #endif benchmark.setOperation( "parallel for RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor ); #ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor ); @@ -299,13 +309,15 @@ bool runBenchmark( const Config::ParameterContainer& parameters, if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) ) { benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser ); + if( withHost ) + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser ); #endif benchmark.setOperation( "traverser RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser ); #endif @@ -318,17 +330,18 @@ void setupConfig( Config::ConfigDescription& config ) { config.addList< String >( "tests", "Tests to be performed.", "all" ); config.addEntryEnum( "all" ); - config.addEntryEnum( "no-bc-pure-c" ); - config.addEntryEnum( "no-bc-parallel-for" ); - config.addEntryEnum( "no-bc-parallel-for-and-grid-entity" ); - config.addEntryEnum( "no-bc-traverser" ); + config.addEntryEnum( "add-one-pure-c" ); + config.addEntryEnum( "add-one-parallel-for" ); + config.addEntryEnum( "add-one-parallel-for-and-grid-entity" ); + config.addEntryEnum( "add-one-traverser" ); config.addEntryEnum( "bc-pure-c" ); config.addEntryEnum( "bc-parallel-for" ); config.addEntryEnum( "bc-traverser" ); + config.addEntry< bool >( "with-host", "Perform CPU benchmarks.", true ); #ifdef HAVE_CUDA - config.addEntry< bool >( "with-cuda", "Perform even the CUDA benchmarks.", true ); + config.addEntry< bool >( "with-cuda", "Perform CUDA benchmarks.", true ); #else - config.addEntry< bool >( "with-cuda", "Perform even the CUDA benchmarks.", false ); + config.addEntry< bool >( "with-cuda", "Perform CUDA benchmarks.", false ); #endif config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-traversers.log"); config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); -- GitLab From 60f9f4b1b8b67872c7dcc20b0d52e9600c38ef4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 5 Jan 2019 20:17:09 +0100 Subject: [PATCH 117/130] Implemented SimpleCell traverser benchmark test. --- .../Traversers/GridTraverserBenchmarkHelper.h | 32 ++++--- src/Benchmarks/Traversers/SimpleCell.h | 95 +++++++++++++++++++ 2 files changed, 113 insertions(+), 14 deletions(-) create mode 100644 src/Benchmarks/Traversers/SimpleCell.h diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h index 8b00e060a..c13ec3ab7 100644 --- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h @@ -37,18 +37,19 @@ _GridTraverser1D( typedef Real RealType; typedef Index IndexType; typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; + //typename GridType::CoordinatesType coordinates; - GridEntity entity;//( *grid, ); - //entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( coordinates <= end ) + GridEntity entity( *grid ); + entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + //coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( entity.getCoordinates() <= end ) { - //entity.refresh(); + entity.refresh(); //( userData.u->getData() )[ entity.getIndex( coordinates ) ] += ( RealType ) 1.0; //( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0; - userData.data[ coordinates.x() ] += ( RealType ) 1.0; - //( *userData.u )( entity ) += ( RealType ) 1.0; + //userData.data[ entity.getIndex() ] += ( RealType ) 1.0; + //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0; + ( *userData.u )( entity ) += ( RealType ) 1.0; //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); } } @@ -80,22 +81,25 @@ class GridTraverserBenchmarkHelper< Grid, Devices::Host > UserDataType& userData, std::size_t size ) { - /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >( + /*Meshes::GridTraverser< Grid >::template processEntities< CellType, AddOneEntitiesProcessorType, UserDataType, false >( grid, CoordinatesType( 0 ), grid->getDimensions() - CoordinatesType( 1 ), - userData );*/ - + userData ); + */ + const CoordinatesType begin( 0 ); const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); //MeshFunction* _u = &u.template modifyData< Device >(); - /*SimpleCellType entity( *grid ); + SimpleCellType entity( *grid ); for( IndexType x = begin.x(); x <= end.x(); x ++ ) { entity.getCoordinates().x() = x; entity.refresh(); - AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); - }*/ + //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0; + ( *userData.u )( entity ) += ( RealType ) 1.0; + //AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); + } } }; diff --git a/src/Benchmarks/Traversers/SimpleCell.h b/src/Benchmarks/Traversers/SimpleCell.h new file mode 100644 index 000000000..c70f64fda --- /dev/null +++ b/src/Benchmarks/Traversers/SimpleCell.h @@ -0,0 +1,95 @@ +/*************************************************************************** + SimpleCell.h - description + ------------------- + begin : Jan 5, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include +#include + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename Grid > +class SimpleCell{}; + +template< typename Real, + typename Device, + typename Index > +class SimpleCell< Meshes::Grid< 1, Real, Device, Index > > +{ + public: + using GridType = Meshes::Grid< 1, Real, Device, Index >; + using RealType = typename GridType::RealType; + using DeviceType = typename GridType::DeviceType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + + constexpr static int getEntityDimension() { return 1; }; + + __cuda_callable__ + SimpleCell( const GridType& grid ) : + grid( grid ){}; + + __cuda_callable__ + const GridType& getMesh() const { return this->grid;}; + + __cuda_callable__ + CoordinatesType& getCoordinates() { return this->coordinates; }; + + __cuda_callable__ + void refresh() {index = coordinates.x();}; + + __cuda_callable__ + const IndexType& getIndex() const { return this->index; }; + + protected: + const GridType& grid; + CoordinatesType coordinates; + IndexType index; +}; + +template< typename Real, + typename Device, + typename Index > +class SimpleCell< Meshes::Grid< 2, Real, Device, Index > > +{ + public: + using GridType = Meshes::Grid< 1, Real, Device, Index >; + using RealType = typename GridType::RealType; + using DeviceType = typename GridType::DeviceType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + + constexpr static int getEntityDimension() { return 2; }; + +}; + +template< typename Real, + typename Device, + typename Index > +class SimpleCell< Meshes::Grid< 3, Real, Device, Index > > +{ + public: + using GridType = Meshes::Grid< 1, Real, Device, Index >; + using RealType = typename GridType::RealType; + using DeviceType = typename GridType::DeviceType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + + constexpr static int getEntityDimension() { return 3; }; + +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL -- GitLab From 579d847032f2d93d51970928b8431dc0d37df172 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 5 Jan 2019 20:17:33 +0100 Subject: [PATCH 118/130] MeshFunction refactoring. --- src/TNL/Functions/MeshFunction.h | 103 +++++++++++++------------- src/TNL/Functions/MeshFunction_impl.h | 7 +- 2 files changed, 52 insertions(+), 58 deletions(-) diff --git a/src/TNL/Functions/MeshFunction.h b/src/TNL/Functions/MeshFunction.h index 4ccdab9f3..32d54ec21 100644 --- a/src/TNL/Functions/MeshFunction.h +++ b/src/TNL/Functions/MeshFunction.h @@ -20,7 +20,7 @@ namespace TNL { -namespace Functions { +namespace Functions { template< typename Mesh, int MeshEntityDimension = Mesh::getMeshDimension(), @@ -32,155 +32,152 @@ class MeshFunction : //static_assert( Mesh::DeviceType::DeviceType == Vector::DeviceType::DeviceType, // "Both mesh and vector of a mesh function must reside on the same device."); public: - + using MeshType = Mesh; using DeviceType = typename MeshType::DeviceType; using IndexType = typename MeshType::GlobalIndexType; - using MeshPointer = Pointers::SharedPointer< MeshType >; + using MeshPointer = Pointers::SharedPointer< MeshType >; using RealType = Real; using VectorType = Containers::Vector< RealType, DeviceType, IndexType >; using ThisType = Functions::MeshFunction< MeshType, MeshEntityDimension, RealType >; using DistributedMeshType = Meshes::DistributedMeshes::DistributedMesh; using DistributedMeshSynchronizerType = Meshes::DistributedMeshes::DistributedMeshSynchronizer; - + static constexpr int getEntitiesDimension() { return MeshEntityDimension; } - + static constexpr int getMeshDimension() { return MeshType::getMeshDimension(); } - + MeshFunction(); - - MeshFunction( const MeshPointer& meshPointer ); - + + MeshFunction( const MeshPointer& meshPointer ); + MeshFunction( const ThisType& meshFunction ); - + template< typename Vector > MeshFunction( const MeshPointer& meshPointer, Vector& data, - const IndexType& offset = 0 ); - - + const IndexType& offset = 0 ); + template< typename Vector > MeshFunction( const MeshPointer& meshPointer, Pointers::SharedPointer< Vector >& data, - const IndexType& offset = 0 ); - + const IndexType& offset = 0 ); + static String getType(); - + String getTypeVirtual() const; - + static String getSerializationType(); virtual String getSerializationTypeVirtual() const; - + static void configSetup( Config::ConfigDescription& config, const String& prefix = "" ); bool setup( const MeshPointer& meshPointer, const Config::ParameterContainer& parameters, const String& prefix = "" ); - + void bind( ThisType& meshFunction ); - + template< typename Vector > void bind( const Vector& data, const IndexType& offset = 0 ); - + template< typename Vector > void bind( const MeshPointer& meshPointer, const Vector& data, const IndexType& offset = 0 ); - + template< typename Vector > void bind( const MeshPointer& meshPointer, const Pointers::SharedPointer< Vector >& dataPtr, const IndexType& offset = 0 ); - + void setMesh( const MeshPointer& meshPointer ); - + template< typename Device = Devices::Host > __cuda_callable__ const MeshType& getMesh() const; - + const MeshPointer& getMeshPointer() const; - + static IndexType getDofs( const MeshPointer& meshPointer ); - - __cuda_callable__ const VectorType& getData() const; - + + __cuda_callable__ const VectorType& getData() const; + __cuda_callable__ VectorType& getData(); - + bool refresh( const RealType& time = 0.0 ) const; - + bool deepRefresh( const RealType& time = 0.0 ) const; - + template< typename EntityType > RealType getValue( const EntityType& meshEntity ) const; - + template< typename EntityType > void setValue( const EntityType& meshEntity, const RealType& value ); - + template< typename EntityType > __cuda_callable__ RealType& operator()( const EntityType& meshEntity, - const RealType& time = 0.0 ); - + const RealType& time = 0 ); + template< typename EntityType > __cuda_callable__ const RealType& operator()( const EntityType& meshEntity, - const RealType& time = 0.0 ) const; - + const RealType& time = 0 ) const; + __cuda_callable__ RealType& operator[]( const IndexType& meshEntityIndex ); - __cuda_callable__ const RealType& operator[]( const IndexType& meshEntityIndex ) const; template< typename Function > ThisType& operator = ( const Function& f ); - + template< typename Function > ThisType& operator -= ( const Function& f ); template< typename Function > ThisType& operator += ( const Function& f ); - + RealType getLpNorm( const RealType& p ) const; - + RealType getMaxNorm() const; - + bool save( File& file ) const; bool load( File& file ); - + bool boundLoad( File& file ); - + bool write( const String& fileName, const String& format = "vtk", const double& scale = 1.0 ) const; - + using Object::save; - + using Object::load; - + using Object::boundLoad; template< typename CommunicatorType, typename PeriodicBoundariesMaskType = MeshFunction< Mesh, MeshEntityDimension, bool > > void synchronize( bool withPeriodicBoundaryConditions = false, const Pointers::SharedPointer< PeriodicBoundariesMaskType, DeviceType >& mask = - Pointers::SharedPointer< PeriodicBoundariesMaskType, DeviceType >( nullptr ) ); + Pointers::SharedPointer< PeriodicBoundariesMaskType, DeviceType >( nullptr ) ); - protected: //DistributedMeshSynchronizerType synchronizer; Meshes::DistributedMeshes::DistributedMeshSynchronizer< Functions::MeshFunction< MeshType, MeshEntityDimension, RealType > > synchronizer; - + MeshPointer meshPointer; - + VectorType data; - + template< typename, typename > friend class MeshFunctionEvaluator; private: diff --git a/src/TNL/Functions/MeshFunction_impl.h b/src/TNL/Functions/MeshFunction_impl.h index 49b75d52f..16d17914d 100644 --- a/src/TNL/Functions/MeshFunction_impl.h +++ b/src/TNL/Functions/MeshFunction_impl.h @@ -19,7 +19,7 @@ #pragma once namespace TNL { -namespace Functions { + namespace Functions { template< typename Mesh, int MeshEntityDimension, @@ -48,7 +48,6 @@ template< typename Mesh, MeshFunction< Mesh, MeshEntityDimension, Real >:: MeshFunction( const ThisType& meshFunction ) { - setupSynchronizer(meshFunction.meshPointer->getDistributedMesh()); this->meshPointer=meshFunction.meshPointer; @@ -241,7 +240,6 @@ bind( const MeshPointer& meshPointer, this->data.bind( *data, offset, getMesh().template getEntitiesCount< typename Mesh::template EntityType< MeshEntityDimension > >() ); } - template< typename Mesh, int MeshEntityDimension, typename Real > @@ -578,7 +576,6 @@ operator << ( std::ostream& str, const MeshFunction< Mesh, MeshEntityDimension, return str; } - -} // namespace Functions + } // namespace Functions } // namespace TNL -- GitLab From 14432a825a7c9c6a1eeb247e39800383bd2de826 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 5 Jan 2019 22:21:57 +0100 Subject: [PATCH 119/130] Added asynchronous mode to ParallelFor. --- src/TNL/ParallelFor.h | 46 ++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h index 9989954b5..7bffa7dda 100644 --- a/src/TNL/ParallelFor.h +++ b/src/TNL/ParallelFor.h @@ -15,7 +15,7 @@ #include #include -/* +/**** * The implementation of ParallelFor is not meant to provide maximum performance * at every cost, but maximum flexibility for operating with data stored on the * device. @@ -28,7 +28,10 @@ namespace TNL { -template< typename Device = Devices::Host > +enum ParallelForMode { SynchronousMode, AsynchronousMode }; + +template< typename Device = Devices::Host, + ParallelForMode Mode = SynchronousMode > struct ParallelFor { template< typename Index, @@ -55,7 +58,8 @@ struct ParallelFor } }; -template< typename Device = Devices::Host > +template< typename Device = Devices::Host, + ParallelForMode Mode = SynchronousMode > struct ParallelFor2D { template< typename Index, @@ -86,7 +90,8 @@ struct ParallelFor2D } }; -template< typename Device = Devices::Host > +template< typename Device = Devices::Host, + ParallelForMode Mode = SynchronousMode > struct ParallelFor3D { template< typename Index, @@ -185,8 +190,8 @@ ParallelFor3DKernel( Index startX, Index startY, Index startZ, Index endX, Index } #endif -template<> -struct ParallelFor< Devices::Cuda > +template< ParallelForMode Mode > +struct ParallelFor< Devices::Cuda, Mode > { template< typename Index, typename Function, @@ -208,8 +213,11 @@ struct ParallelFor< Devices::Cuda > ParallelForKernel< true ><<< gridSize, blockSize >>>( start, end, f, args... ); } - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + if( Mode == SynchronousMode ) + { + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + } } #else throw Exceptions::CudaSupportMissing(); @@ -217,8 +225,8 @@ struct ParallelFor< Devices::Cuda > } }; -template<> -struct ParallelFor2D< Devices::Cuda > +template< ParallelForMode Mode > +struct ParallelFor2D< Devices::Cuda, Mode > { template< typename Index, typename Function, @@ -264,8 +272,11 @@ struct ParallelFor2D< Devices::Cuda > ParallelFor2DKernel< true, true ><<< gridSize, blockSize >>> ( startX, startY, endX, endY, f, args... ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + if( Mode == SynchronousMode ) + { + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + } } #else throw Exceptions::CudaSupportMissing(); @@ -273,8 +284,8 @@ struct ParallelFor2D< Devices::Cuda > } }; -template<> -struct ParallelFor3D< Devices::Cuda > +template< ParallelForMode Mode > +struct ParallelFor3D< Devices::Cuda, Mode > { template< typename Index, typename Function, @@ -359,8 +370,11 @@ struct ParallelFor3D< Devices::Cuda > ParallelFor3DKernel< true, true, true ><<< gridSize, blockSize >>> ( startX, startY, startZ, endX, endY, endZ, f, args... ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + if( Mode == SynchronousMode ) + { + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + } } #else throw Exceptions::CudaSupportMissing(); -- GitLab From f04a3b2cd8d0514eb853a0e5e70637a0c6e957fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 5 Jan 2019 22:22:16 +0100 Subject: [PATCH 120/130] Traversers benchmark is using asynchronous parallel for. --- .../Traversers/GridTraversersBenchmark_1D.h | 6 +-- .../Traversers/GridTraversersBenchmark_2D.h | 33 +++++++------- .../Traversers/GridTraversersBenchmark_3D.h | 45 ++++++++++--------- 3 files changed, 45 insertions(+), 39 deletions(-) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index 006b0316f..41391d625 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -97,7 +97,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > { data[ i ] += (Real) 1.0; }; - ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); + ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() ); } void addOneUsingParallelForAndGridEntity() @@ -110,7 +110,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > entity.refresh(); data[ entity.getIndex() ] += (Real) 1.0; }; - ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); + ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() ); } void addOneUsingParallelForAndMeshFunction() @@ -125,7 +125,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ( *_u )( entity ) += (Real) 1.0; //WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity ); }; - ParallelFor< Device >::exec( ( Index ) 0, size, f ); + ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f ); } void addOneUsingTraverser() diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index 7c90a5064..1da182a54 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -100,11 +100,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index > data[ j * _size + i ] += (Real) 1.0; }; - ParallelFor2D< Device >::exec( ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - f, v.getData() ); + ParallelFor2D< Device, AsynchronousMode >::exec( + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); } void addOneUsingParallelForAndGridEntity() @@ -119,11 +120,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index > data[ entity.getIndex() ] += (Real) 1.0; }; - ParallelFor2D< Device >::exec( ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - f, v.getData() ); + ParallelFor2D< Device, AsynchronousMode >::exec( + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); } void addOneUsingParallelForAndMeshFunction() @@ -139,11 +141,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index > ( *_u )( entity ) += (Real) 1.0; }; - ParallelFor2D< Device >::exec( ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - f, v.getData() ); + ParallelFor2D< Device, AsynchronousMode >::exec( + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); } diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index 2a32184ea..858a4d1db 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -109,13 +109,14 @@ class GridTraversersBenchmark< 3, Device, Real, Index > data[ ( k * _size + j ) * _size + i ] += (Real) 1.0; }; - ParallelFor3D< Device >::exec( ( Index ) 0, - ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - this->size, - f, v.getData() ); + ParallelFor3D< Device, AsynchronousMode >::exec( + ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); } void addOneUsingParallelForAndGridEntity() @@ -131,13 +132,14 @@ class GridTraversersBenchmark< 3, Device, Real, Index > data[ entity.getIndex() ] += (Real) 1.0; }; - ParallelFor3D< Device >::exec( ( Index ) 0, - ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - this->size, - f, v.getData() ); + ParallelFor3D< Device, AsynchronousMode >::exec( + ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); } void addOneUsingParallelForAndMeshFunction() @@ -154,13 +156,14 @@ class GridTraversersBenchmark< 3, Device, Real, Index > ( *_u )( entity ) += (Real) 1.0; }; - ParallelFor3D< Device >::exec( ( Index ) 0, - ( Index ) 0, - ( Index ) 0, - this->size, - this->size, - this->size, - f, v.getData() ); + ParallelFor3D< Device, AsynchronousMode >::exec( + ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); } -- GitLab From b5d9ebb1aa600a2806db964583949cb8172d9543 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sun, 6 Jan 2019 15:50:51 +0100 Subject: [PATCH 121/130] Added simple cell test. --- .../Traversers/GridTraverserBenchmarkHelper.h | 136 +-------------- .../GridTraverserBenchmarkHelper_1D.h | 154 +++++++++++++++++ .../GridTraverserBenchmarkHelper_2D.h | 152 +++++++++++++++++ .../GridTraverserBenchmarkHelper_3D.h | 156 ++++++++++++++++++ .../Traversers/GridTraversersBenchmark_1D.h | 44 ++--- .../Traversers/GridTraversersBenchmark_2D.h | 38 +++-- .../Traversers/GridTraversersBenchmark_3D.h | 36 ++-- src/Benchmarks/Traversers/SimpleCell.h | 57 ++++++- .../Traversers/tnl-benchmark-traversers.h | 24 +-- 9 files changed, 602 insertions(+), 195 deletions(-) create mode 100644 src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h create mode 100644 src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h create mode 100644 src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h index c13ec3ab7..6da7ec09b 100644 --- a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h @@ -20,143 +20,15 @@ namespace TNL { namespace Benchmarks { namespace Traversers { -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor > -__global__ void -_GridTraverser1D( - const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, - UserData userData, - const typename GridEntity::CoordinatesType begin, - const typename GridEntity::CoordinatesType end, - const Index gridIdx ) -{ - typedef Real RealType; - typedef Index IndexType; - typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; - //typename GridType::CoordinatesType coordinates; - - GridEntity entity( *grid ); - entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - //coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( entity.getCoordinates() <= end ) - { - entity.refresh(); - //( userData.u->getData() )[ entity.getIndex( coordinates ) ] += ( RealType ) 1.0; - //( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0; - //userData.data[ entity.getIndex() ] += ( RealType ) 1.0; - //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0; - ( *userData.u )( entity ) += ( RealType ) 1.0; - //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } -} -#endif - -template< typename Grid, - typename Device = typename Grid::DeviceType > -class GridTraverserBenchmarkHelper{}; - template< typename Grid > -class GridTraverserBenchmarkHelper< Grid, Devices::Host > -{ - public: - - using GridType = Grid; - using GridPointer = Pointers::SharedPointer< Grid >; - using RealType = typename GridType::RealType; - using IndexType = typename GridType::IndexType; - using CoordinatesType = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; - using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using CellType = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; - using SimpleCellType = SimpleCell< GridType >; - using Traverser = Meshes::Traverser< Grid, CellType >; - using UserDataType = BenchmarkTraverserUserData< MeshFunction >; - using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; - - static void noBCTraverserTest( const GridPointer& grid, - UserDataType& userData, - std::size_t size ) - { - /*Meshes::GridTraverser< Grid >::template processEntities< CellType, AddOneEntitiesProcessorType, UserDataType, false >( - grid, - CoordinatesType( 0 ), - grid->getDimensions() - CoordinatesType( 1 ), - userData ); - */ - - const CoordinatesType begin( 0 ); - const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); - //MeshFunction* _u = &u.template modifyData< Device >(); - SimpleCellType entity( *grid ); - for( IndexType x = begin.x(); x <= end.x(); x ++ ) - { - entity.getCoordinates().x() = x; - entity.refresh(); - //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0; - ( *userData.u )( entity ) += ( RealType ) 1.0; - //AddOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); - } - - } -}; - -template< typename Grid > -class GridTraverserBenchmarkHelper< Grid, Devices::Cuda > -{ - public: - - using GridType = Grid; - using GridPointer = Pointers::SharedPointer< Grid >; - using RealType = typename GridType::RealType; - using IndexType = typename GridType::IndexType; - using CoordinatesType = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; - using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using CellType = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; - using SimpleCellType = SimpleCell< GridType >; - using Traverser = Meshes::Traverser< Grid, CellType >; - using UserDataType = BenchmarkTraverserUserData< MeshFunction >; - using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; - - static void noBCTraverserTest( const GridPointer& grid, - UserDataType& userData, - std::size_t size ) - { -#ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( - blockSize, - blocksCount, - gridsCount, - size ); - dim3 gridIdx; - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) - { - dim3 gridSize; - Devices::Cuda::setupGrid( - blocksCount, - gridsCount, - gridIdx, - gridSize ); - _GridTraverser1D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType > - <<< blocksCount, blockSize >>> - ( &grid.template getData< Devices::Cuda >(), - userData, - CoordinatesType( 0 ), - CoordinatesType( size ) - CoordinatesType( 1 ), - gridIdx.x ); +class GridTraverserBenchmarkHelper{}; - } -#endif - } -}; } // namespace Traversers } // namespace Benchmarks } // namespace TNL +#include "GridTraverserBenchmarkHelper_1D.h" +#include "GridTraverserBenchmarkHelper_2D.h" +#include "GridTraverserBenchmarkHelper_3D.h" diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h new file mode 100644 index 000000000..e460a8bca --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h @@ -0,0 +1,154 @@ +/*************************************************************************** + GridTraversersBenchmarkHelper_1D.h - description + ------------------- + begin : Jan 6, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include "GridTraverserBenchmarkHelper.h" +#include "AddOneEntitiesProcessor.h" +#include "BenchmarkTraverserUserData.h" +#include "SimpleCell.h" + + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor > +__global__ void +_GridTraverser1D( + const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const Index gridIdx ) +{ + typedef Real RealType; + typedef Index IndexType; + typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; + //typename GridType::CoordinatesType coordinates; + + GridEntity entity( *grid ); + entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + //coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( entity.getCoordinates() <= end ) + { + entity.refresh(); + //( userData.u->getData() )[ entity.getIndex( coordinates ) ] += ( RealType ) 1.0; + //( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0; + //userData.data[ entity.getIndex() ] += ( RealType ) 1.0; + //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0; + ( *userData.u )( entity ) += ( RealType ) 1.0; + //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +} +#endif + +template< typename Real, + typename Index > +class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Host, Index > > +{ + public: + + constexpr static int Dimension = 1; + using GridType = Meshes::Grid< Dimension, Real, Devices::Host, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void simpleCellTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { + const CoordinatesType begin( 0 ); + const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); + SimpleCellType entity( *grid ); + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0; + ( *userData.u )( entity ) += ( RealType ) 1.0; + } + + } +}; + +template< typename Real, + typename Index > +class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Cuda, Index > > +{ + public: + + constexpr static int Dimension = 1; + using GridType = Meshes::Grid< Dimension, Real, Devices::Cuda, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void simpleCellTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + _GridTraverser1D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType > + <<< blocksCount, blockSize >>> + ( &grid.template getData< Devices::Cuda >(), + userData, + CoordinatesType( 0 ), + CoordinatesType( size ) - CoordinatesType( 1 ), + gridIdx.x ); + + } +#endif + } +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h new file mode 100644 index 000000000..eca6c7fee --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h @@ -0,0 +1,152 @@ +/*************************************************************************** + GridTraversersBenchmarkHelper_2D.h - description + ------------------- + begin : Jan 6, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include "GridTraverserBenchmarkHelper.h" +#include "AddOneEntitiesProcessor.h" +#include "BenchmarkTraverserUserData.h" +#include "SimpleCell.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor > +__global__ void +_GridTraverser2D( + const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const dim3 gridIdx ) +{ + typedef Real RealType; + typedef Index IndexType; + typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; + + GridEntity entity( *grid ); + entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + if( entity.getCoordinates() <= end ) + { + entity.refresh(); + ( *userData.u )( entity ) += ( RealType ) 1.0; + } +} +#endif + +template< typename Real, + typename Index > +class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Host, Index > > +{ + public: + + constexpr static int Dimension = 2; + using GridType = Meshes::Grid< Dimension, Real, Devices::Host, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void simpleCellTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { + const CoordinatesType begin( 0 ); + const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); + SimpleCellType entity( *grid ); + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y()++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0; + ( *userData.u )( entity ) += ( RealType ) 1.0; + } + + } +}; + +template< typename Real, + typename Index > +class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Cuda, Index > > +{ + public: + + constexpr static int Dimension = 2; + using GridType = Meshes::Grid< Dimension, Real, Devices::Cuda, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void simpleCellTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { +#ifdef HAVE_CUDA + dim3 blockSize( 16, 16 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size ); + dim3 gridIdx; + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + _GridTraverser2D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType > + <<< blocksCount, blockSize >>> + ( &grid.template getData< Devices::Cuda >(), + userData, + CoordinatesType( 0 ), + CoordinatesType( size ) - CoordinatesType( 1 ), + gridIdx.x ); + } +#endif + } +}; + + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h new file mode 100644 index 000000000..4a5da6fd4 --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h @@ -0,0 +1,156 @@ +/*************************************************************************** + GridTraversersBenchmarkHelper_3D.h - description + ------------------- + begin : Jan 6, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include "GridTraverserBenchmarkHelper.h" +#include "AddOneEntitiesProcessor.h" +#include "BenchmarkTraverserUserData.h" +#include "SimpleCell.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor > +__global__ void +_GridTraverser3D( + const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const dim3 gridIdx ) +{ + typedef Real RealType; + typedef Index IndexType; + typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; + + GridEntity entity( *grid ); + entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + entity.getCoordinates().z() = begin.z() + ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + + if( entity.getCoordinates() <= end ) + { + entity.refresh(); + ( *userData.u )( entity ) += ( RealType ) 1.0; + } +} +#endif + +template< typename Real, + typename Index > +class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Host, Index > > +{ + public: + + constexpr static int Dimension = 3; + using GridType = Meshes::Grid< Dimension, Real, Devices::Host, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void simpleCellTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { + const CoordinatesType begin( 0 ); + const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); + SimpleCellType entity( *grid ); + for( entity.getCoordinates().z() = begin.z(); + entity.getCoordinates().z() <= end.z(); + entity.getCoordinates().z()++ ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y()++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + ( *userData.u )( entity ) += ( RealType ) 1.0; + } + } +}; + +template< typename Real, + typename Index > +class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Cuda, Index > > +{ + public: + + constexpr static int Dimension = 3; + using GridType = Meshes::Grid< Dimension, Real, Devices::Cuda, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void simpleCellTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { +#ifdef HAVE_CUDA + dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size, + size ); + dim3 gridIdx; + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + _GridTraverser3D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType > + <<< blocksCount, blockSize >>> + ( &grid.template getData< Devices::Cuda >(), + userData, + CoordinatesType( 0 ), + CoordinatesType( size ) - CoordinatesType( 1 ), + gridIdx.x ); + } +#endif + } +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index 41391d625..145f42ca9 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -23,6 +23,7 @@ #include #include "cuda-kernels.h" #include "GridTraversersBenchmark.h" +#include "SimpleCell.h" namespace TNL { namespace Benchmarks { @@ -37,13 +38,14 @@ class GridTraversersBenchmark< 1, Device, Real, Index > public: using Vector = Containers::Vector< Real, Device, Index >; - using Grid = Meshes::Grid< 1, Real, Device, Index >; - using GridPointer = Pointers::SharedPointer< Grid >; - using Coordinates = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; + using GridType = Meshes::Grid< 1, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using Coordinates = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; + using CellType = typename GridType::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; using UserDataType = BenchmarkTraverserUserData< MeshFunction >; using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; @@ -100,44 +102,48 @@ class GridTraversersBenchmark< 1, Device, Real, Index > ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() ); } - void addOneUsingParallelForAndGridEntity() + void addOneUsingSimpleCell() { - const Grid* currentGrid = &grid.template getData< Device >(); + /*const GridType* currentGrid = &grid.template getData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Real* data ) { - Cell entity( *currentGrid ); + SimpleCellType entity( *currentGrid ); entity.getCoordinates().x() = i; entity.refresh(); data[ entity.getIndex() ] += (Real) 1.0; }; - ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() ); + ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );*/ + GridTraverserBenchmarkHelper< GridType >::simpleCellTest( + grid, + userData, + size ); } void addOneUsingParallelForAndMeshFunction() { - const Grid* currentGrid = &grid.template getData< Device >(); + const GridType* currentGrid = &grid.template getData< Device >(); MeshFunction* _u = &u.template modifyData< Device >(); auto f = [=] __cuda_callable__ ( Index i ) { - Cell entity( *currentGrid ); + SimpleCellType entity( *currentGrid ); entity.getCoordinates().x() = i; entity.refresh(); - ( *_u )( entity ) += (Real) 1.0; - //WriteOneEntitiesProcessorType::processEntity( *currentGrid, userData, entity ); + _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0; + // ( *_u )( entity ) += (Real) 1.0; }; ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f ); } void addOneUsingTraverser() { - using CoordinatesType = typename Grid::CoordinatesType; - //traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > - // ( grid, userData ); + using CoordinatesType = typename GridType::CoordinatesType; + traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > + ( grid, userData ); - GridTraverserBenchmarkHelper< Grid >::noBCTraverserTest( + /*GridTraverserBenchmarkHelper< GridType >::noBCTraverserTest( grid, userData, - size ); + size );*/ } void traverseUsingPureC() diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index 1da182a54..66462eb1a 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -22,6 +22,7 @@ #include #include #include "cuda-kernels.h" +#include "SimpleCell.h" namespace TNL { namespace Benchmarks { @@ -35,13 +36,14 @@ class GridTraversersBenchmark< 2, Device, Real, Index > public: using Vector = Containers::Vector< Real, Device, Index >; - using Grid = Meshes::Grid< 2, Real, Device, Index >; - using GridPointer = Pointers::SharedPointer< Grid >; - using Coordinates = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; + using GridType = Meshes::Grid< 2, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using Coordinates = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 2, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; + using CellType = typename GridType::template EntityType< 2, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; using UserDataType = BenchmarkTraverserUserData< MeshFunction >; using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; @@ -108,12 +110,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index > f, v.getData() ); } - void addOneUsingParallelForAndGridEntity() + void addOneUsingSimpleCell() { - const Grid* currentGrid = &grid.template getData< Device >(); + /*const GridType* currentGrid = &grid.template getData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { - Cell entity( *currentGrid ); + SimpleCellType entity( *currentGrid ); entity.getCoordinates().x() = i; entity.getCoordinates().y() = j; entity.refresh(); @@ -125,20 +127,26 @@ class GridTraversersBenchmark< 2, Device, Real, Index > ( Index ) 0, this->size, this->size, - f, v.getData() ); + f, v.getData() );*/ + GridTraverserBenchmarkHelper< GridType >::simpleCellTest( + grid, + userData, + size ); + } void addOneUsingParallelForAndMeshFunction() { - const Grid* currentGrid = &grid.template getData< Device >(); + const GridType* currentGrid = &grid.template getData< Device >(); MeshFunction* _u = &u.template modifyData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) { - Cell entity( *currentGrid ); + SimpleCellType entity( *currentGrid ); entity.getCoordinates().x() = i; entity.getCoordinates().y() = j; entity.refresh(); - ( *_u )( entity ) += (Real) 1.0; + //( *_u )( entity ) += (Real) 1.0; + _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0; }; ParallelFor2D< Device, AsynchronousMode >::exec( @@ -152,7 +160,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > void addOneUsingTraverser() { - using CoordinatesType = typename Grid::CoordinatesType; + using CoordinatesType = typename GridType::CoordinatesType; traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); @@ -197,7 +205,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > else // Device == Devices::Cuda { #ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; + dim3 blockSize( 32, 8 ), blocksCount, gridsCount; Devices::Cuda::setupThreads( blockSize, blocksCount, diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index 858a4d1db..b6f9bd4e1 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -25,6 +25,7 @@ #include "cuda-kernels.h" #include "AddOneEntitiesProcessor.h" #include "BenchmarkTraverserUserData.h" +#include "SimpleCell.h" namespace TNL { namespace Benchmarks { @@ -38,13 +39,14 @@ class GridTraversersBenchmark< 3, Device, Real, Index > public: using Vector = Containers::Vector< Real, Device, Index >; - using Grid = Meshes::Grid< 3, Real, Device, Index >; - using GridPointer = Pointers::SharedPointer< Grid >; - using Coordinates = typename Grid::CoordinatesType; - using MeshFunction = Functions::MeshFunction< Grid >; + using GridType = Meshes::Grid< 3, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using Coordinates = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; - using Cell = typename Grid::template EntityType< 3, Meshes::GridEntityNoStencilStorage >; - using Traverser = Meshes::Traverser< Grid, Cell >; + using CellType = typename GridType::template EntityType< 3, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; using UserDataType = BenchmarkTraverserUserData< MeshFunction >; using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; @@ -119,12 +121,12 @@ class GridTraversersBenchmark< 3, Device, Real, Index > f, v.getData() ); } - void addOneUsingParallelForAndGridEntity() + void addOneUsingSimpleCell() { - const Grid* currentGrid = &grid.template getData< Device >(); + /*const GridType* currentGrid = &grid.template getData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { - Cell entity( *currentGrid ); + SimpleCellType entity( *currentGrid ); entity.getCoordinates().x() = i; entity.getCoordinates().y() = j; entity.getCoordinates().z() = k; @@ -139,21 +141,27 @@ class GridTraversersBenchmark< 3, Device, Real, Index > this->size, this->size, this->size, - f, v.getData() ); + f, v.getData() );*/ + GridTraverserBenchmarkHelper< GridType >::simpleCellTest( + grid, + userData, + size ); + } void addOneUsingParallelForAndMeshFunction() { - const Grid* currentGrid = &grid.template getData< Device >(); + const GridType* currentGrid = &grid.template getData< Device >(); MeshFunction* _u = &u.template modifyData< Device >(); auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) { - Cell entity( *currentGrid ); + SimpleCellType entity( *currentGrid ); entity.getCoordinates().x() = i; entity.getCoordinates().y() = j; entity.getCoordinates().z() = k; entity.refresh(); - ( *_u )( entity ) += (Real) 1.0; + //( *_u )( entity ) += (Real) 1.0; + _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0; }; ParallelFor3D< Device, AsynchronousMode >::exec( @@ -205,7 +213,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > else // Device == Devices::Cuda { #ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; + dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount; Devices::Cuda::setupThreads( blockSize, blocksCount, diff --git a/src/Benchmarks/Traversers/SimpleCell.h b/src/Benchmarks/Traversers/SimpleCell.h index c70f64fda..9776ef26c 100644 --- a/src/Benchmarks/Traversers/SimpleCell.h +++ b/src/Benchmarks/Traversers/SimpleCell.h @@ -47,7 +47,10 @@ class SimpleCell< Meshes::Grid< 1, Real, Device, Index > > CoordinatesType& getCoordinates() { return this->coordinates; }; __cuda_callable__ - void refresh() {index = coordinates.x();}; + const CoordinatesType& getCoordinates() const { return this->coordinates; }; + + __cuda_callable__ + void refresh() {index = this->grid.getEntityIndex( *this );}; __cuda_callable__ const IndexType& getIndex() const { return this->index; }; @@ -64,7 +67,7 @@ template< typename Real, class SimpleCell< Meshes::Grid< 2, Real, Device, Index > > { public: - using GridType = Meshes::Grid< 1, Real, Device, Index >; + using GridType = Meshes::Grid< 2, Real, Device, Index >; using RealType = typename GridType::RealType; using DeviceType = typename GridType::DeviceType; using IndexType = typename GridType::IndexType; @@ -72,6 +75,30 @@ class SimpleCell< Meshes::Grid< 2, Real, Device, Index > > constexpr static int getEntityDimension() { return 2; }; + __cuda_callable__ + SimpleCell( const GridType& grid ) : + grid( grid ){}; + + __cuda_callable__ + const GridType& getMesh() const { return this->grid;}; + + __cuda_callable__ + CoordinatesType& getCoordinates() { return this->coordinates; }; + + __cuda_callable__ + const CoordinatesType& getCoordinates() const { return this->coordinates; }; + + __cuda_callable__ + void refresh() {index = this->grid.getEntityIndex( *this );}; + + __cuda_callable__ + const IndexType& getIndex() const { return this->index; }; + + protected: + const GridType& grid; + CoordinatesType coordinates; + IndexType index; + }; template< typename Real, @@ -80,7 +107,7 @@ template< typename Real, class SimpleCell< Meshes::Grid< 3, Real, Device, Index > > { public: - using GridType = Meshes::Grid< 1, Real, Device, Index >; + using GridType = Meshes::Grid< 3, Real, Device, Index >; using RealType = typename GridType::RealType; using DeviceType = typename GridType::DeviceType; using IndexType = typename GridType::IndexType; @@ -88,6 +115,30 @@ class SimpleCell< Meshes::Grid< 3, Real, Device, Index > > constexpr static int getEntityDimension() { return 3; }; + __cuda_callable__ + SimpleCell( const GridType& grid ) : + grid( grid ){}; + + __cuda_callable__ + const GridType& getMesh() const { return this->grid;}; + + __cuda_callable__ + CoordinatesType& getCoordinates() { return this->coordinates; }; + + __cuda_callable__ + const CoordinatesType& getCoordinates() const { return this->coordinates; }; + + __cuda_callable__ + void refresh() { index = this->grid.getEntityIndex( *this ); }; + + __cuda_callable__ + const IndexType& getIndex() const { return this->index; }; + + protected: + const GridType& grid; + CoordinatesType coordinates; + IndexType index; + }; } // namespace Traversers diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 2963bb792..f329d5640 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -127,23 +127,23 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one using parallel for with grid entity */ - if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-grid-entity" ) ) + if( tests.containsValue( "all" ) || tests.containsValue( "add-one-simple-cell" ) ) { - auto hostWriteOneUsingParallelForAndGridEntity = [&] () + auto hostAddOneUsingSimpleCell = [&] () { - hostTraverserBenchmark.addOneUsingParallelForAndGridEntity(); + hostTraverserBenchmark.addOneUsingSimpleCell(); }; - benchmark.setOperation( "par.for+grid ent.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.setOperation( "simple cell", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndGridEntity ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingSimpleCell ); #ifdef HAVE_CUDA - auto cudaWriteOneUsingParallelForAndGridEntity = [&] () + auto cudaAddOneUsingSimpleCell = [&] () { - cudaTraverserBenchmark.addOneUsingParallelForAndGridEntity(); + cudaTraverserBenchmark.addOneUsingSimpleCell(); }; if( withCuda ) - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndGridEntity ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingSimpleCell ); #endif } @@ -152,21 +152,21 @@ bool runBenchmark( const Config::ParameterContainer& parameters, */ if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-mesh-function" ) ) { - auto hostWriteOneUsingParallelForAndMeshFunction = [&] () + auto hostAddOneUsingParallelForAndMeshFunction = [&] () { hostTraverserBenchmark.addOneUsingParallelForAndMeshFunction(); }; benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) - benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelForAndMeshFunction ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingParallelForAndMeshFunction ); #ifdef HAVE_CUDA - auto cudaWriteOneUsingParallelForAndMeshFunction = [&] () + auto cudaAddOneUsingParallelForAndMeshFunction = [&] () { cudaTraverserBenchmark.addOneUsingParallelForAndMeshFunction(); }; if( withCuda ) - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelForAndMeshFunction ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingParallelForAndMeshFunction ); #endif } -- GitLab From 8ad63ca53c9502c711aada9c3c92b556212bd8b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Tue, 8 Jan 2019 10:41:17 +0100 Subject: [PATCH 122/130] Benchmarks: set minTime = 0.0 by default due to backwards compatibility --- src/Benchmarks/Benchmarks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 355fb4671..48e496c1e 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -332,7 +332,7 @@ public: protected: int loops = 1; - double minTime = 1; + double minTime = 0.0; double datasetSize = 0.0; double baseTime = 0.0; bool timing = true; -- GitLab From 1db10725d0878ed5674f2de32abee072793af455 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Tue, 8 Jan 2019 19:50:28 +0100 Subject: [PATCH 123/130] Added check of the benchmark results. --- src/Benchmarks/Benchmarks.h | 42 +++++++---- src/Benchmarks/FunctionTimer.h | 27 ++++--- .../Traversers/GridTraversersBenchmark_1D.h | 10 ++- .../Traversers/GridTraversersBenchmark_2D.h | 11 ++- .../Traversers/GridTraversersBenchmark_3D.h | 12 ++- .../Traversers/tnl-benchmark-traversers.h | 74 +++++++++++++++++-- 6 files changed, 141 insertions(+), 35 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 48e496c1e..b05958f17 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -202,33 +202,35 @@ public: BenchmarkResult & result ) { result.time = std::numeric_limits::quiet_NaN(); + FunctionTimer< Device > functionTimer; try { if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); if( this->timing ) if( this->reset ) - result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor ); else - result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); else if( this->reset ) - result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor ); else - result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); } else { if( this->timing ) if( this->reset ) - result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor ); else - result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); else if( this->reset ) - result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor ); else - result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); } + this->performedLoops = functionTimer.getPerformedLoops(); } catch ( const std::exception& e ) { std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl; @@ -269,24 +271,25 @@ public: BenchmarkResult & result ) { result.time = std::numeric_limits::quiet_NaN(); + FunctionTimer< Device > functionTimer; try { if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); if( this->timing ) - result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); else - result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); } else { if( this->timing ) - result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); else - result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); } } catch ( const std::exception& e ) { - std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl; + std::cerr << "Function timer failed due to a C++ exception with description: " << e.what() << std::endl; } result.bandwidth = datasetSize / result.time; @@ -320,6 +323,7 @@ public: // each computation has 3 subcolumns const int colspan = 3 * numberOfComputations; writeErrorMessage( msg, colspan ); + std::cerr << msg << std::endl; } using Logging::save; @@ -330,8 +334,18 @@ public: return monitor; } + int getPerformedLoops() const + { + return this->performedLoops; + } + + bool isResetingOn() const + { + return reset; + } + protected: - int loops = 1; + int loops = 1, performedLoops = 0; double minTime = 0.0; double datasetSize = 0.0; double baseTime = 0.0; diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h index 601cfc16c..05b59d28a 100644 --- a/src/Benchmarks/FunctionTimer.h +++ b/src/Benchmarks/FunctionTimer.h @@ -22,17 +22,17 @@ namespace TNL { namespace Benchmarks { -template< typename Device, - bool timing > +template< typename Device > class FunctionTimer { public: using DeviceType = Device; - template< typename ComputeFunction, + template< bool timing, + typename ComputeFunction, typename ResetFunction, typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > - static double + double timeFunction( ComputeFunction compute, ResetFunction reset, int maxLoops, @@ -52,7 +52,6 @@ class FunctionTimer reset(); compute(); - int loops; // If we do not perform reset function and don't need // the monitor, the timer is not interrupted after each loop. if( ! performReset && verbose < 2 ) @@ -67,7 +66,7 @@ class FunctionTimer for( loops = 0; loops < maxLoops || ( timing && timer.getRealTime() < minTime ); - ++loops) + ++loops) compute(); // Explicit synchronization of the CUDA device #ifdef HAVE_CUDA @@ -85,7 +84,6 @@ class FunctionTimer { // abuse the monitor's "time" for loops monitor.setTime( loops + 1 ); - reset(); // Explicit synchronization of the CUDA device @@ -104,15 +102,17 @@ class FunctionTimer timer.stop(); } } + std::cerr << loops << std::endl; if( timing ) return timer.getRealTime() / ( double ) loops; else return std::numeric_limits::quiet_NaN(); } - template< typename ComputeFunction, + template< bool timing, + typename ComputeFunction, typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > - static double + double timeFunction( ComputeFunction compute, int maxLoops, const double& minTime, @@ -120,8 +120,15 @@ class FunctionTimer Monitor && monitor = Monitor() ) { auto noReset = [] () {}; - return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false ); + return timeFunction< timing >( compute, noReset, maxLoops, minTime, verbose, monitor, false ); } + + int getPerformedLoops() const + { + return this->loops; + } + protected: + int loops; }; } // namespace Benchmarks diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index 145f42ca9..fb79acfc8 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -54,12 +54,12 @@ class GridTraversersBenchmark< 1, Device, Real, Index > userData( this->u ) { v_data = v.getData(); + u->getData().bind( v ); } void reset() { v.setValue( 0.0 ); - u->getData().setValue( 0.0 ); }; void addOneUsingPureC() @@ -146,6 +146,14 @@ class GridTraversersBenchmark< 1, Device, Real, Index > size );*/ } + bool checkAddOne( int loops, bool reseting ) + { + std::cout << loops << " -> " << v << std::endl; + if( reseting ) + return v.containsOnlyValue( 1.0 ); + return v.containsOnlyValue( ( Real ) loops ); + } + void traverseUsingPureC() { if( std::is_same< Device, Devices::Host >::value ) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index 66462eb1a..a707d0e9c 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -52,12 +52,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index > userData( u ) { v_data = v.getData(); + u->getData().bind( v ); } void reset() { v.setValue( 0.0 ); - u->getData().setValue( 0.0 ); }; void addOneUsingPureC() @@ -71,7 +71,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > else // Device == Devices::Cuda { #ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; + dim3 blockSize( 16, 16 ), blocksCount, gridsCount; Devices::Cuda::setupThreads( blockSize, blocksCount, @@ -183,6 +183,13 @@ class GridTraversersBenchmark< 2, Device, Real, Index > }*/ } + bool checkAddOne( int loops, bool reseting ) + { + if( reseting ) + return v.containsOnlyValue( 1.0 ); + return v.containsOnlyValue( ( Real ) loops ); + } + void traverseUsingPureC() { if( std::is_same< Device, Devices::Host >::value ) diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index b6f9bd4e1..833c15126 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -58,12 +58,12 @@ class GridTraversersBenchmark< 3, Device, Real, Index > userData( u ) { v_data = v.getData(); + u->getData().bind( v ); } void reset() { v.setValue( 0.0 ); - u->getData().setValue( 0.0 ); }; void addOneUsingPureC() @@ -78,7 +78,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > else // Device == Devices::Cuda { #ifdef HAVE_CUDA - dim3 blockSize( 256 ), blocksCount, gridsCount; + dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount; Devices::Cuda::setupThreads( blockSize, blocksCount, @@ -174,13 +174,19 @@ class GridTraversersBenchmark< 3, Device, Real, Index > f, v.getData() ); } - void addOneUsingTraverser() { traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); } + bool checkAddOne( int loops, bool reseting ) + { + if( reseting ) + return v.containsOnlyValue( 1.0 ); + return v.containsOnlyValue( ( Real ) loops ); + } + void traverseUsingPureC() { if( std::is_same< Device, Devices::Host >::value ) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index f329d5640..59441bbbb 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -48,6 +48,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, #else const bool withCuda = false; #endif + const bool check = parameters.getParameter< bool >( "check" ); /**** * Full grid traversing with no boundary conditions @@ -77,7 +78,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, { {"size", convertToString( size ) }, } ) ); /**** - * Write one using C for + * Add one using pure C code */ if( tests.containsValue( "all" ) || tests.containsValue( "add-one-pure-c" ) ) { @@ -88,7 +89,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.addOneUsingPureC(); }; if( withHost ) + { benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); + if( check && ! hostTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } #ifdef HAVE_CUDA auto cudaWriteOneUsingPureC = [&] () @@ -96,12 +103,18 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.addOneUsingPureC(); }; if( withCuda ) + { benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } #endif } /**** - * Write one using parallel for + * Add one using parallel for */ if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for" ) ) { @@ -112,7 +125,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.addOneUsingParallelFor(); }; if( withHost ) + { benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); + if( check && ! hostTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } #ifdef HAVE_CUDA auto cudaWriteOneUsingParallelFor = [&] () @@ -120,12 +139,19 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.addOneUsingParallelFor(); }; if( withCuda ) + { benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } + #endif } /**** - * Write one using parallel for with grid entity + * Add one using parallel for with grid entity */ if( tests.containsValue( "all" ) || tests.containsValue( "add-one-simple-cell" ) ) { @@ -135,7 +161,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; benchmark.setOperation( "simple cell", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) + { benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingSimpleCell ); + if( check && ! hostTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } #ifdef HAVE_CUDA auto cudaAddOneUsingSimpleCell = [&] () @@ -143,12 +175,19 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.addOneUsingSimpleCell(); }; if( withCuda ) + { benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingSimpleCell ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } + #endif } /**** - * Write one using parallel for with mesh function + * Add one using parallel for with mesh function */ if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-mesh-function" ) ) { @@ -158,7 +197,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) + { benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingParallelForAndMeshFunction ); + if( check && ! hostTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } #ifdef HAVE_CUDA auto cudaAddOneUsingParallelForAndMeshFunction = [&] () @@ -166,13 +211,19 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.addOneUsingParallelForAndMeshFunction(); }; if( withCuda ) + { benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingParallelForAndMeshFunction ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } #endif } /**** - * Write one using traverser + * Add one using traverser */ if( tests.containsValue( "all" ) || tests.containsValue( "add-one-traverser" ) ) { @@ -182,7 +233,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.addOneUsingTraverser(); }; if( withHost ) + { benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); + if( check && ! hostTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } #ifdef HAVE_CUDA auto cudaWriteOneUsingTraverser = [&] () @@ -190,7 +247,13 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.addOneUsingTraverser(); }; if( withCuda ) + { benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } #endif } std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl; @@ -343,6 +406,7 @@ void setupConfig( Config::ConfigDescription& config ) #else config.addEntry< bool >( "with-cuda", "Perform CUDA benchmarks.", false ); #endif + config.addEntry< bool >( "check", "Checking correct results of benchmark tests.", false ); config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-traversers.log"); config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); config.addEntryEnum( "append" ); -- GitLab From 3e42bec669bbc44ba95a7cbad4be0cd34db2736a Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Thu, 24 Jan 2019 13:50:41 +0100 Subject: [PATCH 124/130] Added build parameter --with-profiling. --- CMakeLists.txt | 11 +++++++++-- build | 4 ++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fe5519d12..85ad15652 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,6 +21,7 @@ set(WITH_CUDA_ARCH "auto" CACHE STRING "Build for these CUDA architectures") option(WITH_OPENMP "Build with OpenMP support" ON) option(WITH_GMP "Build with GMP support" OFF) option(WITH_TESTS "Build tests" ON) +option(WITH_PROFILING "Enable code profiling compiler flags" OFF ) option(WITH_COVERAGE "Enable code coverage reports from unit tests" OFF) option(WITH_EXAMPLES "Compile the 'examples' directory" ON) option(WITH_TOOLS "Compile the 'src/Tools' directory" ON) @@ -74,7 +75,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") endif() # set Debug/Release options -set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -g" ) +set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable" ) set( CMAKE_CXX_FLAGS_DEBUG "-g" ) set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG" ) #set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" ) @@ -229,7 +230,7 @@ if( ${WITH_CUDA} ) endif() endif() endif() - set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; ${CUDA_ARCH} -D_FORCE_INLINES --generate-line-info) + set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; ${CUDA_ARCH} -D_FORCE_INLINES ) # TODO: this is necessary only due to a bug in cmake set( CUDA_ADD_LIBRARY_OPTIONS -shared ) endif() @@ -243,6 +244,11 @@ if( OPENMP_FOUND AND ${WITH_OPENMP} ) set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_OPENMP ${OpenMP_CXX_FLAGS}" ) endif() +if( ${WITH_PROFILING} ) + set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g" ) + set( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-line-info") +endif() + find_package( DCMTK ) if( DCMTK_FOUND ) set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_DCMTK_H" ) @@ -414,6 +420,7 @@ message( " WITH_CUDA_ARCH = ${WITH_CUDA_ARCH}" ) message( " WITH_OPENMP = ${WITH_OPENMP}" ) message( " WITH_GMP = ${WITH_GMP}" ) message( " WITH_TESTS = ${WITH_TESTS}" ) +message( " WITH_PROFILING = ${WITH_PROFILING}" ) message( " WITH_COVERAGE = ${WITH_COVERAGE}" ) message( " WITH_EXAMPLES = ${WITH_EXAMPLES}" ) message( " WITH_TOOLS = ${WITH_TOOLS}" ) diff --git a/build b/build index f11dbffbc..c009a2608 100755 --- a/build +++ b/build @@ -22,6 +22,7 @@ WITH_CUDA_ARCH="auto" WITH_OPENMP="yes" WITH_GMP="no" WITH_TESTS="yes" +WITH_PROFILING="no" WITH_COVERAGE="no" WITH_EXAMPLES="yes" WITH_PYTHON="yes" @@ -57,6 +58,7 @@ do --with-openmp=* ) WITH_OPENMP="${option#*=}" ;; --with-gmp=* ) WITH_GMP="${option#*=}" ;; --with-tests=* ) WITH_TESTS="${option#*=}" ;; + --with-profiling=* ) WITH_PROFILING="${option#*=}" ;; --with-coverage=* ) WITH_COVERAGE="${option#*=}" ;; --with-examples=* ) WITH_EXAMPLES="${option#*=}" ;; --with-tools=* ) WITH_TOOLS="${option#*=}" ;; @@ -95,6 +97,7 @@ if [[ ${HELP} == "yes" ]]; then echo " --with-openmp=yes/no Enables OpenMP. 'yes' by default." echo " --with-gmp=yes/no Enables the wrapper for GNU Multiple Precision Arithmetic Library. 'no' by default." echo " --with-tests=yes/no Enables unit tests. 'yes' by default." + echo " --with-profiling=yes/no Enables code profiling compiler falgs. 'no' by default." echo " --with-coverage=yes/no Enables code coverage reports for unit tests. 'no' by default (lcov is required)." echo " --with-examples=yes/no Compile the 'examples' directory. 'yes' by default." echo " --with-tools=yes/no Compile the 'src/Tools' directory. 'yes' by default." @@ -165,6 +168,7 @@ cmake_command=( -DWITH_OPENMP=${WITH_OPENMP} -DWITH_GMP=${WITH_GMP} -DWITH_TESTS=${WITH_TESTS} + -DWITH_PROFILING=${WITH_PROFILING} -DWITH_COVERAGE=${WITH_COVERAGE} -DWITH_EXAMPLES=${WITH_EXAMPLES} -DWITH_TOOLS=${WITH_TOOLS} -- GitLab From ad6afe25b4baaa69998ae4568709616b75059623 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 30 Jan 2019 21:28:37 +0100 Subject: [PATCH 125/130] Turned off the build of traverser benchmark until its splitted into several files. --- src/Benchmarks/Traversers/CMakeLists.txt | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/Benchmarks/Traversers/CMakeLists.txt b/src/Benchmarks/Traversers/CMakeLists.txt index a80487135..b4e830a33 100644 --- a/src/Benchmarks/Traversers/CMakeLists.txt +++ b/src/Benchmarks/Traversers/CMakeLists.txt @@ -1,10 +1,12 @@ -if( BUILD_CUDA ) - CUDA_ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cu ) - TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl ${CUDA_cusparse_LIBRARY} ) -else() - ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp ) - TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl ) -endif() -SET_TARGET_PROPERTIES( tnl-benchmark-traversers PROPERTIES COMPILE_OPTIONS "-g" ) +# TODO: Split the benchmark into several files for faster build -install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin ) +#if( BUILD_CUDA ) +# CUDA_ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cu ) +# TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl ${CUDA_cusparse_LIBRARY} ) +#else() +# ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp ) +# TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl ) +#endif() +#SET_TARGET_PROPERTIES( tnl-benchmark-traversers PROPERTIES COMPILE_OPTIONS "-g" ) + +#install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin ) -- GitLab From 12fd6d02575eb38de2d4112d2c5665ad2f5c4feb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 10 Feb 2019 13:04:42 +0100 Subject: [PATCH 126/130] Fixing benchmarks --- src/Benchmarks/Benchmarks.h | 4 ++-- src/Benchmarks/FunctionTimer.h | 2 +- src/Benchmarks/Traversers/tnl-benchmark-traversers.h | 12 ++++++++---- src/TNL/Config/ConfigEntry.h | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index b05958f17..b58ea5007 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -81,11 +81,11 @@ public: void setup( const Config::ParameterContainer& parameters ) { - this->loops = parameters.getParameter< unsigned >( "loops" ); + this->loops = parameters.getParameter< int >( "loops" ); this->reset = parameters.getParameter< bool >( "reset" ); this->minTime = parameters.getParameter< double >( "min-time" ); this->timing = parameters.getParameter< bool >( "timing" ); - const int verbose = parameters.getParameter< unsigned >( "verbose" ); + const int verbose = parameters.getParameter< int >( "verbose" ); Logging::setVerbose( verbose ); } // TODO: ensure that this is not called in the middle of the benchmark diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h index 05b59d28a..6cef44aaf 100644 --- a/src/Benchmarks/FunctionTimer.h +++ b/src/Benchmarks/FunctionTimer.h @@ -16,6 +16,7 @@ #include #include +#include #include namespace TNL { @@ -102,7 +103,6 @@ class FunctionTimer timer.stop(); } } - std::cerr << loops << std::endl; if( timing ) return timer.getRealTime() / ( double ) loops; else diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 59441bbbb..38e22efeb 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -34,7 +34,10 @@ bool runBenchmark( const Config::ParameterContainer& parameters, Benchmark& benchmark, Benchmark::MetadataMap& metadata ) { - const Containers::List< String >& tests = parameters.getParameter< Containers::List< String > >( "tests" ); + // FIXME: the --tests is just a string because list does not work with enums +// const Containers::List< String >& tests = parameters.getParameter< Containers::List< String > >( "tests" ); + Containers::List< String > tests; + tests.Append( parameters.getParameter< String >( "tests" ) ); // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), // which have a default value. The workaround below works for int values, but it is not possible // to pass 64-bit integer values @@ -258,7 +261,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters, } std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl; } - return true; /**** @@ -391,7 +393,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, void setupConfig( Config::ConfigDescription& config ) { - config.addList< String >( "tests", "Tests to be performed.", "all" ); + // FIXME: addList does not work with addEntryEnum - ConfigDescription::addEntryEnum throws std::bad_cast +// config.addList< String >( "tests", "Tests to be performed.", "all" ); + config.addEntry< String >( "tests", "Tests to be performed.", "all" ); config.addEntryEnum( "all" ); config.addEntryEnum( "add-one-pure-c" ); config.addEntryEnum( "add-one-parallel-for" ); @@ -433,7 +437,7 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) const String & logFileName = parameters.getParameter< String >( "log-file" ); const String & outputMode = parameters.getParameter< String >( "output-mode" ); const String & precision = parameters.getParameter< String >( "precision" ); - const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); + const int sizeStepFactor = parameters.getParameter< int >( "size-step-factor" ); Benchmark benchmark; //( loops, verbose ); benchmark.setup( parameters ); diff --git a/src/TNL/Config/ConfigEntry.h b/src/TNL/Config/ConfigEntry.h index 1608a5b4b..1b56574cc 100644 --- a/src/TNL/Config/ConfigEntry.h +++ b/src/TNL/Config/ConfigEntry.h @@ -61,7 +61,7 @@ struct ConfigEntry : public ConfigEntryBase String printDefaultValue() const { return convertToString( defaultValue ); - }; + } std::vector< EntryType >& getEnumValues() { -- GitLab From 078ad1543b4c5810882617e14b3b08f5f5e31de4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 10 Feb 2019 10:43:25 +0100 Subject: [PATCH 127/130] Disabled unused parameters in tnl-benchmark-traversers --- .../Traversers/tnl-benchmark-traversers.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 38e22efeb..32b5dc1e6 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -415,14 +415,14 @@ void setupConfig( Config::ConfigDescription& config ) config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); config.addEntryEnum( "append" ); config.addEntryEnum( "overwrite" ); - config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); - config.addEntryEnum( "float" ); - config.addEntryEnum( "double" ); - config.addEntryEnum( "all" ); +// config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); +// config.addEntryEnum( "float" ); +// config.addEntryEnum( "double" ); +// config.addEntryEnum( "all" ); config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 ); config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 ); config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 ); - config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); +// config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); Benchmark::configSetup( config ); @@ -436,8 +436,8 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) { const String & logFileName = parameters.getParameter< String >( "log-file" ); const String & outputMode = parameters.getParameter< String >( "output-mode" ); - const String & precision = parameters.getParameter< String >( "precision" ); - const int sizeStepFactor = parameters.getParameter< int >( "size-step-factor" ); +// const String & precision = parameters.getParameter< String >( "precision" ); +// const int sizeStepFactor = parameters.getParameter< int >( "size-step-factor" ); Benchmark benchmark; //( loops, verbose ); benchmark.setup( parameters ); -- GitLab From 336bddf83737bc11d6b4e2cc1b51cc9611e18822 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 10 Feb 2019 10:51:16 +0100 Subject: [PATCH 128/130] Removed useless HAVE_CUDA from tnl-benchmark-traversers --- .../Traversers/tnl-benchmark-traversers.h | 43 ++----------------- 1 file changed, 4 insertions(+), 39 deletions(-) diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 32b5dc1e6..3552f9090 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -60,21 +60,17 @@ bool runBenchmark( const Config::ParameterContainer& parameters, for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size ); -#ifdef HAVE_CUDA GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size ); -#endif auto hostReset = [&]() { hostTraverserBenchmark.reset(); }; -#ifdef HAVE_CUDA auto cudaReset = [&]() { cudaTraverserBenchmark.reset(); }; -#endif benchmark.setMetadataColumns( Benchmark::MetadataColumns( @@ -100,7 +96,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters, benchmark.addErrorMessage( "Test results are not correct." ); } -#ifdef HAVE_CUDA auto cudaWriteOneUsingPureC = [&] () { cudaTraverserBenchmark.addOneUsingPureC(); @@ -113,7 +108,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters, benchmark.isResetingOn() ) ) benchmark.addErrorMessage( "Test results are not correct." ); } -#endif } /**** @@ -136,7 +130,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters, benchmark.addErrorMessage( "Test results are not correct." ); } -#ifdef HAVE_CUDA auto cudaWriteOneUsingParallelFor = [&] () { cudaTraverserBenchmark.addOneUsingParallelFor(); @@ -149,8 +142,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters, benchmark.isResetingOn() ) ) benchmark.addErrorMessage( "Test results are not correct." ); } - -#endif } /**** @@ -172,7 +163,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters, benchmark.addErrorMessage( "Test results are not correct." ); } -#ifdef HAVE_CUDA auto cudaAddOneUsingSimpleCell = [&] () { cudaTraverserBenchmark.addOneUsingSimpleCell(); @@ -185,8 +175,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters, benchmark.isResetingOn() ) ) benchmark.addErrorMessage( "Test results are not correct." ); } - -#endif } /**** @@ -208,7 +196,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters, benchmark.addErrorMessage( "Test results are not correct." ); } -#ifdef HAVE_CUDA auto cudaAddOneUsingParallelForAndMeshFunction = [&] () { cudaTraverserBenchmark.addOneUsingParallelForAndMeshFunction(); @@ -221,8 +208,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters, benchmark.isResetingOn() ) ) benchmark.addErrorMessage( "Test results are not correct." ); } -#endif - } /**** @@ -244,7 +229,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters, benchmark.addErrorMessage( "Test results are not correct." ); } -#ifdef HAVE_CUDA auto cudaWriteOneUsingTraverser = [&] () { cudaTraverserBenchmark.addOneUsingTraverser(); @@ -257,7 +241,6 @@ bool runBenchmark( const Config::ParameterContainer& parameters, benchmark.isResetingOn() ) ) benchmark.addErrorMessage( "Test results are not correct." ); } -#endif } std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl; } @@ -277,12 +260,10 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.reset(); }; -#ifdef HAVE_CUDA auto cudaReset = [&]() { cudaTraverserBenchmark.reset(); }; -#endif benchmark.setMetadataColumns( Benchmark::MetadataColumns( @@ -296,30 +277,24 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.traverseUsingPureC(); }; -#ifdef HAVE_CUDA auto cudaTraverseUsingPureC = [&] () { cudaTraverserBenchmark.traverseUsingPureC(); }; -#endif if( tests.containsValue( "all" ) || tests.containsValue( "bc-pure-c" ) ) { benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC ); -#ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC ); -#endif benchmark.setOperation( "Pure C RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC ); -#ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC ); -#endif } /**** @@ -330,30 +305,24 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.addOneUsingParallelFor(); }; -#ifdef HAVE_CUDA auto cudaTraverseUsingParallelFor = [&] () { cudaTraverserBenchmark.addOneUsingParallelFor(); }; -#endif if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) ) { benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); -#ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor ); -#endif benchmark.setOperation( "parallel for RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor ); -#ifdef HAVE_CUDA if( withCuda ) benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor ); -#endif } /**** @@ -364,28 +333,24 @@ bool runBenchmark( const Config::ParameterContainer& parameters, hostTraverserBenchmark.addOneUsingTraverser(); }; -#ifdef HAVE_CUDA auto cudaTraverseUsingTraverser = [&] () { cudaTraverserBenchmark.addOneUsingTraverser(); }; -#endif if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) ) { benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser ); -#ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser ); -#endif + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser ); benchmark.setOperation( "traverser RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); if( withHost ) benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); -#ifdef HAVE_CUDA - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser ); -#endif + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser ); } } return true; -- GitLab From 2c674bba2678ddaf8887c37f13dbf28b31d3ecc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 10 Feb 2019 13:05:20 +0100 Subject: [PATCH 129/130] Fixing benchmarks for traversers with BC --- .../Traversers/AddTwoEntitiesProcessor.h | 43 +++++++++++++++ .../Traversers/GridTraversersBenchmark.h | 2 + .../Traversers/GridTraversersBenchmark_1D.h | 6 ++- .../Traversers/GridTraversersBenchmark_2D.h | 9 ++-- .../Traversers/GridTraversersBenchmark_3D.h | 10 ++-- .../Traversers/tnl-benchmark-traversers.h | 53 ++++++++++--------- 6 files changed, 87 insertions(+), 36 deletions(-) create mode 100644 src/Benchmarks/Traversers/AddTwoEntitiesProcessor.h diff --git a/src/Benchmarks/Traversers/AddTwoEntitiesProcessor.h b/src/Benchmarks/Traversers/AddTwoEntitiesProcessor.h new file mode 100644 index 000000000..94f6d5807 --- /dev/null +++ b/src/Benchmarks/Traversers/AddTwoEntitiesProcessor.h @@ -0,0 +1,43 @@ +/*************************************************************************** + BenchmarkTraverserUserData.h - description + ------------------- + begin : Jan 5, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename TraverserUserData > +class AddTwoEntitiesProcessor +{ + public: + + using MeshType = typename TraverserUserData::MeshType; + using DeviceType = typename MeshType::DeviceType; + using RealType = typename MeshType::RealType; + + template< typename GridEntity > + __cuda_callable__ + static inline void processEntity( const MeshType& mesh, + TraverserUserData& userData, + const GridEntity& entity ) + { + auto& u = *userData.u; + u( entity ) += ( RealType ) 2.0; + } +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index be4f41d31..72ca102bc 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -22,6 +22,8 @@ #include #include +#include "AddOneEntitiesProcessor.h" +#include "AddTwoEntitiesProcessor.h" #include "GridTraverserBenchmarkHelper.h" #include "BenchmarkTraverserUserData.h" #include "cuda-kernels.h" diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h index fb79acfc8..94f8fa0d2 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -48,6 +48,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > using Traverser = Meshes::Traverser< GridType, CellType >; using UserDataType = BenchmarkTraverserUserData< MeshFunction >; using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + using AddTwoEntitiesProcessorType = AddTwoEntitiesProcessor< UserDataType >; GridTraversersBenchmark( Index size ) :size( size ), v( size ), grid( size ), u( grid ), @@ -199,8 +200,9 @@ class GridTraversersBenchmark< 1, Device, Real, Index > void traverseUsingTraverser() { - // TODO !!!!!!!!!!!!!!!!!!!!!! - traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > + traverser.template processBoundaryEntities< UserDataType, AddTwoEntitiesProcessorType > + ( grid, userData ); + traverser.template processInteriorEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); } diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h index a707d0e9c..803e598a4 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -22,6 +22,7 @@ #include #include #include "cuda-kernels.h" +#include "GridTraversersBenchmark.h" #include "SimpleCell.h" namespace TNL { @@ -46,6 +47,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > using Traverser = Meshes::Traverser< GridType, CellType >; using UserDataType = BenchmarkTraverserUserData< MeshFunction >; using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + using AddTwoEntitiesProcessorType = AddTwoEntitiesProcessor< UserDataType >; GridTraversersBenchmark( Index size ) :size( size ), v( size * size ), grid( size, size ), u( grid ), @@ -246,10 +248,11 @@ class GridTraversersBenchmark< 2, Device, Real, Index > } } - void traversingUsingTraverser() + void traverseUsingTraverser() { - // TODO !!!!!!!!!!!!!!!!!!!!!! - traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > + traverser.template processBoundaryEntities< UserDataType, AddTwoEntitiesProcessorType > + ( grid, userData ); + traverser.template processInteriorEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); } diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h index 833c15126..b7abb8b29 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -21,10 +21,8 @@ #include #include #include - #include "cuda-kernels.h" -#include "AddOneEntitiesProcessor.h" -#include "BenchmarkTraverserUserData.h" +#include "GridTraversersBenchmark.h" #include "SimpleCell.h" namespace TNL { @@ -49,6 +47,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > using Traverser = Meshes::Traverser< GridType, CellType >; using UserDataType = BenchmarkTraverserUserData< MeshFunction >; using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + using AddTwoEntitiesProcessorType = AddTwoEntitiesProcessor< UserDataType >; GridTraversersBenchmark( Index size ) : size( size ), @@ -258,8 +257,9 @@ class GridTraversersBenchmark< 3, Device, Real, Index > void traverseUsingTraverser() { - // TODO !!!!!!!!!!!!!!!!!!!!!! - traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > + traverser.template processBoundaryEntities< UserDataType, AddTwoEntitiesProcessorType > + ( grid, userData ); + traverser.template processInteriorEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); } diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index 3552f9090..7b44b9eb0 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -300,42 +300,43 @@ bool runBenchmark( const Config::ParameterContainer& parameters, /**** * Write one and two (as BC) using parallel for */ - auto hostTraverseUsingParallelFor = [&] () - { - hostTraverserBenchmark.addOneUsingParallelFor(); - }; - - auto cudaTraverseUsingParallelFor = [&] () - { - cudaTraverserBenchmark.addOneUsingParallelFor(); - }; - - if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) ) - { - benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - if( withHost ) - benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); - if( withCuda ) - benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor ); - - benchmark.setOperation( "parallel for RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - if( withHost ) - benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor ); - if( withCuda ) - benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor ); - } +// TODO: implement the benchmark (addOneUsingParallelFor does not consider BC) +// auto hostTraverseUsingParallelFor = [&] () +// { +// hostTraverserBenchmark.addOneUsingParallelFor(); +// }; +// +// auto cudaTraverseUsingParallelFor = [&] () +// { +// cudaTraverserBenchmark.addOneUsingParallelFor(); +// }; +// +// if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) ) +// { +// benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); +// if( withHost ) +// benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); +// if( withCuda ) +// benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor ); +// +// benchmark.setOperation( "parallel for RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); +// if( withHost ) +// benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor ); +// if( withCuda ) +// benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor ); +// } /**** * Write one and two (as BC) using traverser */ auto hostTraverseUsingTraverser = [&] () { - hostTraverserBenchmark.addOneUsingTraverser(); + hostTraverserBenchmark.traverseUsingTraverser(); }; auto cudaTraverseUsingTraverser = [&] () { - cudaTraverserBenchmark.addOneUsingTraverser(); + cudaTraverserBenchmark.traverseUsingTraverser(); }; if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) ) -- GitLab From c11141216ca4458cb9c52ea7480a39324c906853 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 10 Feb 2019 13:29:13 +0100 Subject: [PATCH 130/130] Removed TARGET_LINK_LIBRARIES and SET_TARGET_PROPERTIES from CMakeLists.txt in the benchmark of traversers [ci skip] --- src/Benchmarks/Traversers/CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/Benchmarks/Traversers/CMakeLists.txt b/src/Benchmarks/Traversers/CMakeLists.txt index b4e830a33..8e7c12d45 100644 --- a/src/Benchmarks/Traversers/CMakeLists.txt +++ b/src/Benchmarks/Traversers/CMakeLists.txt @@ -2,11 +2,8 @@ #if( BUILD_CUDA ) # CUDA_ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cu ) -# TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl ${CUDA_cusparse_LIBRARY} ) #else() # ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp ) -# TARGET_LINK_LIBRARIES( tnl-benchmark-traversers tnl ) #endif() -#SET_TARGET_PROPERTIES( tnl-benchmark-traversers PROPERTIES COMPILE_OPTIONS "-g" ) #install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin ) -- GitLab