diff --git a/CMakeLists.txt b/CMakeLists.txt index 348ad4ac27dc1a8294201591288660e23a363595..85ad156528e68100d2c2953b7e3f28e857cc1969 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,6 +21,7 @@ set(WITH_CUDA_ARCH "auto" CACHE STRING "Build for these CUDA architectures") option(WITH_OPENMP "Build with OpenMP support" ON) option(WITH_GMP "Build with GMP support" OFF) option(WITH_TESTS "Build tests" ON) +option(WITH_PROFILING "Enable code profiling compiler flags" OFF ) option(WITH_COVERAGE "Enable code coverage reports from unit tests" OFF) option(WITH_EXAMPLES "Compile the 'examples' directory" ON) option(WITH_TOOLS "Compile the 'src/Tools' directory" ON) @@ -243,6 +244,11 @@ if( OPENMP_FOUND AND ${WITH_OPENMP} ) set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_OPENMP ${OpenMP_CXX_FLAGS}" ) endif() +if( ${WITH_PROFILING} ) + set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g" ) + set( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-line-info") +endif() + find_package( DCMTK ) if( DCMTK_FOUND ) set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_DCMTK_H" ) @@ -414,6 +420,7 @@ message( " WITH_CUDA_ARCH = ${WITH_CUDA_ARCH}" ) message( " WITH_OPENMP = ${WITH_OPENMP}" ) message( " WITH_GMP = ${WITH_GMP}" ) message( " WITH_TESTS = ${WITH_TESTS}" ) +message( " WITH_PROFILING = ${WITH_PROFILING}" ) message( " WITH_COVERAGE = ${WITH_COVERAGE}" ) message( " WITH_EXAMPLES = ${WITH_EXAMPLES}" ) message( " WITH_TOOLS = ${WITH_TOOLS}" ) diff --git a/build b/build index f11dbffbc1ed3b23bfb1ef514cc21755ed0283d8..c009a2608423e52bdb71bee1841cde5efd54377b 100755 --- a/build +++ b/build @@ -22,6 +22,7 @@ WITH_CUDA_ARCH="auto" WITH_OPENMP="yes" WITH_GMP="no" WITH_TESTS="yes" +WITH_PROFILING="no" WITH_COVERAGE="no" WITH_EXAMPLES="yes" WITH_PYTHON="yes" @@ -57,6 +58,7 @@ do --with-openmp=* ) WITH_OPENMP="${option#*=}" ;; --with-gmp=* ) WITH_GMP="${option#*=}" ;; --with-tests=* ) WITH_TESTS="${option#*=}" ;; + --with-profiling=* ) WITH_PROFILING="${option#*=}" ;; --with-coverage=* ) WITH_COVERAGE="${option#*=}" ;; --with-examples=* ) WITH_EXAMPLES="${option#*=}" ;; --with-tools=* ) WITH_TOOLS="${option#*=}" ;; @@ -95,6 +97,7 @@ if [[ ${HELP} == "yes" ]]; then echo " --with-openmp=yes/no Enables OpenMP. 'yes' by default." echo " --with-gmp=yes/no Enables the wrapper for GNU Multiple Precision Arithmetic Library. 'no' by default." echo " --with-tests=yes/no Enables unit tests. 'yes' by default." + echo " --with-profiling=yes/no Enables code profiling compiler falgs. 'no' by default." echo " --with-coverage=yes/no Enables code coverage reports for unit tests. 'no' by default (lcov is required)." echo " --with-examples=yes/no Compile the 'examples' directory. 'yes' by default." echo " --with-tools=yes/no Compile the 'src/Tools' directory. 'yes' by default." @@ -165,6 +168,7 @@ cmake_command=( -DWITH_OPENMP=${WITH_OPENMP} -DWITH_GMP=${WITH_GMP} -DWITH_TESTS=${WITH_TESTS} + -DWITH_PROFILING=${WITH_PROFILING} -DWITH_COVERAGE=${WITH_COVERAGE} -DWITH_EXAMPLES=${WITH_EXAMPLES} -DWITH_TOOLS=${WITH_TOOLS} diff --git a/src/Benchmarks/BLAS/array-operations.h b/src/Benchmarks/BLAS/array-operations.h index 9ee6ff8a0699bbc96c63e34942f9ee890d88685f..b5cf9ff58da754aaa4c8ce645989afbb25e61f7c 100644 --- a/src/Benchmarks/BLAS/array-operations.h +++ b/src/Benchmarks/BLAS/array-operations.h @@ -72,9 +72,9 @@ benchmarkArrayOperations( Benchmark & benchmark, resultDevice = (int) deviceArray == deviceArray2; }; benchmark.setOperation( "comparison (operator==)", 2 * datasetSize ); - benchmark.time( reset1, "CPU", compareHost ); + benchmark.time< Devices::Host >( reset1, "CPU", compareHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", compareCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", compareCuda ); #endif @@ -87,9 +87,9 @@ benchmarkArrayOperations( Benchmark & benchmark, benchmark.setOperation( "copy (operator=)", 2 * datasetSize ); // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will // complain when compiling without CUDA - const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost ); + const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", copyAssignCudaCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", copyAssignCudaCuda ); #endif @@ -101,8 +101,8 @@ benchmarkArrayOperations( Benchmark & benchmark, }; #ifdef HAVE_CUDA benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime ); - benchmark.time( reset1, "CPU->GPU", copyAssignHostCuda ); - benchmark.time( reset1, "GPU->CPU", copyAssignCudaHost ); + benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost ); #endif @@ -113,9 +113,9 @@ benchmarkArrayOperations( Benchmark & benchmark, deviceArray.setValue( 3.0 ); }; benchmark.setOperation( "setValue", datasetSize ); - benchmark.time( reset1, "CPU", setValueHost ); + benchmark.time< Devices::Host >( reset1, "CPU", setValueHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", setValueCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", setValueCuda ); #endif @@ -132,9 +132,9 @@ benchmarkArrayOperations( Benchmark & benchmark, #endif }; benchmark.setOperation( "allocation (setSize)", datasetSize ); - benchmark.time( resetSize1, "CPU", setSizeHost ); + benchmark.time< Devices::Host >( resetSize1, "CPU", setSizeHost ); #ifdef HAVE_CUDA - benchmark.time( resetSize1, "GPU", setSizeCuda ); + benchmark.time< Devices::Cuda >( resetSize1, "GPU", setSizeCuda ); #endif @@ -151,9 +151,9 @@ benchmarkArrayOperations( Benchmark & benchmark, #endif }; benchmark.setOperation( "deallocation (reset)", datasetSize ); - benchmark.time( setSize1, "CPU", resetSizeHost ); + benchmark.time< Devices::Host >( setSize1, "CPU", resetSizeHost ); #ifdef HAVE_CUDA - benchmark.time( setSize1, "GPU", resetSizeCuda ); + benchmark.time< Devices::Cuda >( setSize1, "GPU", resetSizeCuda ); #endif return true; diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h index 5c3813b0a9f1798582ce2ee8a04f52ae4ee78408..966a4ec06bc90d8b4e220a2f4e49108c7cb0ece1 100644 --- a/src/Benchmarks/BLAS/spmv.h +++ b/src/Benchmarks/BLAS/spmv.h @@ -161,9 +161,9 @@ benchmarkSpMV( Benchmark & benchmark, }; benchmark.setOperation( datasetSize ); - benchmark.time( reset, "CPU", spmvHost ); + benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); #ifdef HAVE_CUDA - benchmark.time( reset, "GPU", spmvCuda ); + benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda ); #endif return true; diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h index b9a68d618b1c7caf5eb47a21e630143927367e54..e191b8fbb9a7949b7aba399591c5b2928909d893 100644 --- a/src/Benchmarks/BLAS/vector-operations.h +++ b/src/Benchmarks/BLAS/vector-operations.h @@ -64,7 +64,7 @@ benchmarkVectorOperations( Benchmark & benchmark, deviceVector.setValue( 1.0 ); #endif // A relatively harmless call to keep the compiler from realizing we - // don't actually do any useful work with the result of the reduciton. + // don't actually do any useful work with the result of the reduction. srand48(resultHost); resultHost = resultDevice = 0.0; }; @@ -90,9 +90,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.max(); }; benchmark.setOperation( "max", datasetSize ); - benchmark.time( reset1, "CPU", maxHost ); + benchmark.time< Devices::Host >( reset1, "CPU", maxHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", maxCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", maxCuda ); #endif @@ -103,9 +103,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.min(); }; benchmark.setOperation( "min", datasetSize ); - benchmark.time( reset1, "CPU", minHost ); + benchmark.time< Devices::Host >( reset1, "CPU", minHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", minCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", minCuda ); #endif @@ -125,10 +125,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "absMax", datasetSize ); - benchmark.time( reset1, "CPU", absMaxHost ); + benchmark.time< Devices::Host >( reset1, "CPU", absMaxHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", absMaxCuda ); - benchmark.time( reset1, "cuBLAS", absMaxCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", absMaxCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMaxCublas ); #endif @@ -148,10 +148,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "absMin", datasetSize ); - benchmark.time( reset1, "CPU", absMinHost ); + benchmark.time< Devices::Host >( reset1, "CPU", absMinHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", absMinCuda ); - benchmark.time( reset1, "cuBLAS", absMinCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", absMinCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMinCublas ); #endif @@ -162,9 +162,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.sum(); }; benchmark.setOperation( "sum", datasetSize ); - benchmark.time( reset1, "CPU", sumHost ); + benchmark.time< Devices::Host >( reset1, "CPU", sumHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", sumCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", sumCuda ); #endif @@ -182,10 +182,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "l1 norm", datasetSize ); - benchmark.time( reset1, "CPU", l1normHost ); + benchmark.time< Devices::Host >( reset1, "CPU", l1normHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", l1normCuda ); - benchmark.time( reset1, "cuBLAS", l1normCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", l1normCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l1normCublas ); #endif @@ -203,10 +203,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "l2 norm", datasetSize ); - benchmark.time( reset1, "CPU", l2normHost ); + benchmark.time< Devices::Host >( reset1, "CPU", l2normHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", l2normCuda ); - benchmark.time( reset1, "cuBLAS", l2normCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", l2normCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l2normCublas ); #endif @@ -217,9 +217,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.lpNorm( 3.0 ); }; benchmark.setOperation( "l3 norm", datasetSize ); - benchmark.time( reset1, "CPU", l3normHost ); + benchmark.time< Devices::Host >( reset1, "CPU", l3normHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", l3normCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", l3normCuda ); #endif @@ -238,10 +238,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "scalar product", 2 * datasetSize ); - benchmark.time( reset1, "CPU", scalarProductHost ); + benchmark.time< Devices::Host >( reset1, "CPU", scalarProductHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", scalarProductCuda ); - benchmark.time( reset1, "cuBLAS", scalarProductCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", scalarProductCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas ); #endif /* @@ -289,10 +289,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "scalar multiplication", 2 * datasetSize ); - benchmark.time( reset1, "CPU", multiplyHost ); + benchmark.time< Devices::Host >( reset1, "CPU", multiplyHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", multiplyCuda ); - benchmark.time( reset1, "cuBLAS", multiplyCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", multiplyCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas ); #endif @@ -312,10 +312,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "vector addition", 3 * datasetSize ); - benchmark.time( reset1, "CPU", addVectorHost ); + benchmark.time< Devices::Host >( reset1, "CPU", addVectorHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", addVectorCuda ); - benchmark.time( reset1, "cuBLAS", addVectorCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", addVectorCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", addVectorCublas ); #endif diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 559e27ee26fe6c03d1df5d3d76584376f6f0f7ed..b58ea50077161a23b938a898fab39311d18d5af6 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -8,24 +8,25 @@ /* See Copyright Notice in tnl/Copyright */ -// Implemented by: Jakub Klinkovsky +// Implemented by: Jakub Klinkovsky, +// Tomas Oberhuber #pragma once +#include "FunctionTimer.h" +#include "Logging.h" + #include #include -#include -#include #include #include -#include #include -#include #include #include #include +#include #include namespace TNL { @@ -33,252 +34,6 @@ namespace Benchmarks { const double oneGB = 1024.0 * 1024.0 * 1024.0; -template< typename ComputeFunction, - typename ResetFunction, - typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > -double -timeFunction( ComputeFunction compute, - ResetFunction reset, - int loops, - Monitor && monitor = Monitor() ) -{ - // the timer is constructed zero-initialized and stopped - Timer timer; - - // set timer to the monitor - monitor.setTimer( timer ); - - // warm up - reset(); - compute(); - - for(int i = 0; i < loops; ++i) { - // abuse the monitor's "time" for loops - monitor.setTime( i + 1 ); - - reset(); - - // Explicit synchronization of the CUDA device - // TODO: not necessary for host computations -#ifdef HAVE_CUDA - cudaDeviceSynchronize(); -#endif - timer.start(); - compute(); -#ifdef HAVE_CUDA - cudaDeviceSynchronize(); -#endif - timer.stop(); - } - - return timer.getRealTime() / loops; -} - - -class Logging -{ -public: - using MetadataElement = std::pair< const char*, String >; - using MetadataMap = std::map< const char*, String >; - using MetadataColumns = std::vector; - - using HeaderElements = std::vector< String >; - using RowElements = std::vector< double >; - - Logging( bool verbose = true ) - : verbose(verbose) - {} - - void - writeTitle( const String & title ) - { - if( verbose ) - std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl; - log << ": title = " << title << std::endl; - } - - void - writeMetadata( const MetadataMap & metadata ) - { - if( verbose ) - std::cout << "properties:" << std::endl; - - for( auto & it : metadata ) { - if( verbose ) - std::cout << " " << it.first << " = " << it.second << std::endl; - log << ": " << it.first << " = " << it.second << std::endl; - } - if( verbose ) - std::cout << std::endl; - } - - void - writeTableHeader( const String & spanningElement, - const HeaderElements & subElements ) - { - if( verbose && header_changed ) { - for( auto & it : metadataColumns ) { - std::cout << std::setw( 20 ) << it.first; - } - - // spanning element is printed as usual column to stdout, - // but is excluded from header - std::cout << std::setw( 15 ) << ""; - - for( auto & it : subElements ) { - std::cout << std::setw( 15 ) << it; - } - std::cout << std::endl; - - header_changed = false; - } - - // initial indent string - header_indent = "!"; - log << std::endl; - for( auto & it : metadataColumns ) { - log << header_indent << " " << it.first << std::endl; - } - - // dump stacked spanning columns - if( horizontalGroups.size() > 0 ) - while( horizontalGroups.back().second <= 0 ) { - horizontalGroups.pop_back(); - header_indent.pop_back(); - } - for( size_t i = 0; i < horizontalGroups.size(); i++ ) { - if( horizontalGroups[ i ].second > 0 ) { - log << header_indent << " " << horizontalGroups[ i ].first << std::endl; - header_indent += "!"; - } - } - - log << header_indent << " " << spanningElement << std::endl; - for( auto & it : subElements ) { - log << header_indent << "! " << it << std::endl; - } - - if( horizontalGroups.size() > 0 ) { - horizontalGroups.back().second--; - header_indent.pop_back(); - } - } - - void - writeTableRow( const String & spanningElement, - const RowElements & subElements ) - { - if( verbose ) { - for( auto & it : metadataColumns ) { - std::cout << std::setw( 20 ) << it.second; - } - // spanning element is printed as usual column to stdout - std::cout << std::setw( 15 ) << spanningElement; - for( auto & it : subElements ) { - std::cout << std::setw( 15 ); - if( it != 0.0 )std::cout << it; - else std::cout << "N/A"; - } - std::cout << std::endl; - } - - // only when changed (the header has been already adjusted) - // print each element on separate line - for( auto & it : metadataColumns ) { - log << it.second << std::endl; - } - - // benchmark data are indented - const String indent = " "; - for( auto & it : subElements ) { - if( it != 0.0 ) log << indent << it << std::endl; - else log << indent << "N/A" << std::endl; - } - } - - void - writeErrorMessage( const char* msg, - int colspan = 1 ) - { - // initial indent string - header_indent = "!"; - log << std::endl; - for( auto & it : metadataColumns ) { - log << header_indent << " " << it.first << std::endl; - } - - // make sure there is a header column for the message - if( horizontalGroups.size() == 0 ) - horizontalGroups.push_back( {"", 1} ); - - // dump stacked spanning columns - while( horizontalGroups.back().second <= 0 ) { - horizontalGroups.pop_back(); - header_indent.pop_back(); - } - for( size_t i = 0; i < horizontalGroups.size(); i++ ) { - if( horizontalGroups[ i ].second > 0 ) { - log << header_indent << " " << horizontalGroups[ i ].first << std::endl; - header_indent += "!"; - } - } - if( horizontalGroups.size() > 0 ) { - horizontalGroups.back().second -= colspan; - header_indent.pop_back(); - } - - // only when changed (the header has been already adjusted) - // print each element on separate line - for( auto & it : metadataColumns ) { - log << it.second << std::endl; - } - log << msg << std::endl; - } - - void - closeTable() - { - log << std::endl; - header_indent = body_indent = ""; - header_changed = true; - horizontalGroups.clear(); - } - - bool save( std::ostream & logFile ) - { - closeTable(); - logFile << log.str(); - if( logFile.good() ) { - log.str() = ""; - return true; - } - return false; - } - -protected: - - // manual double -> String conversion with fixed precision - static String - _to_string( double num, int precision = 0, bool fixed = false ) - { - std::stringstream str; - if( fixed ) - str << std::fixed; - if( precision ) - str << std::setprecision( precision ); - str << num; - return String( str.str().data() ); - } - - std::stringstream log; - std::string header_indent; - std::string body_indent; - - bool verbose; - MetadataColumns metadataColumns; - bool header_changed = true; - std::vector< std::pair< String, int > > horizontalGroups; -}; struct BenchmarkResult @@ -309,12 +64,30 @@ public: using Logging::MetadataElement; using Logging::MetadataMap; using Logging::MetadataColumns; - + Benchmark( int loops = 10, bool verbose = true ) : Logging(verbose), loops(loops) {} + + static void configSetup( Config::ConfigDescription& config ) + { + config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); + config.addEntry< bool >( "reset", "Call reset function between loops.", true ); + config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 ); + config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true ); + config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 ); + } + void setup( const Config::ParameterContainer& parameters ) + { + this->loops = parameters.getParameter< int >( "loops" ); + this->reset = parameters.getParameter< bool >( "reset" ); + this->minTime = parameters.getParameter< double >( "min-time" ); + this->timing = parameters.getParameter< bool >( "timing" ); + const int verbose = parameters.getParameter< int >( "verbose" ); + Logging::setVerbose( verbose ); + } // TODO: ensure that this is not called in the middle of the benchmark // (or just remove it completely?) void @@ -322,6 +95,11 @@ public: { this->loops = loops; } + + void setMinTime( const double& minTime ) + { + this->minTime = minTime; + } // Marks the start of a new benchmark void @@ -338,8 +116,11 @@ public: { closeTable(); writeTitle( title ); - // add loops to metadata + // add loops and reset flag to metadata metadata["loops"] = convertToString(loops); + metadata["reset"] = convertToString( reset ); + metadata["minimal test time"] = convertToString( minTime ); + metadata["timing"] = convertToString( timing ); writeMetadata( metadata ); } @@ -411,7 +192,8 @@ public: // "speedup" columns. // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation) // Also terminates the recursion of the following variadic template. - template< typename ResetFunction, + template< typename Device, + typename ResetFunction, typename ComputeFunction > double time( ResetFunction reset, @@ -420,15 +202,35 @@ public: BenchmarkResult & result ) { result.time = std::numeric_limits::quiet_NaN(); + FunctionTimer< Device > functionTimer; try { - if( verbose ) { + if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); - result.time = timeFunction( compute, reset, loops, monitor ); + if( this->timing ) + if( this->reset ) + result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor ); + else + result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); + else + if( this->reset ) + result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor ); + else + result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); } else { - result.time = timeFunction( compute, reset, loops, monitor ); + if( this->timing ) + if( this->reset ) + result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor ); + else + result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); + else + if( this->reset ) + result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor ); + else + result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); } + this->performedLoops = functionTimer.getPerformedLoops(); } catch ( const std::exception& e ) { std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl; @@ -445,7 +247,8 @@ public: return this->baseTime; } - template< typename ResetFunction, + template< typename Device, + typename ResetFunction, typename ComputeFunction, typename... NextComputations > inline double @@ -454,7 +257,61 @@ public: ComputeFunction & compute ) { BenchmarkResult result; - return time( reset, performer, compute, result ); + return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result ); + } + + /**** + * The same methods as above but without reset function + */ + template< typename Device, + typename ComputeFunction > + double + time( const String & performer, + ComputeFunction & compute, + BenchmarkResult & result ) + { + result.time = std::numeric_limits::quiet_NaN(); + FunctionTimer< Device > functionTimer; + try { + if( verbose > 1 ) { + // run the monitor main loop + Solvers::SolverMonitorThread monitor_thread( monitor ); + if( this->timing ) + result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); + else + result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); + } + else { + if( this->timing ) + result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); + else + result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); + } + } + catch ( const std::exception& e ) { + std::cerr << "Function timer failed due to a C++ exception with description: " << e.what() << std::endl; + } + + result.bandwidth = datasetSize / result.time; + result.speedup = this->baseTime / result.time; + if( this->baseTime == 0.0 ) + this->baseTime = result.time; + + writeTableHeader( performer, result.getTableHeader() ); + writeTableRow( performer, result.getRowElements() ); + + return this->baseTime; + } + + template< typename Device, + typename ComputeFunction, + typename... NextComputations > + inline double + time( const String & performer, + ComputeFunction & compute ) + { + BenchmarkResult result; + return time< Device, ComputeFunction >( performer, compute, result ); } // Adds an error message to the log. Should be called in places where the @@ -466,6 +323,7 @@ public: // each computation has 3 subcolumns const int colspan = 3 * numberOfComputations; writeErrorMessage( msg, colspan ); + std::cerr << msg << std::endl; } using Logging::save; @@ -476,10 +334,23 @@ public: return monitor; } + int getPerformedLoops() const + { + return this->performedLoops; + } + + bool isResetingOn() const + { + return reset; + } + protected: - int loops; + int loops = 1, performedLoops = 0; + double minTime = 0.0; double datasetSize = 0.0; double baseTime = 0.0; + bool timing = true; + bool reset = true; Solvers::IterativeSolverMonitor< double, int > monitor; }; diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt index e0637205f760be7975e27a06867a9fe3c0c40a46..556dc1604436e707a659f9680b42d7d470ddb372 100644 --- a/src/Benchmarks/CMakeLists.txt +++ b/src/Benchmarks/CMakeLists.txt @@ -3,9 +3,12 @@ add_subdirectory( BLAS ) add_subdirectory( SpMV ) add_subdirectory( DistSpMV ) add_subdirectory( LinearSolvers ) +add_subdirectory( Traversers ) set( headers Benchmarks.h + FunctionTimer.h + Logging.h ) install( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/Benchmarks ) diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h index 73001e95820b8d10e1e7dc1845bb825f02e05c02..23f08152724222009ff8e818bfee9d6e78f81611 100644 --- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h +++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h @@ -62,7 +62,7 @@ benchmarkSpmv( Benchmark& benchmark, matrix.vectorProduct( x, y ); }; - benchmark.time( reset, performer, compute ); + benchmark.time< typename Matrix::DeviceType >( reset, performer, compute ); } template< typename Matrix, typename Vector > @@ -114,7 +114,7 @@ benchmarkDistributedSpmv( Benchmark& benchmark, Matrix::CommunicatorType::Barrier( matrix.getCommunicationGroup() ); }; - benchmark.time( reset, performer, compute ); + benchmark.time< typename Matrix::DeviceType >( reset, performer, compute ); } template< typename Matrix, typename Vector > diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h new file mode 100644 index 0000000000000000000000000000000000000000..6cef44aaf673e99459851509ca1318d904d8ecf4 --- /dev/null +++ b/src/Benchmarks/FunctionTimer.h @@ -0,0 +1,135 @@ +/*************************************************************************** + FunctionTimer.h - description + ------------------- + begin : Dec 25, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky, +// Tomas Oberhuber + +#pragma once + +#include + +#include +#include +#include + +namespace TNL { + namespace Benchmarks { + + +template< typename Device > +class FunctionTimer +{ + public: + using DeviceType = Device; + + template< bool timing, + typename ComputeFunction, + typename ResetFunction, + typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > + double + timeFunction( ComputeFunction compute, + ResetFunction reset, + int maxLoops, + const double& minTime, + int verbose = 1, + Monitor && monitor = Monitor(), + bool performReset = true ) + { + // the timer is constructed zero-initialized and stopped + Timer timer; + + // set timer to the monitor + if( verbose > 1 ) + monitor.setTimer( timer ); + + // warm up + reset(); + compute(); + + // If we do not perform reset function and don't need + // the monitor, the timer is not interrupted after each loop. + if( ! performReset && verbose < 2 ) + { + // Explicit synchronization of the CUDA device +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + if( timing ) + timer.start(); + + for( loops = 0; + loops < maxLoops || ( timing && timer.getRealTime() < minTime ); + ++loops) + compute(); + // Explicit synchronization of the CUDA device +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + if( timing ) + timer.stop(); + } + else + { + for( loops = 0; + loops < maxLoops || ( timing && timer.getRealTime() < minTime ); + ++loops) + { + // abuse the monitor's "time" for loops + monitor.setTime( loops + 1 ); + reset(); + + // Explicit synchronization of the CUDA device +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + if( timing ) + timer.start(); + compute(); +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + if( timing ) + timer.stop(); + } + } + if( timing ) + return timer.getRealTime() / ( double ) loops; + else + return std::numeric_limits::quiet_NaN(); + } + + template< bool timing, + typename ComputeFunction, + typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > + double + timeFunction( ComputeFunction compute, + int maxLoops, + const double& minTime, + int verbose = 1, + Monitor && monitor = Monitor() ) + { + auto noReset = [] () {}; + return timeFunction< timing >( compute, noReset, maxLoops, minTime, verbose, monitor, false ); + } + + int getPerformedLoops() const + { + return this->loops; + } + protected: + int loops; +}; + + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h index a82ec2dc297ba008d45b5291ece3df07d47fefe7..c6278a76b1e10de7c830a6bae27476623796e42b 100644 --- a/src/Benchmarks/LinearSolvers/benchmarks.h +++ b/src/Benchmarks/LinearSolvers/benchmarks.h @@ -73,7 +73,7 @@ benchmarkPreconditionerUpdate( Benchmark& benchmark, barrier( matrix ); }; - benchmark.time( reset, performer, compute ); + benchmark.time< typename Matrix::DeviceType >( reset, performer, compute ); } template< template class Solver, template class Preconditioner, typename Matrix, typename Vector > @@ -166,7 +166,7 @@ benchmarkSolver( Benchmark& benchmark, }; MyBenchmarkResult benchmarkResult( solver, matrix, x, b ); - benchmark.time( reset, performer, compute, benchmarkResult ); + benchmark.time< typename Matrix::DeviceType >( reset, performer, compute, benchmarkResult ); } #ifdef HAVE_ARMADILLO diff --git a/src/Benchmarks/Logging.h b/src/Benchmarks/Logging.h new file mode 100644 index 0000000000000000000000000000000000000000..b10ab7199337b68fe5c51181b4924fcc8a3f89b6 --- /dev/null +++ b/src/Benchmarks/Logging.h @@ -0,0 +1,240 @@ +/*************************************************************************** + Logging.h - description + ------------------- + begin : Dec 25, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky, +// Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include +#include + +namespace TNL { + namespace Benchmarks { + +class Logging +{ + public: + using MetadataElement = std::pair< const char*, String >; + using MetadataMap = std::map< const char*, String >; + using MetadataColumns = std::vector; + + using HeaderElements = std::vector< String >; + using RowElements = std::vector< double >; + + Logging( int verbose = true ) + : verbose(verbose) + {} + + void + setVerbose( int verbose) + { + this->verbose = verbose; + } + + void + writeTitle( const String & title ) + { + if( verbose ) + std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl; + log << ": title = " << title << std::endl; + } + + void + writeMetadata( const MetadataMap & metadata ) + { + if( verbose ) + std::cout << "properties:" << std::endl; + + for( auto & it : metadata ) { + if( verbose ) + std::cout << " " << it.first << " = " << it.second << std::endl; + log << ": " << it.first << " = " << it.second << std::endl; + } + if( verbose ) + std::cout << std::endl; + } + + void + writeTableHeader( const String & spanningElement, + const HeaderElements & subElements ) + { + if( verbose && header_changed ) { + for( auto & it : metadataColumns ) { + std::cout << std::setw( 20 ) << it.first; + } + + // spanning element is printed as usual column to stdout, + // but is excluded from header + std::cout << std::setw( 15 ) << ""; + + for( auto & it : subElements ) { + std::cout << std::setw( 15 ) << it; + } + std::cout << std::endl; + + header_changed = false; + } + + // initial indent string + header_indent = "!"; + log << std::endl; + for( auto & it : metadataColumns ) { + log << header_indent << " " << it.first << std::endl; + } + + // dump stacked spanning columns + if( horizontalGroups.size() > 0 ) + while( horizontalGroups.back().second <= 0 ) { + horizontalGroups.pop_back(); + header_indent.pop_back(); + } + for( size_t i = 0; i < horizontalGroups.size(); i++ ) { + if( horizontalGroups[ i ].second > 0 ) { + log << header_indent << " " << horizontalGroups[ i ].first << std::endl; + header_indent += "!"; + } + } + + log << header_indent << " " << spanningElement << std::endl; + for( auto & it : subElements ) { + log << header_indent << "! " << it << std::endl; + } + + if( horizontalGroups.size() > 0 ) { + horizontalGroups.back().second--; + header_indent.pop_back(); + } + } + + void + writeTableRow( const String & spanningElement, + const RowElements & subElements ) + { + if( verbose ) { + for( auto & it : metadataColumns ) { + std::cout << std::setw( 20 ) << it.second; + } + // spanning element is printed as usual column to stdout + std::cout << std::setw( 15 ) << spanningElement; + for( auto & it : subElements ) { + std::cout << std::setw( 15 ); + if( it != 0.0 )std::cout << it; + else std::cout << "N/A"; + } + std::cout << std::endl; + } + + // only when changed (the header has been already adjusted) + // print each element on separate line + for( auto & it : metadataColumns ) { + log << it.second << std::endl; + } + + // benchmark data are indented + const String indent = " "; + for( auto & it : subElements ) { + if( it != 0.0 ) log << indent << it << std::endl; + else log << indent << "N/A" << std::endl; + } + } + + void + writeErrorMessage( const char* msg, + int colspan = 1 ) + { + // initial indent string + header_indent = "!"; + log << std::endl; + for( auto & it : metadataColumns ) { + log << header_indent << " " << it.first << std::endl; + } + + // make sure there is a header column for the message + if( horizontalGroups.size() == 0 ) + horizontalGroups.push_back( {"", 1} ); + + // dump stacked spanning columns + while( horizontalGroups.back().second <= 0 ) { + horizontalGroups.pop_back(); + header_indent.pop_back(); + } + for( size_t i = 0; i < horizontalGroups.size(); i++ ) { + if( horizontalGroups[ i ].second > 0 ) { + log << header_indent << " " << horizontalGroups[ i ].first << std::endl; + header_indent += "!"; + } + } + if( horizontalGroups.size() > 0 ) { + horizontalGroups.back().second -= colspan; + header_indent.pop_back(); + } + + // only when changed (the header has been already adjusted) + // print each element on separate line + for( auto & it : metadataColumns ) { + log << it.second << std::endl; + } + log << msg << std::endl; + } + + void + closeTable() + { + log << std::endl; + header_indent = body_indent = ""; + header_changed = true; + horizontalGroups.clear(); + } + + bool save( std::ostream & logFile ) + { + closeTable(); + logFile << log.str(); + if( logFile.good() ) { + log.str() = ""; + return true; + } + return false; + } + + protected: + + // manual double -> String conversion with fixed precision + static String + _to_string( double num, int precision = 0, bool fixed = false ) + { + std::stringstream str; + if( fixed ) + str << std::fixed; + if( precision ) + str << std::setprecision( precision ); + str << num; + return String( str.str().data() ); + } + + std::stringstream log; + std::string header_indent; + std::string body_indent; + + int verbose; + MetadataColumns metadataColumns; + bool header_changed = true; + std::vector< std::pair< String, int > > horizontalGroups; +}; + + + } // namespace Benchmarks +} // namespace TNL + + diff --git a/src/Benchmarks/Traversers/AddOneEntitiesProcessor.h b/src/Benchmarks/Traversers/AddOneEntitiesProcessor.h new file mode 100644 index 0000000000000000000000000000000000000000..6b136d0740a629748a9e3f4774ccf519a39e06c6 --- /dev/null +++ b/src/Benchmarks/Traversers/AddOneEntitiesProcessor.h @@ -0,0 +1,43 @@ +/*************************************************************************** + BenchmarkTraverserUserData.h - description + ------------------- + begin : Jan 5, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename TraverserUserData > +class AddOneEntitiesProcessor +{ + public: + + using MeshType = typename TraverserUserData::MeshType; + using DeviceType = typename MeshType::DeviceType; + using RealType = typename MeshType::RealType; + + template< typename GridEntity > + __cuda_callable__ + static inline void processEntity( const MeshType& mesh, + TraverserUserData& userData, + const GridEntity& entity ) + { + auto& u = *userData.u; + u( entity ) += ( RealType ) 1.0; + } +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/AddTwoEntitiesProcessor.h b/src/Benchmarks/Traversers/AddTwoEntitiesProcessor.h new file mode 100644 index 0000000000000000000000000000000000000000..94f6d5807c8b212aac85c189b530964d8ceaadf8 --- /dev/null +++ b/src/Benchmarks/Traversers/AddTwoEntitiesProcessor.h @@ -0,0 +1,43 @@ +/*************************************************************************** + BenchmarkTraverserUserData.h - description + ------------------- + begin : Jan 5, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename TraverserUserData > +class AddTwoEntitiesProcessor +{ + public: + + using MeshType = typename TraverserUserData::MeshType; + using DeviceType = typename MeshType::DeviceType; + using RealType = typename MeshType::RealType; + + template< typename GridEntity > + __cuda_callable__ + static inline void processEntity( const MeshType& mesh, + TraverserUserData& userData, + const GridEntity& entity ) + { + auto& u = *userData.u; + u( entity ) += ( RealType ) 2.0; + } +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h new file mode 100644 index 0000000000000000000000000000000000000000..2ae00ec697452aad99163752bbd83fee4c5af1f1 --- /dev/null +++ b/src/Benchmarks/Traversers/BenchmarkTraverserUserData.h @@ -0,0 +1,39 @@ +/*************************************************************************** + BenchmarkTraverserUserData.h - description + ------------------- + begin : Jan 5, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename MeshFunction > +class BenchmarkTraverserUserData +{ + public: + + using MeshType = typename MeshFunction::MeshType; + using RealType = typename MeshType::RealType; + using DeviceType = typename MeshType::DeviceType; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + + BenchmarkTraverserUserData( MeshFunctionPointer& f ) + : u( &f.template modifyData< DeviceType >() ), data( f->getData().getData() ){} + + MeshFunction* u; + RealType* data; +}; + + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/CMakeLists.txt b/src/Benchmarks/Traversers/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5932d2606db852736f1ce665f4d52f53d0fa5d09 --- /dev/null +++ b/src/Benchmarks/Traversers/CMakeLists.txt @@ -0,0 +1,10 @@ +# TODO: Split the benchmark into several files for faster build + +#if( BUILD_CUDA ) +# CUDA_ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cu ) +#else() +# ADD_EXECUTABLE( tnl-benchmark-traversers tnl-benchmark-traversers.cpp ) +#endif() + +#install( TARGETS tnl-benchmark-traversers RUNTIME DESTINATION bin ) + diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h new file mode 100644 index 0000000000000000000000000000000000000000..6da7ec09bb05d1591689688bf9fb85420fe5e820 --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper.h @@ -0,0 +1,34 @@ +/*************************************************************************** + GridTraversersBenchmarkHelper.h - description + ------------------- + begin : Jan 5, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include "AddOneEntitiesProcessor.h" +#include "BenchmarkTraverserUserData.h" +#include "SimpleCell.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename Grid > +class GridTraverserBenchmarkHelper{}; + + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL + +#include "GridTraverserBenchmarkHelper_1D.h" +#include "GridTraverserBenchmarkHelper_2D.h" +#include "GridTraverserBenchmarkHelper_3D.h" + diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h new file mode 100644 index 0000000000000000000000000000000000000000..e460a8bca4ac8edb77dcab823576349335de6a73 --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_1D.h @@ -0,0 +1,154 @@ +/*************************************************************************** + GridTraversersBenchmarkHelper_1D.h - description + ------------------- + begin : Jan 6, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include "GridTraverserBenchmarkHelper.h" +#include "AddOneEntitiesProcessor.h" +#include "BenchmarkTraverserUserData.h" +#include "SimpleCell.h" + + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor > +__global__ void +_GridTraverser1D( + const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const Index gridIdx ) +{ + typedef Real RealType; + typedef Index IndexType; + typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; + //typename GridType::CoordinatesType coordinates; + + GridEntity entity( *grid ); + entity.getCoordinates().x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + //coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( entity.getCoordinates() <= end ) + { + entity.refresh(); + //( userData.u->getData() )[ entity.getIndex( coordinates ) ] += ( RealType ) 1.0; + //( userData.u->getData() )[ coordinates.x() ] += ( RealType ) 1.0; + //userData.data[ entity.getIndex() ] += ( RealType ) 1.0; + //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0; + ( *userData.u )( entity ) += ( RealType ) 1.0; + //EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +} +#endif + +template< typename Real, + typename Index > +class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Host, Index > > +{ + public: + + constexpr static int Dimension = 1; + using GridType = Meshes::Grid< Dimension, Real, Devices::Host, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void simpleCellTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { + const CoordinatesType begin( 0 ); + const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); + SimpleCellType entity( *grid ); + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0; + ( *userData.u )( entity ) += ( RealType ) 1.0; + } + + } +}; + +template< typename Real, + typename Index > +class GridTraverserBenchmarkHelper< Meshes::Grid< 1, Real, Devices::Cuda, Index > > +{ + public: + + constexpr static int Dimension = 1; + using GridType = Meshes::Grid< Dimension, Real, Devices::Cuda, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void simpleCellTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + _GridTraverser1D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType > + <<< blocksCount, blockSize >>> + ( &grid.template getData< Devices::Cuda >(), + userData, + CoordinatesType( 0 ), + CoordinatesType( size ) - CoordinatesType( 1 ), + gridIdx.x ); + + } +#endif + } +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h new file mode 100644 index 0000000000000000000000000000000000000000..eca6c7fee0057e2689fa5f473a214d97266471e6 --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_2D.h @@ -0,0 +1,152 @@ +/*************************************************************************** + GridTraversersBenchmarkHelper_2D.h - description + ------------------- + begin : Jan 6, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include "GridTraverserBenchmarkHelper.h" +#include "AddOneEntitiesProcessor.h" +#include "BenchmarkTraverserUserData.h" +#include "SimpleCell.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor > +__global__ void +_GridTraverser2D( + const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const dim3 gridIdx ) +{ + typedef Real RealType; + typedef Index IndexType; + typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; + + GridEntity entity( *grid ); + entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + if( entity.getCoordinates() <= end ) + { + entity.refresh(); + ( *userData.u )( entity ) += ( RealType ) 1.0; + } +} +#endif + +template< typename Real, + typename Index > +class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Host, Index > > +{ + public: + + constexpr static int Dimension = 2; + using GridType = Meshes::Grid< Dimension, Real, Devices::Host, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void simpleCellTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { + const CoordinatesType begin( 0 ); + const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); + SimpleCellType entity( *grid ); + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y()++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + //userData.u->getData()[ entity.getIndex() ] += ( RealType ) 1.0; + ( *userData.u )( entity ) += ( RealType ) 1.0; + } + + } +}; + +template< typename Real, + typename Index > +class GridTraverserBenchmarkHelper< Meshes::Grid< 2, Real, Devices::Cuda, Index > > +{ + public: + + constexpr static int Dimension = 2; + using GridType = Meshes::Grid< Dimension, Real, Devices::Cuda, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void simpleCellTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { +#ifdef HAVE_CUDA + dim3 blockSize( 16, 16 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size ); + dim3 gridIdx; + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + _GridTraverser2D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType > + <<< blocksCount, blockSize >>> + ( &grid.template getData< Devices::Cuda >(), + userData, + CoordinatesType( 0 ), + CoordinatesType( size ) - CoordinatesType( 1 ), + gridIdx.x ); + } +#endif + } +}; + + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h new file mode 100644 index 0000000000000000000000000000000000000000..4a5da6fd4b89c1cebc716ccfdff31d6ecaf96470 --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraverserBenchmarkHelper_3D.h @@ -0,0 +1,156 @@ +/*************************************************************************** + GridTraversersBenchmarkHelper_3D.h - description + ------------------- + begin : Jan 6, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include "GridTraverserBenchmarkHelper.h" +#include "AddOneEntitiesProcessor.h" +#include "BenchmarkTraverserUserData.h" +#include "SimpleCell.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor > +__global__ void +_GridTraverser3D( + const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const dim3 gridIdx ) +{ + typedef Real RealType; + typedef Index IndexType; + typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; + + GridEntity entity( *grid ); + entity.getCoordinates().x() = begin.x() + ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + entity.getCoordinates().y() = begin.y() + ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + entity.getCoordinates().z() = begin.z() + ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + + if( entity.getCoordinates() <= end ) + { + entity.refresh(); + ( *userData.u )( entity ) += ( RealType ) 1.0; + } +} +#endif + +template< typename Real, + typename Index > +class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Host, Index > > +{ + public: + + constexpr static int Dimension = 3; + using GridType = Meshes::Grid< Dimension, Real, Devices::Host, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void simpleCellTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { + const CoordinatesType begin( 0 ); + const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); + SimpleCellType entity( *grid ); + for( entity.getCoordinates().z() = begin.z(); + entity.getCoordinates().z() <= end.z(); + entity.getCoordinates().z()++ ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y()++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + ( *userData.u )( entity ) += ( RealType ) 1.0; + } + } +}; + +template< typename Real, + typename Index > +class GridTraverserBenchmarkHelper< Meshes::Grid< 3, Real, Devices::Cuda, Index > > +{ + public: + + constexpr static int Dimension = 3; + using GridType = Meshes::Grid< Dimension, Real, Devices::Cuda, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using RealType = typename GridType::RealType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< Dimension, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + + static void simpleCellTest( const GridPointer& grid, + UserDataType& userData, + std::size_t size ) + { +#ifdef HAVE_CUDA + dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size, + size ); + dim3 gridIdx; + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + _GridTraverser3D< RealType, IndexType, SimpleCellType, UserDataType, AddOneEntitiesProcessorType > + <<< blocksCount, blockSize >>> + ( &grid.template getData< Devices::Cuda >(), + userData, + CoordinatesType( 0 ), + CoordinatesType( size ) - CoordinatesType( 1 ), + gridIdx.x ); + } +#endif + } +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h new file mode 100644 index 0000000000000000000000000000000000000000..72ca102bcc131067eec286390e819be91db22e04 --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -0,0 +1,49 @@ +/*************************************************************************** + GridTraversersBenchmark.h - description + ------------------- + begin : Dec 19, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "AddOneEntitiesProcessor.h" +#include "AddTwoEntitiesProcessor.h" +#include "GridTraverserBenchmarkHelper.h" +#include "BenchmarkTraverserUserData.h" +#include "cuda-kernels.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + + + +template< int Dimension, + typename Device, + typename Real, + typename Index > +class GridTraversersBenchmark{}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL + +#include "GridTraversersBenchmark_1D.h" +#include "GridTraversersBenchmark_2D.h" +#include "GridTraversersBenchmark_3D.h" diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h new file mode 100644 index 0000000000000000000000000000000000000000..8ec5cdf888b35185becbcda3841cc2cd46a9a176 --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h @@ -0,0 +1,225 @@ +/*************************************************************************** + GridTraversersBenchmark_1D.h - description + ------------------- + begin : Jan 3, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda-kernels.h" +#include "GridTraversersBenchmark.h" +#include "SimpleCell.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + + +template< typename Device, + typename Real, + typename Index > +class GridTraversersBenchmark< 1, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + using GridType = Meshes::Grid< 1, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using Coordinates = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< 1, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + using AddTwoEntitiesProcessorType = AddTwoEntitiesProcessor< UserDataType >; + + GridTraversersBenchmark( Index size ) + :size( size ), v( size ), grid( size ), u( grid ), + userData( this->u ) + { + v_data = v.getData(); + u->getData().bind( v ); + } + + void reset() + { + v.setValue( 0.0 ); + }; + + void addOneUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + v_data[ i ] += (Real) 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + fullGridTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void addOneUsingParallelFor() + { + auto f = [] __cuda_callable__ ( Index i, Real* data ) + { + data[ i ] += (Real) 1.0; + }; + ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() ); + } + + void addOneUsingSimpleCell() + { + /*const GridType* currentGrid = &grid.template getData< Device >(); + auto f = [=] __cuda_callable__ ( Index i, Real* data ) + { + SimpleCellType entity( *currentGrid ); + entity.getCoordinates().x() = i; + entity.refresh(); + data[ entity.getIndex() ] += (Real) 1.0; + }; + ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f, v.getData() );*/ + GridTraverserBenchmarkHelper< GridType >::simpleCellTest( + grid, + userData, + size ); + } + + void addOneUsingParallelForAndMeshFunction() + { + const GridType* currentGrid = &grid.template getData< Device >(); + MeshFunction* _u = &u.template modifyData< Device >(); + auto f = [=] __cuda_callable__ ( Index i ) + { + SimpleCellType entity( *currentGrid ); + entity.getCoordinates().x() = i; + entity.refresh(); + _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0; + // ( *_u )( entity ) += (Real) 1.0; + }; + ParallelFor< Device, AsynchronousMode >::exec( ( Index ) 0, size, f ); + } + + void addOneUsingTraverser() + { + using CoordinatesType = typename GridType::CoordinatesType; + traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > + ( grid, userData ); + + /*GridTraverserBenchmarkHelper< GridType >::noBCTraverserTest( + grid, + userData, + size );*/ + } + + bool checkAddOne( int loops, bool reseting ) + { + std::cout << loops << " -> " << v << std::endl; + if( reseting ) + return v.containsOnlyValue( 1.0 ); + return v.containsOnlyValue( ( Real ) loops ); + } + + void traverseUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + v_data[ 0 ] += (Real) 2; + for( int i = 1; i < size - 1; i++ ) + v_data[ i ] += (Real) 1.0; + v_data[ size - 1 ] += (Real) 2; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + boundariesTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + interiorTraverseKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void traverseUsingTraverser() + { + // TODO !!!!!!!!!!!!!!!!!!!!!! + //traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > + + traverser.template processBoundaryEntities< UserDataType, AddTwoEntitiesProcessorType > + ( grid, userData ); + traverser.template processInteriorEntities< UserDataType, AddOneEntitiesProcessorType > + ( grid, userData ); + } + + protected: + + Index size; + Vector v; + Real* v_data; + GridPointer grid; + MeshFunctionPointer u; + Traverser traverser; + UserDataType userData; +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h new file mode 100644 index 0000000000000000000000000000000000000000..3c2037f40b7e33da59d1af2a3b0552d49d06ebb2 --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h @@ -0,0 +1,273 @@ +/*************************************************************************** + GridTraversersBenchmark_2D.h - description + ------------------- + begin : Jan 3, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda-kernels.h" +#include "GridTraversersBenchmark.h" +#include "SimpleCell.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename Device, + typename Real, + typename Index > +class GridTraversersBenchmark< 2, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + using GridType = Meshes::Grid< 2, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using Coordinates = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< 2, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + using AddTwoEntitiesProcessorType = AddTwoEntitiesProcessor< UserDataType >; + + GridTraversersBenchmark( Index size ) + :size( size ), v( size * size ), grid( size, size ), u( grid ), + userData( u ) + { + v_data = v.getData(); + u->getData().bind( v ); + } + + void reset() + { + v.setValue( 0.0 ); + }; + + void addOneUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + for( int j = 0; j < size; j++ ) + v_data[ i * size + j ] += (Real) 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 16, 16 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size ); + dim3 gridIdx; + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + fullGridTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void addOneUsingParallelFor() + { + Index _size = this->size; + auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) + { + data[ j * _size + i ] += (Real) 1.0; + }; + + ParallelFor2D< Device, AsynchronousMode >::exec( + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); + } + + void addOneUsingSimpleCell() + { + /*const GridType* currentGrid = &grid.template getData< Device >(); + auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) + { + SimpleCellType entity( *currentGrid ); + entity.getCoordinates().x() = i; + entity.getCoordinates().y() = j; + entity.refresh(); + data[ entity.getIndex() ] += (Real) 1.0; + }; + + ParallelFor2D< Device, AsynchronousMode >::exec( + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() );*/ + GridTraverserBenchmarkHelper< GridType >::simpleCellTest( + grid, + userData, + size ); + + } + + void addOneUsingParallelForAndMeshFunction() + { + const GridType* currentGrid = &grid.template getData< Device >(); + MeshFunction* _u = &u.template modifyData< Device >(); + auto f = [=] __cuda_callable__ ( Index i, Index j, Real* data ) + { + SimpleCellType entity( *currentGrid ); + entity.getCoordinates().x() = i; + entity.getCoordinates().y() = j; + entity.refresh(); + //( *_u )( entity ) += (Real) 1.0; + _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0; + }; + + ParallelFor2D< Device, AsynchronousMode >::exec( + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + f, v.getData() ); + } + + + void addOneUsingTraverser() + { + using CoordinatesType = typename GridType::CoordinatesType; + traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > + ( grid, userData ); + + /*Meshes::GridTraverser< Grid >::template processEntities< Cell, WriteOneEntitiesProcessorType, WriteOneTraverserUserDataType, false >( + grid, + CoordinatesType( 0 ), + grid->getDimensions() - CoordinatesType( 1 ), + userData );*/ + /*const CoordinatesType begin( 0 ); + const CoordinatesType end = CoordinatesType( size ) - CoordinatesType( 1 ); + MeshFunction* _u = &u.template modifyData< Device >(); + Cell entity( *grid ); + for( Index y = begin.y(); y <= end.y(); y ++ ) + for( Index x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.getCoordinates().y() = y; + entity.refresh(); + WriteOneEntitiesProcessorType::processEntity( entity.getMesh(), userData, entity ); + }*/ + } + + bool checkAddOne( int loops, bool reseting ) + { + if( reseting ) + return v.containsOnlyValue( 1.0 ); + return v.containsOnlyValue( ( Real ) loops ); + } + + void traverseUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + { + v_data[ i * size ] += (Real) 2.0; + v_data[ i * size + size - 1 ] += (Real) 2.0; + } + for( int j = 1; j < size - 1; j++ ) + { + v_data[ j ] += (Real) 2.0; + v_data[ ( size - 1 ) * size + j ] += (Real) 2.0; + } + + for( int i = 1; i < size - 1; i++ ) + for( int j = 1; j < size - 1; j++ ) + v_data[ i * size + j ] += (Real) 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 32, 8 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size ); + dim3 gridIdx; + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + boundariesTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + interiorTraverseKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void traverseUsingTraverser() + { + //traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > + traverser.template processBoundaryEntities< UserDataType, AddTwoEntitiesProcessorType > + ( grid, userData ); + traverser.template processInteriorEntities< UserDataType, AddOneEntitiesProcessorType > + ( grid, userData ); + } + + protected: + + Index size; + Vector v; + Real* v_data; + GridPointer grid; + MeshFunctionPointer u; + Traverser traverser; + UserDataType userData; +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h new file mode 100644 index 0000000000000000000000000000000000000000..9dfeadb056461623f12b51992c3efee9a8c8767e --- /dev/null +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h @@ -0,0 +1,284 @@ +/*************************************************************************** + GridTraversersBenchmark_3D.h - description + ------------------- + begin : Jan 3, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda-kernels.h" +#include "AddOneEntitiesProcessor.h" +#include "BenchmarkTraverserUserData.h" +#include "GridTraversersBenchmark.h" +#include "SimpleCell.h" + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename Device, + typename Real, + typename Index > +class GridTraversersBenchmark< 3, Device, Real, Index > +{ + public: + + using Vector = Containers::Vector< Real, Device, Index >; + using GridType = Meshes::Grid< 3, Real, Device, Index >; + using GridPointer = Pointers::SharedPointer< GridType >; + using Coordinates = typename GridType::CoordinatesType; + using MeshFunction = Functions::MeshFunction< GridType >; + using MeshFunctionPointer = Pointers::SharedPointer< MeshFunction >; + using CellType = typename GridType::template EntityType< 3, Meshes::GridEntityNoStencilStorage >; + using SimpleCellType = SimpleCell< GridType >; + using Traverser = Meshes::Traverser< GridType, CellType >; + using UserDataType = BenchmarkTraverserUserData< MeshFunction >; + using AddOneEntitiesProcessorType = AddOneEntitiesProcessor< UserDataType >; + using AddTwoEntitiesProcessorType = AddTwoEntitiesProcessor< UserDataType >; + + GridTraversersBenchmark( Index size ) + : size( size ), + v( size * size * size ), + grid( size, size, size ), + u( grid ), + userData( u ) + { + v_data = v.getData(); + u->getData().bind( v ); + } + + void reset() + { + v.setValue( 0.0 ); + }; + + void addOneUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + for( int j = 0; j < size; j++ ) + for( int k = 0; k < size; k++ ) + v_data[ ( i * size + j ) * size + k ] += (Real) 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size, + size ); + dim3 gridIdx; + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + fullGridTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void addOneUsingParallelFor() + { + Index _size = this->size; + auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) + { + data[ ( k * _size + j ) * _size + i ] += (Real) 1.0; + }; + + ParallelFor3D< Device, AsynchronousMode >::exec( + ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); + } + + void addOneUsingSimpleCell() + { + /*const GridType* currentGrid = &grid.template getData< Device >(); + auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) + { + SimpleCellType entity( *currentGrid ); + entity.getCoordinates().x() = i; + entity.getCoordinates().y() = j; + entity.getCoordinates().z() = k; + entity.refresh(); + data[ entity.getIndex() ] += (Real) 1.0; + }; + + ParallelFor3D< Device, AsynchronousMode >::exec( + ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() );*/ + GridTraverserBenchmarkHelper< GridType >::simpleCellTest( + grid, + userData, + size ); + + } + + void addOneUsingParallelForAndMeshFunction() + { + const GridType* currentGrid = &grid.template getData< Device >(); + MeshFunction* _u = &u.template modifyData< Device >(); + auto f = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data ) + { + SimpleCellType entity( *currentGrid ); + entity.getCoordinates().x() = i; + entity.getCoordinates().y() = j; + entity.getCoordinates().z() = k; + entity.refresh(); + //( *_u )( entity ) += (Real) 1.0; + _u->getData().getData()[ entity.getIndex() ] += (Real) 1.0; + }; + + ParallelFor3D< Device, AsynchronousMode >::exec( + ( Index ) 0, + ( Index ) 0, + ( Index ) 0, + this->size, + this->size, + this->size, + f, v.getData() ); + } + + void addOneUsingTraverser() + { + traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > + ( grid, userData ); + } + + bool checkAddOne( int loops, bool reseting ) + { + if( reseting ) + return v.containsOnlyValue( 1.0 ); + return v.containsOnlyValue( ( Real ) loops ); + } + + void traverseUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + for( int j = 0; j < size; j++ ) + { + v_data[ ( i * size + j ) * size ] += (Real) 2.0; + v_data[ ( i * size + j ) * size + size - 1 ] += (Real) 2.0; + } + for( int j = 0; j < size; j++ ) + for( int k = 1; k < size - 1; k++ ) + { + v_data[ j * size + k ] += (Real) 1.0; + v_data[ ( ( size - 1) * size + j ) * size + k ] += (Real) 1.0; + } + + for( int i = 1; i < size -1; i++ ) + for( int k = 1; k < size - 1; k++ ) + { + v_data[ ( i * size ) * size + k ] += (Real) 2.0; + v_data[ ( i * size + size - 1 ) * size + k ] += (Real) 2.0; + } + + for( int i = 1; i < size -1; i++ ) + for( int j = 1; j < size -1; j++ ) + for( int k = 1; k < size - 1; k++ ) + v_data[ ( i * size + j ) * size + k ] += (Real) 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size, + size ); + dim3 gridIdx; + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + boundariesTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + interiorTraverseKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } + } + + void traverseUsingTraverser() + { + // TODO !!!!!!!!!!!!!!!!!!!!!! + //traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > + + traverser.template processBoundaryEntities< UserDataType, AddTwoEntitiesProcessorType > + ( grid, userData ); + traverser.template processInteriorEntities< UserDataType, AddOneEntitiesProcessorType > + ( grid, userData ); + } + + protected: + + Index size; + Vector v; + Real* v_data; + GridPointer grid; + MeshFunctionPointer u; + Traverser traverser; + UserDataType userData; +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/SimpleCell.h b/src/Benchmarks/Traversers/SimpleCell.h new file mode 100644 index 0000000000000000000000000000000000000000..9776ef26cc49292017e64ba80f3936e8a1ebe374 --- /dev/null +++ b/src/Benchmarks/Traversers/SimpleCell.h @@ -0,0 +1,146 @@ +/*************************************************************************** + SimpleCell.h - description + ------------------- + begin : Jan 5, 2019 + copyright : (C) 2019 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include +#include + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +template< typename Grid > +class SimpleCell{}; + +template< typename Real, + typename Device, + typename Index > +class SimpleCell< Meshes::Grid< 1, Real, Device, Index > > +{ + public: + using GridType = Meshes::Grid< 1, Real, Device, Index >; + using RealType = typename GridType::RealType; + using DeviceType = typename GridType::DeviceType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + + constexpr static int getEntityDimension() { return 1; }; + + __cuda_callable__ + SimpleCell( const GridType& grid ) : + grid( grid ){}; + + __cuda_callable__ + const GridType& getMesh() const { return this->grid;}; + + __cuda_callable__ + CoordinatesType& getCoordinates() { return this->coordinates; }; + + __cuda_callable__ + const CoordinatesType& getCoordinates() const { return this->coordinates; }; + + __cuda_callable__ + void refresh() {index = this->grid.getEntityIndex( *this );}; + + __cuda_callable__ + const IndexType& getIndex() const { return this->index; }; + + protected: + const GridType& grid; + CoordinatesType coordinates; + IndexType index; +}; + +template< typename Real, + typename Device, + typename Index > +class SimpleCell< Meshes::Grid< 2, Real, Device, Index > > +{ + public: + using GridType = Meshes::Grid< 2, Real, Device, Index >; + using RealType = typename GridType::RealType; + using DeviceType = typename GridType::DeviceType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + + constexpr static int getEntityDimension() { return 2; }; + + __cuda_callable__ + SimpleCell( const GridType& grid ) : + grid( grid ){}; + + __cuda_callable__ + const GridType& getMesh() const { return this->grid;}; + + __cuda_callable__ + CoordinatesType& getCoordinates() { return this->coordinates; }; + + __cuda_callable__ + const CoordinatesType& getCoordinates() const { return this->coordinates; }; + + __cuda_callable__ + void refresh() {index = this->grid.getEntityIndex( *this );}; + + __cuda_callable__ + const IndexType& getIndex() const { return this->index; }; + + protected: + const GridType& grid; + CoordinatesType coordinates; + IndexType index; + +}; + +template< typename Real, + typename Device, + typename Index > +class SimpleCell< Meshes::Grid< 3, Real, Device, Index > > +{ + public: + using GridType = Meshes::Grid< 3, Real, Device, Index >; + using RealType = typename GridType::RealType; + using DeviceType = typename GridType::DeviceType; + using IndexType = typename GridType::IndexType; + using CoordinatesType = typename GridType::CoordinatesType; + + constexpr static int getEntityDimension() { return 3; }; + + __cuda_callable__ + SimpleCell( const GridType& grid ) : + grid( grid ){}; + + __cuda_callable__ + const GridType& getMesh() const { return this->grid;}; + + __cuda_callable__ + CoordinatesType& getCoordinates() { return this->coordinates; }; + + __cuda_callable__ + const CoordinatesType& getCoordinates() const { return this->coordinates; }; + + __cuda_callable__ + void refresh() { index = this->grid.getEntityIndex( *this ); }; + + __cuda_callable__ + const IndexType& getIndex() const { return this->index; }; + + protected: + const GridType& grid; + CoordinatesType coordinates; + IndexType index; + +}; + + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Traversers/cuda-kernels.h b/src/Benchmarks/Traversers/cuda-kernels.h new file mode 100644 index 0000000000000000000000000000000000000000..a90baf5b02075d9c80d572bd51b93ed5cd97b391 --- /dev/null +++ b/src/Benchmarks/Traversers/cuda-kernels.h @@ -0,0 +1,128 @@ +/*************************************************************************** + cuda-kernels.h - description + ------------------- + begin : Dec 19, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +namespace TNL { + namespace Benchmarks { + namespace Traversers { + +#ifdef HAVE_CUDA + +/**** + * Full grid traversing + */ +template< typename Real, + typename Index > +__global__ void fullGridTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( threadIdx_x < size ) + v_data[ threadIdx_x ] += (Real) 1.0; +} + +template< typename Real, + typename Index > +__global__ void fullGridTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + if( threadIdx_x < size && threadIdx_y < size ) + v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 1.0; +} + +template< typename Real, + typename Index > +__global__ void fullGridTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size ) + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 1.0; +} + +/**** + * Traversing interior cells + */ +template< typename Real, + typename Index > +__global__ void interiorTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( threadIdx_x > 0 && threadIdx_x < size - 1 ) + v_data[ threadIdx_x ] += (Real) 1.0; +} + +template< typename Real, + typename Index > +__global__ void interiorTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + if( threadIdx_x > 0 && threadIdx_y > 0 && + threadIdx_x < size - 1 && threadIdx_y < size - 1 ) + v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 1.0; +} + +template< typename Real, + typename Index > +__global__ void interiorTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + if( threadIdx_x > 0 && threadIdx_y > 0 && threadIdx_z > 0 && + threadIdx_x < size - 1 && threadIdx_y < size - 1 && threadIdx_z < size - 1 ) + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 1.0; +} + +/**** + * Grid boundaries traversing + */ +template< typename Real, + typename Index > +__global__ void boundariesTraverseKernel1D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( threadIdx_x == 0 || threadIdx_x == size - 1 ) + v_data[ threadIdx_x ] += (Real) 2.0; +} + +template< typename Real, + typename Index > +__global__ void boundariesTraverseKernel2D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + if( threadIdx_x > 0 && threadIdx_y > 0 && + threadIdx_x < size - 1 && threadIdx_y < size - 1 ) + v_data[ threadIdx_y * size + threadIdx_x ] += (Real) 2.0; +} + +template< typename Real, + typename Index > +__global__ void boundariesTraverseKernel3D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + if( threadIdx_x == 0 || threadIdx_y == 0 || threadIdx_z == 0 || + threadIdx_x == size - 1 || threadIdx_y == size - 1 || threadIdx_z == size - 1 ) + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] += (Real) 2.0; +} + +#endif + } // namespace Traversers + } // namespace Benchmarks +} // namespace TNL + diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cf69b41ddb1a5887e7345bf8ac7951e73c3cf4f1 --- /dev/null +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cpp @@ -0,0 +1,11 @@ +/*************************************************************************** + tnl-benchmark-traversers.cpp - description + ------------------- + begin : Dec 17, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include "tnl-benchmark-traversers.h" \ No newline at end of file diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.cu b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cu new file mode 100644 index 0000000000000000000000000000000000000000..614b0d2000c9da3293aa95ef7bd82bcf6b643368 --- /dev/null +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.cu @@ -0,0 +1,11 @@ +/*************************************************************************** + tnl-benchmark-traversers.cu - description + ------------------- + begin : Dec 17, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include "tnl-benchmark-traversers.h" \ No newline at end of file diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h new file mode 100644 index 0000000000000000000000000000000000000000..63b3cc8c94a58b1616cb77c4fa43348bc53e93e8 --- /dev/null +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -0,0 +1,528 @@ +/*************************************************************************** + tnl-benchmark-traversers.h - description + ------------------- + begin : Dec 17, 2018 + copyright : (C) 2018 by oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include "../Benchmarks.h" +//#include "grid-traversing.h" +#include "GridTraversersBenchmark.h" + +#include +#include +#include +#include +#include + +using namespace TNL; +using namespace TNL::Benchmarks; +using namespace TNL::Benchmarks::Traversers; + + +template< int Dimension, + typename Real = float, + typename Index = int > +bool runBenchmark( const Config::ParameterContainer& parameters, + Benchmark& benchmark, + Benchmark::MetadataMap& metadata ) +{ + const Containers::List< String >& tests = parameters.getParameter< Containers::List< String > >( "tests" ); + // FIXME: the --tests is just a string because list does not work with enums +// const Containers::List< String >& tests = parameters.getParameter< Containers::List< String > >( "tests" ); + //Containers::List< String > tests; + //tests.Append( parameters.getParameter< String >( "tests" ) ); + // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), + // which have a default value. The workaround below works for int values, but it is not possible + // to pass 64-bit integer values + // const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); + // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); + const std::size_t minSize = parameters.getParameter< int >( "min-size" ); + const std::size_t maxSize = parameters.getParameter< int >( "max-size" ); + const bool withHost = parameters.getParameter< bool >( "with-host" ); +#ifdef HAVE_CUDA + const bool withCuda = parameters.getParameter< bool >( "with-cuda" ); +#else + const bool withCuda = false; +#endif + const bool check = parameters.getParameter< bool >( "check" ); + + /**** + * Full grid traversing with no boundary conditions + */ + benchmark.newBenchmark( String("Traversing without boundary conditions" + convertToString( Dimension ) + "D" ), metadata ); + for( std::size_t size = minSize; size <= maxSize; size *= 2 ) + { + GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size ); +#ifdef HAVE_CUDA + GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size ); +#endif + + auto hostReset = [&]() + { + hostTraverserBenchmark.reset(); + }; + +#ifdef HAVE_CUDA + auto cudaReset = [&]() + { + cudaTraverserBenchmark.reset(); + }; +#endif + benchmark.setMetadataColumns( + Benchmark::MetadataColumns( + { {"size", convertToString( size ) }, } ) ); + + /**** + * Add one using pure C code + */ + if( tests.containsValue( "all" ) || tests.containsValue( "add-one-pure-c" ) ) + { + benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + + auto hostWriteOneUsingPureC = [&] () + { + hostTraverserBenchmark.addOneUsingPureC(); + }; + if( withHost ) + { + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); + if( check && ! hostTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } +#ifdef HAVE_CUDA + auto cudaWriteOneUsingPureC = [&] () + { + cudaTraverserBenchmark.addOneUsingPureC(); + }; + if( withCuda ) + { + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } +#endif + } + + /**** + * Add one using parallel for + */ + if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for" ) ) + { + benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + + auto hostWriteOneUsingParallelFor = [&] () + { + hostTraverserBenchmark.addOneUsingParallelFor(); + }; + if( withHost ) + { + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); + if( check && ! hostTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } + +#ifdef HAVE_CUDA + auto cudaWriteOneUsingParallelFor = [&] () + { + cudaTraverserBenchmark.addOneUsingParallelFor(); + }; + if( withCuda ) + { + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } +#endif + } + + /**** + * Add one using parallel for with grid entity + */ + if( tests.containsValue( "all" ) || tests.containsValue( "add-one-simple-cell" ) ) + { + auto hostAddOneUsingSimpleCell = [&] () + { + hostTraverserBenchmark.addOneUsingSimpleCell(); + }; + benchmark.setOperation( "simple cell", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + if( withHost ) + { + benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingSimpleCell ); + if( check && ! hostTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } +#ifdef HAVE_CUDA + auto cudaAddOneUsingSimpleCell = [&] () + { + cudaTraverserBenchmark.addOneUsingSimpleCell(); + }; + if( withCuda ) + { + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingSimpleCell ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } +#endif + } + + /**** + * Add one using parallel for with mesh function + */ + if( tests.containsValue( "all" ) || tests.containsValue( "add-one-parallel-for-and-mesh-function" ) ) + { + auto hostAddOneUsingParallelForAndMeshFunction = [&] () + { + hostTraverserBenchmark.addOneUsingParallelForAndMeshFunction(); + }; + benchmark.setOperation( "par.for+mesh fc.", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + if( withHost ) + { + benchmark.time< Devices::Host >( hostReset, "CPU", hostAddOneUsingParallelForAndMeshFunction ); + if( check && ! hostTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } +#ifdef HAVE_CUDA + auto cudaAddOneUsingParallelForAndMeshFunction = [&] () + { + cudaTraverserBenchmark.addOneUsingParallelForAndMeshFunction(); + }; + if( withCuda ) + { + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaAddOneUsingParallelForAndMeshFunction ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } +#endif + } + + /**** + * Add one using traverser + */ + if( tests.containsValue( "all" ) || tests.containsValue( "add-one-traverser" ) ) + { + benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + auto hostWriteOneUsingTraverser = [&] () + { + hostTraverserBenchmark.addOneUsingTraverser(); + }; + if( withHost ) + { + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); + if( check && ! hostTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } + +#ifdef HAVE_CUDA + auto cudaWriteOneUsingTraverser = [&] () + { + cudaTraverserBenchmark.addOneUsingTraverser(); + }; + if( withCuda ) + { + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); + if( check && ! cudaTraverserBenchmark.checkAddOne( + benchmark.getPerformedLoops(), + benchmark.isResetingOn() ) ) + benchmark.addErrorMessage( "Test results are not correct." ); + } +#endif + } + std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl; + } + return true; + } + std::cout << "--------------------------------------------------------------------------------------------------------" << std::endl; + } + + /**** + * Full grid traversing including boundary conditions + */ + benchmark.newBenchmark( String("Traversing with boundary conditions" + convertToString( Dimension ) + "D" ), metadata ); + for( std::size_t size = minSize; size <= maxSize; size *= 2 ) + { + GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size ); + GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size ); + + auto hostReset = [&]() + { + hostTraverserBenchmark.reset(); + }; + +#ifdef HAVE_CUDA + auto cudaReset = [&]() + { + cudaTraverserBenchmark.reset(); + }; +#endif + + benchmark.setMetadataColumns( + Benchmark::MetadataColumns( + { {"size", convertToString( size ) }, } ) ); + + /**** + * Write one and two (as BC) using C for + */ + auto hostTraverseUsingPureC = [&] () + { + hostTraverserBenchmark.traverseUsingPureC(); + }; + +#ifdef HAVE_CUDA + auto cudaTraverseUsingPureC = [&] () + { + cudaTraverserBenchmark.traverseUsingPureC(); + }; +#endif + + if( tests.containsValue( "all" ) || tests.containsValue( "bc-pure-c" ) ) + { + benchmark.setOperation( "Pure C", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + if( withHost ) + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingPureC ); + +#ifdef HAVE_CUDA + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingPureC ); +#endif + benchmark.setOperation( "Pure C RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingPureC ); + +#ifdef HAVE_CUDA + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingPureC ); +#endif + } + + /**** + * Write one and two (as BC) using parallel for + */ + auto hostTraverseUsingParallelFor = [&] () + { + hostTraverserBenchmark.addOneUsingParallelFor(); + }; + +#ifdef HAVE_CUDA + auto cudaTraverseUsingParallelFor = [&] () + { + cudaTraverserBenchmark.addOneUsingParallelFor(); + }; +#endif + + if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) ) + { + benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + if( withHost ) + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); +#ifdef HAVE_CUDA + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor ); +#endif + + benchmark.setOperation( "parallel for RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor ); +#ifdef HAVE_CUDA + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor ); +#endif + } +// TODO: implement the benchmark (addOneUsingParallelFor does not consider BC) +// auto hostTraverseUsingParallelFor = [&] () +// { +// hostTraverserBenchmark.addOneUsingParallelFor(); +// }; +// +// auto cudaTraverseUsingParallelFor = [&] () +// { +// cudaTraverserBenchmark.addOneUsingParallelFor(); +// }; +// +// if( tests.containsValue( "all" ) || tests.containsValue( "bc-parallel-for" ) ) +// { +// benchmark.setOperation( "parallel for", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); +// if( withHost ) +// benchmark.time< Devices::Host >( "CPU", hostTraverseUsingParallelFor ); +// if( withCuda ) +// benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingParallelFor ); +// +// benchmark.setOperation( "parallel for RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); +// if( withHost ) +// benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingParallelFor ); +// if( withCuda ) +// benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingParallelFor ); +// } + + /**** + * Write one and two (as BC) using traverser + */ + auto hostTraverseUsingTraverser = [&] () + { + hostTraverserBenchmark.addOneUsingTraverser(); + }; + +#ifdef HAVE_CUDA + auto cudaTraverseUsingTraverser = [&] () + { + cudaTraverserBenchmark.addOneUsingTraverser(); + }; +#endif + + if( tests.containsValue( "all" ) || tests.containsValue( "bc-traverser" ) ) + { + benchmark.setOperation( "traverser", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + if( withHost ) + benchmark.time< Devices::Host >( "CPU", hostTraverseUsingTraverser ); + +#ifdef HAVE_CUDA + if( withCuda ) + benchmark.time< Devices::Cuda >( "GPU", cudaTraverseUsingTraverser ); +#endif + + benchmark.setOperation( "traverser RST", 2 * pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + if( withHost ) + benchmark.time< Devices::Host >( hostReset, "CPU", hostTraverseUsingTraverser ); + +#ifdef HAVE_CUDA + if( withCuda ) + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaTraverseUsingTraverser ); +#endif + } + } + return true; +} + +void setupConfig( Config::ConfigDescription& config ) +{ + config.addList< String >( "tests", "Tests to be performed.", "all" ); + // FIXME: addList does not work with addEntryEnum - ConfigDescription::addEntryEnum throws std::bad_cast + // config.addList< String >( "tests", "Tests to be performed.", "all" ); + config.addEntryEnum( "all" ); + config.addEntryEnum( "add-one-pure-c" ); + config.addEntryEnum( "add-one-parallel-for" ); + config.addEntryEnum( "add-one-parallel-for-and-grid-entity" ); + config.addEntryEnum( "add-one-traverser" ); + config.addEntryEnum( "bc-pure-c" ); + config.addEntryEnum( "bc-parallel-for" ); + config.addEntryEnum( "bc-traverser" ); + config.addEntry< bool >( "with-host", "Perform CPU benchmarks.", true ); +#ifdef HAVE_CUDA + config.addEntry< bool >( "with-cuda", "Perform CUDA benchmarks.", true ); +#else + config.addEntry< bool >( "with-cuda", "Perform CUDA benchmarks.", false ); +#endif + config.addEntry< bool >( "check", "Checking correct results of benchmark tests.", false ); + config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-traversers.log"); + config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); + config.addEntryEnum( "append" ); + config.addEntryEnum( "overwrite" ); + + config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); + config.addEntryEnum( "float" ); + config.addEntryEnum( "double" ); + config.addEntryEnum( "all" ); + config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 ); + config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 ); + config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 ); + config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); + Benchmark::configSetup( config ); + + config.addDelimiter( "Device settings:" ); + Devices::Host::configSetup( config ); + Devices::Cuda::configSetup( config ); +} + +template< int Dimension > +bool setupBenchmark( const Config::ParameterContainer& parameters ) +{ + const String & logFileName = parameters.getParameter< String >( "log-file" ); + const String & outputMode = parameters.getParameter< String >( "output-mode" ); + const String & precision = parameters.getParameter< String >( "precision" ); + const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); + + Benchmark benchmark; //( loops, verbose ); + benchmark.setup( parameters ); + Benchmark::MetadataMap metadata = getHardwareMetadata(); + runBenchmark< Dimension >( parameters, benchmark, metadata ); + + auto mode = std::ios::out; + if( outputMode == "append" ) + mode |= std::ios::app; + std::ofstream logFile( logFileName.getString(), mode ); + + if( ! benchmark.save( logFile ) ) + { + std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; + return false; + } + return true; +} + +int main( int argc, char* argv[] ) +{ + Config::ConfigDescription config; + Config::ParameterContainer parameters; + + setupConfig( config ); + if( ! parseCommandLine( argc, argv, config, parameters ) ) { + config.printUsage( argv[ 0 ] ); + return EXIT_FAILURE; + } + + if( ! Devices::Host::setup( parameters ) || + ! Devices::Cuda::setup( parameters ) ) + return EXIT_FAILURE; + + const int dimension = parameters.getParameter< int >( "dimension" ); + bool status( false ); + if( ! dimension ) + { + status = setupBenchmark< 1 >( parameters ); + status |= setupBenchmark< 2 >( parameters ); + status |= setupBenchmark< 3 >( parameters ); + } + else + { + switch( dimension ) + { + case 1: + status = setupBenchmark< 1 >( parameters ); + break; + case 2: + status = setupBenchmark< 2 >( parameters ); + break; + case 3: + status = setupBenchmark< 3 >( parameters ); + break; + } + } + if( status == false ) + return EXIT_FAILURE; + return EXIT_SUCCESS; +} diff --git a/src/Benchmarks/scripts/CMakeLists.txt b/src/Benchmarks/scripts/CMakeLists.txt index 1388c7984ad603c804e6e11ebcbfc9167e3b1a3a..31acdeb7d739db08d10f289bc3e25b952b108147 100644 --- a/src/Benchmarks/scripts/CMakeLists.txt +++ b/src/Benchmarks/scripts/CMakeLists.txt @@ -1,16 +1,13 @@ -INSTALL( FILES matrix-market - florida-matrix-market - get-matrices - convert-matrices - draw-matrices +INSTALL( FILES tnl-run-heat-equation-benchmark + run-tnl-benchmark-spmv + run-tnl-benchmark-traversers run-matrix-solvers-benchmark run-tnl-benchmark-spmv run-tnl-benchmark-linear-solvers - tnl-run-heat-equation-benchmark - cuda-profiler.conf - process-cuda-profile.pl + DESTINATION ${TNL_TARGET_DATA_DIRECTORY}/benchmark-scripts ) -INSTALL( FILES tnl-run-spmv-benchmark +INSTALL( FILES run-tnl-benchmark-spmv + run-tnl-benchmark-traversers DESTINATION bin PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) diff --git a/src/Benchmarks/scripts/cuda-profiler.conf b/src/Benchmarks/scripts/cuda-profiler.conf deleted file mode 100644 index 8ff91fe3b3cd7d1fce3759c4458ec557b98bfeb2..0000000000000000000000000000000000000000 --- a/src/Benchmarks/scripts/cuda-profiler.conf +++ /dev/null @@ -1,7 +0,0 @@ -== cuda-kernel.conf == -timestamp -threadblocksize -l1_global_load_hit -l1_global_load_miss -gld_incoherent -gst_incoherent \ No newline at end of file diff --git a/src/Benchmarks/scripts/process-cuda-profile.pl b/src/Benchmarks/scripts/process-cuda-profile.pl deleted file mode 100644 index 187623da942b20ff9c57fe11f9aecc8768573a2b..0000000000000000000000000000000000000000 --- a/src/Benchmarks/scripts/process-cuda-profile.pl +++ /dev/null @@ -1,42 +0,0 @@ -open( INPUT, "$ARGV[0]" ) - or die "Can not open file $ARGV[ 0 ]"; -$blockSize = 0; -$testNumber = 0; -while( $line = ) -{ - if( $line =~ m/.*sparseCSRMatrixVectorProductKernel.*threadblocksize=\[ (.*), 1, 1 \] occupancy=\[ (.*) \] tex_cache_hit=\[ (.*) \] tex_cache_miss=\[ (.*) \] gld_incoherent=\[ (.*) \] gst_incoherent=\[ (.*) \].*/ ) - { - if( $blockSize != $1 ) - { - $blockSize = $1; - $occupancy{$testNumber} = $2; - $texCacheHit{$testNumber} = $3; - $texCacheMiss{$testNumber} = $4; - $gldIncoherent{$testNumber} = $5; - $gstIncoherent{$testNumber} = $6; - $testNumber = $testNumber + 1; - } - } -} -close( INPUT ); - -print "There were $testNumber tests."; - -open( LOG, ">>$ARGV[1]" ) - or die "Can not open file $ARGV[1]"; -printf LOG "| %97s |", $ARGV[ 0 ]; -$testOutput = 0; -while( $testOutput < $testNumber ) -{ - printf LOG "%10.3f |", $occupancy{$testOutput}; - printf LOG "%10.3f |", $texCahceHit{$testOutput}; - printf LOG "%10.3f |", $texCacheMiss{$testOutput}; - printf LOG "%10.3f |", $gldIncoherent{$testOutput}; - printf LOG "%10.3f |", $gstIncoherent{$testOutput}; - $testOutput = $testOutput + 1; -} -print LOG "\n"; -close( LOG ); - - - diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-traversers b/src/Benchmarks/scripts/run-tnl-benchmark-traversers new file mode 100644 index 0000000000000000000000000000000000000000..00cd1e1ac64f0a9318c7ea749aad7014ce4d8e20 --- /dev/null +++ b/src/Benchmarks/scripts/run-tnl-benchmark-traversers @@ -0,0 +1,5 @@ +#!/bin/bash + +tnl-benchmark-traversers --dimension 1 --loops 1 --min-size 16 --max-size 100000 --min-time 1 +tnl-benchmark-traversers --dimension 2 --loops 1 --min-size 16 --max-size 10000 --min-time 1 --output-mode append +tnl-benchmark-traversers --dimension 3 --loops 1 --min-size 16 --max-size 1000 --min-time 1 --output-mode append diff --git a/src/TNL/Config/ConfigEntry.h b/src/TNL/Config/ConfigEntry.h index 1608a5b4b1311ded6ca6d02c01fbe6afd7524d75..1b56574cc3983a3425ab023e7466c699fbe9f982 100644 --- a/src/TNL/Config/ConfigEntry.h +++ b/src/TNL/Config/ConfigEntry.h @@ -61,7 +61,7 @@ struct ConfigEntry : public ConfigEntryBase String printDefaultValue() const { return convertToString( defaultValue ); - }; + } std::vector< EntryType >& getEnumValues() { diff --git a/src/TNL/Containers/List.h b/src/TNL/Containers/List.h index 2c175bcce772236d54c8cd0a2f664822e9c97a8c..0cf6f762dbfce6057af4132659064fc889c91082 100644 --- a/src/TNL/Containers/List.h +++ b/src/TNL/Containers/List.h @@ -109,6 +109,13 @@ template< class T > class List template< typename Array > void toArray( Array& array ); + /*** + * \brief Checks if there is an element with value \e v in given array. + * + * \param v Reference to a value. + */ + bool containsValue( const T& v ) const; + /// Erases data element at given position. /// /// \param ind Index of the data element one chooses to remove. @@ -146,7 +153,7 @@ template< class T > class List /// /// \param file Name of file. bool DeepLoad( File& file ); - + protected: /// Pointer to the first element. ListDataElement< T >* first; diff --git a/src/TNL/Containers/List_impl.h b/src/TNL/Containers/List_impl.h index e67be136cd19e28db2f3f2da1183e8b2d7bf3236..36fd5dbdc2c928d2feb07bf505eb5c441ab85101 100644 --- a/src/TNL/Containers/List_impl.h +++ b/src/TNL/Containers/List_impl.h @@ -207,6 +207,14 @@ void List< T >::toArray( Array& array ) for( int i = 0; i < this->getSize(); i++ ) array[ i ] = ( *this )[ i ]; } +template< typename T > +bool List< T >::containsValue( const T& v ) const +{ + for( int i = 0; i < this->getSize(); i++ ) + if( ( *this )[ i ] == v ) + return true; + return false; +} template< typename T > void List< T >::Erase( const int& ind ) diff --git a/src/TNL/Functions/MeshFunction.h b/src/TNL/Functions/MeshFunction.h index 4ccdab9f312433d262e654eb4686df45216406fc..32d54ec2139a05e7e2456fe9436b53d1801b8ffe 100644 --- a/src/TNL/Functions/MeshFunction.h +++ b/src/TNL/Functions/MeshFunction.h @@ -20,7 +20,7 @@ namespace TNL { -namespace Functions { +namespace Functions { template< typename Mesh, int MeshEntityDimension = Mesh::getMeshDimension(), @@ -32,155 +32,152 @@ class MeshFunction : //static_assert( Mesh::DeviceType::DeviceType == Vector::DeviceType::DeviceType, // "Both mesh and vector of a mesh function must reside on the same device."); public: - + using MeshType = Mesh; using DeviceType = typename MeshType::DeviceType; using IndexType = typename MeshType::GlobalIndexType; - using MeshPointer = Pointers::SharedPointer< MeshType >; + using MeshPointer = Pointers::SharedPointer< MeshType >; using RealType = Real; using VectorType = Containers::Vector< RealType, DeviceType, IndexType >; using ThisType = Functions::MeshFunction< MeshType, MeshEntityDimension, RealType >; using DistributedMeshType = Meshes::DistributedMeshes::DistributedMesh; using DistributedMeshSynchronizerType = Meshes::DistributedMeshes::DistributedMeshSynchronizer; - + static constexpr int getEntitiesDimension() { return MeshEntityDimension; } - + static constexpr int getMeshDimension() { return MeshType::getMeshDimension(); } - + MeshFunction(); - - MeshFunction( const MeshPointer& meshPointer ); - + + MeshFunction( const MeshPointer& meshPointer ); + MeshFunction( const ThisType& meshFunction ); - + template< typename Vector > MeshFunction( const MeshPointer& meshPointer, Vector& data, - const IndexType& offset = 0 ); - - + const IndexType& offset = 0 ); + template< typename Vector > MeshFunction( const MeshPointer& meshPointer, Pointers::SharedPointer< Vector >& data, - const IndexType& offset = 0 ); - + const IndexType& offset = 0 ); + static String getType(); - + String getTypeVirtual() const; - + static String getSerializationType(); virtual String getSerializationTypeVirtual() const; - + static void configSetup( Config::ConfigDescription& config, const String& prefix = "" ); bool setup( const MeshPointer& meshPointer, const Config::ParameterContainer& parameters, const String& prefix = "" ); - + void bind( ThisType& meshFunction ); - + template< typename Vector > void bind( const Vector& data, const IndexType& offset = 0 ); - + template< typename Vector > void bind( const MeshPointer& meshPointer, const Vector& data, const IndexType& offset = 0 ); - + template< typename Vector > void bind( const MeshPointer& meshPointer, const Pointers::SharedPointer< Vector >& dataPtr, const IndexType& offset = 0 ); - + void setMesh( const MeshPointer& meshPointer ); - + template< typename Device = Devices::Host > __cuda_callable__ const MeshType& getMesh() const; - + const MeshPointer& getMeshPointer() const; - + static IndexType getDofs( const MeshPointer& meshPointer ); - - __cuda_callable__ const VectorType& getData() const; - + + __cuda_callable__ const VectorType& getData() const; + __cuda_callable__ VectorType& getData(); - + bool refresh( const RealType& time = 0.0 ) const; - + bool deepRefresh( const RealType& time = 0.0 ) const; - + template< typename EntityType > RealType getValue( const EntityType& meshEntity ) const; - + template< typename EntityType > void setValue( const EntityType& meshEntity, const RealType& value ); - + template< typename EntityType > __cuda_callable__ RealType& operator()( const EntityType& meshEntity, - const RealType& time = 0.0 ); - + const RealType& time = 0 ); + template< typename EntityType > __cuda_callable__ const RealType& operator()( const EntityType& meshEntity, - const RealType& time = 0.0 ) const; - + const RealType& time = 0 ) const; + __cuda_callable__ RealType& operator[]( const IndexType& meshEntityIndex ); - __cuda_callable__ const RealType& operator[]( const IndexType& meshEntityIndex ) const; template< typename Function > ThisType& operator = ( const Function& f ); - + template< typename Function > ThisType& operator -= ( const Function& f ); template< typename Function > ThisType& operator += ( const Function& f ); - + RealType getLpNorm( const RealType& p ) const; - + RealType getMaxNorm() const; - + bool save( File& file ) const; bool load( File& file ); - + bool boundLoad( File& file ); - + bool write( const String& fileName, const String& format = "vtk", const double& scale = 1.0 ) const; - + using Object::save; - + using Object::load; - + using Object::boundLoad; template< typename CommunicatorType, typename PeriodicBoundariesMaskType = MeshFunction< Mesh, MeshEntityDimension, bool > > void synchronize( bool withPeriodicBoundaryConditions = false, const Pointers::SharedPointer< PeriodicBoundariesMaskType, DeviceType >& mask = - Pointers::SharedPointer< PeriodicBoundariesMaskType, DeviceType >( nullptr ) ); + Pointers::SharedPointer< PeriodicBoundariesMaskType, DeviceType >( nullptr ) ); - protected: //DistributedMeshSynchronizerType synchronizer; Meshes::DistributedMeshes::DistributedMeshSynchronizer< Functions::MeshFunction< MeshType, MeshEntityDimension, RealType > > synchronizer; - + MeshPointer meshPointer; - + VectorType data; - + template< typename, typename > friend class MeshFunctionEvaluator; private: diff --git a/src/TNL/Functions/MeshFunction_impl.h b/src/TNL/Functions/MeshFunction_impl.h index 49b75d52fdbdb14eca36cd9d5b543fb838f4426c..16d17914d52955af64068d9516922580de6515d9 100644 --- a/src/TNL/Functions/MeshFunction_impl.h +++ b/src/TNL/Functions/MeshFunction_impl.h @@ -19,7 +19,7 @@ #pragma once namespace TNL { -namespace Functions { + namespace Functions { template< typename Mesh, int MeshEntityDimension, @@ -48,7 +48,6 @@ template< typename Mesh, MeshFunction< Mesh, MeshEntityDimension, Real >:: MeshFunction( const ThisType& meshFunction ) { - setupSynchronizer(meshFunction.meshPointer->getDistributedMesh()); this->meshPointer=meshFunction.meshPointer; @@ -241,7 +240,6 @@ bind( const MeshPointer& meshPointer, this->data.bind( *data, offset, getMesh().template getEntitiesCount< typename Mesh::template EntityType< MeshEntityDimension > >() ); } - template< typename Mesh, int MeshEntityDimension, typename Real > @@ -578,7 +576,6 @@ operator << ( std::ostream& str, const MeshFunction< Mesh, MeshEntityDimension, return str; } - -} // namespace Functions + } // namespace Functions } // namespace TNL diff --git a/src/TNL/Meshes/GridDetails/CMakeLists.txt b/src/TNL/Meshes/GridDetails/CMakeLists.txt index 0da067f142c99ed0db8435a2c3818133d007026f..3386ec242531431fc12b59649760a8a9e35b6cb7 100644 --- a/src/TNL/Meshes/GridDetails/CMakeLists.txt +++ b/src/TNL/Meshes/GridDetails/CMakeLists.txt @@ -14,7 +14,9 @@ SET( headers BoundaryGridEntityChecker.h GridEntityMeasureGetter.h GridEntityTopology.h GridTraverser.h - GridTraverser_impl.h + GridTraverser_1D.hpp + GridTraverser_2D.hpp + GridTraverser_3D.hpp NeighborGridEntitiesStorage.h NeighborGridEntityGetter1D_impl.h NeighborGridEntityGetter2D_impl.h diff --git a/src/TNL/Meshes/GridDetails/Grid1D.h b/src/TNL/Meshes/GridDetails/Grid1D.h index 426428ae4144910e3f77f4f68033df7d0e133601..9a8f1460076d578837b48b7f0dbcee258e70d9f2 100644 --- a/src/TNL/Meshes/GridDetails/Grid1D.h +++ b/src/TNL/Meshes/GridDetails/Grid1D.h @@ -60,6 +60,8 @@ class Grid< 1, Real, Device, Index > : public Object * \brief Basic constructor. */ Grid(); + + Grid( const Index xSize ); /** * \brief Returns type of grid Real (value), Device type and the type of Index. diff --git a/src/TNL/Meshes/GridDetails/Grid1D_impl.h b/src/TNL/Meshes/GridDetails/Grid1D_impl.h index 1754edc587fee6202d59f778cb6c9485d3a05f4e..995fa6dab31446cdd5e2a293e3377f04e97cb7ab 100644 --- a/src/TNL/Meshes/GridDetails/Grid1D_impl.h +++ b/src/TNL/Meshes/GridDetails/Grid1D_impl.h @@ -33,6 +33,17 @@ Grid< 1, Real, Device, Index >::Grid() { } +template< typename Real, + typename Device, + typename Index > +Grid< 1, Real, Device, Index >::Grid( const Index xSize ) +: numberOfCells( 0 ), + numberOfVertices( 0 ), + distGrid(nullptr) +{ + this->setDimensions( xSize ); +} + template< typename Real, typename Device, typename Index > diff --git a/src/TNL/Meshes/GridDetails/Grid2D.h b/src/TNL/Meshes/GridDetails/Grid2D.h index 84c6b4f33f9ba717da2aa0ba2bd6359cb1f69384..f2dbebc5c78c36959a68a574886496c75857ca81 100644 --- a/src/TNL/Meshes/GridDetails/Grid2D.h +++ b/src/TNL/Meshes/GridDetails/Grid2D.h @@ -61,6 +61,8 @@ class Grid< 2, Real, Device, Index > : public Object /** * \brief See Grid1D::getType(). */ + Grid( const Index xSize, const Index ySize ); + static String getType(); /** @@ -80,8 +82,8 @@ class Grid< 2, Real, Device, Index > : public Object /** * \brief Sets the size of dimensions. - * \param xSize Size of dimesion x. - * \param ySize Size of dimesion y. + * \param xSize Size of dimension x. + * \param ySize Size of dimension y. */ void setDimensions( const Index xSize, const Index ySize ); diff --git a/src/TNL/Meshes/GridDetails/Grid2D_impl.h b/src/TNL/Meshes/GridDetails/Grid2D_impl.h index b315d5d086aee7532817edd67b9870735d4a49a5..41e05d8b5a5853f8f5cc9d08ef8a0360b7202636 100644 --- a/src/TNL/Meshes/GridDetails/Grid2D_impl.h +++ b/src/TNL/Meshes/GridDetails/Grid2D_impl.h @@ -36,6 +36,20 @@ Grid< 2, Real, Device, Index > :: Grid() { } +template< typename Real, + typename Device, + typename Index > +Grid< 2, Real, Device, Index >::Grid( const Index xSize, const Index ySize ) +: numberOfCells( 0 ), + numberOfNxFaces( 0 ), + numberOfNyFaces( 0 ), + numberOfFaces( 0 ), + numberOfVertices( 0 ), + distGrid(nullptr) +{ + this->setDimensions( xSize, ySize ); +} + template< typename Real, typename Device, typename Index > diff --git a/src/TNL/Meshes/GridDetails/Grid3D.h b/src/TNL/Meshes/GridDetails/Grid3D.h index 565198077f68271e58022b314ba8ecd5ca19c3fb..617efe7f31bc9e8b5ef3898e0f75eb9cf1ada11a 100644 --- a/src/TNL/Meshes/GridDetails/Grid3D.h +++ b/src/TNL/Meshes/GridDetails/Grid3D.h @@ -58,6 +58,8 @@ class Grid< 3, Real, Device, Index > : public Object */ Grid(); + Grid( const Index xSize, const Index ySize, const Index zSize ); + /** * \brief See Grid1D::getType(). */ diff --git a/src/TNL/Meshes/GridDetails/Grid3D_impl.h b/src/TNL/Meshes/GridDetails/Grid3D_impl.h index cc6805ac0632de43899d124c87fe4427f84487e5..edbee0c006bc50e2575262efc64f5b2978d13034 100644 --- a/src/TNL/Meshes/GridDetails/Grid3D_impl.h +++ b/src/TNL/Meshes/GridDetails/Grid3D_impl.h @@ -43,6 +43,28 @@ Grid< 3, Real, Device, Index > :: Grid() { } +template< typename Real, + typename Device, + typename Index > +Grid< 3, Real, Device, Index >::Grid( const Index xSize, const Index ySize, const Index zSize ) +: numberOfCells( 0 ), + numberOfNxFaces( 0 ), + numberOfNyFaces( 0 ), + numberOfNzFaces( 0 ), + numberOfNxAndNyFaces( 0 ), + numberOfFaces( 0 ), + numberOfDxEdges( 0 ), + numberOfDyEdges( 0 ), + numberOfDzEdges( 0 ), + numberOfDxAndDyEdges( 0 ), + numberOfEdges( 0 ), + numberOfVertices( 0 ), + distGrid(nullptr) +{ + this->setDimensions( xSize, ySize, zSize ); +} + + template< typename Real, typename Device, typename Index > diff --git a/src/TNL/Meshes/GridDetails/GridTraverser.h b/src/TNL/Meshes/GridDetails/GridTraverser.h index 3a74c085bcd24b9935cc6750cc2da3587a795a15..fb6b34da12fb750c0ad74cc3ba05b086727adf01 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser.h +++ b/src/TNL/Meshes/GridDetails/GridTraverser.h @@ -25,6 +25,8 @@ class GridTraverser { }; +enum GridTraverserMode { synchronousMode, asynchronousMode }; + /**** * 1D grid, Devices::Host */ @@ -52,6 +54,7 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > > const CoordinatesType begin, const CoordinatesType end, UserData& userData, + GridTraverserMode mode = synchronousMode, const int& stream = 0 ); }; @@ -82,6 +85,7 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > > const CoordinatesType& begin, const CoordinatesType& end, UserData& userData, + GridTraverserMode mode = synchronousMode, const int& stream = 0 ); }; @@ -112,6 +116,7 @@ class GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > > const CoordinatesType& begin, const CoordinatesType& end, UserData& userData, + GridTraverserMode mode = synchronousMode, const int& stream = 0 ); }; @@ -148,7 +153,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > > const CoordinatesType end, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) -// const int& stream = 0, + //GridTraverserMode mode = synchronousMode, + GridTraverserMode mode, + // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces) @@ -186,7 +193,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > > const CoordinatesType& end, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) -// const int& stream = 0, + //GridTraverserMode mode = synchronousMode, + GridTraverserMode mode, + // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces) @@ -224,7 +233,9 @@ class GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > > const CoordinatesType& end, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) -// const int& stream = 0, + //GridTraverserMode mode = synchronousMode, + GridTraverserMode mode, + // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces) @@ -263,7 +274,9 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > > const CoordinatesType end, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) -// const int& stream = 0, + //GridTraverserMode mode = synchronousMode, + GridTraverserMode mode, + // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces and edges) @@ -302,7 +315,9 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > > const CoordinatesType& end, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) -// const int& stream = 0, + //GridTraverserMode mode = synchronousMode, + GridTraverserMode mode, + // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces and edges) @@ -341,7 +356,9 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > > const CoordinatesType& end, UserData& userData, // FIXME: hack around nvcc bug (error: default argument not at end of parameter list) -// const int& stream = 0, + //GridTraverserMode mode = synchronousMode, + GridTraverserMode mode, + // const int& stream = 0, const int& stream, // gridEntityParameters are passed to GridEntity's constructor // (i.e. orientation and basis for faces and edges) @@ -351,5 +368,7 @@ class GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > > } // namespace Meshes } // namespace TNL -#include +#include +#include +#include diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5b35d5be972516d945da497837fc5711d62bb693 --- /dev/null +++ b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp @@ -0,0 +1,322 @@ +/*************************************************************************** + GridTraverser_1D.hpp - description + ------------------- + begin : Jan 4, 2019 + copyright : (C) 2019 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber, +// Jakub Klinkovsky, +// Vit Hanousek + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace TNL { +namespace Meshes { + +/**** + * 1D traverser, host + */ +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities > +void +GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType begin, + const CoordinatesType end, + UserData& userData, + GridTraverserMode mode, + const int& stream ) +{ + GridEntity entity( *gridPointer ); + if( processOnlyBoundaryEntities ) + { + GridEntity entity( *gridPointer ); + + entity.getCoordinates() = begin; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + entity.getCoordinates() = end; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + else + { +#ifdef HAVE_OPENMP + if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 ) + { +#pragma omp parallel firstprivate( begin, end ) + { + GridEntity entity( *gridPointer ); +#pragma omp for + // TODO: g++ 5.5 crashes when coding this loop without auxiliary x as bellow + for( IndexType x = begin.x(); x <= end.x(); x++ ) + { + entity.getCoordinates().x() = x; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + } + else + { + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } +#else + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +#endif + } +} + +/**** + * 1D traverser, CUDA + */ +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor > +__global__ void +GridTraverser1D( + const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const Index gridIdx ) +{ + typedef Real RealType; + typedef Index IndexType; + typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( coordinates <= end ) + { + GridEntity entity( *grid, coordinates ); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +} + +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor > +__global__ void +GridBoundaryTraverser1D( + const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end ) +{ + typedef Real RealType; + typedef Index IndexType; + typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + if( threadIdx.x == 0 ) + { + coordinates.x() = begin.x(); + GridEntity entity( *grid, coordinates ); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + if( threadIdx.x == 1 ) + { + coordinates.x() = end.x(); + GridEntity entity( *grid, coordinates ); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +} + +#endif + +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities > +void +GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType& begin, + const CoordinatesType& end, + UserData& userData, + GridTraverserMode mode, + const int& stream ) +{ +#ifdef HAVE_CUDA + auto& pool = CudaStreamPool::getInstance(); + const cudaStream_t& s = pool.getStream( stream ); + + //Devices::Cuda::synchronizeDevice(); + if( processOnlyBoundaryEntities ) + { + dim3 cudaBlockSize( 2 ); + dim3 cudaBlocks( 1 ); + GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > + <<< cudaBlocks, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end ); + } + else + { + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + end.x() - begin.x() + 1 ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > + <<< blocksCount, blockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end, + gridIdx.x ); + } + + /*dim3 cudaBlockSize( 256 ); + dim3 cudaBlocks; + cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); + const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); + + for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) + GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > + <<< cudaBlocks, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end, + gridXIdx );*/ + } + +#ifdef NDEBUG + if( mode == synchronousMode ) + { + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; + } +#else + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; +#endif + +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + +/**** + * 1D traverser, MIC + */ + +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities > +void +GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType& begin, + const CoordinatesType& end, + UserData& userData, + GridTraverserMode mode, + const int& stream ) +{ + std::cout << "Not Implemented yet Grid Traverser <1, Real, Device::MIC>" << std::endl; +/* + auto& pool = CudaStreamPool::getInstance(); + const cudaStream_t& s = pool.getStream( stream ); + + Devices::Cuda::synchronizeDevice(); + if( processOnlyBoundaryEntities ) + { + dim3 cudaBlockSize( 2 ); + dim3 cudaBlocks( 1 ); + GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > + <<< cudaBlocks, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end ); + } + else + { + dim3 cudaBlockSize( 256 ); + dim3 cudaBlocks; + cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); + const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); + + for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) + GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > + <<< cudaBlocks, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end, + gridXIdx ); + } + + // only launches into the stream 0 are synchronized + if( stream == 0 ) + { + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; + } +*/ +} + + } // namespace Meshes +} // namespace TNL diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp new file mode 100644 index 0000000000000000000000000000000000000000..50b30c0190bdda8c6c266385ecd785884f3282ac --- /dev/null +++ b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp @@ -0,0 +1,656 @@ +/*************************************************************************** + GridTraverser_2D.hpp - description + ------------------- + begin : Jan 4, 2019 + copyright : (C) 2019 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace TNL { +namespace Meshes { + +//#define GRID_TRAVERSER_USE_STREAMS + + +/**** + * 2D traverser, host + */ +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities, + int XOrthogonalBoundary, + int YOrthogonalBoundary, + typename... GridEntityParameters > +void +GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType begin, + const CoordinatesType end, + UserData& userData, + GridTraverserMode mode, + const int& stream, + const GridEntityParameters&... gridEntityParameters ) +{ + if( processOnlyBoundaryEntities ) + { + GridEntity entity( *gridPointer, begin, gridEntityParameters... ); + + if( YOrthogonalBoundary ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.getCoordinates().y() = begin.y(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + entity.getCoordinates().y() = end.y(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + if( XOrthogonalBoundary ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + { + entity.getCoordinates().x() = begin.x(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + entity.getCoordinates().x() = end.x(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + else + { +#ifdef HAVE_OPENMP + if( Devices::Host::isOMPEnabled() ) + { +#pragma omp parallel firstprivate( begin, end ) + { + GridEntity entity( *gridPointer ); +#pragma omp for + // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow + for( IndexType y = begin.y(); y <= end.y(); y ++ ) + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.getCoordinates().y() = y; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + } + else + { + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } +#else + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +#endif + } +} + +/**** + * 2D traverser, CUDA + */ +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser2D( + const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + + if( coordinates <= end ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() ) + { + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } + } +} + +// Boundary traverser using streams +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser2DBoundaryAlongX( + const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, + UserData userData, + const Index beginX, + const Index endX, + const Index fixedY, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = fixedY; + + if( coordinates.x() <= endX ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } +} + +// Boundary traverser using streams +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser2DBoundaryAlongY( + const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, + UserData userData, + const Index beginY, + const Index endY, + const Index fixedX, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = fixedX; + coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + + if( coordinates.y() <= endY ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } +} + + +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser2DBoundary( + const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, + UserData userData, + const Index beginX, + const Index endX, + const Index beginY, + const Index endY, + const Index blocksPerFace, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + using GridType = Meshes::Grid< 2, Real, Devices::Cuda, Index >; + using CoordinatesType = typename GridType::CoordinatesType; + + const Index faceIdx = blockIdx.x / blocksPerFace; + const Index faceBlockIdx = blockIdx.x % blocksPerFace; + const Index threadId = faceBlockIdx * blockDim. x + threadIdx.x; + if( faceIdx < 2 ) + { + const Index entitiesAlongX = endX - beginX + 1; + if( threadId < entitiesAlongX ) + { + GridEntity entity( *grid, + CoordinatesType( beginX + threadId, faceIdx == 0 ? beginY : endY ), + gridEntityParameters... ); + //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + } + else + { + const Index entitiesAlongY = endY - beginY - 1; + if( threadId < entitiesAlongY ) + { + GridEntity entity( *grid, + CoordinatesType( faceIdx == 2 ? beginX : endX, beginY + threadId + 1 ), + gridEntityParameters... ); + //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + } + + + + /*const Index aux = max( entitiesAlongX, entitiesAlongY ); + const Index& warpSize = Devices::Cuda::getWarpSize(); + const Index threadsPerAxis = warpSize * ( aux / warpSize + ( aux % warpSize != 0 ) ); + + Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + GridEntity entity( *grid, + CoordinatesType( 0, 0 ), + gridEntityParameters... ); + CoordinatesType& coordinates = entity.getCoordinates(); + const Index axisIndex = threadId / threadsPerAxis; + //printf( "axisIndex %d, threadId %d thradsPerAxis %d \n", axisIndex, threadId, threadsPerAxis ); + threadId -= axisIndex * threadsPerAxis; + switch( axisIndex ) + { + case 1: + coordinates = CoordinatesType( beginX + threadId, beginY ); + if( threadId < entitiesAlongX ) + { + //printf( "X1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + break; + case 2: + coordinates = CoordinatesType( beginX + threadId, endY ); + if( threadId < entitiesAlongX ) + { + //printf( "X2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + break; + case 3: + coordinates = CoordinatesType( beginX, beginY + threadId + 1 ); + if( threadId < entitiesAlongY ) + { + //printf( "Y1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + break; + case 4: + coordinates = CoordinatesType( endX, beginY + threadId + 1 ); + if( threadId < entitiesAlongY ) + { + //printf( "Y2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + break; + }*/ + + /*if( threadId < entitiesAlongX ) + { + GridEntity entity( *grid, + CoordinatesType( beginX + threadId, beginY ), + gridEntityParameters... ); + //printf( "X1: Thread %d -> %d %d x %d %d \n ", threadId, + // entity.getCoordinates().x(), entity.getCoordinates().y(), + // grid->getDimensions().x(), grid->getDimensions().y() ); + entity.refresh(); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + else if( ( threadId -= entitiesAlongX ) < entitiesAlongX && threadId >= 0 ) + { + GridEntity entity( *grid, + CoordinatesType( beginX + threadId, endY ), + gridEntityParameters... ); + entity.refresh(); + //printf( "X2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + else if( ( ( threadId -= entitiesAlongX ) < entitiesAlongY - 1 ) && threadId >= 0 ) + { + GridEntity entity( *grid, + CoordinatesType( beginX, beginY + threadId + 1 ), + gridEntityParameters... ); + entity.refresh(); + //printf( "Y1: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); + EntitiesProcessor::processEntity( *grid, userData, entity ); + } + else if( ( ( threadId -= entitiesAlongY - 1 ) < entitiesAlongY - 1 ) && threadId >= 0 ) + { + GridEntity entity( *grid, + CoordinatesType( endX, beginY + threadId + 1 ), + gridEntityParameters... ); + entity.refresh(); + //printf( "Y2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); + EntitiesProcessor::processEntity( *grid, userData, entity ); + }*/ +} + + +#endif // HAVE_CUDA + +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities, + int XOrthogonalBoundary, + int YOrthogonalBoundary, + typename... GridEntityParameters > +void +GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType& begin, + const CoordinatesType& end, + UserData& userData, + GridTraverserMode mode, + const int& stream, + const GridEntityParameters&... gridEntityParameters ) +{ +#ifdef HAVE_CUDA + if( processOnlyBoundaryEntities && + ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) ) + { +#ifdef GRID_TRAVERSER_USE_STREAMS + dim3 cudaBlockSize( 256 ); + dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX, + cudaBlocksCountAlongY, cudaGridsCountAlongY; + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 ); + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 ); + + auto& pool = CudaStreamPool::getInstance(); + Devices::Cuda::synchronizeDevice(); + + const cudaStream_t& s1 = pool.getStream( stream ); + const cudaStream_t& s2 = pool.getStream( stream + 1 ); + dim3 gridIdx, cudaGridSize; + for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize ); + //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX ); + GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaGridSize, cudaBlockSize, 0, s1 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + begin.y(), + gridIdx, + gridEntityParameters... ); + GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaGridSize, cudaBlockSize, 0, s2 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + end.y(), + gridIdx, + gridEntityParameters... ); + } + const cudaStream_t& s3 = pool.getStream( stream + 2 ); + const cudaStream_t& s4 = pool.getStream( stream + 3 ); + for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize ); + GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaGridSize, cudaBlockSize, 0, s3 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.y() + 1, + end.y() - 1, + begin.x(), + gridIdx, + gridEntityParameters... ); + GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaGridSize, cudaBlockSize, 0, s4 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.y() + 1, + end.y() - 1, + end.x(), + gridIdx, + gridEntityParameters... ); + } + cudaStreamSynchronize( s1 ); + cudaStreamSynchronize( s2 ); + cudaStreamSynchronize( s3 ); + cudaStreamSynchronize( s4 ); +#else // not defined GRID_TRAVERSER_USE_STREAMS + dim3 cudaBlockSize( 256 ); + dim3 cudaBlocksCount, cudaGridsCount; + const IndexType entitiesAlongX = end.x() - begin.x() + 1; + const IndexType entitiesAlongY = end.x() - begin.x() - 1; + const IndexType maxFaceSize = max( entitiesAlongX, entitiesAlongY ); + const IndexType blocksPerFace = maxFaceSize / cudaBlockSize.x + ( maxFaceSize % cudaBlockSize.x != 0 ); + IndexType cudaThreadsCount = 4 * cudaBlockSize.x * blocksPerFace; + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount ); + //std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount + // << "cudaBlockCount = " << cudaBlocksCount.x << std::endl; + dim3 gridIdx, cudaGridSize; + Devices::Cuda::synchronizeDevice(); + for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); + //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX ); + GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaGridSize, cudaBlockSize >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + begin.y(), + end.y(), + blocksPerFace, + gridIdx, + gridEntityParameters... ); + } +#endif //GRID_TRAVERSER_USE_STREAMS + //getchar(); + TNL_CHECK_CUDA_DEVICE; + } + else + { + dim3 cudaBlockSize( 16, 16 ); + dim3 cudaBlocksCount, cudaGridsCount; + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, + end.x() - begin.x() + 1, + end.y() - begin.y() + 1 ); + + auto& pool = CudaStreamPool::getInstance(); + const cudaStream_t& s = pool.getStream( stream ); + + Devices::Cuda::synchronizeDevice(); + dim3 gridIdx, cudaGridSize; + for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) + for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); + //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount ); + GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaGridSize, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end, + gridIdx, + gridEntityParameters... ); + } + +#ifdef NDEBUG + if( mode == synchronousMode ) + { + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; + } +#else + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; +#endif + } + +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + + +/**** + * 2D traverser, MIC + */ +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities, + int XOrthogonalBoundary, + int YOrthogonalBoundary, + typename... GridEntityParameters > +void +GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType& begin, + const CoordinatesType& end, + UserData& userData, + GridTraverserMode mode, + const int& stream, + const GridEntityParameters&... gridEntityParameters ) +{ + + +#ifdef HAVE_MIC + Devices::MIC::synchronizeDevice(); + + //TOHLE JE PRUSER -- nemim poslat vypustku -- + //GridEntity entity( gridPointer.template getData< Devices::MIC >(), begin, gridEntityParameters... ); + + + Devices::MICHider hMicGrid; + hMicGrid.pointer=& gridPointer.template getData< Devices::MIC >(); + Devices::MICHider hMicUserData; + hMicUserData.pointer=& userDataPointer.template modifyData(); + TNLMICSTRUCT(begin, const CoordinatesType); + TNLMICSTRUCT(end, const CoordinatesType); + + #pragma offload target(mic) in(sbegin,send,hMicUserData,hMicGrid) + { + + #pragma omp parallel firstprivate( sbegin, send ) + { + TNLMICSTRUCTUSE(begin, const CoordinatesType); + TNLMICSTRUCTUSE(end, const CoordinatesType); + GridEntity entity( *(hMicGrid.pointer), *(kernelbegin) ); + + if( processOnlyBoundaryEntities ) + { + if( YOrthogonalBoundary ) + #pragma omp for + for( auto k = kernelbegin->x(); + k <= kernelend->x(); + k ++ ) + { + entity.getCoordinates().x() = k; + entity.getCoordinates().y() = kernelbegin->y(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); + entity.getCoordinates().y() = kernelend->y(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); + } + if( XOrthogonalBoundary ) + #pragma omp for + for( auto k = kernelbegin->y(); + k <= kernelend->y(); + k ++ ) + { + entity.getCoordinates().y() = k; + entity.getCoordinates().x() = kernelbegin->x(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); + entity.getCoordinates().x() = kernelend->x(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); + } + } + else + { + #pragma omp for + for( IndexType y = kernelbegin->y(); y <= kernelend->y(); y ++ ) + for( IndexType x = kernelbegin->x(); x <= kernelend->x(); x ++ ) + { + // std::cerr << x << " " < +#include +#include +#include +#include +#include + +namespace TNL { +namespace Meshes { + + +/**** + * 3D traverser, host + */ +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities, + int XOrthogonalBoundary, + int YOrthogonalBoundary, + int ZOrthogonalBoundary, + typename... GridEntityParameters > +void +GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType begin, + const CoordinatesType end, + UserData& userData, + GridTraverserMode mode, + const int& stream, + const GridEntityParameters&... gridEntityParameters ) +{ + if( processOnlyBoundaryEntities ) + { + GridEntity entity( *gridPointer, begin, gridEntityParameters... ); + + if( ZOrthogonalBoundary ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.getCoordinates().z() = begin.z(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + entity.getCoordinates().z() = end.z(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + if( YOrthogonalBoundary ) + for( entity.getCoordinates().z() = begin.z(); + entity.getCoordinates().z() <= end.z(); + entity.getCoordinates().z() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.getCoordinates().y() = begin.y(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + entity.getCoordinates().y() = end.y(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + if( XOrthogonalBoundary ) + for( entity.getCoordinates().z() = begin.z(); + entity.getCoordinates().z() <= end.z(); + entity.getCoordinates().z() ++ ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + { + entity.getCoordinates().x() = begin.x(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + entity.getCoordinates().x() = end.x(); + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + else + { +#ifdef HAVE_OPENMP + if( Devices::Host::isOMPEnabled() ) + { +#pragma omp parallel firstprivate( begin, end ) + { + GridEntity entity( *gridPointer ); +#pragma omp for + // TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow + for( IndexType z = begin.z(); z <= end.z(); z ++ ) + for( IndexType y = begin.y(); y <= end.y(); y ++ ) + for( IndexType x = begin.x(); x <= end.x(); x ++ ) + { + entity.getCoordinates().x() = x; + entity.getCoordinates().y() = y; + entity.getCoordinates().z() = z; + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } + } + else + { + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().z() = begin.z(); + entity.getCoordinates().z() <= end.z(); + entity.getCoordinates().z() ++ ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } + } +#else + GridEntity entity( *gridPointer ); + for( entity.getCoordinates().z() = begin.z(); + entity.getCoordinates().z() <= end.z(); + entity.getCoordinates().z() ++ ) + for( entity.getCoordinates().y() = begin.y(); + entity.getCoordinates().y() <= end.y(); + entity.getCoordinates().y() ++ ) + for( entity.getCoordinates().x() = begin.x(); + entity.getCoordinates().x() <= end.x(); + entity.getCoordinates().x() ++ ) + { + entity.refresh(); + EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); + } +#endif + } +} + +/**** + * 3D traverser, CUDA + */ +#ifdef HAVE_CUDA +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser3D( + const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, + UserData userData, + const typename GridEntity::CoordinatesType begin, + const typename GridEntity::CoordinatesType end, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + coordinates.z() = begin.z() + Devices::Cuda::getGlobalThreadIdx_z( gridIdx ); + + if( coordinates <= end ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() ) + { + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } + } +} + +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser3DBoundaryAlongXY( + const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, + UserData userData, + const Index beginX, + const Index endX, + const Index beginY, + const Index endY, + const Index fixedZ, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + coordinates.z() = fixedZ; + + if( coordinates.x() <= endX && coordinates.y() <= endY ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } +} + +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser3DBoundaryAlongXZ( + const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, + UserData userData, + const Index beginX, + const Index endX, + const Index beginZ, + const Index endZ, + const Index fixedY, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = fixedY; + coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + + if( coordinates.x() <= endX && coordinates.z() <= endZ ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } +} + +template< typename Real, + typename Index, + typename GridEntity, + typename UserData, + typename EntitiesProcessor, + bool processOnlyBoundaryEntities, + typename... GridEntityParameters > +__global__ void +GridTraverser3DBoundaryAlongYZ( + const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, + UserData userData, + const Index beginY, + const Index endY, + const Index beginZ, + const Index endZ, + const Index fixedX, + const dim3 gridIdx, + const GridEntityParameters... gridEntityParameters ) +{ + typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; + typename GridType::CoordinatesType coordinates; + + coordinates.x() = fixedX; + coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + + if( coordinates.y() <= endY && coordinates.z() <= endZ ) + { + GridEntity entity( *grid, coordinates, gridEntityParameters... ); + entity.refresh(); + EntitiesProcessor::processEntity + ( *grid, + userData, + entity ); + } +} +#endif + +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities, + int XOrthogonalBoundary, + int YOrthogonalBoundary, + int ZOrthogonalBoundary, + typename... GridEntityParameters > +void +GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType& begin, + const CoordinatesType& end, + UserData& userData, + GridTraverserMode mode, + const int& stream, + const GridEntityParameters&... gridEntityParameters ) +{ +#ifdef HAVE_CUDA + if( processOnlyBoundaryEntities && + ( GridEntity::getEntityDimension() == 3 || GridEntity::getEntityDimension() == 0 ) ) + { + dim3 cudaBlockSize( 16, 16 ); + const IndexType entitiesAlongX = end.x() - begin.x() + 1; + const IndexType entitiesAlongY = end.y() - begin.y() + 1; + const IndexType entitiesAlongZ = end.z() - begin.z() + 1; + + dim3 cudaBlocksCountAlongXY, cudaBlocksCountAlongXZ, cudaBlocksCountAlongYZ, + cudaGridsCountAlongXY, cudaGridsCountAlongXZ, cudaGridsCountAlongYZ; + + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXY, cudaGridsCountAlongXY, entitiesAlongX, entitiesAlongY ); + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, entitiesAlongX, entitiesAlongZ - 2 ); + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 ); + + auto& pool = CudaStreamPool::getInstance(); + Devices::Cuda::synchronizeDevice(); + + const cudaStream_t& s1 = pool.getStream( stream ); + const cudaStream_t& s2 = pool.getStream( stream + 1 ); + const cudaStream_t& s3 = pool.getStream( stream + 2 ); + const cudaStream_t& s4 = pool.getStream( stream + 3 ); + const cudaStream_t& s5 = pool.getStream( stream + 4 ); + const cudaStream_t& s6 = pool.getStream( stream + 5 ); + + dim3 gridIdx, gridSize; + for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXY.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXY.x; gridIdx.x++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCountAlongXY, cudaGridsCountAlongXY, gridIdx, gridSize ); + GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocksCountAlongXY, cudaBlockSize, 0 , s1 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + begin.y(), + end.y(), + begin.z(), + gridIdx, + gridEntityParameters... ); + GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocksCountAlongXY, cudaBlockSize, 0, s2 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + begin.y(), + end.y(), + end.z(), + gridIdx, + gridEntityParameters... ); + } + for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXZ.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXZ.x; gridIdx.x++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, gridIdx, gridSize ); + GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s3 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + begin.z() + 1, + end.z() - 1, + begin.y(), + gridIdx, + gridEntityParameters... ); + GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s4 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.x(), + end.x(), + begin.z() + 1, + end.z() - 1, + end.y(), + gridIdx, + gridEntityParameters... ); + } + for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongYZ.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongYZ.x; gridIdx.x++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, gridIdx, gridSize ); + GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s5 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.y() + 1, + end.y() - 1, + begin.z() + 1, + end.z() - 1, + begin.x(), + gridIdx, + gridEntityParameters... ); + GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s6 >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin.y() + 1, + end.y() - 1, + begin.z() + 1, + end.z() - 1, + end.x(), + gridIdx, + gridEntityParameters... ); + } + cudaStreamSynchronize( s1 ); + cudaStreamSynchronize( s2 ); + cudaStreamSynchronize( s3 ); + cudaStreamSynchronize( s4 ); + cudaStreamSynchronize( s5 ); + cudaStreamSynchronize( s6 ); + TNL_CHECK_CUDA_DEVICE; + } + else + { + dim3 cudaBlockSize( 8, 8, 8 ); + dim3 cudaBlocksCount, cudaGridsCount; + + Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, + end.x() - begin.x() + 1, + end.y() - begin.y() + 1, + end.z() - begin.z() + 1 ); + + auto& pool = CudaStreamPool::getInstance(); + const cudaStream_t& s = pool.getStream( stream ); + + Devices::Cuda::synchronizeDevice(); + dim3 gridIdx, gridSize; + for( gridIdx.z = 0; gridIdx.z < cudaGridsCount.z; gridIdx.z ++ ) + for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) + for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ ) + { + Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, gridSize ); + GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< gridSize, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end, + gridIdx, + gridEntityParameters... ); + } + + // only launches into the stream 0 are synchronized + if( stream == 0 ) + { + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; + } + } +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + +/**** + * 3D traverser, MIC + */ +template< typename Real, + typename Index > + template< + typename GridEntity, + typename EntitiesProcessor, + typename UserData, + bool processOnlyBoundaryEntities, + int XOrthogonalBoundary, + int YOrthogonalBoundary, + int ZOrthogonalBoundary, + typename... GridEntityParameters > +void +GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > >:: +processEntities( + const GridPointer& gridPointer, + const CoordinatesType& begin, + const CoordinatesType& end, + UserData& userData, + GridTraverserMode mode, + const int& stream, + const GridEntityParameters&... gridEntityParameters ) +{ + std::cout << "Not Implemented yet Grid Traverser <3, Real, Device::MIC>" << std::endl; + +/* HAVE_CUDA + dim3 cudaBlockSize( 8, 8, 8 ); + dim3 cudaBlocks; + cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); + cudaBlocks.y = Devices::Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y ); + cudaBlocks.z = Devices::Cuda::getNumberOfBlocks( end.z() - begin.z() + 1, cudaBlockSize.z ); + const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); + const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y ); + const IndexType cudaZGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.z ); + + auto& pool = CudaStreamPool::getInstance(); + const cudaStream_t& s = pool.getStream( stream ); + + Devices::Cuda::synchronizeDevice(); + for( IndexType gridZIdx = 0; gridZIdx < cudaZGrids; gridZIdx ++ ) + for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ ) + for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) + GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > + <<< cudaBlocks, cudaBlockSize, 0, s >>> + ( &gridPointer.template getData< Devices::Cuda >(), + userData, + begin, + end, + gridXIdx, + gridYIdx, + gridZIdx, + gridEntityParameters... ); + + // only launches into the stream 0 are synchronized + if( stream == 0 ) + { + cudaStreamSynchronize( s ); + TNL_CHECK_CUDA_DEVICE; + } + */ +} + } // namespace Meshes +} // namespace TNL diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h b/src/TNL/Meshes/GridDetails/GridTraverser_impl.h deleted file mode 100644 index 258325a768cde7c37fdecedd34829c07a1374bc8..0000000000000000000000000000000000000000 --- a/src/TNL/Meshes/GridDetails/GridTraverser_impl.h +++ /dev/null @@ -1,1389 +0,0 @@ -/*************************************************************************** - GridTraverser_impl.h - description - ------------------- - begin : Jan 2, 2016 - copyright : (C) 2016 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#include - -#pragma once - -//#define GRID_TRAVERSER_USE_STREAMS - -#include "GridTraverser.h" - -#include - -namespace TNL { -namespace Meshes { - -/**** - * 1D traverser, host - */ -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities > -void -GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType begin, - const CoordinatesType end, - UserData& userData, - const int& stream ) -{ - GridEntity entity( *gridPointer ); - if( processOnlyBoundaryEntities ) - { - GridEntity entity( *gridPointer ); - - entity.getCoordinates() = begin; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - entity.getCoordinates() = end; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - else - { - //TODO: This does not work with gcc-5.4 and older, should work at gcc 6.x -/*#pragma omp parallel for firstprivate( entity, begin, end ) if( Devices::Host::isOMPEnabled() ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - }*/ -#ifdef HAVE_OPENMP -#pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() ) -#endif - { - GridEntity entity( *gridPointer ); -#ifdef HAVE_OPENMP -#pragma omp for -#endif - for( IndexType x = begin.x(); x <= end.x(); x ++ ) - { - entity.getCoordinates().x() = x; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } - - } -} - -/**** - * 1D traverser, CUDA - */ -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor > -__global__ void -GridTraverser1D( - const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, - UserData userData, - const typename GridEntity::CoordinatesType begin, - const typename GridEntity::CoordinatesType end, - const Index gridIdx ) -{ - typedef Real RealType; - typedef Index IndexType; - typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( coordinates <= end ) - { - GridEntity entity( *grid, coordinates ); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } -} - -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor > -__global__ void -GridBoundaryTraverser1D( - const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid, - UserData userData, - const typename GridEntity::CoordinatesType begin, - const typename GridEntity::CoordinatesType end ) -{ - typedef Real RealType; - typedef Index IndexType; - typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - if( threadIdx.x == 0 ) - { - coordinates.x() = begin.x(); - GridEntity entity( *grid, coordinates ); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - if( threadIdx.x == 1 ) - { - coordinates.x() = end.x(); - GridEntity entity( *grid, coordinates ); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } -} - -#endif - -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities > -void -GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType& begin, - const CoordinatesType& end, - UserData& userData, - const int& stream ) -{ -#ifdef HAVE_CUDA - auto& pool = CudaStreamPool::getInstance(); - const cudaStream_t& s = pool.getStream( stream ); - - Devices::Cuda::synchronizeDevice(); - if( processOnlyBoundaryEntities ) - { - dim3 cudaBlockSize( 2 ); - dim3 cudaBlocks( 1 ); - GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > - <<< cudaBlocks, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end ); - } - else - { - dim3 cudaBlockSize( 256 ); - dim3 cudaBlocks; - cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); - const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); - - for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) - GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > - <<< cudaBlocks, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end, - gridXIdx ); - } - - // only launches into the stream 0 are synchronized - if( stream == 0 ) - { - cudaStreamSynchronize( s ); - TNL_CHECK_CUDA_DEVICE; - } -#else - throw Exceptions::CudaSupportMissing(); -#endif -} - -/**** - * 1D traverser, MIC - */ - -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities > -void -GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType& begin, - const CoordinatesType& end, - UserData& userData, - const int& stream ) -{ - std::cout << "Not Implemented yet Grid Traverser <1, Real, Device::MIC>" << std::endl; -/* - auto& pool = CudaStreamPool::getInstance(); - const cudaStream_t& s = pool.getStream( stream ); - - Devices::Cuda::synchronizeDevice(); - if( processOnlyBoundaryEntities ) - { - dim3 cudaBlockSize( 2 ); - dim3 cudaBlocks( 1 ); - GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > - <<< cudaBlocks, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end ); - } - else - { - dim3 cudaBlockSize( 256 ); - dim3 cudaBlocks; - cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); - const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); - - for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) - GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > - <<< cudaBlocks, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end, - gridXIdx ); - } - - // only launches into the stream 0 are synchronized - if( stream == 0 ) - { - cudaStreamSynchronize( s ); - TNL_CHECK_CUDA_DEVICE; - } -*/ -} - -/**** - * 2D traverser, host - */ -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities, - int XOrthogonalBoundary, - int YOrthogonalBoundary, - typename... GridEntityParameters > -void -GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType begin, - const CoordinatesType end, - UserData& userData, - const int& stream, - const GridEntityParameters&... gridEntityParameters ) -{ - if( processOnlyBoundaryEntities ) - { - GridEntity entity( *gridPointer, begin, gridEntityParameters... ); - - if( YOrthogonalBoundary ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.getCoordinates().y() = begin.y(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - entity.getCoordinates().y() = end.y(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - if( XOrthogonalBoundary ) - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - { - entity.getCoordinates().x() = begin.x(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - entity.getCoordinates().x() = end.x(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } - else - { - //TODO: This does not work with gcc-5.4 and older, should work at gcc 6.x -/*#pragma omp parallel for firstprivate( entity, begin, end ) if( Devices::Host::isOMPEnabled() ) - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - }*/ -#ifdef HAVE_OPENMP -#pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() ) -#endif - { - GridEntity entity( *gridPointer, begin, gridEntityParameters... ); -#ifdef HAVE_OPENMP -#pragma omp for -#endif - for( IndexType y = begin.y(); y <= end.y(); y ++ ) - for( IndexType x = begin.x(); x <= end.x(); x ++ ) - { - entity.getCoordinates().x() = x; - entity.getCoordinates().y() = y; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } - } -} - -/**** - * 2D traverser, CUDA - */ -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser2D( - const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, - UserData userData, - const typename GridEntity::CoordinatesType begin, - const typename GridEntity::CoordinatesType end, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); - - if( coordinates <= end ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() ) - { - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } - } -} - -// Boundary traverser using streams -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser2DBoundaryAlongX( - const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, - UserData userData, - const Index beginX, - const Index endX, - const Index fixedY, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = fixedY; - - if( coordinates.x() <= endX ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } -} - -// Boundary traverser using streams -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser2DBoundaryAlongY( - const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, - UserData userData, - const Index beginY, - const Index endY, - const Index fixedX, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = fixedX; - coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - - if( coordinates.y() <= endY ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } -} - - -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser2DBoundary( - const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid, - UserData userData, - const Index beginX, - const Index endX, - const Index beginY, - const Index endY, - const Index blocksPerFace, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - using GridType = Meshes::Grid< 2, Real, Devices::Cuda, Index >; - using CoordinatesType = typename GridType::CoordinatesType; - - const Index faceIdx = blockIdx.x / blocksPerFace; - const Index faceBlockIdx = blockIdx.x % blocksPerFace; - const Index threadId = faceBlockIdx * blockDim. x + threadIdx.x; - if( faceIdx < 2 ) - { - const Index entitiesAlongX = endX - beginX + 1; - if( threadId < entitiesAlongX ) - { - GridEntity entity( *grid, - CoordinatesType( beginX + threadId, faceIdx == 0 ? beginY : endY ), - gridEntityParameters... ); - //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - } - else - { - const Index entitiesAlongY = endY - beginY - 1; - if( threadId < entitiesAlongY ) - { - GridEntity entity( *grid, - CoordinatesType( faceIdx == 2 ? beginX : endX, beginY + threadId + 1 ), - gridEntityParameters... ); - //printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - } - - - - /*const Index aux = max( entitiesAlongX, entitiesAlongY ); - const Index& warpSize = Devices::Cuda::getWarpSize(); - const Index threadsPerAxis = warpSize * ( aux / warpSize + ( aux % warpSize != 0 ) ); - - Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - GridEntity entity( *grid, - CoordinatesType( 0, 0 ), - gridEntityParameters... ); - CoordinatesType& coordinates = entity.getCoordinates(); - const Index axisIndex = threadId / threadsPerAxis; - //printf( "axisIndex %d, threadId %d thradsPerAxis %d \n", axisIndex, threadId, threadsPerAxis ); - threadId -= axisIndex * threadsPerAxis; - switch( axisIndex ) - { - case 1: - coordinates = CoordinatesType( beginX + threadId, beginY ); - if( threadId < entitiesAlongX ) - { - //printf( "X1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - break; - case 2: - coordinates = CoordinatesType( beginX + threadId, endY ); - if( threadId < entitiesAlongX ) - { - //printf( "X2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - break; - case 3: - coordinates = CoordinatesType( beginX, beginY + threadId + 1 ); - if( threadId < entitiesAlongY ) - { - //printf( "Y1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - break; - case 4: - coordinates = CoordinatesType( endX, beginY + threadId + 1 ); - if( threadId < entitiesAlongY ) - { - //printf( "Y2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - break; - }*/ - - /*if( threadId < entitiesAlongX ) - { - GridEntity entity( *grid, - CoordinatesType( beginX + threadId, beginY ), - gridEntityParameters... ); - //printf( "X1: Thread %d -> %d %d x %d %d \n ", threadId, - // entity.getCoordinates().x(), entity.getCoordinates().y(), - // grid->getDimensions().x(), grid->getDimensions().y() ); - entity.refresh(); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - else if( ( threadId -= entitiesAlongX ) < entitiesAlongX && threadId >= 0 ) - { - GridEntity entity( *grid, - CoordinatesType( beginX + threadId, endY ), - gridEntityParameters... ); - entity.refresh(); - //printf( "X2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - else if( ( ( threadId -= entitiesAlongX ) < entitiesAlongY - 1 ) && threadId >= 0 ) - { - GridEntity entity( *grid, - CoordinatesType( beginX, beginY + threadId + 1 ), - gridEntityParameters... ); - entity.refresh(); - //printf( "Y1: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); - EntitiesProcessor::processEntity( *grid, userData, entity ); - } - else if( ( ( threadId -= entitiesAlongY - 1 ) < entitiesAlongY - 1 ) && threadId >= 0 ) - { - GridEntity entity( *grid, - CoordinatesType( endX, beginY + threadId + 1 ), - gridEntityParameters... ); - entity.refresh(); - //printf( "Y2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() ); - EntitiesProcessor::processEntity( *grid, userData, entity ); - }*/ -} - - -#endif // HAVE_CUDA - -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities, - int XOrthogonalBoundary, - int YOrthogonalBoundary, - typename... GridEntityParameters > -void -GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType& begin, - const CoordinatesType& end, - UserData& userData, - const int& stream, - const GridEntityParameters&... gridEntityParameters ) -{ -#ifdef HAVE_CUDA - if( processOnlyBoundaryEntities && - ( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) ) - { -#ifdef GRID_TRAVERSER_USE_STREAMS - dim3 cudaBlockSize( 256 ); - dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX, - cudaBlocksCountAlongY, cudaGridsCountAlongY; - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 ); - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 ); - - auto& pool = CudaStreamPool::getInstance(); - Devices::Cuda::synchronizeDevice(); - - const cudaStream_t& s1 = pool.getStream( stream ); - const cudaStream_t& s2 = pool.getStream( stream + 1 ); - dim3 gridIdx, cudaGridSize; - for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize ); - //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX ); - GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaGridSize, cudaBlockSize, 0, s1 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - begin.y(), - gridIdx, - gridEntityParameters... ); - GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaGridSize, cudaBlockSize, 0, s2 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - end.y(), - gridIdx, - gridEntityParameters... ); - } - const cudaStream_t& s3 = pool.getStream( stream + 2 ); - const cudaStream_t& s4 = pool.getStream( stream + 3 ); - for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize ); - GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaGridSize, cudaBlockSize, 0, s3 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.y() + 1, - end.y() - 1, - begin.x(), - gridIdx, - gridEntityParameters... ); - GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaGridSize, cudaBlockSize, 0, s4 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.y() + 1, - end.y() - 1, - end.x(), - gridIdx, - gridEntityParameters... ); - } - cudaStreamSynchronize( s1 ); - cudaStreamSynchronize( s2 ); - cudaStreamSynchronize( s3 ); - cudaStreamSynchronize( s4 ); -#else // not defined GRID_TRAVERSER_USE_STREAMS - dim3 cudaBlockSize( 256 ); - dim3 cudaBlocksCount, cudaGridsCount; - const IndexType entitiesAlongX = end.x() - begin.x() + 1; - const IndexType entitiesAlongY = end.x() - begin.x() - 1; - const IndexType maxFaceSize = max( entitiesAlongX, entitiesAlongY ); - const IndexType blocksPerFace = maxFaceSize / cudaBlockSize.x + ( maxFaceSize % cudaBlockSize.x != 0 ); - IndexType cudaThreadsCount = 4 * cudaBlockSize.x * blocksPerFace; - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount ); - //std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount - // << "cudaBlockCount = " << cudaBlocksCount.x << std::endl; - dim3 gridIdx, cudaGridSize; - Devices::Cuda::synchronizeDevice(); - for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); - //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX ); - GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaGridSize, cudaBlockSize >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - begin.y(), - end.y(), - blocksPerFace, - gridIdx, - gridEntityParameters... ); - } -#endif //GRID_TRAVERSER_USE_STREAMS - //getchar(); - TNL_CHECK_CUDA_DEVICE; - } - else - { - dim3 cudaBlockSize( 16, 16 ); - dim3 cudaBlocksCount, cudaGridsCount; - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, - end.x() - begin.x() + 1, - end.y() - begin.y() + 1 ); - - auto& pool = CudaStreamPool::getInstance(); - const cudaStream_t& s = pool.getStream( stream ); - - Devices::Cuda::synchronizeDevice(); - dim3 gridIdx, cudaGridSize; - for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) - for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); - //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount ); - GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaGridSize, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end, - gridIdx, - gridEntityParameters... ); - } - - // only launches into the stream 0 are synchronized - if( stream == 0 ) - { - cudaStreamSynchronize( s ); - TNL_CHECK_CUDA_DEVICE; - } - } -#else - throw Exceptions::CudaSupportMissing(); -#endif -} - - -/**** - * 2D traverser, MIC - */ -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities, - int XOrthogonalBoundary, - int YOrthogonalBoundary, - typename... GridEntityParameters > -void -GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType& begin, - const CoordinatesType& end, - UserData& userData, - const int& stream, - const GridEntityParameters&... gridEntityParameters ) -{ - - -#ifdef HAVE_MIC - Devices::MIC::synchronizeDevice(); - - //TOHLE JE PRUSER -- nemim poslat vypustku -- - //GridEntity entity( gridPointer.template getData< Devices::MIC >(), begin, gridEntityParameters... ); - - - Devices::MICHider hMicGrid; - hMicGrid.pointer=& gridPointer.template getData< Devices::MIC >(); - Devices::MICHider hMicUserData; - hMicUserData.pointer=& userDataPointer.template modifyData(); - TNLMICSTRUCT(begin, const CoordinatesType); - TNLMICSTRUCT(end, const CoordinatesType); - - #pragma offload target(mic) in(sbegin,send,hMicUserData,hMicGrid) - { - - #pragma omp parallel firstprivate( sbegin, send ) - { - TNLMICSTRUCTUSE(begin, const CoordinatesType); - TNLMICSTRUCTUSE(end, const CoordinatesType); - GridEntity entity( *(hMicGrid.pointer), *(kernelbegin) ); - - if( processOnlyBoundaryEntities ) - { - if( YOrthogonalBoundary ) - #pragma omp for - for( auto k = kernelbegin->x(); - k <= kernelend->x(); - k ++ ) - { - entity.getCoordinates().x() = k; - entity.getCoordinates().y() = kernelbegin->y(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); - entity.getCoordinates().y() = kernelend->y(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); - } - if( XOrthogonalBoundary ) - #pragma omp for - for( auto k = kernelbegin->y(); - k <= kernelend->y(); - k ++ ) - { - entity.getCoordinates().y() = k; - entity.getCoordinates().x() = kernelbegin->x(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); - entity.getCoordinates().x() = kernelend->x(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity ); - } - } - else - { - #pragma omp for - for( IndexType y = kernelbegin->y(); y <= kernelend->y(); y ++ ) - for( IndexType x = kernelbegin->x(); x <= kernelend->x(); x ++ ) - { - // std::cerr << x << " " < - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities, - int XOrthogonalBoundary, - int YOrthogonalBoundary, - int ZOrthogonalBoundary, - typename... GridEntityParameters > -void -GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType begin, - const CoordinatesType end, - UserData& userData, - const int& stream, - const GridEntityParameters&... gridEntityParameters ) -{ - if( processOnlyBoundaryEntities ) - { - GridEntity entity( *gridPointer, begin, gridEntityParameters... ); - - if( ZOrthogonalBoundary ) - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.getCoordinates().z() = begin.z(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - entity.getCoordinates().z() = end.z(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - if( YOrthogonalBoundary ) - for( entity.getCoordinates().z() = begin.z(); - entity.getCoordinates().z() <= end.z(); - entity.getCoordinates().z() ++ ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.getCoordinates().y() = begin.y(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - entity.getCoordinates().y() = end.y(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - if( XOrthogonalBoundary ) - for( entity.getCoordinates().z() = begin.z(); - entity.getCoordinates().z() <= end.z(); - entity.getCoordinates().z() ++ ) - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - { - entity.getCoordinates().x() = begin.x(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - entity.getCoordinates().x() = end.x(); - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } - else - { - // TODO: this does not work with gcc-5.4 and older, should work at gcc 6.x -/*#pragma omp parallel for firstprivate( entity, begin, end ) if( Devices::Host::isOMPEnabled() ) - for( entity.getCoordinates().z() = begin.z(); - entity.getCoordinates().z() <= end.z(); - entity.getCoordinates().z() ++ ) - for( entity.getCoordinates().y() = begin.y(); - entity.getCoordinates().y() <= end.y(); - entity.getCoordinates().y() ++ ) - for( entity.getCoordinates().x() = begin.x(); - entity.getCoordinates().x() <= end.x(); - entity.getCoordinates().x() ++ ) - { - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - }*/ -#ifdef HAVE_OPENMP -#pragma omp parallel firstprivate( begin, end ) if( Devices::Host::isOMPEnabled() ) -#endif - { - GridEntity entity( *gridPointer, begin, gridEntityParameters... ); -#ifdef HAVE_OPENMP -#pragma omp for -#endif - for( IndexType z = begin.z(); z <= end.z(); z ++ ) - for( IndexType y = begin.y(); y <= end.y(); y ++ ) - for( IndexType x = begin.x(); x <= end.x(); x ++ ) - { - entity.getCoordinates().x() = x; - entity.getCoordinates().y() = y; - entity.getCoordinates().z() = z; - entity.refresh(); - EntitiesProcessor::processEntity( entity.getMesh(), userData, entity ); - } - } - } -} - -/**** - * 3D traverser, CUDA - */ -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser3D( - const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, - UserData userData, - const typename GridEntity::CoordinatesType begin, - const typename GridEntity::CoordinatesType end, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); - coordinates.z() = begin.z() + Devices::Cuda::getGlobalThreadIdx_z( gridIdx ); - - if( coordinates <= end ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() ) - { - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } - } -} - -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser3DBoundaryAlongXY( - const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, - UserData userData, - const Index beginX, - const Index endX, - const Index beginY, - const Index endY, - const Index fixedZ, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); - coordinates.z() = fixedZ; - - if( coordinates.x() <= endX && coordinates.y() <= endY ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } -} - -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser3DBoundaryAlongXZ( - const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, - UserData userData, - const Index beginX, - const Index endX, - const Index beginZ, - const Index endZ, - const Index fixedY, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = fixedY; - coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); - - if( coordinates.x() <= endX && coordinates.z() <= endZ ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } -} - -template< typename Real, - typename Index, - typename GridEntity, - typename UserData, - typename EntitiesProcessor, - bool processOnlyBoundaryEntities, - typename... GridEntityParameters > -__global__ void -GridTraverser3DBoundaryAlongYZ( - const Meshes::Grid< 3, Real, Devices::Cuda, Index >* grid, - UserData userData, - const Index beginY, - const Index endY, - const Index beginZ, - const Index endZ, - const Index fixedX, - const dim3 gridIdx, - const GridEntityParameters... gridEntityParameters ) -{ - typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; - typename GridType::CoordinatesType coordinates; - - coordinates.x() = fixedX; - coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); - - if( coordinates.y() <= endY && coordinates.z() <= endZ ) - { - GridEntity entity( *grid, coordinates, gridEntityParameters... ); - entity.refresh(); - EntitiesProcessor::processEntity - ( *grid, - userData, - entity ); - } -} -#endif - -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities, - int XOrthogonalBoundary, - int YOrthogonalBoundary, - int ZOrthogonalBoundary, - typename... GridEntityParameters > -void -GridTraverser< Meshes::Grid< 3, Real, Devices::Cuda, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType& begin, - const CoordinatesType& end, - UserData& userData, - const int& stream, - const GridEntityParameters&... gridEntityParameters ) -{ -#ifdef HAVE_CUDA - if( processOnlyBoundaryEntities && - ( GridEntity::getEntityDimension() == 3 || GridEntity::getEntityDimension() == 0 ) ) - { - dim3 cudaBlockSize( 16, 16 ); - const IndexType entitiesAlongX = end.x() - begin.x() + 1; - const IndexType entitiesAlongY = end.y() - begin.y() + 1; - const IndexType entitiesAlongZ = end.z() - begin.z() + 1; - - dim3 cudaBlocksCountAlongXY, cudaBlocksCountAlongXZ, cudaBlocksCountAlongYZ, - cudaGridsCountAlongXY, cudaGridsCountAlongXZ, cudaGridsCountAlongYZ; - - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXY, cudaGridsCountAlongXY, entitiesAlongX, entitiesAlongY ); - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, entitiesAlongX, entitiesAlongZ - 2 ); - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 ); - - auto& pool = CudaStreamPool::getInstance(); - Devices::Cuda::synchronizeDevice(); - - const cudaStream_t& s1 = pool.getStream( stream ); - const cudaStream_t& s2 = pool.getStream( stream + 1 ); - const cudaStream_t& s3 = pool.getStream( stream + 2 ); - const cudaStream_t& s4 = pool.getStream( stream + 3 ); - const cudaStream_t& s5 = pool.getStream( stream + 4 ); - const cudaStream_t& s6 = pool.getStream( stream + 5 ); - - dim3 gridIdx, gridSize; - for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXY.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXY.x; gridIdx.x++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCountAlongXY, cudaGridsCountAlongXY, gridIdx, gridSize ); - GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocksCountAlongXY, cudaBlockSize, 0 , s1 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - begin.y(), - end.y(), - begin.z(), - gridIdx, - gridEntityParameters... ); - GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocksCountAlongXY, cudaBlockSize, 0, s2 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - begin.y(), - end.y(), - end.z(), - gridIdx, - gridEntityParameters... ); - } - for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXZ.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXZ.x; gridIdx.x++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, gridIdx, gridSize ); - GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s3 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - begin.z() + 1, - end.z() - 1, - begin.y(), - gridIdx, - gridEntityParameters... ); - GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s4 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.x(), - end.x(), - begin.z() + 1, - end.z() - 1, - end.y(), - gridIdx, - gridEntityParameters... ); - } - for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongYZ.y; gridIdx.y++ ) - for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongYZ.x; gridIdx.x++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, gridIdx, gridSize ); - GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s5 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.y() + 1, - end.y() - 1, - begin.z() + 1, - end.z() - 1, - begin.x(), - gridIdx, - gridEntityParameters... ); - GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s6 >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin.y() + 1, - end.y() - 1, - begin.z() + 1, - end.z() - 1, - end.x(), - gridIdx, - gridEntityParameters... ); - } - cudaStreamSynchronize( s1 ); - cudaStreamSynchronize( s2 ); - cudaStreamSynchronize( s3 ); - cudaStreamSynchronize( s4 ); - cudaStreamSynchronize( s5 ); - cudaStreamSynchronize( s6 ); - TNL_CHECK_CUDA_DEVICE; - } - else - { - dim3 cudaBlockSize( 8, 8, 8 ); - dim3 cudaBlocksCount, cudaGridsCount; - - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, - end.x() - begin.x() + 1, - end.y() - begin.y() + 1, - end.z() - begin.z() + 1 ); - - auto& pool = CudaStreamPool::getInstance(); - const cudaStream_t& s = pool.getStream( stream ); - - Devices::Cuda::synchronizeDevice(); - dim3 gridIdx, gridSize; - for( gridIdx.z = 0; gridIdx.z < cudaGridsCount.z; gridIdx.z ++ ) - for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) - for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ ) - { - Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, gridSize ); - GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< gridSize, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end, - gridIdx, - gridEntityParameters... ); - } - - // only launches into the stream 0 are synchronized - if( stream == 0 ) - { - cudaStreamSynchronize( s ); - TNL_CHECK_CUDA_DEVICE; - } - } -#else - throw Exceptions::CudaSupportMissing(); -#endif -} - -/**** - * 3D traverser, MIC - */ -template< typename Real, - typename Index > - template< - typename GridEntity, - typename EntitiesProcessor, - typename UserData, - bool processOnlyBoundaryEntities, - int XOrthogonalBoundary, - int YOrthogonalBoundary, - int ZOrthogonalBoundary, - typename... GridEntityParameters > -void -GridTraverser< Meshes::Grid< 3, Real, Devices::MIC, Index > >:: -processEntities( - const GridPointer& gridPointer, - const CoordinatesType& begin, - const CoordinatesType& end, - UserData& userData, - const int& stream, - const GridEntityParameters&... gridEntityParameters ) -{ - std::cout << "Not Implemented yet Grid Traverser <3, Real, Device::MIC>" << std::endl; - -/* HAVE_CUDA - dim3 cudaBlockSize( 8, 8, 8 ); - dim3 cudaBlocks; - cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); - cudaBlocks.y = Devices::Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y ); - cudaBlocks.z = Devices::Cuda::getNumberOfBlocks( end.z() - begin.z() + 1, cudaBlockSize.z ); - const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); - const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y ); - const IndexType cudaZGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.z ); - - auto& pool = CudaStreamPool::getInstance(); - const cudaStream_t& s = pool.getStream( stream ); - - Devices::Cuda::synchronizeDevice(); - for( IndexType gridZIdx = 0; gridZIdx < cudaZGrids; gridZIdx ++ ) - for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ ) - for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) - GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > - <<< cudaBlocks, cudaBlockSize, 0, s >>> - ( &gridPointer.template getData< Devices::Cuda >(), - userData, - begin, - end, - gridXIdx, - gridYIdx, - gridZIdx, - gridEntityParameters... ); - - // only launches into the stream 0 are synchronized - if( stream == 0 ) - { - cudaStreamSynchronize( s ); - TNL_CHECK_CUDA_DEVICE; - } - */ -} - -} // namespace Meshes -} // namespace TNL diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h index c0ddcf2dae86cf1b4a8da74d7d8d18307f240b73..7413315385eed74ce918e5e82db6c3523713bc76 100644 --- a/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h +++ b/src/TNL/Meshes/GridDetails/Traverser_Grid1D_impl.h @@ -43,7 +43,8 @@ processBoundaryEntities( const GridPointer& gridPointer, gridPointer, CoordinatesType( 0 ), gridPointer->getDimensions() - CoordinatesType( 1 ), - userData ); + userData, + asynchronousMode ); } else //Distributed { @@ -54,7 +55,8 @@ processBoundaryEntities( const GridPointer& gridPointer, gridPointer, CoordinatesType( 0 ) + distributedGrid->getLowerOverlap(), CoordinatesType( 0 ) + distributedGrid->getLowerOverlap(), - userData ); + userData, + asynchronousMode ); } if( neighbors[ Meshes::DistributedMeshes::ZzYzXp ] == -1 ) @@ -63,7 +65,8 @@ processBoundaryEntities( const GridPointer& gridPointer, gridPointer, gridPointer->getDimensions() - CoordinatesType( 1 ) - distributedGrid->getUpperOverlap(), gridPointer->getDimensions() - CoordinatesType( 1 ) - distributedGrid->getUpperOverlap(), - userData ); + userData, + asynchronousMode ); } } @@ -92,7 +95,8 @@ processInteriorEntities( const GridPointer& gridPointer, gridPointer, CoordinatesType( 1 ), gridPointer->getDimensions() - CoordinatesType( 2 ), - userData ); + userData, + asynchronousMode ); } else //Distributed { @@ -117,7 +121,8 @@ processInteriorEntities( const GridPointer& gridPointer, gridPointer, begin, end, - userData ); + userData, + asynchronousMode ); } } @@ -146,7 +151,8 @@ processAllEntities( gridPointer, CoordinatesType( 0 ), gridPointer->getDimensions() - CoordinatesType( 1 ), - userData ); + userData, + asynchronousMode ); } else //Distributed { @@ -157,7 +163,8 @@ processAllEntities( gridPointer, begin, end, - userData ); + userData, + asynchronousMode ); } } @@ -185,7 +192,8 @@ processBoundaryEntities( const GridPointer& gridPointer, gridPointer, CoordinatesType( 0 ), gridPointer->getDimensions(), - userData ); + userData, + asynchronousMode ); } template< typename Real, @@ -208,7 +216,8 @@ processInteriorEntities( const GridPointer& gridPointer, gridPointer, CoordinatesType( 1 ), gridPointer->getDimensions() - CoordinatesType( 1 ), - userData ); + userData, + asynchronousMode ); } template< typename Real, @@ -232,7 +241,8 @@ processAllEntities( gridPointer, CoordinatesType( 0 ), gridPointer->getDimensions(), - userData ); + userData, + asynchronousMode ); } } // namespace Meshes diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h index 41e1612565dd66a61eb5a505ff9ffdbdb67adf41..7809c9739ceca545a8e3bb9359793de2f69c270d 100644 --- a/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h +++ b/src/TNL/Meshes/GridDetails/Traverser_Grid2D_impl.h @@ -42,6 +42,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1 ), userData, + asynchronousMode, 0 ); } else //Distributed @@ -57,6 +58,7 @@ processBoundaryEntities( const GridPointer& gridPointer, begin, CoordinatesType( begin.x(), end.y() ), userData, + asynchronousMode, 0 ); } @@ -67,6 +69,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( end.x(), begin.y() ), end, userData, + asynchronousMode, 0 ); } @@ -77,6 +80,7 @@ processBoundaryEntities( const GridPointer& gridPointer, begin, CoordinatesType( end.x(), begin.y() ), userData, + asynchronousMode, 0 ); } @@ -87,6 +91,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( begin.x(), end.y() ), end, userData, + asynchronousMode, 0 ); } } @@ -116,6 +121,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 1 ), gridPointer->getDimensions() - CoordinatesType( 2, 2 ), userData, + asynchronousMode, 0 ); } else // distributed @@ -141,6 +147,7 @@ processInteriorEntities( const GridPointer& gridPointer, begin, end, userData, + asynchronousMode, 0); } } @@ -169,6 +176,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1 ), userData, + asynchronousMode, 0 ); } else @@ -182,6 +190,7 @@ processAllEntities( const GridPointer& gridPointer, begin, end, userData, + asynchronousMode, 0); } } @@ -210,6 +219,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 1, 0 ), CoordinatesType( 0, 1 ) ); @@ -219,6 +229,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 0 ), userData, + asynchronousMode, 0, CoordinatesType( 0, 1 ), CoordinatesType( 1, 0 ) ); @@ -245,6 +256,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 1, 0 ), CoordinatesType( 0, 1 ) ); @@ -254,6 +266,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 0, 1 ), gridPointer->getDimensions() - CoordinatesType( 1, 1 ), userData, + asynchronousMode, 0, CoordinatesType( 0, 1 ), CoordinatesType( 1, 0 ) ); @@ -280,6 +293,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 1, 0 ), CoordinatesType( 0, 1 ) ); @@ -289,6 +303,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 0 ), userData, + asynchronousMode, 0, CoordinatesType( 0, 1 ), CoordinatesType( 1, 0 ) ); @@ -315,6 +330,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions(), userData, + asynchronousMode, 0 ); } @@ -339,6 +355,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 1 ), gridPointer->getDimensions() - CoordinatesType( 1, 1 ), userData, + asynchronousMode, 0 ); } @@ -363,6 +380,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0 ), gridPointer->getDimensions(), userData, + asynchronousMode, 0 ); } diff --git a/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h b/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h index e32c5a12ef0080ce8b7e0d57e3dacaf8a75425f7..ec242e36774f964cf249ec385efab1c558c36993 100644 --- a/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h +++ b/src/TNL/Meshes/GridDetails/Traverser_Grid3D_impl.h @@ -44,6 +44,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ), userData, + asynchronousMode, 0 ); } else // distributed @@ -59,6 +60,7 @@ processBoundaryEntities( const GridPointer& gridPointer, begin, CoordinatesType( begin.x(), end.y(), end.z() ), userData, + asynchronousMode, 0 ); } @@ -69,6 +71,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( end.x() , begin.y(), begin.z() ), end, userData, + asynchronousMode, 0 ); } @@ -79,6 +82,7 @@ processBoundaryEntities( const GridPointer& gridPointer, begin, CoordinatesType( end.x(), begin.y(), end.z() ), userData, + asynchronousMode, 0 ); } @@ -89,6 +93,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( begin.x(), end.y(), begin.z() ), end, userData, + asynchronousMode, 0 ); } @@ -99,6 +104,7 @@ processBoundaryEntities( const GridPointer& gridPointer, begin, CoordinatesType( end.x(), end.y(), begin.z() ), userData, + asynchronousMode, 0 ); } @@ -109,6 +115,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( begin.x(), begin.y(), end.z() ), end, userData, + asynchronousMode, 0 ); } } @@ -138,6 +145,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 1, 1 ), gridPointer->getDimensions() - CoordinatesType( 2, 2, 2 ), userData, + asynchronousMode, 0 ); } else @@ -169,7 +177,8 @@ processInteriorEntities( const GridPointer& gridPointer, begin, end, userData, - 0); + asynchronousMode, + 0 ); } } @@ -197,6 +206,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ), userData, + asynchronousMode, 0 ); } else @@ -209,6 +219,7 @@ processAllEntities( const GridPointer& gridPointer, begin, end, userData, + asynchronousMode, 0 ); } } @@ -237,6 +248,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 1, 1 ), userData, + asynchronousMode, 2, CoordinatesType( 1, 0, 0 ), CoordinatesType( 0, 1, 1 ) ); @@ -246,6 +258,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 0, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 0, 1, 0 ), CoordinatesType( 1, 0, 1 ) ); @@ -255,6 +268,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 0 ), userData, + asynchronousMode, 0, CoordinatesType( 0, 0, 1 ), CoordinatesType( 1, 1, 0 ) ); @@ -281,6 +295,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ), userData, + asynchronousMode, 2, CoordinatesType( 1, 0, 0 ), CoordinatesType( 0, 1, 1 ) ); @@ -290,6 +305,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 0, 1, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 0, 1, 0 ), CoordinatesType( 1, 0, 1 ) ); @@ -299,6 +315,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 1 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ), userData, + asynchronousMode, 0, CoordinatesType( 0, 0, 1 ), CoordinatesType( 1, 1, 0 ) ); @@ -324,6 +341,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 1, 1 ), userData, + asynchronousMode, 2, CoordinatesType( 1, 0, 0 ), CoordinatesType( 0, 1, 1 ) ); @@ -333,6 +351,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 0, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 0, 1, 0 ), CoordinatesType( 1, 0, 1 ) ); @@ -342,6 +361,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 0 ), userData, + asynchronousMode, 0, CoordinatesType( 0, 0, 1 ), CoordinatesType( 1, 1, 0 ) ); @@ -371,6 +391,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 0, 0 ), userData, + asynchronousMode, 2, CoordinatesType( 0, 1, 1 ), CoordinatesType( 1, 0, 0 ) ); @@ -380,6 +401,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 1, 0 ), userData, + asynchronousMode, 1, CoordinatesType( 1, 0, 1 ), CoordinatesType( 0, 1, 0 ) ); @@ -389,6 +411,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 0, 1 ), userData, + asynchronousMode, 0, CoordinatesType( 1, 1, 0 ), CoordinatesType( 0, 0, 1 ) ); @@ -415,6 +438,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 0, 1, 1 ), gridPointer->getDimensions() - CoordinatesType( 0, 1, 1 ), userData, + asynchronousMode, 2, CoordinatesType( 0, 1, 1 ), CoordinatesType( 1, 0, 0 ) ); @@ -424,6 +448,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 0, 1 ), gridPointer->getDimensions() - CoordinatesType( 1, 0, 1 ), userData, + asynchronousMode, 1, CoordinatesType( 1, 0, 1 ), CoordinatesType( 0, 1, 0 ) ); @@ -433,6 +458,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 1, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 0 ), userData, + asynchronousMode, 0, CoordinatesType( 1, 1, 0 ), CoordinatesType( 0, 0, 1 ) ); @@ -458,6 +484,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 1, 0, 0 ), userData, + asynchronousMode, 2, CoordinatesType( 0, 1, 1 ), CoordinatesType( 1, 0, 0 ) ); @@ -467,6 +494,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 1, 0 ), userData, + asynchronousMode, 1, CoordinatesType( 1, 0, 1 ), CoordinatesType( 0, 1, 0 ) ); @@ -476,6 +504,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions() - CoordinatesType( 0, 0, 1 ), userData, + asynchronousMode, 0, CoordinatesType( 1, 1, 0 ), CoordinatesType( 0, 0, 1 ) ); @@ -505,6 +534,7 @@ processBoundaryEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions(), userData, + asynchronousMode, 0 ); } @@ -529,6 +559,7 @@ processInteriorEntities( const GridPointer& gridPointer, CoordinatesType( 1, 1, 1 ), gridPointer->getDimensions() - CoordinatesType( 1, 1, 1 ), userData, + asynchronousMode, 0 ); } @@ -553,6 +584,7 @@ processAllEntities( const GridPointer& gridPointer, CoordinatesType( 0, 0, 0 ), gridPointer->getDimensions(), userData, + asynchronousMode, 0 ); } diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h index 9989954b56101398ac8cbf6aa8c9b1344a67e44a..40e2af8f3e1c32ae4ab31aaf2c1bb9571e3c639a 100644 --- a/src/TNL/ParallelFor.h +++ b/src/TNL/ParallelFor.h @@ -15,7 +15,7 @@ #include #include -/* +/**** * The implementation of ParallelFor is not meant to provide maximum performance * at every cost, but maximum flexibility for operating with data stored on the * device. @@ -28,7 +28,10 @@ namespace TNL { -template< typename Device = Devices::Host > +enum ParallelForMode { SynchronousMode, AsynchronousMode }; + +template< typename Device = Devices::Host, + ParallelForMode Mode = SynchronousMode > struct ParallelFor { template< typename Index, @@ -41,7 +44,7 @@ struct ParallelFor // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )' if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 ) { - #pragma omp parallel for +#pragma omp parallel for for( Index i = start; i < end; i++ ) f( i, args... ); } @@ -55,7 +58,8 @@ struct ParallelFor } }; -template< typename Device = Devices::Host > +template< typename Device = Devices::Host, + ParallelForMode Mode = SynchronousMode > struct ParallelFor2D { template< typename Index, @@ -68,7 +72,7 @@ struct ParallelFor2D // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )' if( TNL::Devices::Host::isOMPEnabled() ) { - #pragma omp parallel for +#pragma omp parallel for for( Index j = startY; j < endY; j++ ) for( Index i = startX; i < endX; i++ ) f( i, j, args... ); @@ -86,7 +90,8 @@ struct ParallelFor2D } }; -template< typename Device = Devices::Host > +template< typename Device = Devices::Host, + ParallelForMode Mode = SynchronousMode > struct ParallelFor3D { template< typename Index, @@ -97,15 +102,16 @@ struct ParallelFor3D #ifdef HAVE_OPENMP // Benchmarks show that this is significantly faster compared // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )' - if( TNL::Devices::Host::isOMPEnabled() ) - { - #pragma omp parallel for collapse(2) - for( Index k = startZ; k < endZ; k++ ) - for( Index j = startY; j < endY; j++ ) - for( Index i = startX; i < endX; i++ ) - f( i, j, k, args... ); + if( TNL::Devices::Host::isOMPEnabled() ) + { +#pragma omp parallel for collapse(2) + for( Index k = startZ; k < endZ; k++ ) + for( Index j = startY; j < endY; j++ ) + for( Index i = startX; i < endX; i++ ) + f( i, j, k, args... ); } - else { + else + { for( Index k = startZ; k < endZ; k++ ) for( Index j = startY; j < endY; j++ ) for( Index i = startX; i < endX; i++ ) @@ -185,8 +191,8 @@ ParallelFor3DKernel( Index startX, Index startY, Index startZ, Index endX, Index } #endif -template<> -struct ParallelFor< Devices::Cuda > +template< ParallelForMode Mode > +struct ParallelFor< Devices::Cuda, Mode > { template< typename Index, typename Function, @@ -208,8 +214,11 @@ struct ParallelFor< Devices::Cuda > ParallelForKernel< true ><<< gridSize, blockSize >>>( start, end, f, args... ); } - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + if( Mode == SynchronousMode ) + { + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + } } #else throw Exceptions::CudaSupportMissing(); @@ -217,8 +226,8 @@ struct ParallelFor< Devices::Cuda > } }; -template<> -struct ParallelFor2D< Devices::Cuda > +template< ParallelForMode Mode > +struct ParallelFor2D< Devices::Cuda, Mode > { template< typename Index, typename Function, @@ -264,8 +273,11 @@ struct ParallelFor2D< Devices::Cuda > ParallelFor2DKernel< true, true ><<< gridSize, blockSize >>> ( startX, startY, endX, endY, f, args... ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + if( Mode == SynchronousMode ) + { + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + } } #else throw Exceptions::CudaSupportMissing(); @@ -273,8 +285,8 @@ struct ParallelFor2D< Devices::Cuda > } }; -template<> -struct ParallelFor3D< Devices::Cuda > +template< ParallelForMode Mode > +struct ParallelFor3D< Devices::Cuda, Mode > { template< typename Index, typename Function, @@ -359,8 +371,11 @@ struct ParallelFor3D< Devices::Cuda > ParallelFor3DKernel< true, true, true ><<< gridSize, blockSize >>> ( startX, startY, startZ, endX, endY, endZ, f, args... ); - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; + if( Mode == SynchronousMode ) + { + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + } } #else throw Exceptions::CudaSupportMissing(); diff --git a/src/UnitTests/Meshes/DistributedMeshes/CMakeLists.txt b/src/UnitTests/Meshes/DistributedMeshes/CMakeLists.txt index 068c0485a0e417a5108864e78119794c68a19b4d..644e4b08b60b3fc8c922c53be655a9909c382487 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/CMakeLists.txt +++ b/src/UnitTests/Meshes/DistributedMeshes/CMakeLists.txt @@ -67,19 +67,19 @@ SET (mpi_test_parameters_3d -np 27 -H localhost:27 "${EXECUTABLE_OUTPUT_PATH}/Di ADD_TEST( NAME DistributedGridTest_3D COMMAND "mpirun" ${mpi_test_parameters_3d}) SET (mpi_test_parameters_IO -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedGridIOTest${CMAKE_EXECUTABLE_SUFFIX}") -ADD_TEST( NAME DistributedGridIOTest COMMAND "mpirun" ${mpi_test_parameters_IO}) +#ADD_TEST( NAME DistributedGridIOTest COMMAND "mpirun" ${mpi_test_parameters_IO}) SET (mpi_test_parameters_IOMPIIO -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedGridIO_MPIIOTest${CMAKE_EXECUTABLE_SUFFIX}") -ADD_TEST( NAME DistributedGridIO_MPIIOTest COMMAND "mpirun" ${mpi_test_parameters_IOMPIIO}) +#ADD_TEST( NAME DistributedGridIO_MPIIOTest COMMAND "mpirun" ${mpi_test_parameters_IOMPIIO}) SET (mpi_test_parameters_CutDistributedGridTest -np 12 -H localhost:12 "${EXECUTABLE_OUTPUT_PATH}/CutDistributedGridTest${CMAKE_EXECUTABLE_SUFFIX}") -ADD_TEST( NAME CutDistributedGridTest COMMAND "mpirun" ${mpi_test_parameters_CutDistributedGridTest}) +#ADD_TEST( NAME CutDistributedGridTest COMMAND "mpirun" ${mpi_test_parameters_CutDistributedGridTest}) SET (mpi_test_parameters_CutDistributedMeshFunctionTest -np 12 -H localhost:12 "${EXECUTABLE_OUTPUT_PATH}/CutDistributedMeshFunctionTest${CMAKE_EXECUTABLE_SUFFIX}") -ADD_TEST( NAME CutDistributedMeshFunctionTest COMMAND "mpirun" ${mpi_test_parameters_CutDistributedMeshFunctionTest}) +#ADD_TEST( NAME CutDistributedMeshFunctionTest COMMAND "mpirun" ${mpi_test_parameters_CutDistributedMeshFunctionTest}) SET (mpi_test_parameters_DistributedVectorFieldIO_MPIIOTest -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedVectorFieldIO_MPIIOTest ${CMAKE_EXECUTABLE_SUFFIX}") -ADD_TEST( NAME DistributedVectorFieldIO_MPIIOTest COMMAND "mpirun" ${mpi_test_parameters_IOMPIIO}) +#ADD_TEST( NAME DistributedVectorFieldIO_MPIIOTest COMMAND "mpirun" ${mpi_test_parameters_IOMPIIO}) endif()