diff --git a/CMakeLists.txt b/CMakeLists.txt index 05a0fd0b6849f69b166dfcb400ff705df68ee61f..b85842c1f14a9eca598427b6b7240d2f1cbba417 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -210,7 +210,7 @@ if( ${WITH_CUDA} ) set( CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} ) endif() endif() - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -DHAVE_CUDA --expt-relaxed-constexpr --expt-extended-lambda) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -DHAVE_CUDA --expt-relaxed-constexpr --expt-extended-lambda --default-stream per-thread) # disable false compiler warnings # reference for the -Xcudafe --diag_suppress and --display_error_number flags: https://stackoverflow.com/a/54142937 # incomplete list of tokens: http://www.ssl.berkeley.edu/~jimm/grizzly_docs/SSL/opt/intel/cc/9.0/lib/locale/en_US/mcpcom.msg diff --git a/Documentation/Pages/main-page.md b/Documentation/Pages/main-page.md index db9aceccbbf7e887db7b76c8145204b8380f132a..5693f92a0df917185a705df474389bb554f08aa4 100644 --- a/Documentation/Pages/main-page.md +++ b/Documentation/Pages/main-page.md @@ -109,9 +109,9 @@ computing platform, and (optionally) some libraries. - [CUDA](https://docs.nvidia.com/cuda/index.html) 9.0 or later -- for computations on Nvidia GPUs. - [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface) -- TNL can - use an MPI library such as [OpenMPI](https://www.open-mpi.org/) for - distributed computing. For distributed CUDA computations, the library must - be [CUDA-aware]( + a library implementing the MPI-3 standard for distributed computing (e.g. + [OpenMPI](https://www.open-mpi.org/)). For distributed CUDA computations, + the library must be [CUDA-aware]( https://developer.nvidia.com/blog/introduction-cuda-aware-mpi/). - __Libraries:__ diff --git a/Documentation/Tutorials/index.md b/Documentation/Tutorials/index.md index 56a51cc2234f964866bdb7ae3d2f07e03851ebea..55b92ad81a27a198dcc6cb71d534b84664ed3d78 100644 --- a/Documentation/Tutorials/index.md +++ b/Documentation/Tutorials/index.md @@ -2,11 +2,10 @@ ## Tutorials -1. [Building applications with TNL](tutorial_building_applications_with_tnl.html) -2. [General concepts](tutorial_GeneralConcepts.html) -3. [Arrays](tutorial_Arrays.html) -4. [Vectors](tutorial_Vectors.html) -5. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html) -6. [For loops](tutorial_ForLoops.html) -7. [Cross-device pointers](tutorial_Pointers.html) -8. [Matrices](tutorial_Matrices.html) +1. [General concepts](tutorial_GeneralConcepts.html) +2. [Arrays](tutorial_Arrays.html) +3. [Vectors](tutorial_Vectors.html) +4. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html) +5. [For loops](tutorial_ForLoops.html) +6. [Cross-device pointers](tutorial_Pointers.html) +7. [Matrices](tutorial_Matrices.html) diff --git a/src/3rdparty/CMakeLists.txt b/src/3rdparty/CMakeLists.txt index 6dba288f0332c40c2e6eaf068e46af089519492e..01550de19eb337db7856628463ef7233763c4729 100644 --- a/src/3rdparty/CMakeLists.txt +++ b/src/3rdparty/CMakeLists.txt @@ -1,3 +1,9 @@ install( DIRECTORY mpark Leksys TYPE INCLUDE MESSAGE_NEVER FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp" ) + +if( ${WITH_PYTHON} ) + install( DIRECTORY cctbx TYPE INCLUDE + MESSAGE_NEVER + FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp" ) +endif() diff --git a/src/3rdparty/async/README.md b/src/3rdparty/async/README.md new file mode 100644 index 0000000000000000000000000000000000000000..36106864ab9f712faf762188a58104947ac5e4f5 --- /dev/null +++ b/src/3rdparty/async/README.md @@ -0,0 +1,532 @@ +# async +Homepage: https://github.com/d36u9/async + +[[License(Boost Software License - Version 1.0)](http://www.boost.org/LICENSE_1_0.txt)] + +## Welcome +async is a tiny C++ header-only high-performance library for async calls handled by a thread-pool, which is built on top of an unbounded MPMC lock-free queue. +It's written in pure C++14 (C++11 support with preprocessor macros), no dependencies on other 3rd party libraries. + +Note: This library is originally designed for 64bit system. It has been tested on arch X86-64 and ARMV8(64bit), and ARMV7(32bit). + +## change logs +* Jun. 2018: + * Added support for ARMV7 & V8 + * Tested on Raspberry Pi 3 B+ with Gentoo ARMV8 64bit (Linux Pi64 4.14.44-V8 AArch64) + * Tested on Raspberry Pi 3 B+ with Raspbian ARMV7 32bit (Linux 4.14.34-v7 armv7l) + * Added Benchmark Results for Raspberry Pi 3 B+ ARMV8 (Linux Pi64 4.14.44-V8 AArch64) + * Added Benchmark Results for Raspberry Pi 3 B+ ARMV7 32bit (Linux 4.14.34-v7 armv7l) +* Sept. 2017: + * Significantly improved the performance of async::queue without bulk operations. + * async::threadpool also benifits from this change. + * A bounded MPMC queue `async::bounded_queue` was added to the lib, which is pretty useful for memory constrainted system or some fixed-size message pipeline design. The overall performance of this buffer based `async::bounded_queue` is comparable to bulk operations of node-based `async::queue`. `async::bounded_queue` shares the almost identical interface as `async::queue`, except for bulk operations, and a size prarameter has to be passed to `bounded_queue`'s constructor, and also added blocking methods (`blocking_enqueue` & `blocking_dequeue`). `TRAIT::NOEXCEPT_CHECK` setting is also similar to `async::queue` to help handle exceptions that may be thrown in element's ctor. `bounded_queue` is basically a C++ implementation of [PTLQueue](https://blogs.oracle.com/dave/ptlqueue-:-a-scalable-bounded-capacity-mpmc-queue) design (Please read Dave Dice's article for details and references). + +## Features +* interchangeable with std::async, accepts all kinds of callable instances, like static functions, member functions, functors, lambdas +* dynamically changeable thread-pool size at run-time +* tasks are managed in a lock-free queue +* provided lock-free queue doesn't have restricted limitation as boost::lockfree::queue +* low-latency for the task execution thanks to underlying lock-free queue + +## Tested Platforms& Compilers +(old versions of OSs or compilers may work, but not tested) +* Windows 10 Visual Studio 2015+ +* Linux Ubuntu 16.04 gcc4.9.2+/clang 3.8+ +* MacOS Sierra 10.12.5 clang-802.0.42 + +## Getting Started +## Building the test& benchmark + +### C++11 compilers +If your compiler only supports C++11, please edit CMakeLists.txt with the following change: +``` +set(CMAKE_CXX_STANDARD 14) +#change to +set(CMAKE_CXX_STANDARD 11) +``` + +### Build& test with Microsoft C++ REST SDK +If your OS is Windows or has cppresetsdk installed& configured on Linux or Mac, please edit CMakeLists.txt to enable PPL test: +``` +option(WITH_CPPRESTSDK "Build Cpprestsdk Test" OFF) +#to +option(WITH_CPPRESTSDK "Build Cpprestsdk Test" ON) +``` + + +### Build for Linux or Mac (x86-64 & ARMV7&V8) +``` +#to use clang (linux) with following export command +#EXPORT CC=clang-3.8 +#EXPORT CXX=clang++-3.8 +#run the following to set up release build, (for MasOS Xcode, you can remove -DCMAKE_BUILD_TYPE for now, and choose build type at build-time) +cmake -H. -Bbuild -DCMAKE_BUILD_TYPE=RELEASE +#now build the release +cmake --build build --config Release +#or debug +cmake --build build --config Debug +#or other builds +cmake --build build --config RelWithDebInfo +cmake --build build --config MinSizeRel +``` + +### Build for Windows (X86-64) +``` +#for VS 2015 +cmake -H. -Bbuild -G "Visual Studio 14 2015 Win64" +#or VS 2017 +cmake -H. -Bbuild -G "Visual Studio 15 2017 Win64" +#build the release from command line or you can open the project file in Visual Studio, and build from there +cmake --build build --config Release +``` + +## How to use it in your project/application +simply copy all headers in async sub-folder to your project, and include those headers in your source code. + +## Thread Pool Indrodction +### Thread Pool intializations + +``` +async::threadpool tp; //by default, thread pool size will be the same number of your hardware CPU core/threads +async::threadpool tp(8); //create a thread pool with 8 threads +async::threadpool tp(0); //create a thread pool with no threads available, it's in pause mode +``` + +### resize the thread pool +``` +async::threadpool tp(32); +...//some operations +tp.configurepool(16);// can be called at anytime (as long as tp is still valid) to reset the pool size + // no interurption for running tasks +``` +### submit the task +*static functions, member functions, functors, lambdas are all supported +``` +int foo(int i) { return ++i; } +auto pkg = tp.post(foo, i); //retuns a std::future +pkg.get(); //will block +``` + +## multi-producer multi-consumer unbounded lock-free queue Indrodction +The design: A simple and classic implementation. It's link-based 3-level depth nested container with local array for each level storage and simulated tagged pointer for linking. +The size of each level, and tag bits can be configured through TRAITS (please see source for details). +The queue with default traits seetings can store up to 1 Trillion elements/nodes (at least 1 Terabyte memory space). + +### element type requirements +* nothrow destructible +* optional (better to be true) + * nothrow constructible + * nothrow move-assignable + +NOTE: the exception thrown by constructor is acceptable. Although it'd be better to keep ctor noexcept if possible. +noexcept detection is turned off by default, it can be turned on by setting `TRAIT::NOEXCEPT_CHECK` to true. +With `TRAIT::NOEXCEPT_CHECK` on(true), queue will enable exception handling if ctor or move assignment may throw exceptions. + + +### queue intializations +``` +async::queue q; //default constructor, it's unbounded + +async::queue q(1000); // pre-allocated 1000 storage nodes, the capcity will increase automatically after 1000 nodes are used +``` +### usage +``` +// enqueues a T constructed from args, supports the following constructions: +// move, if args is a T rvalue +// copy, if args is a T lvalue, or +// emplacement if args is an initializer list that can be passed to a T constructor +async::queue::enqueue(Args... args) + +async::queue::dequeue(T& data) //type T should have move assignment operator, +//e.g. +async::queue q; +q.enqueue(11); +int i(0); +q.dequeue(i); + +``` +### bulk operations +It's convienent for bulk data, and also can boost the throughput. +exception handling is not available in bulk operations even with `TRAIT::NOEXCEPT_CHECK` being true. +bulk operations are suitable for plain data types, like network/event messages. + +``` +int a[] = {1,2,3,4,5}; +int b[5]; +q.bulk_enqueue(std::bengin(a), 5); +auto popcount = q.bulk_dequeue(std::begin(b), 5); //popcount is the number of elemtnets sucessfully pulled from the queue. +//or like the following code: +std::vector v; +auto it = std::inserter(v, std::begin(v)); +popcount = q.bulk_dequeue(it, 5); +``` + +## Unit Test +The unit test code provides most samples for usage. + +## Benchmark +NOTE: the results may vary on different OS platforms and hardware. +### thread pool benchmark +The benchmark is a simple demonstration. +NOTE: may require extra config, please see CMakeLists.txt for detailed settings +The test benchamarks the following task/job based async implementation: +* async::threadpool (this library) +* std::async +* boost::async +* AsioThreadPool (my another implementation based on boost::asio, has very stable and good performance, especially on Windows with iocp) +* Microsoft::PPL (pplx from [cpprestsdk](https://github.com/Microsoft/cpprestsdk) on Linux& MacOS or PPL on windows) + + +e.g. Windows 10 64bit Intel i7-6700K 16GB RAM 480GB SSD Visual Studio 2017 (cl 19.11.25507.1 x64) +``` +Benchmark Test Run: 1 Producers 7(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 1130 ns max: 1227 ns min: 1066 ns avg_task_post: 1032 ns + *std::async (time/task) avg: 1469 ns max: 1549 ns min: 1423 ns avg_task_post: 1250 ns + *Microsoft::PPL (time/task) avg: 1148 ns max: 1216 ns min: 1114 ns avg_task_post: 1088 ns + AsioThreadPool (time/task) avg: 1166 ns max: 1319 ns min: 1013 ns avg_task_post: 1073 ns + *boost::async (time/task) avg: 29153 ns max: 30028 ns min: 27990 ns avg_task_post: 23343 ns +... +Benchmark Test Run: 4 Producers 4(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 439 ns max: 557 ns min: 398 ns avg_task_post: 356 ns + *std::async (time/task) avg: 800 ns max: 890 ns min: 759 ns avg_task_post: 629 ns + *Microsoft::PPL (time/task) avg: 666 ns max: 701 ns min: 640 ns avg_task_post: 605 ns + AsioThreadPool (time/task) avg: 448 ns max: 541 ns min: 389 ns avg_task_post: 365 ns + *boost::async (time/task) avg: 32419 ns max: 33296 ns min: 30105 ns avg_task_post: 25561 ns +... +Benchmark Test Run: 7 Producers 1(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 262 ns max: 300 ns min: 252 ns avg_task_post: 176 ns + *std::async (time/task) avg: 873 ns max: 961 ns min: 821 ns avg_task_post: 701 ns + *Microsoft::PPL (time/task) avg: 727 ns max: 755 ns min: 637 ns avg_task_post: 662 ns + AsioThreadPool (time/task) avg: 607 ns max: 645 ns min: 567 ns avg_task_post: 210 ns + *boost::async (time/task) avg: 33158 ns max: 150331 ns min: 28560 ns avg_task_post: 28655 ns +``` + +e.g. Ubuntu 17.04 Intel i7-6700K 16GB RAM 100GB HDD gcc 6.3.0 +``` +Benchmark Test Run: 1 Producers 7(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 1320 ns max: 1357 ns min: 1301 ns avg_task_post: 1266 ns + *std::async (time/task) avg: 11817 ns max: 12469 ns min: 11533 ns avg_task_post: 9580 ns + *Microsoft::PPL (time/task) avg: 1368 ns max: 1498 ns min: 1325 ns avg_task_post: 1349 ns + AsioThreadPool (time/task) avg: 1475 ns max: 1499 ns min: 1318 ns avg_task_post: 1332 ns + *boost::async (time/task) avg: 4574 ns max: 4697 ns min: 4450 ns avg_task_post: 4531 ns +... +Benchmark Test Run: 4 Producers 4(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 516 ns max: 688 ns min: 239 ns avg_task_post: 522 ns + *std::async (time/task) avg: 41630 ns max: 44316 ns min: 41334 ns avg_task_post: 38151 ns + *Microsoft::PPL (time/task) avg: 3652 ns max: 3710 ns min: 3598 ns avg_task_post: 3629 ns + AsioThreadPool (time/task) avg: 529 ns max: 814 ns min: 494 ns avg_task_post: 447 ns + *boost::async (time/task) avg: 14634 ns max: 14669 ns min: 14598 ns avg_task_post: 14583 ns +... +Benchmark Test Run: 7 Producers 1(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 398 ns max: 468 ns min: 337 ns avg_task_post: 177 ns + *std::async (time/task) avg: 44603 ns max: 46904 ns min: 44272 ns avg_task_post: 40877 ns + *Microsoft::PPL (time/task) avg: 3714 ns max: 3816 ns min: 3656 ns avg_task_post: 3690 ns + AsioThreadPool (time/task) avg: 564 ns max: 605 ns min: 533 ns avg_task_post: 253 ns + *boost::async (time/task) avg: 20421 ns max: 21738 ns min: 19105 ns avg_task_post: 20375 ns +``` + +e.g. MacOS 10.12.5 clang Intel i7-6700K 16GB RAM 250GB SSD clang-802.0.42 (Microsoft::PPL(cpprestsdk::pplx) is superisingly good compared with other libraries on MacOS, not sure if it's due to some comipiler optimization) +``` +Benchmark Test Run: 1 Producers 7(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 8517 ns max: 8641 ns min: 7400 ns avg_task_post: 8393 ns + *std::async (time/task) avg: 13618 ns max: 13845 ns min: 13276 ns avg_task_post: 13476 ns + *Microsoft::PPL (time/task) avg: 747 ns max: 938 ns min: 626 ns avg_task_post: 718 ns + AsioThreadPool (time/task) avg: 8647 ns max: 8807 ns min: 8558 ns avg_task_post: 8524 ns + *boost::async (time/task) avg: 11732 ns max: 12028 ns min: 11526 ns avg_task_post: 11698 ns +... +Benchmark Test Run: 4 Producers 4(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 5964 ns max: 6017 ns min: 5790 ns avg_task_post: 5830 ns + *std::async (time/task) avg: 9690 ns max: 10043 ns min: 9132 ns avg_task_post: 9531 ns + *Microsoft::PPL (time/task) avg: 380 ns max: 425 ns min: 342 ns avg_task_post: 353 ns + AsioThreadPool (time/task) avg: 6173 ns max: 6459 ns min: 6116 ns avg_task_post: 6042 ns + *boost::async (time/task) avg: 8643 ns max: 9470 ns min: 8513 ns avg_task_post: 8591 ns +... +Benchmark Test Run: 7 Producers 1(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 3469 ns max: 3527 ns min: 3415 ns avg_task_post: 3339 ns + *std::async (time/task) avg: 10902 ns max: 11164 ns min: 10709 ns avg_task_post: 10738 ns + *Microsoft::PPL (time/task) avg: 367 ns max: 426 ns min: 326 ns avg_task_post: 323 ns + AsioThreadPool (time/task) avg: 3920 ns max: 3975 ns min: 3832 ns avg_task_post: 3409 ns + *boost::async (time/task) avg: 9800 ns max: 10223 ns min: 9196 ns avg_task_post: 9744 ns +``` + +e.g. Windows 7 64bit Intel i7-4790 16GB RAM Visual Studio 2015 Update 3 +``` +Benchmark Test Run: 1 Producers 7(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 809 ns max: 924 ns min: 687 ns avg_task_post: 774 ns + *std::async (time/task) avg: 1914 ns max: 2032 ns min: 1790 ns avg_task_post: 1877 ns + *Microsoft::PPL (time/task) avg: 1718 ns max: 2181 ns min: 1623 ns avg_task_post: 1677 ns + AsioThreadPool (time/task) avg: 1100 ns max: 1137 ns min: 1076 ns avg_task_post: 1065 ns + *boost::async (time/task) avg: 191532 ns max: 203716 ns min: 186114 ns avg_task_post: 191507 ns +... +Benchmark Test Run: 4 Producers 4(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 423 ns max: 538 ns min: 338 ns avg_task_post: 388 ns + *std::async (time/task) avg: 1249 ns max: 1279 ns min: 1233 ns avg_task_post: 1211 ns + *Microsoft::PPL (time/task) avg: 1229 ns max: 1246 ns min: 1208 ns avg_task_post: 1186 ns + AsioThreadPool (time/task) avg: 563 ns max: 577 ns min: 499 ns avg_task_post: 528 ns + *boost::async (time/task) avg: 95484 ns max: 112569 ns min: 93808 ns avg_task_post: 95458 ns +... +Benchmark Test Run: 7 Producers 1(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 267 ns max: 323 ns min: 255 ns avg_task_post: 232 ns + *std::async (time/task) avg: 1202 ns max: 1257 ns min: 1182 ns avg_task_post: 1009 ns + *Microsoft::PPL (time/task) avg: 1199 ns max: 1262 ns min: 1175 ns avg_task_post: 988 ns + AsioThreadPool (time/task) avg: 783 ns max: 960 ns min: 706 ns avg_task_post: 375 ns + *boost::async (time/task) avg: 103572 ns max: 107041 ns min: 101993 ns avg_task_post: 103542 ns +``` + +e.g. Gentoo ARMV8 64bit (Linux Pi64 4.14.44-V8 AArch64) gcc 7.3.0 on Raspberry Pi 3 B+ +``` +Benchmark Test Run: 1 Producers 3(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 7809 ns max: 10467 ns min: 7453 ns avg_task_post: 7261 ns + *std::async (time/task) avg: 139664 ns max: 3453077 ns min: 104589 ns avg_task_post: 117819 ns + AsioThreadPool (time/task) avg: 6545 ns max: 8804 ns min: 5678 ns avg_task_post: 5654 ns + *boost::async (time/task) avg: 37629 ns max: 38978 ns min: 36769 ns avg_task_post: 36933 ns + +Benchmark Test Run: 2 Producers 2(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 2207 ns max: 4084 ns min: 1809 ns avg_task_post: 1325 ns + *std::async (time/task) avg: 431781 ns max: 17500817 ns min: 91919 ns avg_task_post: 407595 ns + AsioThreadPool (time/task) avg: 2251 ns max: 3351 ns min: 1839 ns avg_task_post: 1405 ns + *boost::async (time/task) avg: 48456 ns max: 50578 ns min: 46698 ns avg_task_post: 47753 ns + +Benchmark Test Run: 3 Producers 1(* not applied) Consumers with 21000 tasks and run 100 batches + async::threapool (time/task) avg: 3346 ns max: 3974 ns min: 2635 ns avg_task_post: 1017 ns + *std::async (time/task) avg: 110853 ns max: 768224 ns min: 103045 ns avg_task_post: 86361 ns + AsioThreadPool (time/task) avg: 3828 ns max: 4209 ns min: 3354 ns avg_task_post: 976 ns + *boost::async (time/task) avg: 59094 ns max: 67042 ns min: 54802 ns avg_task_post: 58365 ns +``` + +### queue benchmark +The benchmark uses producers-consumers model, and doesn't provide all the detailed measurements. +* async::bounded_queue +* async::queue +* boost::lockfree::queue +* boost::lockfree::spsc_queue (only for single-producer-single-consumer test) + +e.g. Windows 10 64bit Intel i7-6700K 16GB RAM 480GB SSD Visual Studio 2017 (cl 19.11.25507.1 x64) +``` +Single Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches +Benchmark Test Run: 1 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 18 ns max: 55 ns min: 17 ns +async::queue::bulk(16) (time/op) avg: 26 ns max: 50 ns min: 23 ns + async::queue (time/op) avg: 28 ns max: 66 ns min: 27 ns +boost::lockfree::queue (time/op) avg: 167 ns max: 195 ns min: 70 ns +boost::lockfree::spsc_queue (time/op) avg: 10 ns max: 38 ns min: 8 ns + +Benchmark Test Run: 1 Producers 7 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 27 ns max: 62 ns min: 25 ns +async::queue::bulk(16) (time/op) avg: 28 ns max: 124 ns min: 24 ns + async::queue (time/op) avg: 42 ns max: 115 ns min: 29 ns +boost::lockfree::queue (time/op) avg: 240 ns max: 576 ns min: 119 ns + +Benchmark Test Run: 2 Producers 6 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 44 ns max: 78 ns min: 29 ns +async::queue::bulk(16) (time/op) avg: 34 ns max: 109 ns min: 28 ns + async::queue (time/op) avg: 90 ns max: 122 ns min: 44 ns +boost::lockfree::queue (time/op) avg: 213 ns max: 227 ns min: 161 ns + +Benchmark Test Run: 3 Producers 5 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 53 ns max: 82 ns min: 27 ns +async::queue::bulk(16) (time/op) avg: 34 ns max: 107 ns min: 29 ns + async::queue (time/op) avg: 100 ns max: 114 ns min: 51 ns +boost::lockfree::queue (time/op) avg: 197 ns max: 207 ns min: 186 ns + +Benchmark Test Run: 4 Producers 4 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 31 ns max: 81 ns min: 25 ns +async::queue::bulk(16) (time/op) avg: 31 ns max: 104 ns min: 28 ns + async::queue (time/op) avg: 93 ns max: 117 ns min: 73 ns +boost::lockfree::queue (time/op) avg: 211 ns max: 222 ns min: 162 ns + +Benchmark Test Run: 5 Producers 3 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 52 ns max: 79 ns min: 30 ns +async::queue::bulk(16) (time/op) avg: 33 ns max: 103 ns min: 29 ns + async::queue (time/op) avg: 94 ns max: 126 ns min: 74 ns +boost::lockfree::queue (time/op) avg: 199 ns max: 217 ns min: 174 ns + +Benchmark Test Run: 6 Producers 2 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 49 ns max: 81 ns min: 35 ns +async::queue::bulk(16) (time/op) avg: 33 ns max: 60 ns min: 28 ns + async::queue (time/op) avg: 97 ns max: 134 ns min: 51 ns +boost::lockfree::queue (time/op) avg: 185 ns max: 198 ns min: 152 ns + +Benchmark Test Run: 7 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 36 ns max: 81 ns min: 34 ns +async::queue::bulk(16) (time/op) avg: 30 ns max: 60 ns min: 26 ns + async::queue (time/op) avg: 48 ns max: 89 ns min: 45 ns +boost::lockfree::queue (time/op) avg: 161 ns max: 179 ns min: 120 ns +``` + +e.g. MacOS 10.12.5 Intel i7-6700K 16GB RAM 250GB SSD clang-802.0.42 +``` +SSingle Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches +Benchmark Test Run: 1 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 12 ns max: 37 ns min: 12 ns +async::queue::bulk(16) (time/op) avg: 26 ns max: 54 ns min: 25 ns + async::queue (time/op) avg: 23 ns max: 61 ns min: 23 ns +boost::lockfree::queue (time/op) avg: 156 ns max: 172 ns min: 118 ns +boost::lockfree::spsc_queue (time/op) avg: 11 ns max: 30 ns min: 5 ns + +Benchmark Test Run: 1 Producers 7 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 84 ns max: 98 ns min: 60 ns +async::queue::bulk(16) (time/op) avg: 27 ns max: 125 ns min: 24 ns + async::queue (time/op) avg: 104 ns max: 115 ns min: 92 ns +boost::lockfree::queue (time/op) avg: 231 ns max: 326 ns min: 213 ns + +Benchmark Test Run: 2 Producers 6 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 82 ns max: 100 ns min: 61 ns +async::queue::bulk(16) (time/op) avg: 36 ns max: 108 ns min: 31 ns + async::queue (time/op) avg: 102 ns max: 122 ns min: 90 ns +boost::lockfree::queue (time/op) avg: 192 ns max: 229 ns min: 184 ns + +Benchmark Test Run: 3 Producers 5 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 79 ns max: 93 ns min: 61 ns +async::queue::bulk(16) (time/op) avg: 31 ns max: 94 ns min: 29 ns + async::queue (time/op) avg: 98 ns max: 116 ns min: 70 ns +boost::lockfree::queue (time/op) avg: 189 ns max: 198 ns min: 175 ns + +Benchmark Test Run: 4 Producers 4 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 77 ns max: 146 ns min: 56 ns +async::queue::bulk(16) (time/op) avg: 28 ns max: 92 ns min: 26 ns + async::queue (time/op) avg: 93 ns max: 167 ns min: 73 ns +boost::lockfree::queue (time/op) avg: 200 ns max: 218 ns min: 182 ns + +Benchmark Test Run: 5 Producers 3 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 76 ns max: 92 ns min: 48 ns +async::queue::bulk(16) (time/op) avg: 27 ns max: 89 ns min: 24 ns + async::queue (time/op) avg: 97 ns max: 140 ns min: 83 ns +boost::lockfree::queue (time/op) avg: 200 ns max: 211 ns min: 163 ns + +Benchmark Test Run: 6 Producers 2 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 80 ns max: 98 ns min: 59 ns +async::queue::bulk(16) (time/op) avg: 28 ns max: 97 ns min: 24 ns + async::queue (time/op) avg: 105 ns max: 122 ns min: 78 ns +boost::lockfree::queue (time/op) avg: 182 ns max: 194 ns min: 153 ns + +Benchmark Test Run: 7 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 86 ns max: 103 ns min: 64 ns +async::queue::bulk(16) (time/op) avg: 27 ns max: 82 ns min: 23 ns + async::queue (time/op) avg: 107 ns max: 127 ns min: 91 ns +boost::lockfree::queue (time/op) avg: 154 ns max: 180 ns min: 146 ns +``` + +e.g. Ubuntu 17.04 Intel i7-6700K 16GB RAM 100GB HDD gcc 6.3.0 +``` +Single Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches +Benchmark Test Run: 1 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 12 ns max: 71 ns min: 11 ns +async::queue::bulk(16) (time/op) avg: 65 ns max: 134 ns min: 24 ns + async::queue (time/op) avg: 48 ns max: 107 ns min: 33 ns +boost::lockfree::queue (time/op) avg: 179 ns max: 198 ns min: 60 ns +boost::lockfree::spsc_queue (time/op) avg: 7 ns max: 47 ns min: 4 ns + +Benchmark Test Run: 1 Producers 7 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 68 ns max: 505 ns min: 35 ns +async::queue::bulk(16) (time/op) avg: 29 ns max: 135 ns min: 25 ns + async::queue (time/op) avg: 93 ns max: 138 ns min: 73 ns +boost::lockfree::queue (time/op) avg: 234 ns max: 292 ns min: 208 ns + +Benchmark Test Run: 2 Producers 6 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 68 ns max: 106 ns min: 39 ns +async::queue::bulk(16) (time/op) avg: 35 ns max: 117 ns min: 19 ns + async::queue (time/op) avg: 92 ns max: 135 ns min: 79 ns +boost::lockfree::queue (time/op) avg: 193 ns max: 227 ns min: 175 ns + +Benchmark Test Run: 3 Producers 5 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 73 ns max: 251 ns min: 49 ns +async::queue::bulk(16) (time/op) avg: 31 ns max: 110 ns min: 26 ns + async::queue (time/op) avg: 96 ns max: 178 ns min: 70 ns +boost::lockfree::queue (time/op) avg: 179 ns max: 359 ns min: 164 ns + +Benchmark Test Run: 4 Producers 4 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 81 ns max: 220 ns min: 61 ns +async::queue::bulk(16) (time/op) avg: 27 ns max: 114 ns min: 23 ns + async::queue (time/op) avg: 102 ns max: 159 ns min: 74 ns +boost::lockfree::queue (time/op) avg: 177 ns max: 541 ns min: 162 ns + +Benchmark Test Run: 5 Producers 3 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 83 ns max: 443 ns min: 53 ns +async::queue::bulk(16) (time/op) avg: 26 ns max: 297 ns min: 23 ns + async::queue (time/op) avg: 110 ns max: 512 ns min: 79 ns +boost::lockfree::queue (time/op) avg: 176 ns max: 505 ns min: 161 ns + +Benchmark Test Run: 6 Producers 2 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 83 ns max: 437 ns min: 36 ns +async::queue::bulk(16) (time/op) avg: 26 ns max: 261 ns min: 23 ns + async::queue (time/op) avg: 112 ns max: 449 ns min: 84 ns +boost::lockfree::queue (time/op) avg: 178 ns max: 547 ns min: 164 ns + +Benchmark Test Run: 7 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 90 ns max: 805 ns min: 28 ns +async::queue::bulk(16) (time/op) avg: 26 ns max: 78 ns min: 21 ns + async::queue (time/op) avg: 123 ns max: 695 ns min: 80 ns +boost::lockfree::queue (time/op) avg: 195 ns max: 615 ns min: 154 ns +``` + +e.g. Gentoo ARMV8 64bit (Linux Pi64 4.14.44-V8 AArch64) gcc 7.3.0 on Raspberry Pi 3 B+ +``` +Single Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches +Benchmark Test Run: 1 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 67 ns max: 697 ns min: 53 ns +async::queue::bulk(16) (time/op) avg: 144 ns max: 434 ns min: 130 ns + async::queue (time/op) avg: 141 ns max: 441 ns min: 115 ns +boost::lockfree::queue (time/op) avg: 182 ns max: 514 ns min: 168 ns +boost::lockfree::spsc_queue (time/op) avg: 62 ns max: 430 ns min: 53 ns + +Benchmark Test Run: 1 Producers 3 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 72 ns max: 574 ns min: 59 ns +async::queue::bulk(16) (time/op) avg: 141 ns max: 515 ns min: 116 ns + async::queue (time/op) avg: 181 ns max: 590 ns min: 134 ns +boost::lockfree::queue (time/op) avg: 192 ns max: 1045 ns min: 172 ns + +Benchmark Test Run: 2 Producers 2 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 82 ns max: 457 ns min: 65 ns +async::queue::bulk(16) (time/op) avg: 99 ns max: 701 ns min: 84 ns + async::queue (time/op) avg: 124 ns max: 550 ns min: 108 ns +boost::lockfree::queue (time/op) avg: 151 ns max: 847 ns min: 138 ns + +Benchmark Test Run: 3 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 88 ns max: 538 ns min: 67 ns +async::queue::bulk(16) (time/op) avg: 89 ns max: 717 ns min: 71 ns + async::queue (time/op) avg: 131 ns max: 631 ns min: 118 ns +boost::lockfree::queue (time/op) avg: 165 ns max: 644 ns min: 149 ns +``` + +e.g. Raspbian ARMV7 32bit (Linux 4.14.34-v7 armv7l) gcc 6.3.0 on Raspberry Pi 3 B+ +``` +Single Producer Single Consumer Benchmark with 10000 Ops and run 1000 batches +Benchmark Test Run: 1 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 227 ns max: 912 ns min: 179 ns +async::queue::bulk(16) (time/op) avg: 442 ns max: 1236 ns min: 365 ns + async::queue (time/op) avg: 423 ns max: 1249 ns min: 364 ns +boost::lockfree::queue (time/op) avg: 474 ns max: 1017 ns min: 410 ns +boost::lockfree::spsc_queue (time/op) avg: 70 ns max: 761 ns min: 48 ns + +Benchmark Test Run: 1 Producers 3 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 241 ns max: 1482 ns min: 187 ns +async::queue::bulk(16) (time/op) avg: 470 ns max: 1259 ns min: 354 ns + async::queue (time/op) avg: 488 ns max: 1482 ns min: 375 ns +boost::lockfree::queue (time/op) avg: 462 ns max: 1158 ns min: 427 ns + + +Benchmark Test Run: 2 Producers 2 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 208 ns max: 348 ns min: 158 ns +async::queue::bulk(16) (time/op) avg: 285 ns max: 543 ns min: 237 ns + async::queue (time/op) avg: 306 ns max: 761 ns min: 234 ns +boost::lockfree::queue (time/op) avg: 334 ns max: 1481 ns min: 261 ns + + +Benchmark Test Run: 3 Producers 1 Consumers with 10000 Ops and run 1000 batches + async::bounded_queue (time/op) avg: 241 ns max: 884 ns min: 192 ns +async::queue::bulk(16) (time/op) avg: 210 ns max: 651 ns min: 180 ns + async::queue (time/op) avg: 439 ns max: 682 ns min: 375 ns +boost::lockfree::queue (time/op) avg: 420 ns max: 903 ns min: 320 ns +``` + +## coding style +all code has been formated by clang-format. It may be more easy to read in text editor or may be not :) + +## Many Thanks to 3rd party and their developers +* [Boost](http://www.boost.org/) +* [Boost CMake](https://github.com/Orphis/boost-cmake) Easy Boost integration in CMake projects! +* [Catch](https://github.com/philsquared/Catch) A powerful test framework for unit test. +* [cpprestsdk](https://github.com/Microsoft/cpprestsdk) The C++ REST SDK is a Microsoft project for cloud-based client-server communication in native code using a modern asynchronous C++ API design. +* [rlutil](https://github.com/tapio/rlutil) provides cross-platform console-mode functions to position and colorize text. +* [sakaki](https://github.com/sakaki-/gentoo-on-rpi3-64bit) Bootable 64-bit Gentoo image for the Raspberry Pi 3 B / B+, with Linux 4.14 diff --git a/src/3rdparty/async/bounded_queue.h b/src/3rdparty/async/bounded_queue.h new file mode 100644 index 0000000000000000000000000000000000000000..341e5f307498714657d37457042da26bf8f455cd --- /dev/null +++ b/src/3rdparty/async/bounded_queue.h @@ -0,0 +1,342 @@ +///////////////////////////////////////////////////////////////////// +// Copyright Yibo Zhu 2017 +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +///////////////////////////////////////////////////////////////////// + +#pragma once + +#include "utility.h" +#include +#include +#include + +namespace async { + +struct bounded_traits { + static constexpr bool NOEXCEPT_CHECK = false; // exception handling flag + static constexpr std::size_t CachelineSize = 64; + static constexpr std::size_t CachelineAlignment = 16; // must not be larger than alignof(std::max_align_t), see issue #1 + using sequence_type = std::uint64_t; +}; + +template class bounded_queue { +private: + static_assert(std::is_nothrow_destructible::value, + "T must be nothrow destructible"); + +public: + static constexpr std::size_t cacheline_size = TRAITS::CachelineSize; + static constexpr std::size_t cacheline_alignment = TRAITS::CachelineAlignment; + using seq_t = typename TRAITS::sequence_type; + explicit bounded_queue(std::size_t size) + : fastmodulo((size > 0 && ((size & (size - 1)) == 0))), + bitshift(fastmodulo ? getShiftBitsCount(size) : 0), + elements(new element[size]), mask(fastmodulo ? size - 1 : 0), + qsize(size), enqueueIx(0), dequeueIx(0) { + assert(qsize > 0); // any size <= 0 is illegal + } + bounded_queue(bounded_queue const &) = delete; + bounded_queue(bounded_queue &&) = delete; + bounded_queue &operator=(bounded_queue const &) = delete; + bounded_queue &operator=(bounded_queue &&) = delete; + ~bounded_queue() { delete[] elements; } + std::size_t size() { return qsize; } + + template ::value>::type> + inline void blocking_enqueue(Args &&... args) noexcept { + auto enqidx = enqueueIx.fetch_add(1, std::memory_order_acq_rel); + auto &ele = elements[index(enqidx)]; + auto enq_tkt = ticket(enqidx); + while (enq_tkt != ele.tkt.load(std::memory_order_acquire)) + continue; + ele.construct(std::forward(args)...); + ele.tkt.store(enq_tkt + 1, std::memory_order_release); + } + + template ::value>::type> + inline bool blocking_enqueue(Args &&... args) noexcept { + auto enqidx = enqueueIx.fetch_add(1, std::memory_order_acq_rel); + auto &ele = elements[index(enqidx)]; + auto enq_tkt = ticket(enqidx); + while (enq_tkt != ele.tkt.load(std::memory_order_acquire)) + continue; + if (ele.construct(std::forward(args)...)) { + ele.hasdata.store(true, std::memory_order_release); + ele.tkt.store(enq_tkt + 1, std::memory_order_release); + return true; + } else { + ele.hasdata.store(false, std::memory_order_release); + ele.tkt.store(enq_tkt + 1, std::memory_order_release); + return false; + } + } + + template ::value, + int>::type = 0> + inline bool enqueue(Args &&... args) noexcept { + auto enqidx = enqueueIx.load(std::memory_order_acquire); + for (;;) { + auto &ele = elements[index(enqidx)]; + seq_t tkt = ele.tkt.load(std::memory_order_acquire); + seq_t enq_tkt = ticket(enqidx); + seq_t diff = tkt - enq_tkt; + if (diff == 0) { + if (enqueueIx.compare_exchange_strong(enqidx, enqidx + 1, + std::memory_order_release, + std::memory_order_relaxed)) { + ele.construct(std::forward(args)...); + ele.tkt.store(enq_tkt + 1, std::memory_order_release); + return true; + } + } else if (diff >= std::numeric_limits::max() / 2) + return false; // queue is full + else + enqidx = enqueueIx.load(std::memory_order_acquire); + } + } + + template ::value, + int>::type = 0> + inline bool enqueue(Args &&... args) noexcept { + auto enqidx = enqueueIx.load(std::memory_order_relaxed); + for (;;) { + auto &ele = elements[index(enqidx)]; + seq_t tkt = ele.tkt.load(std::memory_order_acquire); + seq_t enq_tkt = ticket(enqidx); + seq_t diff = tkt - enq_tkt; + if (diff == 0) { + if (enqueueIx.compare_exchange_strong(enqidx, enqidx + 1, + std::memory_order_release, + std::memory_order_relaxed)) { + if (ele.construct(std::forward(args)...)) { + ele.hasdata.store(true, std::memory_order_release); + ele.tkt.store(enq_tkt + 1, std::memory_order_release); + return true; + } else { + ele.hasdata.store(false, std::memory_order_release); + ele.tkt.store(enq_tkt + 1, std::memory_order_release); + return false; + } + } + } else if (diff >= std::numeric_limits::max() / 2) + return false; // queue is full + else + enqidx = enqueueIx.load(std::memory_order_acquire); + } + } + + template ::value>::type> + inline void blocking_dequeue(U &data) noexcept { + auto deqidx = dequeueIx.fetch_add(1, std::memory_order_acq_rel); + auto &ele = elements[index(deqidx)]; + seq_t deq_tkt = ticket(deqidx) + 1; + while (deq_tkt != ele.tkt.load(std::memory_order_acquire)) + continue; + ele.move(data); + ele.tkt.store(deq_tkt + 1, std::memory_order_release); + } + + template ::value>::type> + inline bool blocking_dequeue(U &data) noexcept { + auto deqidx = dequeueIx.fetch_add(1, std::memory_order_acq_rel); + auto &ele = elements[index(deqidx)]; + seq_t deq_tkt = ticket(deqidx) + 1; + while (deq_tkt != ele.tkt.load(std::memory_order_acquire)) + continue; + if (ele.hasdata.load(std::memory_order_acquire)) { + ele.move(data); + ele.tkt.store(deq_tkt + 1, std::memory_order_release); + return true; + } else { + ele.tkt.store(deq_tkt + 1, std::memory_order_release); + return false; + } + } + + template ::value, + int>::type = 0> + inline bool dequeue(U &data) { + + auto deqidx = dequeueIx.load(std::memory_order_acquire); + for (;;) { + auto &ele = elements[index(deqidx)]; + seq_t tkt = ele.tkt.load(std::memory_order_acquire); + seq_t deq_tkt = ticket(deqidx) + 1; + seq_t diff = tkt - deq_tkt; + if (diff == 0) { + if (dequeueIx.compare_exchange_strong(deqidx, deqidx + 1, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + ele.move(data); + ele.tkt.store(deq_tkt + 1, std::memory_order_release); + return true; + } + } else if (diff >= std::numeric_limits::max() / 2) + return false; // queue is empty + else { + + deqidx = dequeueIx.load(std::memory_order_acquire); + } + } + } + + template < + typename U = T, // SAFE-IMPL + typename std::enable_if::value, + int>::type = 0> + inline bool + dequeue(U &data) // false could be queue is empty, or skip an invalid element + { + + auto deqidx = dequeueIx.load(std::memory_order_acquire); + for (;;) { + auto &ele = elements[index(deqidx)]; + seq_t tkt = ele.tkt.load(std::memory_order_acquire); + seq_t deq_tkt = ticket(deqidx) + 1; + seq_t diff = tkt - deq_tkt; + if (diff == 0) { + if (dequeueIx.compare_exchange_strong(deqidx, deqidx + 1, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + if (ele.hasdata.load(std::memory_order_acquire)) { + ele.move(data); + ele.tkt.store(deq_tkt + 1, std::memory_order_release); + return true; + } else { + ele.tkt.store(deq_tkt + 1, std::memory_order_release); + return false; + } + } + } else if (diff >= std::numeric_limits::max() / 2) + return false; // queue is empty + else { + deqidx = dequeueIx.load(std::memory_order_acquire); + } + } + } + +private: + inline seq_t index(seq_t const seq) { + if (fastmodulo) + return seq & mask; + else + return seq >= qsize ? seq % qsize : seq; + } + + inline seq_t ticket(seq_t const seq) { + if (fastmodulo) + return (seq >> bitshift) << 1; + else + return (seq / static_cast(qsize)) << 1; + } + //TODO& Review: replace the following with c++ concepts + template struct checkdata {}; + + template + struct checkdata::value>::type> {}; + + template + struct checkdata::value>::type> { + checkdata() : hasdata(false) {} + std::atomic hasdata; + }; + + struct element : public checkdata { + element() : tkt(0) {} + ~element() { + if (tkt & 1) // enqueue op visited + destruct(); + } + + template ::value>::type> + inline void construct(Args &&... args) noexcept { + new (&storage) T(std::forward(args)...); + } + + template ::value>::type> + inline bool construct(Args &&... args) noexcept { + try { + new (&storage) T(std::forward(args)...); + } catch (...) { + return false; + } + return true; + } + + inline void destruct() noexcept { reinterpret_cast(&storage)->~T(); } + + inline T *getptr() { return reinterpret_cast(&storage); } + + template < + typename U = T, // NON-SAFE + typename std::enable_if::value, + int>::type = 0> + inline void move(U &data) { + data = std::move(*getptr()); + destruct(); + } + + template < + typename U = T, // SAFE-IMPL + typename std::enable_if::value, + int>::type = 0> + inline void move(U &data) { + try { + data = std::move(*getptr()); + } catch (...) { + } + destruct(); + } + + std::atomic tkt; + typename std::aligned_storage::type storage; + std::atomic hasdata; + }; + + bool const fastmodulo; // true if qsize is power of 2 + int const bitshift; // used if fastmodulo is true + element *const elements; // pointer to buffer + std::size_t const mask; // used if fastmodulo is true + std::size_t const qsize; // queue size + alignas(cacheline_alignment) char cacheline_padding1[cacheline_size]; + alignas(cacheline_alignment) std::atomic enqueueIx; + alignas(cacheline_alignment) char cacheline_padding2[cacheline_size]; + alignas(cacheline_alignment) std::atomic dequeueIx; + alignas(cacheline_alignment) char cacheline_padding3[cacheline_size]; +}; +} // namespace async diff --git a/src/3rdparty/async/queue.h b/src/3rdparty/async/queue.h new file mode 100644 index 0000000000000000000000000000000000000000..6b00d1d61fd18ff0278cc43c11ddccb131bef930 --- /dev/null +++ b/src/3rdparty/async/queue.h @@ -0,0 +1,429 @@ +///////////////////////////////////////////////////////////////////// +// Copyright Yibo Zhu 2017 +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +///////////////////////////////////////////////////////////////////// +#pragma once +#include "utility.h" +#include +#include +#include + +namespace async { +struct traits // 3-level (L3, L2, L1) depth of nested group design, total + // indexing space is pow(2, 64-Tagbits) +{ // user can change the bits settings by providing your own TRAITS + static constexpr std::uint64_t Tagbits = 24; + static constexpr std::uint64_t L3bits = 10; + static constexpr std::uint64_t L2bits = 10; + static constexpr std::uint64_t L1bits = 12; + static constexpr std::uint64_t Basebits = 8; + static constexpr bool NOEXCEPT_CHECK = false; // exception handling flag + static constexpr std::size_t CachelineSize = 64; + static constexpr std::size_t CachelineAlignment = 16; // must not be larger than alignof(std::max_align_t), see issue #1 +}; + +template class queue final { +public: + static bool is_lock_free_v() { + return std::atomic{}.is_lock_free(); + } + static constexpr std::size_t cacheline_size = TRAITS::CachelineSize; + static constexpr std::size_t cacheline_alignment = TRAITS::CachelineAlignment; + static constexpr std::uint64_t BaseMask = getBitmask(TRAITS::Basebits); + static constexpr std::uint64_t L1Mask = getBitmask(TRAITS::L1bits) + << TRAITS::Basebits; + static constexpr std::uint64_t L2Mask = getBitmask(TRAITS::L2bits) + << (TRAITS::Basebits + TRAITS::L1bits); + static constexpr std::uint64_t L3Mask = + getBitmask(TRAITS::L3bits) + << (TRAITS::Basebits + TRAITS::L1bits + TRAITS::L2bits); + static constexpr std::uint64_t TagMask = + getBitmask(TRAITS::Tagbits) + << (TRAITS::Basebits + TRAITS::L1bits + TRAITS::L2bits + TRAITS::L3bits); + static constexpr std::uint64_t TagShift = 64 - TRAITS::Tagbits; + static constexpr std::uint64_t TagPlus1 = static_cast(1) << TagShift; + +public: // assert bits settings meet requirements + static_assert(TRAITS::Tagbits + TRAITS::L3bits + TRAITS::L2bits + + TRAITS::L1bits + TRAITS::Basebits == + 64, + "The sum of all bits settings should be 64"); + static_assert(TRAITS::Tagbits > 0 && TRAITS::L3bits > 0 && + TRAITS::L2bits > 0 && TRAITS::L1bits > 0 && + TRAITS::Basebits > 3, + "All bits settings should be > 0 and Basebits must be > 3"); + static_assert(std::is_nothrow_destructible::value, + "T must be nothrow destructible"); + +public: + queue() : nodeCount(3), dequeueIx(2), enqueueIx(2), spawnIx(1), recycleIx(1) { + container.get(index(0)); // allocate initial space + } + queue(std::size_t size) // pre-allocate size + : nodeCount(3), dequeueIx(2), enqueueIx(2), spawnIx(1), recycleIx(1) { + container.get(index(0)); + + if (size > (static_cast(1) << TRAITS::Basebits)) { + index ix; + for (std::size_t i = (static_cast(1) << TRAITS::Basebits); i < size; + ++i) { + auto &node = getNode(ix); + recycle(ix); + } + } + } + + queue(queue const &other) = delete; + queue &operator=(queue const &other) = delete; + queue(queue &&other) = delete; + queue &operator=(queue &&other) = delete; + + template ::value>::type> + inline void enqueue(Args &&... args) noexcept { + auto ix = encapsulate(std::forward(args)...); + auto enqidx = enqueueIx.load(std::memory_order_relaxed); + while (!enqueueIx.compare_exchange_weak( + enqidx, ix, std::memory_order_release, std::memory_order_relaxed)) + continue; + container[enqidx].next.store(ix, std::memory_order_release); + } + + template ::value>::type> + inline bool enqueue(Args &&... args) noexcept { + auto ix = encapsulate(std::forward(args)...); + if (ix == 0) + return false; + auto enqidx = enqueueIx.load(std::memory_order_relaxed); + while (!enqueueIx.compare_exchange_weak( + enqidx, ix, std::memory_order_release, std::memory_order_relaxed)) + continue; + container[enqidx].next.store(ix, std::memory_order_release); + return true; + } + + template void bulk_enqueue(IT it, std::size_t count) { + index firstidx(0), preidx(0), lastidx(0); + for (std::size_t i = 0; i < count; ++i) { + lastidx = encapsulate(*it++); + if (firstidx == 0) + firstidx = lastidx; + if (preidx != 0) { + container[preidx].next.store(lastidx, std::memory_order_relaxed); + } + preidx = lastidx; + } + auto enqidx = enqueueIx.load(std::memory_order_relaxed); + while (!enqueueIx.compare_exchange_weak( + enqidx, lastidx, std::memory_order_release, std::memory_order_relaxed)) + continue; + container[enqidx].next.store(firstidx, std::memory_order_release); + } + + template + std::size_t bulk_dequeue(IT &&it, std::size_t maxcount) // or IT& it to return the + { + std::size_t count(0); + while (maxcount-- && dequeue(*it++)) { + ++count; + } + return count; + } + + template // U could be T, or any kinds of iterators/adapters, + // like insert_iterator + inline bool dequeue(U &data) noexcept // return false if queue is empty + { + for (;;) { + auto deqidx = dequeueIx.load(std::memory_order_acquire); + auto &node = container[deqidx]; + auto next = node.next.load(std::memory_order_relaxed); + if (next == 0) { + auto ready_for_consume = + node.consume_ready.load(std::memory_order_relaxed); + if (!ready_for_consume) { + return false; + } + + if (node.consume_ready.compare_exchange_strong( + ready_for_consume, false, std::memory_order_release, + std::memory_order_relaxed)) { + node.template move(data); + return true; + } + } else { + if (dequeueIx.compare_exchange_weak(deqidx, next, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + auto ready_for_consume = + node.consume_ready.load(std::memory_order_acquire); + if (ready_for_consume && + node.consume_ready.compare_exchange_strong( + ready_for_consume, false, std::memory_order_release, + std::memory_order_relaxed)) { + node.template move(data); + } else { // the node is being consumed by another thread, waiting for + // it finishes + for (; !node.recycle_ready.load(std::memory_order_acquire);) { + } + } + node.next.store( + 0, std::memory_order_relaxed); // reset link to avoid chain effect + recycle(deqidx); + if (ready_for_consume) + return ready_for_consume; + } + } + } + } + std::uint64_t getNodeCount() { return nodeCount; } // get in-use-nodes count + +private: // internal data structures + struct index // simulate tagged pointer + { + index(std::uint64_t newval) noexcept + : value(newval) {} // is_trivially_copyable must be true + index() noexcept : value(0) {} + inline operator std::uint64_t() const { return value; } + std::uint64_t getVersion() { return (value & TagMask) >> TagShift; } + inline void increTag() { + value = (value & ~TagMask) | ((value + TagPlus1) & TagMask); + } + std::uint64_t value; + }; + + struct node // to store the data + { + node() : next(0), consume_ready(false), recycle_ready(true) {} + ~node() noexcept { + if (consume_ready.load(std::memory_order_relaxed)) { + destruct(); + } + } + + template ::value>::type> + inline void construct(Args &&... args) noexcept { + new (&storage) T(std::forward(args)...); + consume_ready.store(true, std::memory_order_release); + recycle_ready.store(false, std::memory_order_release); + } + + template ::value>::type> + inline bool construct(Args &&... args) noexcept { + try { + new (&storage) T(std::forward(args)...); + } catch (...) { + return false; + } + + consume_ready.store(true, std::memory_order_release); + recycle_ready.store(false, std::memory_order_release); + return true; + } + + inline void destruct() noexcept { reinterpret_cast(&storage)->~T(); } + + template < + typename TR, typename U, // NON-SAFE + typename std::enable_if::value, + int>::type = 0> + inline void move(U &data) { + data = std::move(*getptr()); + destruct(); + recycle_ready.store(true, std::memory_order_release); + } + + template < + typename TR, typename U, // SAFE-IMPL + typename std::enable_if::value, + int>::type = 0> + inline void move(U &data) { + try { + data = std::move(*getptr()); + } catch (...) { + } + destruct(); + recycle_ready.store(true, std::memory_order_release); + } + inline T *getptr() { return reinterpret_cast(&storage); } + std::atomic next; // link + std::atomic consume_ready; // if true, consume ready + std::atomic recycle_ready; // if true, recycle ready + typename std::aligned_storage::type storage; // data + }; + + struct basecontainer { + inline node &get(index const &ix) { return operator[](ix); } + inline node &at(index const &ix) { return operator[](ix); } + inline node &operator[](index const &ix) { return nodes[ix & BaseMask]; } + std::array(1) << TRAITS::Basebits> nodes; + }; + + template struct nestedcontainer { + static constexpr std::uint64_t mask = BitMask; + static constexpr std::uint64_t bits = getSetBitsCount(mask); + static constexpr std::uint64_t shift = getShiftBitsCount(mask); + std::array, static_cast(1) << bits> + subgroups; + nestedcontainer() { + for (auto &gptr : subgroups) { + gptr.store(nullptr, std::memory_order_release); + } + } + ~nestedcontainer() { + for (auto &gptr : subgroups) { + if (gptr.load(std::memory_order_relaxed) != nullptr) + delete gptr.load(std::memory_order_relaxed); + } + } + + inline node &get(index const &ix) // will trigger the new operation if + // subgroup doesn't exist + { + auto ptr = + subgroups[(ix & mask) >> shift].load(std::memory_order_acquire); + if (ptr == nullptr) { + auto newgroup = std::make_unique(); // if ComExch fails, + // unique_ptr will self + // delete + if (subgroups[(ix & mask) >> shift].compare_exchange_strong( + ptr, newgroup.get(), std::memory_order_release, + std::memory_order_acquire)) { + ptr = newgroup.release(); + } + } + return ptr->get(ix); // recursively calling get 'til get the node + } + + inline node &operator[](index const &ix) { + return subgroups[(ix & mask) >> shift] + .load(std::memory_order_relaxed) + -> + operator[](ix); + } + + inline node &at(index const &ix) { // balanced performance and safety + auto ptr = + subgroups[(ix & mask) >> shift].load(std::memory_order_relaxed); + if (ptr) + return ptr->at(ix); + else + return get(ix); + } + }; + + inline node &getNode(index &ix) { // return an existing or new node + #if defined(__arm__) && (!defined(__aarch64__)) + //for ARMV7 or below + ix.value = nodeCount.load(std::memory_order_relaxed); + auto val = ix.value + 1; + while(!nodeCount.compare_exchange_weak( + ix.value, val, std::memory_order_release, std::memory_order_relaxed)) { + val = ix.value + 1; + } + #else + ix.value = nodeCount.fetch_add(static_cast(1), + std::memory_order_relaxed); + #endif + if ((ix.value & BaseMask) == 0) + return container.get(ix); + else + return container.at(ix); + } + + template ::value, + int>::type = 0> + inline index encapsulate(Args &&... args) noexcept { + auto ix = spawn(); + auto &node = container[ix]; + node.construct(std::forward(args)...); + node.next.store(0, std::memory_order_relaxed); + return ix; + } + + template ::value, + int>::type = 0> + inline index encapsulate(Args &&... args) noexcept { + auto ix = spawn(); + auto &node = container[ix]; + node.next.store(0, std::memory_order_relaxed); + if (node.construct(std::forward(args)...)) + return ix; + else { + recycle(ix); // construction failed, recycle the node + return index(0); + } + } + + inline void recycle(index const &ix) { + auto recycle = recycleIx.load(std::memory_order_relaxed); + while (!recycleIx.compare_exchange_weak( + recycle, ix, std::memory_order_release, std::memory_order_relaxed)) + continue; + container[recycle].next.store(ix, std::memory_order_release); + } + + inline auto spawn() +#if ((defined(__clang__) || defined(__GNUC__)) && __cplusplus <= 201103L) || \ + (defined(_MSC_VER) && _MSC_VER < 1800) + -> index +#endif + { + index ix(0); + for (;;) { + auto spaidx = spawnIx.load(std::memory_order_acquire); + auto next = container[spaidx].next.load(std::memory_order_relaxed); + if (next == 0) { + getNode(ix); + return ix; + } else { + if (spawnIx.compare_exchange_weak(spaidx, next, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + if (spaidx != 0) { + spaidx.increTag(); + } + return spaidx; + } + } + } + } + + using L1container = nestedcontainer; + using L2container = nestedcontainer; + nestedcontainer container; + alignas(cacheline_alignment) char cacheline_padding1[cacheline_size]; + alignas(cacheline_alignment) std::atomic nodeCount; // # of allocated nodes, not the # + // of elements stored in the queue + alignas(cacheline_alignment) char cacheline_padding2[cacheline_size]; + alignas(cacheline_alignment) std::atomic dequeueIx; // dequeue pointer + alignas(cacheline_alignment) char cacheline_padding3[cacheline_size]; + alignas(cacheline_alignment) std::atomic enqueueIx; // enqueue pointer + alignas(cacheline_alignment) char cacheline_padding4[cacheline_size]; + alignas(cacheline_alignment) std::atomic spawnIx; // spawn pointer + alignas(cacheline_alignment) char cacheline_padding5[cacheline_size]; + alignas(cacheline_alignment) std::atomic recycleIx; // recycle pointer + alignas(cacheline_alignment) char cacheline_padding6[cacheline_size]; +}; +} // namespace async diff --git a/src/3rdparty/async/threadpool.h b/src/3rdparty/async/threadpool.h new file mode 100644 index 0000000000000000000000000000000000000000..395a9d85041b5631d8a79d3a0721db0a8bb091c8 --- /dev/null +++ b/src/3rdparty/async/threadpool.h @@ -0,0 +1,192 @@ +///////////////////////////////////////////////////////////////////// +// Copyright Yibo Zhu 2017 +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +///////////////////////////////////////////////////////////////////// +#pragma once +#include "queue.h" +#include +#include +#include +#include +#include +#include +#include +#include +namespace async { +// thread pool to execute functions, functors, lamdas asynchronously, +// default poolsize = machine's logical CPU cores/threads +class threadpool final { +public: + static int defaultpoolsize() { return std::thread::hardware_concurrency(); } + + threadpool(int poolsize = defaultpoolsize()) + : idlecount(0), conflag(false) { + configurepool(poolsize); + } + + threadpool(const threadpool &) = delete; + threadpool(threadpool &&) = delete; + threadpool &operator=(const threadpool &) = delete; + threadpool &operator=(threadpool &&) = delete; + + ~threadpool() { cleanup(); } + + inline std::size_t size() { + std::lock_guard lg(poolmux); + return threads.size(); + } + + inline int idlesize() { return idlecount; } + + // can be called to resize the pool at any time after construction and before + // destruction, recommand to be called from main thread or manager thread even + // though it is thread-safe + void configurepool(std::size_t poolsize) { + std::unique_lock veclk(poolmux); + auto currentsize = threads.size(); + if (currentsize < poolsize) { // expand the pool + for (std::size_t i = currentsize; i < poolsize; i++) { + tpstops.emplace_back(addthread()); + } + } else if (currentsize > poolsize) { // shrink the pool + std::vector> dumpthreads; + std::vector *> dumpthreadstops; + std::move(threads.begin() + poolsize, threads.end(), + std::back_inserter(dumpthreads)); + std::move(tpstops.begin() + poolsize, tpstops.end(), + std::back_inserter(dumpthreadstops)); + tpstops.resize(poolsize); + threads.resize(poolsize); + veclk.unlock(); + for (auto &a : dumpthreadstops) { + *a = true; + } + for (auto &t : dumpthreads) { + t->detach(); + } + { + std::unique_lock lk(qcvmux); // suspended threads to quit + qcv.notify_all(); + } + } + } + + template + inline auto post(Func &&func, Args &&... args) +#if ((defined(__clang__) || defined(__GNUC__)) && __cplusplus <= 201103L) || \ + (defined(_MSC_VER) && _MSC_VER <= 1800) + -> std::future::type> +#endif + { // TODO: replace result_of with invoke_result_t when migrate to c++17 + auto taskptr = std::make_shared< + std::packaged_task::type()>>( + std::bind(std::forward(func), std::forward(args)...)); + taskqueue.enqueue([taskptr]() { (*taskptr)(); }); + { + std::lock_guard lg(qcvmux); + conflag = true; + } + qcv.notify_one(); + return taskptr->get_future(); + } + + template + inline auto post(Func &&func) +#if ((defined(__clang__) || defined(__GNUC__)) && __cplusplus <= 201103L) || \ + (defined(_MSC_VER) && _MSC_VER <= 1800) + -> std::future::type> +#endif + { // a special case for func() type without any parameters, might be + // removed later + auto taskptr = std::make_shared< + std::packaged_task::type()>>( + std::forward(func)); + taskqueue.enqueue([taskptr]() { (*taskptr)(); }); + { + std::lock_guard lg(qcvmux); + conflag = true; + } + qcv.notify_one(); + return taskptr->get_future(); + } + +private: + struct executor { + executor(std::unique_ptr> &&ptr, threadpool &pool) + : stop(std::move(ptr)), thpool(pool) {} + void operator()() { + while (!*stop) { + if (!thpool.executetask_in_loop(*stop)) { + return; // signaled to quit + } + thpool.wait_for_task(*stop); // wait for new task + } + } + + private: + std::unique_ptr> stop; + threadpool &thpool; + }; + + std::atomic *addthread() { + auto stopuniptr = std::make_unique>(false); + auto stoprawptr = stopuniptr.get(); + threads.emplace_back( + std::make_unique(executor(std::move(stopuniptr), *this))); + return stoprawptr; + } + + void cleanup() { // make sure no more tasks being pushed to the taskqueue + { + std::lock_guard lk(qcvmux); + qcv.notify_all(); // let running thread drain the task queue? no need, + // should be removed + } + for (auto &stop : tpstops) { + *stop = true; // stop signaled + } + { + std::lock_guard lk(qcvmux); + qcv.notify_all(); // notify again + } + for (auto &thread : threads) { + if (thread->joinable()) + thread->join(); + } + threads.clear(); + tpstops.clear(); + } + + inline void wait_for_task(std::atomic const &stop) { + idlecount.fetch_add(1, std::memory_order_relaxed); + { + std::unique_lock lk(qcvmux); + qcv.wait(lk, [&]() { + return conflag || stop.load(std::memory_order_acquire); + }); //memory_oder can be removed + conflag = false; + } + idlecount.fetch_sub(1, std::memory_order_relaxed); + } + + inline bool executetask_in_loop(std::atomic const &stop) { + std::function func; + for (; taskqueue.dequeue(func);) { + func(); + if (stop) // stop is signaled + return false; + } + return true; + } + + std::vector> threads; + std::vector *> tpstops; // threads terminate flags + async::queue> taskqueue; + std::atomic idlecount; // idle thread count + std::mutex qcvmux, poolmux; + std::condition_variable qcv; + bool conflag; // continue flag for cv +}; +} // namespace async diff --git a/src/3rdparty/async/utility.h b/src/3rdparty/async/utility.h new file mode 100644 index 0000000000000000000000000000000000000000..f5bb2d1f45e41e397645af522fd05889a6422c03 --- /dev/null +++ b/src/3rdparty/async/utility.h @@ -0,0 +1,66 @@ +///////////////////////////////////////////////////////////////////// +// Copyright Yibo Zhu 2017 +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +///////////////////////////////////////////////////////////////////// +#pragma once + +#if ((defined(__clang__) || defined(__GNUC__)) && __cplusplus < 201103L) || \ + (defined(_MSC_VER) && _MSC_VER < 1800) +#error This library needs at least a C++11 compliant compiler +#endif +#include +#include +#include +#include +template static constexpr T getBitmask(unsigned int const bits) { + return static_cast(-(bits != 0)) & + (static_cast(-1) >> ((sizeof(T) * CHAR_BIT) - bits)); +} + +#if __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1910) +// c++14 impl +static constexpr unsigned int getSetBitsCount(std::uint64_t n) { + unsigned int count{0}; + while (n) { + n &= (n - 1); + count++; + } + return count; +} + +static constexpr unsigned int getShiftBitsCount(std::uint64_t n) { + // requires c++14 + unsigned int count{0}; + if (n == 0) + return count; + while ((n & 0x1) == 0) { + n >>= 1; + ++count; + } + return count; +} + +#elif __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1800) +// c++11 impl +static constexpr unsigned int getSetBitsCount(std::uint64_t n) { + return n == 0 ? 0 : 1 + getSetBitsCount(n & (n - 1)); +} + +static constexpr unsigned int getShiftBitsCount(std::uint64_t n) { + return n == 0 ? 0 : ((n & 0x1) == 0 ? 1 + getShiftBitsCount(n >> 1) : 0); +} + +#if (__cplusplus == 201103L) && (defined(__clang__) || defined(__GNUC__)) +namespace std { // for c+11 +template +std::unique_ptr make_unique(Args &&... args) { + return std::unique_ptr(new T(std::forward(args)...)); +} +} // namespace std +#endif + +#else +#error This library needs at least a C++11 compliant compiler +#endif diff --git a/src/3rdparty/cctbx/pystreambuf.h b/src/3rdparty/cctbx/pystreambuf.h new file mode 100644 index 0000000000000000000000000000000000000000..d2d67730ae88d93001a0652ef14d3448c39226a9 --- /dev/null +++ b/src/3rdparty/cctbx/pystreambuf.h @@ -0,0 +1,519 @@ +/* Original code: https://gist.github.com/asford/544323a5da7dddad2c9174490eb5ed06 + * License: + * This component utilizes components derived from cctbx, available at + * http://cci.lbl.gov/cctbx_sources/boost_adaptbx/python_streambuf.h + * + * *** License agreement *** + * + * cctbx Copyright (c) 2006, The Regents of the University of + * California, through Lawrence Berkeley National Laboratory (subject to + * receipt of any required approvals from the U.S. Dept. of Energy). All + * rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * (1) Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * (2) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * (3) Neither the name of the University of California, Lawrence Berkeley + * National Laboratory, U.S. Dept. of Energy nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You are under no obligation whatsoever to provide any bug fixes, + * patches, or upgrades to the features, functionality or performance of + * the source code ("Enhancements") to anyone; however, if you choose to + * make your Enhancements available either publicly, or directly to + * Lawrence Berkeley National Laboratory, without imposing a separate + * written license agreement for such Enhancements, then you hereby grant + * the following license: a non-exclusive, royalty-free perpetual license + * to install, use, modify, prepare derivative works, incorporate into + * other computer software, distribute, and sublicense such enhancements or + * derivative works thereof, in binary and source code form. +*/ + +#pragma once + +#include +#include + +#include + +namespace pystreambuf { + +/// A stream buffer getting data from and putting data into a Python file object +/** The aims are as follow: + + - Given a C++ function acting on a standard stream, e.g. + + \code + void read_inputs(std::istream& input) { + ... + input >> something >> something_else; + } + \endcode + + and given a piece of Python code which creates a file-like object, + to be able to pass this file object to that C++ function, e.g. + + \code + import gzip + gzip_file_obj = gzip.GzipFile(...) + read_inputs(gzip_file_obj) + \endcode + + and have the standard stream pull data from and put data into the Python + file object. + + - When Python \c read_inputs() returns, the Python object is able to + continue reading or writing where the C++ code left off. + + - Operations in C++ on mere files should be competitively fast compared + to the direct use of \c std::fstream. + + + \b Motivation + + - the standard Python library offer of file-like objects (files, + compressed files and archives, network, ...) is far superior to the + offer of streams in the C++ standard library and Boost C++ libraries. + + - i/o code involves a fair amount of text processing which is more + efficiently prototyped in Python but then one may need to rewrite + a time-critical part in C++, in as seamless a manner as possible. + + \b Usage + + This is 2-step: + + - a trivial wrapper function + + \code + using boost_adaptbx::python::streambuf; + void read_inputs_wrapper(streambuf& input) + { + streambuf::istream is(input); + read_inputs(is); + } + + def("read_inputs", read_inputs_wrapper); + \endcode + + which has to be written every time one wants a Python binding for + such a C++ function. + + - the Python side + + \code + from boost.python import streambuf + read_inputs(streambuf(python_file_obj=obj, buffer_size=1024)) + \endcode + + \c buffer_size is optional. See also: \c default_buffer_size + + Note: references are to the C++ standard (the numbers between parentheses + at the end of references are margin markers). +*/ +class streambuf : public std::basic_streambuf +{ + private: + typedef std::basic_streambuf base_t; + + public: + /* The syntax + using base_t::char_type; + would be nicer but Visual Studio C++ 8 chokes on it + */ + typedef base_t::char_type char_type; + typedef base_t::int_type int_type; + typedef base_t::pos_type pos_type; + typedef base_t::off_type off_type; + typedef base_t::traits_type traits_type; + + /// The default size of the read and write buffer. + /** They are respectively used to buffer data read from and data written to + the Python file object. It can be modified from Python. + */ + static constexpr std::size_t default_buffer_size = 1024; + + /// Construct from a Python file object + /** if buffer_size is 0 the current default_buffer_size is used. + */ + streambuf( + pybind11::object& python_file_obj, + std::size_t buffer_size_=0) + : + py_read (getattr(python_file_obj, "read", pybind11::none())), + py_write (getattr(python_file_obj, "write", pybind11::none())), + py_seek (getattr(python_file_obj, "seek", pybind11::none())), + py_tell (getattr(python_file_obj, "tell", pybind11::none())), + buffer_size(buffer_size_ != 0 ? buffer_size_ : default_buffer_size), + write_buffer(0), + pos_of_read_buffer_end_in_py_file(0), + pos_of_write_buffer_end_in_py_file(buffer_size), + farthest_pptr(0) + { + assert(buffer_size != 0); + /* Some Python file objects (e.g. sys.stdout and sys.stdin) + have non-functional seek and tell. If so, assign None to + py_tell and py_seek. + */ + if (!py_tell.is_none()) { + try { + py_tell(); + } + catch (pybind11::error_already_set& err) { + py_tell = pybind11::none(); + py_seek = pybind11::none(); + err.restore(); + PyErr_Clear(); + } + } + + if (!py_write.is_none()) { + // add one extra byte for characters passed to the overflow() method + write_buffer = new char[buffer_size + 1]; + setp(write_buffer, write_buffer + buffer_size); // 27.5.2.4.5 (5) + farthest_pptr = pptr(); + } + else { + // The first attempt at output will result in a call to overflow + setp(0, 0); + } + + if (!py_tell.is_none()){ + off_type py_pos = py_tell().cast(); + pos_of_read_buffer_end_in_py_file = py_pos; + pos_of_write_buffer_end_in_py_file = py_pos; + } + } + + /// Mundane destructor freeing the allocated resources + virtual ~streambuf() { + if (write_buffer) delete[] write_buffer; + } + + /// C.f. C++ standard section 27.5.2.4.3 + /** It is essential to override this virtual function for the stream + member function readsome to work correctly (c.f. 27.6.1.3, alinea 30) + */ + virtual std::streamsize showmanyc() { + int_type const failure = traits_type::eof(); + int_type status = underflow(); + if (status == failure) return -1; + return egptr() - gptr(); + } + + /// C.f. C++ standard section 27.5.2.4.3 + virtual int_type underflow() { + int_type const failure = traits_type::eof(); + if (py_read.is_none()) { + throw std::invalid_argument( + "That Python file object has no 'read' attribute"); + } + read_buffer = py_read(buffer_size); + char *read_buffer_data; + pybind11::ssize_t py_n_read; + if (PYBIND11_BYTES_AS_STRING_AND_SIZE(read_buffer.ptr(), + &read_buffer_data, &py_n_read) == -1) { + setg(0, 0, 0); + throw std::invalid_argument( + "The method 'read' of the Python file object " + "did not return a string."); + } + off_type n_read = (off_type)py_n_read; + pos_of_read_buffer_end_in_py_file += n_read; + setg(read_buffer_data, read_buffer_data, read_buffer_data + n_read); + // ^^^27.5.2.3.1 (4) + if (n_read == 0) return failure; + return traits_type::to_int_type(read_buffer_data[0]); + } + + /// C.f. C++ standard section 27.5.2.4.5 + virtual int_type overflow(int_type c=traits_type::eof()) { + if (py_write.is_none()) { + throw std::invalid_argument( + "That Python file object has no 'write' attribute"); + } + farthest_pptr = std::max(farthest_pptr, pptr()); + off_type n_written = (off_type)(farthest_pptr - pbase()); + if (!traits_type::eq_int_type(c, traits_type::eof())) { + // add the overflown character to the end of the buffer + // (we have one extra byte just for that) + write_buffer[n_written++] = traits_type::to_char_type(c); + } + pybind11::bytes chunk(pbase(), n_written); + py_write(chunk); + if (n_written) { + pos_of_write_buffer_end_in_py_file += n_written; + setp(pbase(), epptr()); + // ^^^ 27.5.2.4.5 (5) + farthest_pptr = pptr(); + } + return traits_type::eq_int_type( + c, traits_type::eof()) ? traits_type::not_eof(c) : c; + } + + /// Update the python file to reflect the state of this stream buffer + /** Empty the write buffer into the Python file object and set the seek + position of the latter accordingly (C++ standard section 27.5.2.4.2). + If there is no write buffer or it is empty, but there is a non-empty + read buffer, set the Python file object seek position to the + seek position in that read buffer. + */ + virtual int sync() { + int result = 0; + farthest_pptr = std::max(farthest_pptr, pptr()); + if (farthest_pptr && farthest_pptr > pbase()) { + off_type delta = pptr() - farthest_pptr; + int_type status = overflow(); + if (traits_type::eq_int_type(status, traits_type::eof())) result = -1; + if (!py_seek.is_none()) py_seek(delta, 1); + } + else if (gptr() && gptr() < egptr()) { + if (!py_seek.is_none()) py_seek(gptr() - egptr(), 1); + } + return result; + } + + /// C.f. C++ standard section 27.5.2.4.2 + /** This implementation is optimised to look whether the position is within + the buffers, so as to avoid calling Python seek or tell. It is + important for many applications that the overhead of calling into Python + is avoided as much as possible (e.g. parsers which may do a lot of + backtracking) + */ + virtual + pos_type seekoff(off_type off, std::ios_base::seekdir way, + std::ios_base::openmode which= std::ios_base::in + | std::ios_base::out) + { + /* In practice, "which" is either std::ios_base::in or out + since we end up here because either seekp or seekg was called + on the stream using this buffer. That simplifies the code + in a few places. + */ + int const failure = off_type(-1); + + if (py_seek.is_none()) { + throw std::invalid_argument( + "That Python file object has no 'seek' attribute"); + } + + // we need the read buffer to contain something! + if (which == std::ios_base::in && !gptr()) { + if (traits_type::eq_int_type(underflow(), traits_type::eof())) { + return failure; + } + } + + // compute the whence parameter for Python seek + int whence; + switch (way) { + case std::ios_base::beg: + whence = 0; + break; + case std::ios_base::cur: + whence = 1; + break; + case std::ios_base::end: + whence = 2; + break; + default: + return failure; + } + + // Let's have a go + off_type result; + if (!seekoff_without_calling_python(off, way, which, result)) { + // we need to call Python + if (which == std::ios_base::out) overflow(); + if (way == std::ios_base::cur) { + if (which == std::ios_base::in) off -= egptr() - gptr(); + else if (which == std::ios_base::out) off += pptr() - pbase(); + } + py_seek(off, whence); + result = off_type(py_tell().cast()); + if (which == std::ios_base::in) underflow(); + } + return result; + } + + /// C.f. C++ standard section 27.5.2.4.2 + virtual + pos_type seekpos(pos_type sp, + std::ios_base::openmode which= std::ios_base::in + | std::ios_base::out) + { + return streambuf::seekoff(sp, std::ios_base::beg, which); + } + + private: + pybind11::object py_read, py_write, py_seek, py_tell; + + std::size_t buffer_size; + + /* This is actually a Python bytes object and the actual read buffer is + its internal data, i.e. an array of characters. + */ + pybind11::bytes read_buffer; + + /* A mere array of char's allocated on the heap at construction time and + de-allocated only at destruction time. + */ + char *write_buffer; + + off_type pos_of_read_buffer_end_in_py_file, + pos_of_write_buffer_end_in_py_file; + + // the farthest place the buffer has been written into + char *farthest_pptr; + + + bool seekoff_without_calling_python( + off_type off, + std::ios_base::seekdir way, + std::ios_base::openmode which, + off_type & result) + { + // Buffer range and current position + off_type buf_begin, buf_end, buf_cur, upper_bound; + off_type pos_of_buffer_end_in_py_file; + if (which == std::ios_base::in) { + pos_of_buffer_end_in_py_file = pos_of_read_buffer_end_in_py_file; + buf_begin = reinterpret_cast(eback()); + buf_cur = reinterpret_cast(gptr()); + buf_end = reinterpret_cast(egptr()); + upper_bound = buf_end; + } + else if (which == std::ios_base::out) { + pos_of_buffer_end_in_py_file = pos_of_write_buffer_end_in_py_file; + buf_begin = reinterpret_cast(pbase()); + buf_cur = reinterpret_cast(pptr()); + buf_end = reinterpret_cast(epptr()); + farthest_pptr = std::max(farthest_pptr, pptr()); + upper_bound = reinterpret_cast(farthest_pptr) + 1; + } + else { + std::runtime_error( + "Control flow passes through branch that should be unreachable."); + } + + // Sought position in "buffer coordinate" + off_type buf_sought; + if (way == std::ios_base::cur) { + buf_sought = buf_cur + off; + } + else if (way == std::ios_base::beg) { + buf_sought = buf_end + (off - pos_of_buffer_end_in_py_file); + } + else if (way == std::ios_base::end) { + return false; + } + else { + std::runtime_error( + "Control flow passes through branch that should be unreachable."); + } + + // if the sought position is not in the buffer, give up + if (buf_sought < buf_begin || buf_sought >= upper_bound) return false; + + // we are in wonderland + if (which == std::ios_base::in) gbump(buf_sought - buf_cur); + else if (which == std::ios_base::out) pbump(buf_sought - buf_cur); + + result = pos_of_buffer_end_in_py_file + (buf_sought - buf_end); + return true; + } + + public: + + class istream : public std::istream + { + public: + istream(streambuf& buf) : std::istream(&buf) + { + exceptions(std::ios_base::badbit | std::ios_base::failbit); + } + + ~istream() { if (this->good()) this->sync(); } + }; + + class ostream : public std::ostream + { + public: + ostream(streambuf& buf) : std::ostream(&buf) + { + exceptions(std::ios_base::badbit | std::ios_base::failbit); + } + + ~ostream() { if (this->good()) this->flush(); } + }; +}; + +struct streambuf_capsule +{ + streambuf python_streambuf; + + streambuf_capsule( + pybind11::object& python_file_obj, + std::size_t buffer_size=0) + : + python_streambuf(python_file_obj, buffer_size) + {} +}; + +struct ostream : private streambuf_capsule, streambuf::ostream +{ + ostream( + pybind11::object& python_file_obj, + std::size_t buffer_size=0) + : + streambuf_capsule(python_file_obj, buffer_size), + streambuf::ostream(python_streambuf) + {} + + ~ostream() + { + if (this->good()){ + this->flush(); + } + } +}; + +struct istream : private streambuf_capsule, streambuf::istream +{ + istream( + pybind11::object& python_file_obj, + std::size_t buffer_size=0) + : + streambuf_capsule(python_file_obj, buffer_size), + streambuf::istream(python_streambuf) + {} + + ~istream() + { + if (this->good()) { + this->sync(); + } + } +}; + +} // namespace pystreambuf diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index cbd628b03e13f9c792e2b9dd90b5573ea3ea4568..2b2389e2c02986a7e4c13751b844c19c1ef2b17d 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -26,7 +26,7 @@ #include #include #include -#include +#include namespace TNL { namespace Benchmarks { @@ -55,7 +55,7 @@ struct BenchmarkResult elements << time << stddev << stddev / time << bandwidth; if( speedup != 0 ) elements << speedup; - else + else elements << "N/A"; return elements; } @@ -356,9 +356,7 @@ inline Benchmark::MetadataMap getHardwareMetadata() { "system release", SystemInfo::getSystemRelease() }, { "start time", SystemInfo::getCurrentTime() }, #ifdef HAVE_MPI - { "number of MPI processes", convertToString( (Communicators::MpiCommunicator::IsInitialized()) - ? Communicators::MpiCommunicator::GetSize( Communicators::MpiCommunicator::AllGroup ) - : 1 ) }, + { "number of MPI processes", convertToString( TNL::MPI::GetSize() ) }, #endif { "OpenMP enabled", convertToString( Devices::Host::isOMPEnabled() ) }, { "OpenMP threads", convertToString( Devices::Host::getMaxThreadsCount() ) }, diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h index 683e6960ad2f4c9e93adffa95f558b17edbf64aa..e8b5c9de15692c0de5a0ad30cc8b3762a05f76ef 100644 --- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h +++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h @@ -19,9 +19,8 @@ #include #include #include -#include -#include -#include +#include +#include #include #include #include @@ -39,12 +38,6 @@ using SegmentsType = TNL::Algorithms::Segments::SlicedEllpack< _Device, _Index, using namespace TNL; using namespace TNL::Benchmarks; -#ifdef HAVE_MPI -using CommunicatorType = Communicators::MpiCommunicator; -#else -using CommunicatorType = Communicators::NoDistrCommunicator; -#endif - template< typename Matrix, typename Vector > void @@ -115,7 +108,7 @@ benchmarkDistributedSpmv( Benchmark& benchmark, // benchmark function auto compute = [&]() { matrix.vectorProduct( x, y ); - Matrix::CommunicatorType::Barrier( matrix.getCommunicationGroup() ); + TNL::MPI::Barrier( matrix.getCommunicationGroup() ); }; benchmark.time< typename Matrix::DeviceType >( reset, performer, compute ); @@ -155,9 +148,9 @@ struct SpmvBenchmark using IndexType = typename MatrixType::IndexType; using VectorType = Containers::Vector< RealType, DeviceType, IndexType >; - using Partitioner = Containers::Partitioner< IndexType, CommunicatorType >; - using DistributedMatrix = Matrices::DistributedMatrix< MatrixType, CommunicatorType >; - using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType, CommunicatorType >; + using Partitioner = Containers::Partitioner< IndexType >; + using DistributedMatrix = Matrices::DistributedMatrix< MatrixType >; + using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType >; using DistributedRowLengths = typename DistributedMatrix::CompressedRowLengthsVector; static bool @@ -174,7 +167,7 @@ struct SpmvBenchmark matrix.getCompressedRowLengths( rowLengths ); const IndexType maxRowLength = max( rowLengths ); - const String name = String( (CommunicatorType::isDistributed()) ? "DistSpMV" : "SpMV" ) + const String name = String( (TNL::MPI::GetSize() > 1) ? "DistSpMV" : "SpMV" ) + " (" + parameters.getParameter< String >( "name" ) + "): "; benchmark.newBenchmark( name, metadata ); benchmark.setMetadataColumns( Benchmark::MetadataColumns({ @@ -194,13 +187,13 @@ struct SpmvBenchmark getTrivialOrdering( matrix, perm, iperm ); MatrixType matrix_perm; Matrices::reorderSparseMatrix( matrix, matrix_perm, perm, iperm ); - if( CommunicatorType::isDistributed() ) + if( TNL::MPI::GetSize() > 1 ) runDistributed( benchmark, metadata, parameters, matrix_perm, vector ); else runNonDistributed( benchmark, metadata, parameters, matrix_perm, vector ); } else { - if( CommunicatorType::isDistributed() ) + if( TNL::MPI::GetSize() > 1 ) runDistributed( benchmark, metadata, parameters, matrix, vector ); else runNonDistributed( benchmark, metadata, parameters, matrix, vector ); @@ -230,13 +223,13 @@ struct SpmvBenchmark VectorType& vector ) { // set up the distributed matrix - const auto group = CommunicatorType::AllGroup; + const auto group = TNL::MPI::AllGroup(); const auto localRange = Partitioner::splitRange( matrix.getRows(), group ); DistributedMatrix distributedMatrix( localRange, matrix.getRows(), matrix.getColumns(), group ); - DistributedVector distributedVector( localRange, matrix.getRows(), group ); + DistributedVector distributedVector( localRange, 0, matrix.getRows(), group ); // copy the row lengths from the global matrix to the distributed matrix - DistributedRowLengths distributedRowLengths( localRange, matrix.getRows(), group ); + DistributedRowLengths distributedRowLengths( localRange, 0, matrix.getRows(), group ); for( IndexType i = 0; i < distributedMatrix.getLocalMatrix().getRows(); i++ ) { const auto gi = distributedMatrix.getLocalRowRange().getGlobalIndex( i ); distributedRowLengths[ gi ] = matrix.getRowCapacity( gi ); @@ -272,8 +265,8 @@ struct SpmvBenchmark DistributedVector distributedY; distributedY.setLike( distributedVector ); distributedMatrix.vectorProduct( distributedVector, distributedY ); - const int rank = CommunicatorType::GetRank( distributedMatrix.getCommunicationGroup() ); - const int nproc = CommunicatorType::GetSize( distributedMatrix.getCommunicationGroup() ); + const int rank = TNL::MPI::GetRank( distributedMatrix.getCommunicationGroup() ); + const int nproc = TNL::MPI::GetSize( distributedMatrix.getCommunicationGroup() ); typename VectorType::ViewType subY( &y[ Partitioner::getOffset( matrix.getRows(), rank, nproc ) ], Partitioner::getSizeForRank( matrix.getRows(), rank, nproc ) ); TNL_ASSERT_EQ( distributedY.getLocalView(), subY, "WRONG RESULT !!!" ); @@ -299,7 +292,7 @@ configSetup( Config::ConfigDescription & config ) config.addDelimiter( "Device settings:" ); Devices::Host::configSetup( config ); Devices::Cuda::configSetup( config ); - CommunicatorType::configSetup( config ); + TNL::MPI::configSetup( config ); } int @@ -314,15 +307,15 @@ main( int argc, char* argv[] ) configSetup( conf_desc ); - Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv); - const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup ); + TNL::MPI::ScopedInitializer mpi(argc, argv); + const int rank = TNL::MPI::GetRank(); if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) return EXIT_FAILURE; if( ! Devices::Host::setup( parameters ) || ! Devices::Cuda::setup( parameters ) || - ! CommunicatorType::setup( parameters ) ) + ! TNL::MPI::setup( parameters ) ) return EXIT_FAILURE; const String & logFileName = parameters.getParameter< String >( "log-file" ); diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h index a4c04578d5553f2f039a8e7fc575de0ad116c48d..c10c996e36c39448c632e33ba78f8153b5858c17 100644 --- a/src/Benchmarks/LinearSolvers/benchmarks.h +++ b/src/Benchmarks/LinearSolvers/benchmarks.h @@ -33,10 +33,10 @@ void barrier( const Matrix& matrix ) { } -template< typename Matrix, typename Communicator > -void barrier( const Matrices::DistributedMatrix< Matrix, Communicator >& matrix ) +template< typename Matrix > +void barrier( const Matrices::DistributedMatrix< Matrix >& matrix ) { - Communicator::Barrier( matrix.getCommunicationGroup() ); + TNL::MPI::Barrier( matrix.getCommunicationGroup() ); } template< typename Device > diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h index e5a8d9819aa7e3c8fb31eecd62ba4932b6c1c731..3acfb2438c33539594cb3de6aa8f4cc429d21b06 100644 --- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h +++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h @@ -24,9 +24,8 @@ #include #include #include -#include -#include -#include +#include +#include #include #include #include @@ -66,12 +65,6 @@ using namespace TNL; using namespace TNL::Benchmarks; using namespace TNL::Pointers; -#ifdef HAVE_MPI -using CommunicatorType = Communicators::MpiCommunicator; -#else -using CommunicatorType = Communicators::NoDistrCommunicator; -#endif - static const std::set< std::string > valid_solvers = { "gmres", @@ -338,9 +331,9 @@ struct LinearSolversBenchmark using IndexType = typename MatrixType::IndexType; using VectorType = Containers::Vector< RealType, DeviceType, IndexType >; - using Partitioner = Containers::Partitioner< IndexType, CommunicatorType >; - using DistributedMatrix = Matrices::DistributedMatrix< MatrixType, CommunicatorType >; - using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType, CommunicatorType >; + using Partitioner = Containers::Partitioner< IndexType >; + using DistributedMatrix = Matrices::DistributedMatrix< MatrixType >; + using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType >; using DistributedRowLengths = typename DistributedMatrix::CompressedRowLengthsVector; static bool @@ -388,7 +381,7 @@ struct LinearSolversBenchmark matrixPointer->getCompressedRowLengths( rowLengths ); const IndexType maxRowLength = max( rowLengths ); - const String name = String( (CommunicatorType::isDistributed()) ? "Distributed linear solvers" : "Linear solvers" ) + const String name = String( (TNL::MPI::GetSize() > 1) ? "Distributed linear solvers" : "Linear solvers" ) + " (" + parameters.getParameter< String >( "name" ) + "): "; benchmark.newBenchmark( name, metadata ); benchmark.setMetadataColumns( Benchmark::MetadataColumns({ @@ -413,13 +406,13 @@ struct LinearSolversBenchmark Matrices::reorderSparseMatrix( *matrixPointer, *matrix_perm, perm, iperm ); Matrices::reorderArray( x0, x0_perm, perm ); Matrices::reorderArray( b, b_perm, perm ); - if( CommunicatorType::isDistributed() ) + if( TNL::MPI::GetSize() > 1 ) runDistributed( benchmark, metadata, parameters, matrix_perm, x0_perm, b_perm ); else runNonDistributed( benchmark, metadata, parameters, matrix_perm, x0_perm, b_perm ); } else { - if( CommunicatorType::isDistributed() ) + if( TNL::MPI::GetSize() > 1 ) runDistributed( benchmark, metadata, parameters, matrixPointer, x0, b ); else runNonDistributed( benchmark, metadata, parameters, matrixPointer, x0, b ); @@ -437,14 +430,14 @@ struct LinearSolversBenchmark const VectorType& b ) { // set up the distributed matrix - const auto group = CommunicatorType::AllGroup; + const auto group = TNL::MPI::AllGroup(); const auto localRange = Partitioner::splitRange( matrixPointer->getRows(), group ); SharedPointer< DistributedMatrix > distMatrixPointer( localRange, matrixPointer->getRows(), matrixPointer->getColumns(), group ); - DistributedVector dist_x0( localRange, matrixPointer->getRows(), group ); - DistributedVector dist_b( localRange, matrixPointer->getRows(), group ); + DistributedVector dist_x0( localRange, 0, matrixPointer->getRows(), group ); + DistributedVector dist_b( localRange, 0, matrixPointer->getRows(), group ); // copy the row capacities from the global matrix to the distributed matrix - DistributedRowLengths distributedRowLengths( localRange, matrixPointer->getRows(), group ); + DistributedRowLengths distributedRowLengths( localRange, 0, matrixPointer->getRows(), group ); for( IndexType i = 0; i < distMatrixPointer->getLocalMatrix().getRows(); i++ ) { const auto gi = distMatrixPointer->getLocalRowRange().getGlobalIndex( i ); distributedRowLengths[ gi ] = matrixPointer->getRowCapacity( gi ); @@ -572,7 +565,7 @@ configSetup( Config::ConfigDescription& config ) config.addDelimiter( "Device settings:" ); Devices::Host::configSetup( config ); Devices::Cuda::configSetup( config ); - CommunicatorType::configSetup( config ); + TNL::MPI::configSetup( config ); config.addDelimiter( "Linear solver settings:" ); Solvers::IterativeSolver< double, int >::configSetup( config ); @@ -597,14 +590,14 @@ main( int argc, char* argv[] ) configSetup( conf_desc ); - Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv); - const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup ); + TNL::MPI::ScopedInitializer mpi(argc, argv); + const int rank = TNL::MPI::GetRank(); if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) return EXIT_FAILURE; if( ! Devices::Host::setup( parameters ) || ! Devices::Cuda::setup( parameters ) || - ! CommunicatorType::setup( parameters ) ) + ! TNL::MPI::setup( parameters ) ) return EXIT_FAILURE; const String & logFileName = parameters.getParameter< String >( "log-file" ); diff --git a/src/Benchmarks/ODESolvers/Euler.hpp b/src/Benchmarks/ODESolvers/Euler.hpp index ab975ed078c470f4824d18e7848033e6fed73f2c..fcc8654bec5bbef012b2753843576a6889f2237c 100644 --- a/src/Benchmarks/ODESolvers/Euler.hpp +++ b/src/Benchmarks/ODESolvers/Euler.hpp @@ -10,8 +10,6 @@ #pragma once -#include -#include #include "ComputeBlockResidue.h" namespace TNL { @@ -202,7 +200,7 @@ void Euler< Problem, SolverMonitor >::computeNewTimeLevel( DofVectorPointer& u, } localResidue /= tau * ( RealType ) size; - Problem::CommunicatorType::Allreduce( &localResidue, ¤tResidue, 1, MPI_SUM, Problem::CommunicatorType::AllGroup ); + TNL::MPI::Allreduce( &localResidue, ¤tResidue, 1, MPI_SUM, TNL::MPI::AllGroup() ); //std::cerr << "Local residue = " << localResidue << " - globalResidue = " << currentResidue << std::endl; } diff --git a/src/Benchmarks/ODESolvers/Merson.hpp b/src/Benchmarks/ODESolvers/Merson.hpp index c97bfc236b8db8f321874c92f23fe11ca9771e08..b45faa1b41d18cbc45eb9307587bfdfeb0c80c74 100644 --- a/src/Benchmarks/ODESolvers/Merson.hpp +++ b/src/Benchmarks/ODESolvers/Merson.hpp @@ -13,8 +13,6 @@ #include #include #include -#include -#include #include "Merson.h" @@ -187,13 +185,13 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& u ) time += currentTau; computeNewTimeLevel( time, currentTau, u, newResidue ); this->setResidue( newResidue ); - + /**** * When time is close to stopTime the new residue * may be inaccurate significantly. */ if( abs( time - this->stopTime ) < 1.0e-7 ) this->setResidue( lastResidue ); - + if( ! this->nextIteration() ) return false; @@ -209,7 +207,7 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& u ) currentTau = min( currentTau, this->getMaxTau() ); #ifdef USE_MPI TNLMPI::Bcast( currentTau, 1, 0 ); -#endif +#endif } if( time + currentTau > this->getStopTime() ) currentTau = this->getStopTime() - time; //we don't want to keep such tau @@ -405,7 +403,7 @@ typename Problem :: RealType Merson< Problem, SolverMonitor >::computeError( con } #endif } - Problem::CommunicatorType::Allreduce( &eps, &maxEps, 1, MPI_MAX, Problem::CommunicatorType::AllGroup ); + TNL::MPI::Allreduce( &eps, &maxEps, 1, MPI_MAX, TNL::MPI::AllGroup() ); return maxEps; } @@ -467,7 +465,7 @@ void Merson< Problem, SolverMonitor >::computeNewTimeLevel( const RealType time, } localResidue /= tau * ( RealType ) size; - Problem::CommunicatorType::Allreduce( &localResidue, ¤tResidue, 1, MPI_SUM, Problem::CommunicatorType::AllGroup); + TNL::MPI::Allreduce( &localResidue, ¤tResidue, 1, MPI_SUM, TNL::MPI::AllGroup() ); /*#ifdef USE_MPI TNLMPI::Allreduce( localResidue, currentResidue, 1, MPI_SUM); #else diff --git a/src/Benchmarks/ODESolvers/SimpleProblem.h b/src/Benchmarks/ODESolvers/SimpleProblem.h index ff81fd18e4576672a89f35f54ff37eeed4ba9d86..65f769dda7b41157671bda431c4b1454e0934167 100644 --- a/src/Benchmarks/ODESolvers/SimpleProblem.h +++ b/src/Benchmarks/ODESolvers/SimpleProblem.h @@ -17,7 +17,7 @@ namespace TNL { namespace Benchmarks { - + template< typename Real = double, typename Device = Devices::Host, typename Index = int > @@ -27,8 +27,7 @@ struct SimpleProblem using DeviceType = Device; using IndexType = Index; using DofVectorType = Containers::Vector< RealType, DeviceType, IndexType >; - using CommunicatorType = Communicators::NoDistrCommunicator; - + template< typename VectorPointer > void getExplicitUpdate( const RealType& time, const RealType& tau, @@ -45,10 +44,10 @@ struct SimpleProblem }; Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, u.getSize(), computeF, u, fu ); } - + template< typename Vector > void applyBoundaryConditions( const RealType& t, Vector& u ) {}; - + }; } // namespace Benchmarks diff --git a/src/Benchmarks/ODESolvers/benchmarks.h b/src/Benchmarks/ODESolvers/benchmarks.h index 60b5336639c9289bb39c8240d48d2b3121487210..a6ee67a624a01443eeabb12d540fc7d6cecb58d8 100644 --- a/src/Benchmarks/ODESolvers/benchmarks.h +++ b/src/Benchmarks/ODESolvers/benchmarks.h @@ -16,8 +16,6 @@ #include #include "../Benchmarks.h" -#include "SimpleProblem.h" - #include // std::runtime_error @@ -35,31 +33,6 @@ getPerformer() return "CPU"; } -/*template< typename Matrix > -void barrier( const Matrix& matrix ) -{ -} - -template< typename Matrix, typename Communicator > -void barrier( const Matrices::DistributedMatrix< Matrix, Communicator >& matrix ) -{ - Communicator::Barrier( matrix.getCommunicationGroup() ); -}*/ - -template< typename Device > -bool checkDevice( const Config::ParameterContainer& parameters ) -{ - const String device = parameters.getParameter< String >( "device" ); - if( device == "all" ) - return true; - if( std::is_same< Device, Devices::Host >::value && device == "host" ) - return true; - if( std::is_same< Device, Devices::Cuda >::value && device == "cuda" ) - return true; - return false; -} - - template< typename Solver, typename VectorPointer > void benchmarkSolver( Benchmark& benchmark, @@ -90,7 +63,7 @@ benchmarkSolver( Benchmark& benchmark, auto compute = [&]() { solver.solve( u ); }; - + // subclass BenchmarkResult to add extra columns to the benchmark // (iterations, preconditioned residue, true residue) /*struct MyBenchmarkResult : public BenchmarkResult diff --git a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h index bbde8894518baaecf0b89d3ded7667db422de73d..0d8d3c04e6fdc2ba4c00ccbca254a737e432af53 100644 --- a/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h +++ b/src/Benchmarks/ODESolvers/tnl-benchmark-ode-solvers.h @@ -23,9 +23,8 @@ #include #include #include -#include -#include -#include +#include +#include #include #include @@ -39,12 +38,6 @@ using namespace TNL; using namespace TNL::Benchmarks; using namespace TNL::Pointers; -#ifdef HAVE_MPI -using CommunicatorType = Communicators::MpiCommunicator; -#else -using CommunicatorType = Communicators::NoDistrCommunicator; -#endif - template< typename Real, typename Index > void @@ -68,7 +61,7 @@ benchmarkODESolvers( Benchmark& benchmark, #ifdef HAVE_CUDA CudaVectorPointer cuda_u( dofs ); *cuda_u = 0.0; -#endif +#endif if( solver == "euler" || solver == "all" ) { using HostSolver = Solvers::ODE::Euler< HostProblem, SolverMonitorType >; benchmark.setOperation("Euler"); @@ -118,7 +111,7 @@ struct ODESolversBenchmark Benchmark::MetadataMap metadata, const Config::ParameterContainer& parameters ) { - const String name = String( (CommunicatorType::isDistributed()) ? "Distributed ODE solvers" : "ODE solvers" ); + const String name = String( (TNL::MPI::GetSize() > 1) ? "Distributed ODE solvers" : "ODE solvers" ); //+ " (" + parameters.getParameter< String >( "name" ) + "): "; benchmark.newBenchmark( name, metadata ); for( size_t dofs = 25; dofs <= 10000000; dofs *= 2 ) { @@ -127,7 +120,7 @@ struct ODESolversBenchmark { "DOFs", convertToString( dofs ) }, } )); - if( CommunicatorType::isDistributed() ) + if( TNL::MPI::GetSize() > 1 ) runDistributed( benchmark, metadata, parameters, dofs ); else runNonDistributed( benchmark, metadata, parameters, dofs ); @@ -141,7 +134,7 @@ struct ODESolversBenchmark const Config::ParameterContainer& parameters, size_t dofs ) { - //const auto group = CommunicatorType::AllGroup; + //const auto group = TNL::MPI::AllGroup(); std::cout << "Iterative solvers:" << std::endl; benchmarkODESolvers< Real, Index >( benchmark, parameters, dofs ); @@ -173,10 +166,10 @@ bool resolveRealTypes( Benchmark& benchmark, Config::ParameterContainer& parameters ) { const String& realType = parameters.getParameter< String >( "real-type" ); - if( ( realType == "float" || realType == "all" ) && + if( ( realType == "float" || realType == "all" ) && ! resolveIndexType< float >( benchmark, metadata, parameters ) ) return false; - if( ( realType == "double" || realType == "all" ) && + if( ( realType == "double" || realType == "all" ) && ! resolveIndexType< double >( benchmark, metadata, parameters ) ) return false; return true; @@ -209,7 +202,7 @@ configSetup( Config::ConfigDescription& config ) config.addDelimiter( "Device settings:" ); Devices::Host::configSetup( config ); Devices::Cuda::configSetup( config ); - CommunicatorType::configSetup( config ); + TNL::MPI::configSetup( config ); config.addDelimiter( "ODE solver settings:" ); Solvers::IterativeSolver< double, int >::configSetup( config ); @@ -230,14 +223,14 @@ main( int argc, char* argv[] ) configSetup( conf_desc ); - Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv); - const int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup ); + TNL::MPI::ScopedInitializer mpi(argc, argv); + const int rank = TNL::MPI::GetRank(); if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) return EXIT_FAILURE; if( ! Devices::Host::setup( parameters ) || ! Devices::Cuda::setup( parameters ) || - ! CommunicatorType::setup( parameters ) ) + ! TNL::MPI::setup( parameters ) ) return EXIT_FAILURE; const String & logFileName = parameters.getParameter< String >( "log-file" ); diff --git a/src/Python/pytnl/CMakeLists.txt b/src/Python/pytnl/CMakeLists.txt index 2065b0a13b8a9bd42da4deabafa9ba2b371fc756..15b8e6b0a4e56b1e663af2c27d7de9f8480fc275 100644 --- a/src/Python/pytnl/CMakeLists.txt +++ b/src/Python/pytnl/CMakeLists.txt @@ -1,4 +1,7 @@ add_subdirectory( tnl ) +if( BUILD_MPI ) + add_subdirectory( tnl_mpi ) +endif() install( DIRECTORY . DESTINATION "include/pytnl" MESSAGE_NEVER diff --git a/src/Python/pytnl/iostream_caster.h b/src/Python/pytnl/iostream_caster.h new file mode 100644 index 0000000000000000000000000000000000000000..38f5d4e16c0c33f87028bebbd5b5e56548670c7d --- /dev/null +++ b/src/Python/pytnl/iostream_caster.h @@ -0,0 +1,59 @@ +#pragma once + +#include + +namespace pybind11 { namespace detail { + template <> struct type_caster { + public: + bool load(handle src, bool) { + if (getattr(src, "read", none()).is_none()){ + return false; + } + + obj = reinterpret_borrow(src); + value = std::unique_ptr(new pystreambuf::istream(obj, 0)); + + return true; + } + + protected: + object obj; + std::unique_ptr value; + + public: + static constexpr auto name = _("istream"); + static handle cast(const std::istream *src, return_value_policy policy, handle parent) { + return none().release(); + } + operator std::istream*() { return value.get(); } + operator std::istream&() { return *value; } + template using cast_op_type = pybind11::detail::cast_op_type<_T>; + }; + + template <> struct type_caster { + public: + bool load(handle src, bool) { + if (getattr(src, "write", none()).is_none()){ + return false; + } + + obj = reinterpret_borrow(src); + value = std::unique_ptr(new pystreambuf::ostream(obj, 0)); + + return true; + } + + protected: + object obj; + std::unique_ptr value; + + public: + static constexpr auto name = _("ostream"); + static handle cast(const std::ostream *src, return_value_policy policy, handle parent) { + return none().release(); + } + operator std::ostream*() { return value.get(); } + operator std::ostream&() { return *value; } + template using cast_op_type = pybind11::detail::cast_op_type<_T>; + }; +}} // namespace pybind11::detail diff --git a/src/Python/pytnl/tnl/CMakeLists.txt b/src/Python/pytnl/tnl/CMakeLists.txt index c7fcd80e24480e3fd8eb21cc8d807dc900ea1d66..dc1c3fcc34f5be21f9f9aba34b567edcf5c06ee8 100644 --- a/src/Python/pytnl/tnl/CMakeLists.txt +++ b/src/Python/pytnl/tnl/CMakeLists.txt @@ -6,15 +6,23 @@ set( sources Grid2D.cpp Grid3D.cpp Mesh.cpp + MeshReaders.cpp + MeshWriters.cpp Object.cpp SparseMatrix.cpp String.cpp + VTKTraits.cpp tnl.cpp ) pybind11_add_module( pytnl ${sources} ) # rename the shared library to tnl.cpython-XXm-x86_64-linux-gnu.so -set_target_properties( pytnl PROPERTIES LIBRARY_OUTPUT_NAME tnl ) +set_target_properties( pytnl PROPERTIES LIBRARY_OUTPUT_NAME tnl DEBUG_POSTFIX "_dbg" ) + +# indicate the postfix to the target so that the pybind11 module name can be set accordingly +if( CMAKE_BUILD_TYPE STREQUAL "Debug") + target_compile_options( pytnl PRIVATE -DPYTNL_MODULE_POSTFIX=_dbg ) +endif() # Skip -march=native -mtune=native for pytnl - optimizing python bindings for # a specific architecture is not very useful and prevents using Python tools on diff --git a/src/Python/pytnl/tnl/EntityTypes.h b/src/Python/pytnl/tnl/EntityTypes.h deleted file mode 100644 index 1f10e2827dd2cc24d006c88f509e6b8d5a5cbf90..0000000000000000000000000000000000000000 --- a/src/Python/pytnl/tnl/EntityTypes.h +++ /dev/null @@ -1,36 +0,0 @@ -#pragma once - -#include -namespace py = pybind11; - -enum class EntityTypes { Cell, Face, Vertex }; - -inline void -export_EntityTypes( py::module & m ) -{ - // avoid duplicate conversion -> export only once - static bool exported = false; - if( ! exported ) { - // TODO: should be nested types instead - py::enum_< EntityTypes >( m, "EntityTypes" ) - .value("Cell", EntityTypes::Cell) - .value("Face", EntityTypes::Face) - .value("Vertex", EntityTypes::Vertex) - ; - exported = true; - } -} - -template< typename Mesh > -typename Mesh::GlobalIndexType -mesh_getEntitiesCount( const Mesh & self, const EntityTypes & entity ) -{ - if( entity == EntityTypes::Cell ) - return self.template getEntitiesCount< typename Mesh::Cell >(); - else if( entity == EntityTypes::Face ) - return self.template getEntitiesCount< typename Mesh::Face >(); - else if( entity == EntityTypes::Vertex ) - return self.template getEntitiesCount< typename Mesh::Vertex >(); - else - throw py::value_error("The entity parameter must be either Cell, Face or Vertex."); -} diff --git a/src/Python/pytnl/tnl/Grid.h b/src/Python/pytnl/tnl/Grid.h index 8cf28a8f5bd393dfda5bfc01b6547c77ad66ba91..2622bd5c93dc07a5d67bfca2dde02ef72fdca6c7 100644 --- a/src/Python/pytnl/tnl/Grid.h +++ b/src/Python/pytnl/tnl/Grid.h @@ -5,7 +5,7 @@ namespace py = pybind11; #include "StaticVector.h" #include "Grid_getSpaceStepsProducts.h" -#include "EntityTypes.h" +#include "mesh_getters.h" #include @@ -54,8 +54,6 @@ void export_Grid( py::module & m, const char* name ) // void (Grid::* _setDimensions1)(const IndexType) = &Grid::setDimensions; void (Grid::* _setDimensions2)(const typename Grid::CoordinatesType &) = &Grid::setDimensions; - export_EntityTypes(m); - auto grid = py::class_( m, name ) .def(py::init<>()) .def_static("getMeshDimension", &Grid::getMeshDimension) @@ -68,11 +66,13 @@ void export_Grid( py::module & m, const char* name ) .def("setDomain", &Grid::setDomain) .def("getOrigin", &Grid::getOrigin, py::return_value_policy::reference_internal) .def("getProportions", &Grid::getProportions, py::return_value_policy::reference_internal) - .def("getEntitiesCount", &mesh_getEntitiesCount< Grid >) - // TODO: if combined, the return type would depend on the runtime parameter (entity) - .def("getEntity_cell", &Grid::template getEntity) - .def("getEntity_face", &Grid::template getEntity) - .def("getEntity_vertex", &Grid::template getEntity) + .def("getEntitiesCount", &mesh_getEntitiesCount< Grid, typename Grid::Cell >) + .def("getEntitiesCount", &mesh_getEntitiesCount< Grid, typename Grid::Face >) + .def("getEntitiesCount", &mesh_getEntitiesCount< Grid, typename Grid::Vertex >) + // NOTE: if combined into getEntity, the return type would depend on the runtime parameter (entity) + .def("getCell", &Grid::template getEntity) + .def("getFace", &Grid::template getEntity) + .def("getVertex", &Grid::template getEntity) .def("getEntityIndex", &Grid::template getEntityIndex) .def("getEntityIndex", &Grid::template getEntityIndex) .def("getEntityIndex", &Grid::template getEntityIndex) diff --git a/src/Python/pytnl/tnl/Mesh.cpp b/src/Python/pytnl/tnl/Mesh.cpp index aa0c8c0355363aeeeb0e7123cae82da3ea005dbf..48e3f939b8b656a7ab023857917377f38b8a974b 100644 --- a/src/Python/pytnl/tnl/Mesh.cpp +++ b/src/Python/pytnl/tnl/Mesh.cpp @@ -2,35 +2,12 @@ #include "../tnl_conversions.h" #include "Mesh.h" -#include -#include - -template< typename Reader > -void export_reader( py::module & m, const char* name ) -{ - py::class_< Reader >( m, name ) - .def(py::init()) - .def("loadMesh", &Reader::template loadMesh< MeshOfEdges >) - .def("loadMesh", &Reader::template loadMesh< MeshOfTriangles >) - .def("loadMesh", &Reader::template loadMesh< MeshOfTetrahedrons >) -// .def("loadMesh", []( Reader& reader, const std::string& name, MeshOfEdges & mesh ) { -// return reader.loadMesh( name.c_str(), mesh ); -// } ) -// .def("loadMesh", []( Reader& reader, const std::string& name, MeshOfTriangles & mesh ) { -// return reader.loadMesh( name.c_str(), mesh ); -// } ) -// .def("loadMesh", []( Reader& reader, const std::string& name, MeshOfTetrahedrons & mesh ) { -// return reader.loadMesh( name.c_str(), mesh ); -// } ) - ; -} void export_Meshes( py::module & m ) { export_Mesh< MeshOfEdges >( m, "MeshOfEdges" ); export_Mesh< MeshOfTriangles >( m, "MeshOfTriangles" ); + export_Mesh< MeshOfQuadrangles >( m, "MeshOfQuadrangles" ); export_Mesh< MeshOfTetrahedrons >( m, "MeshOfTetrahedrons" ); - - export_reader< TNL::Meshes::Readers::VTKReader >( m, "VTKReader" ); - export_reader< TNL::Meshes::Readers::VTUReader >( m, "VTUReader" ); + export_Mesh< MeshOfHexahedrons >( m, "MeshOfHexahedrons" ); } diff --git a/src/Python/pytnl/tnl/Mesh.h b/src/Python/pytnl/tnl/Mesh.h index 21fa015fc94d967146cc2e8f046ac0f70fcbdb39..3097f111f528fb06aef62f9d46af611a33509b6d 100644 --- a/src/Python/pytnl/tnl/Mesh.h +++ b/src/Python/pytnl/tnl/Mesh.h @@ -5,7 +5,7 @@ namespace py = pybind11; #include "../typedefs.h" #include "StaticVector.h" -#include "EntityTypes.h" +#include "mesh_getters.h" #include #include @@ -82,8 +82,11 @@ template< typename MeshEntity, typename Scope > void export_MeshEntity( Scope & scope, const char* name ) { auto entity = py::class_< MeshEntity >( scope, name ) +// .def(py::init<>()) +// .def(py::init()) .def_static("getEntityDimension", &MeshEntity::getEntityDimension) .def("getIndex", &MeshEntity::getIndex) + .def("getTag", &MeshEntity::getTag) // TODO ; @@ -95,23 +98,24 @@ void export_MeshEntity( Scope & scope, const char* name ) template< typename Mesh > void export_Mesh( py::module & m, const char* name ) { - // there are two templates - const and non-const - take only the const - auto (Mesh::* getEntity_cell)(const typename Mesh::GlobalIndexType) const = &Mesh::template getEntity; - auto (Mesh::* getEntity_face)(const typename Mesh::GlobalIndexType) const = &Mesh::template getEntity; - auto (Mesh::* getEntity_vertex)(const typename Mesh::GlobalIndexType) const = &Mesh::template getEntity; - - export_EntityTypes(m); - auto mesh = py::class_< Mesh, TNL::Object >( m, name ) .def(py::init<>()) .def_static("getMeshDimension", &Mesh::getMeshDimension) .def_static("getSerializationType", &Mesh::getSerializationType) .def("getSerializationTypeVirtual", &Mesh::getSerializationTypeVirtual) - .def("getEntitiesCount", &mesh_getEntitiesCount< Mesh >) - // TODO: if combined, the return type would depend on the runtime parameter (entity) - .def("getEntity_cell", getEntity_cell) - .def("getEntity_face", getEntity_face) - .def("getEntity_vertex", getEntity_vertex) + .def("getEntitiesCount", &mesh_getEntitiesCount< Mesh, typename Mesh::Cell >) + .def("getEntitiesCount", &mesh_getEntitiesCount< Mesh, typename Mesh::Face >) + .def("getEntitiesCount", &mesh_getEntitiesCount< Mesh, typename Mesh::Vertex >) + .def("getGhostEntitiesCount", &mesh_getGhostEntitiesCount< Mesh, typename Mesh::Cell >) + .def("getGhostEntitiesCount", &mesh_getGhostEntitiesCount< Mesh, typename Mesh::Face >) + .def("getGhostEntitiesCount", &mesh_getGhostEntitiesCount< Mesh, typename Mesh::Vertex >) + .def("getGhostEntitiesOffset", &mesh_getGhostEntitiesOffset< Mesh, typename Mesh::Cell >) + .def("getGhostEntitiesOffset", &mesh_getGhostEntitiesOffset< Mesh, typename Mesh::Face >) + .def("getGhostEntitiesOffset", &mesh_getGhostEntitiesOffset< Mesh, typename Mesh::Vertex >) + // NOTE: if combined into getEntity, the return type would depend on the runtime parameter (entity) + .def("getCell", &Mesh::template getEntity) + .def("getFace", &Mesh::template getEntity) + .def("getVertex", &Mesh::template getEntity) .def("getEntityCenter", []( const Mesh& mesh, const typename Mesh::Cell& cell ){ return getEntityCenter( mesh, cell ); } ) .def("getEntityCenter", []( const Mesh& mesh, const typename Mesh::Face& face ){ return getEntityCenter( mesh, face ); } ) .def("getEntityCenter", []( const Mesh& mesh, const typename Mesh::Vertex& vertex ){ return getEntityCenter( mesh, vertex ); } ) @@ -124,6 +128,12 @@ void export_Mesh( py::module & m, const char* name ) return mesh.template isBoundaryEntity< Mesh::Face::getEntityDimension() >( face.getIndex() ); } ) .def("isBoundaryEntity", []( const Mesh& mesh, const typename Mesh::Vertex& vertex ){ return mesh.template isBoundaryEntity< Mesh::Vertex::getEntityDimension() >( vertex.getIndex() ); } ) + .def("isGhostEntity", []( const Mesh& mesh, const typename Mesh::Cell& cell ){ + return mesh.template isGhostEntity< Mesh::Cell::getEntityDimension() >( cell.getIndex() ); } ) + .def("isGhostEntity", []( const Mesh& mesh, const typename Mesh::Face& face ){ + return mesh.template isGhostEntity< Mesh::Face::getEntityDimension() >( face.getIndex() ); } ) + .def("isGhostEntity", []( const Mesh& mesh, const typename Mesh::Vertex& vertex ){ + return mesh.template isGhostEntity< Mesh::Vertex::getEntityDimension() >( vertex.getIndex() ); } ) // TODO: more? ; diff --git a/src/Python/pytnl/tnl/MeshReaders.cpp b/src/Python/pytnl/tnl/MeshReaders.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c4abae015de76ee4ff440871ed57752f13a7ad79 --- /dev/null +++ b/src/Python/pytnl/tnl/MeshReaders.cpp @@ -0,0 +1,39 @@ +// conversions have to be registered for each object file +#include "../tnl_conversions.h" + +#include "MeshReaders.h" +#include "../typedefs.h" + +void export_MeshReaders( py::module & m ) +{ + using MeshReader = TNL::Meshes::Readers::MeshReader; + using XMLVTK = TNL::Meshes::Readers::XMLVTK; + + // base class with trampolines for virtual methods + py::class_< MeshReader, PyMeshReader >( m, "MeshReader" ) + .def(py::init()) + // bindings against the actual class, NOT the trampoline + .def("reset", &MeshReader::reset) + .def("detectMesh", &MeshReader::detectMesh) + .def("loadMesh", &MeshReader::template loadMesh< MeshOfEdges >) + .def("loadMesh", &MeshReader::template loadMesh< MeshOfTriangles >) + .def("loadMesh", &MeshReader::template loadMesh< MeshOfQuadrangles >) + .def("loadMesh", &MeshReader::template loadMesh< MeshOfTetrahedrons >) + .def("loadMesh", &MeshReader::template loadMesh< MeshOfHexahedrons >) + ; + + py::class_< TNL::Meshes::Readers::VTKReader, MeshReader >( m, "VTKReader" ) + .def(py::init()) + ; + + // base class for VTUReader and PVTUReader + py::class_< XMLVTK, PyXMLVTK, MeshReader >( m, "XMLVTK" ) + .def(py::init()) + .def("readPointData", &XMLVTK::readPointData) + .def("readCellData", &XMLVTK::readCellData) + ; + + py::class_< TNL::Meshes::Readers::VTUReader, XMLVTK >( m, "VTUReader" ) + .def(py::init()) + ; +} diff --git a/src/Python/pytnl/tnl/MeshReaders.h b/src/Python/pytnl/tnl/MeshReaders.h new file mode 100644 index 0000000000000000000000000000000000000000..22b40a6719319251168ad9dfe0922071bd2b9d91 --- /dev/null +++ b/src/Python/pytnl/tnl/MeshReaders.h @@ -0,0 +1,47 @@ +#include +#include + +// trampoline classes needed for overriding virtual methods +// https://pybind11.readthedocs.io/en/stable/advanced/classes.html + +class PyMeshReader +: public TNL::Meshes::Readers::MeshReader +{ + using Parent = TNL::Meshes::Readers::MeshReader; + +public: + // inherit constructors + using TNL::Meshes::Readers::MeshReader::MeshReader; + + // trampolines (one for each virtual method) + void reset() override + { + PYBIND11_OVERRIDE_PURE( void, Parent, reset ); + } + + void detectMesh() override + { + PYBIND11_OVERRIDE_PURE( void, Parent, detectMesh ); + } +}; + +class PyXMLVTK +: public TNL::Meshes::Readers::XMLVTK +{ + using Parent = TNL::Meshes::Readers::XMLVTK; + +public: + // inherit constructors + using TNL::Meshes::Readers::XMLVTK::XMLVTK; + + // trampolines (one for each virtual method) + void reset() override + { + PYBIND11_OVERRIDE_PURE( void, Parent, reset ); + } + + void detectMesh() override + { + PYBIND11_OVERRIDE_PURE( void, Parent, detectMesh ); + } +}; diff --git a/src/Python/pytnl/tnl/MeshWriters.cpp b/src/Python/pytnl/tnl/MeshWriters.cpp new file mode 100644 index 0000000000000000000000000000000000000000..01f79ce2d6a8a3b1f4af2773ac27653148ec4fb3 --- /dev/null +++ b/src/Python/pytnl/tnl/MeshWriters.cpp @@ -0,0 +1,99 @@ +// conversions have to be registered for each object file +#include "../tnl_conversions.h" + +#include "MeshWriters.h" +#include "../typedefs.h" + +#include + +#include +#include + +template< typename Writer, TNL::Meshes::VTK::FileFormat default_format > +void export_MeshWriter( py::module & m, const char* name ) +{ + // We cannot use MeshReader::VariantVector for Python bindings, because its variants are + // std::vector for T in std::int8_t, std::uint8_t, std::int16_t, std::uint16_t, std::int32_t, + // std::uint32_t, std::int64_t, std::uint64_t, float and double. Python types do not map + // nicely to C++ types, integers even have unlimited precision, pybind11 even checks if given + // Python value fits into the C++ type when selecting the alternative for a scalar type, and + // for containers like std::vector it merely selects the first possible type. For reference, see + // https://github.com/pybind/pybind11/issues/1625#issuecomment-723499161 + using VariantVector = mpark::variant< std::vector< IndexType >, std::vector< RealType > >; + + // Binding to Writer directly is not possible, because the writer has a std::ostream attribute + // which would reference the streambuf created by the type caster from the Python file-like object. + // However, the streambuf would be destroyed as soon as the writer is constructed and control + // returned to Python, so the following invokations would use an invalid object and segfault. + // To solve this, we use a transient wrapper struct PyWriter which holds the streambuf in its own + // ostream attribute and is initialized by a py::object to avoid type casting. + using PythonWriter = PyWriter< Writer, default_format >; + py::class_< PythonWriter >( m, name ) + .def(py::init(), py::keep_alive<1, 2>(), + py::arg("stream"), py::pos_only(), py::arg("format") = default_format) + .def("writeMetadata", &Writer::writeMetadata, py::kw_only(), py::arg("cycle") = -1, py::arg("time") = -1) + .def("writeVertices", &Writer::template writeEntities< 0 >) + .def("writeCells", &Writer::template writeEntities<>) + // we use the VariantVector from MeshReader because we already have a caster for it + .def("writePointData", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) { + using mpark::visit; + visit( [&](auto&& array) { + // we need a view for the std::vector + using vector_t = std::decay_t; + using view_t = TNL::Containers::ArrayView< std::add_const_t< typename vector_t::value_type >, TNL::Devices::Host, std::int64_t >; + view_t view( array.data(), array.size() ); + writer.writePointData( view, name, numberOfComponents ); + }, + array + ); + }, + py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1) + .def("writeCellData", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) { + using mpark::visit; + visit( [&](auto&& array) { + // we need a view for the std::vector + using vector_t = std::decay_t; + using view_t = TNL::Containers::ArrayView< std::add_const_t< typename vector_t::value_type >, TNL::Devices::Host, std::int64_t >; + view_t view( array.data(), array.size() ); + writer.writeCellData( view, name, numberOfComponents ); + }, + array + ); + }, + py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1) + .def("writeDataArray", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) { + using mpark::visit; + visit( [&](auto&& array) { + // we need a view for the std::vector + using vector_t = std::decay_t; + using view_t = TNL::Containers::ArrayView< std::add_const_t< typename vector_t::value_type >, TNL::Devices::Host, std::int64_t >; + view_t view( array.data(), array.size() ); + writer.writeDataArray( view, name, numberOfComponents ); + }, + array + ); + }, + py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1) + ; +} + +void export_MeshWriters( py::module & m ) +{ + export_MeshWriter< TNL::Meshes::Writers::VTKWriter< Grid1D >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_Grid1D" ); + export_MeshWriter< TNL::Meshes::Writers::VTUWriter< Grid1D >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_Grid1D" ); + export_MeshWriter< TNL::Meshes::Writers::VTKWriter< Grid2D >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_Grid2D" ); + export_MeshWriter< TNL::Meshes::Writers::VTUWriter< Grid2D >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_Grid2D" ); + export_MeshWriter< TNL::Meshes::Writers::VTKWriter< Grid3D >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_Grid3D" ); + export_MeshWriter< TNL::Meshes::Writers::VTUWriter< Grid3D >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_Grid3D" ); + + export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfEdges >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_MeshOfEdges" ); + export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfEdges >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfEdges" ); + export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfTriangles >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_MeshOfTriangles" ); + export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfTriangles >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfTriangles" ); + export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfQuadrangles >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_MeshOfQuadrangles" ); + export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfQuadrangles >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfQuadrangles" ); + export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfTetrahedrons >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_MeshOfTetrahedrons" ); + export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfTetrahedrons >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfTetrahedrons" ); + export_MeshWriter< TNL::Meshes::Writers::VTKWriter< MeshOfHexahedrons >, TNL::Meshes::VTK::FileFormat::binary >( m, "VTKWriter_MeshOfHexahedrons" ); + export_MeshWriter< TNL::Meshes::Writers::VTUWriter< MeshOfHexahedrons >, TNL::Meshes::VTK::FileFormat::zlib_compressed >( m, "VTUWriter_MeshOfHexahedrons" ); +} diff --git a/src/Python/pytnl/tnl/MeshWriters.h b/src/Python/pytnl/tnl/MeshWriters.h new file mode 100644 index 0000000000000000000000000000000000000000..9dd7185eadf48c11a4fa484e1e7327e32f92566f --- /dev/null +++ b/src/Python/pytnl/tnl/MeshWriters.h @@ -0,0 +1,22 @@ +#include "../iostream_caster.h" +#include + +// helper struct is needed to ensure correct initialization order in the PyWriter constructor +struct PyOstreamHelper +{ + py::object obj; + pystreambuf::ostream str; + + PyOstreamHelper( py::object src ) + : obj(py::reinterpret_borrow(src)), + str(obj) + {} +}; + +template< typename Writer, TNL::Meshes::VTK::FileFormat default_format > +struct PyWriter : public PyOstreamHelper, public Writer +{ + PyWriter( py::object src, TNL::Meshes::VTK::FileFormat format = default_format ) + : PyOstreamHelper(src), Writer(str) + {} +}; diff --git a/src/Python/pytnl/tnl/VTKTraits.cpp b/src/Python/pytnl/tnl/VTKTraits.cpp new file mode 100644 index 0000000000000000000000000000000000000000..85d7964714971d4ef62eda867c5ca84bab1dfbde --- /dev/null +++ b/src/Python/pytnl/tnl/VTKTraits.cpp @@ -0,0 +1,45 @@ +#include +namespace py = pybind11; + +#include + +void export_VTKTraits( py::module & m ) +{ + py::enum_< TNL::Meshes::VTK::FileFormat >( m, "VTKFileFormat") + .value("ascii", TNL::Meshes::VTK::FileFormat::ascii) + .value("binary", TNL::Meshes::VTK::FileFormat::binary) + .value("zlib_compressed", TNL::Meshes::VTK::FileFormat::zlib_compressed) + ; + py::enum_< TNL::Meshes::VTK::DataType >( m, "VTKDataType") + .value("CellData", TNL::Meshes::VTK::DataType::CellData) + .value("PointData", TNL::Meshes::VTK::DataType::PointData) + ; + py::enum_< TNL::Meshes::VTK::EntityShape >( m, "VTKEntityShape") + .value("Vertex", TNL::Meshes::VTK::EntityShape::Vertex) + .value("PolyVertex", TNL::Meshes::VTK::EntityShape::PolyVertex) + .value("Line", TNL::Meshes::VTK::EntityShape::Line) + .value("PolyLine", TNL::Meshes::VTK::EntityShape::PolyLine) + .value("Triangle", TNL::Meshes::VTK::EntityShape::Triangle) + .value("TriangleStrip", TNL::Meshes::VTK::EntityShape::TriangleStrip) + .value("Polygon", TNL::Meshes::VTK::EntityShape::Polygon) + .value("Pixel", TNL::Meshes::VTK::EntityShape::Pixel) + .value("Quad", TNL::Meshes::VTK::EntityShape::Quad) + .value("Tetra", TNL::Meshes::VTK::EntityShape::Tetra) + .value("Voxel", TNL::Meshes::VTK::EntityShape::Voxel) + .value("Hexahedron", TNL::Meshes::VTK::EntityShape::Hexahedron) + .value("Wedge", TNL::Meshes::VTK::EntityShape::Wedge) + .value("Pyramid", TNL::Meshes::VTK::EntityShape::Pyramid) + ; + py::enum_< TNL::Meshes::VTK::CellGhostTypes >( m, "VTKCellGhostTypes") + .value("DUPLICATECELL", TNL::Meshes::VTK::CellGhostTypes::DUPLICATECELL, "the cell is present on multiple processors") + .value("HIGHCONNECTIVITYCELL", TNL::Meshes::VTK::CellGhostTypes::HIGHCONNECTIVITYCELL, "the cell has more neighbors than in a regular mesh") + .value("LOWCONNECTIVITYCELL", TNL::Meshes::VTK::CellGhostTypes::LOWCONNECTIVITYCELL, "the cell has less neighbors than in a regular mesh") + .value("REFINEDCELL", TNL::Meshes::VTK::CellGhostTypes::REFINEDCELL, "other cells are present that refines it") + .value("EXTERIORCELL", TNL::Meshes::VTK::CellGhostTypes::EXTERIORCELL, "the cell is on the exterior of the data set") + .value("HIDDENCELL", TNL::Meshes::VTK::CellGhostTypes::HIDDENCELL, "the cell is needed to maintain connectivity, but the data values should be ignored") + ; + py::enum_< TNL::Meshes::VTK::PointGhostTypes >( m, "VTKPointGhostTypes") + .value("DUPLICATEPOINT", TNL::Meshes::VTK::PointGhostTypes::DUPLICATEPOINT, "the cell is present on multiple processors") + .value("HIDDENPOINT", TNL::Meshes::VTK::PointGhostTypes::HIDDENPOINT, "the point is needed to maintain connectivity, but the data values should be ignored") + ; +} diff --git a/src/Python/pytnl/tnl/mesh_getters.h b/src/Python/pytnl/tnl/mesh_getters.h new file mode 100644 index 0000000000000000000000000000000000000000..c5eddaa5ea3a3f62ad4d40ab3b01b96f5a798aa6 --- /dev/null +++ b/src/Python/pytnl/tnl/mesh_getters.h @@ -0,0 +1,36 @@ +#pragma once + +#include + +template< typename Mesh, typename EntityType > +typename Mesh::GlobalIndexType +mesh_getEntitiesCount( const Mesh & self, const EntityType & entity ) +{ + static_assert( std::is_same< EntityType, typename Mesh::Cell >::value || + std::is_same< EntityType, typename Mesh::Face >::value || + std::is_same< EntityType, typename Mesh::Vertex >::value, + "incompatible entity type" ); + return self.template getEntitiesCount< EntityType::getEntityDimension() >(); +} + +template< typename Mesh, typename EntityType > +typename Mesh::GlobalIndexType +mesh_getGhostEntitiesCount( const Mesh & self, const EntityType & entity ) +{ + static_assert( std::is_same< EntityType, typename Mesh::Cell >::value || + std::is_same< EntityType, typename Mesh::Face >::value || + std::is_same< EntityType, typename Mesh::Vertex >::value, + "incompatible entity type" ); + return self.template getGhostEntitiesCount< EntityType::getEntityDimension() >(); +} + +template< typename Mesh, typename EntityType > +typename Mesh::GlobalIndexType +mesh_getGhostEntitiesOffset( const Mesh & self, const EntityType & entity ) +{ + static_assert( std::is_same< EntityType, typename Mesh::Cell >::value || + std::is_same< EntityType, typename Mesh::Face >::value || + std::is_same< EntityType, typename Mesh::Vertex >::value, + "incompatible entity type" ); + return self.template getGhostEntitiesOffset< EntityType::getEntityDimension() >(); +} diff --git a/src/Python/pytnl/tnl/tnl.cpp b/src/Python/pytnl/tnl/tnl.cpp index 0eb7c3e8b0aee48791a546e36291f4790c9ca8a9..65e9c14e47a153cc9d724c0df5ecff443bb65fb3 100644 --- a/src/Python/pytnl/tnl/tnl.cpp +++ b/src/Python/pytnl/tnl/tnl.cpp @@ -13,7 +13,10 @@ void export_String( py::module & m ); void export_Grid1D( py::module & m ); void export_Grid2D( py::module & m ); void export_Grid3D( py::module & m ); +void export_VTKTraits( py::module & m ); void export_Meshes( py::module & m ); +void export_MeshReaders( py::module & m ); +void export_MeshWriters( py::module & m ); void export_SparseMatrices( py::module & m ); template< typename T > @@ -23,7 +26,7 @@ template< typename T > using _vector = TNL::Containers::Vector< T, TNL::Devices::Host, IndexType >; // Python module definition -PYBIND11_MODULE(tnl, m) +PYBIND11_MODULE(PYTNL_MODULE_NAME(tnl), m) { register_exceptions(m); @@ -41,7 +44,11 @@ PYBIND11_MODULE(tnl, m) export_Grid2D(m); export_Grid3D(m); + export_VTKTraits(m); + export_Meshes(m); + export_MeshReaders(m); + export_MeshWriters(m); export_SparseMatrices(m); } diff --git a/src/Python/pytnl/tnl_conversions.h b/src/Python/pytnl/tnl_conversions.h index 602d1cffd1ccd32660b72630c023f2d913f1da19..788a54813fce26d2021d5c8671e5721a49678585 100644 --- a/src/Python/pytnl/tnl_conversions.h +++ b/src/Python/pytnl/tnl_conversions.h @@ -1,3 +1,5 @@ // conversion has to be registered for each object file #include "tnl_str_conversion.h" #include "tnl_tuple_conversion.h" +#include "variant_caster.h" +#include "iostream_caster.h" diff --git a/src/Python/pytnl/tnl_mpi/CMakeLists.txt b/src/Python/pytnl/tnl_mpi/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..2aa8f73dacd05197b0bfbc04e633d267167ed8d6 --- /dev/null +++ b/src/Python/pytnl/tnl_mpi/CMakeLists.txt @@ -0,0 +1,60 @@ +# enable C++14 for pytnl_mpi (due to py::overload_cast) +set(PYBIND11_CPP_STANDARD -std=c++14) + +set( sources + DistributedMesh.cpp + DistributedMeshReaders.cpp + DistributedMeshWriters.cpp + tnl_mpi.cpp +) +pybind11_add_module( pytnl_mpi ${sources} ) + +# rename the shared library to tnl_mpi.cpython-XXm-x86_64-linux-gnu.so +set_target_properties( pytnl_mpi PROPERTIES LIBRARY_OUTPUT_NAME tnl_mpi DEBUG_POSTFIX "_dbg" ) + +# indicate the postfix to the target so that the pybind11 module name can be set accordingly +if( CMAKE_BUILD_TYPE STREQUAL "Debug") + target_compile_options( pytnl_mpi PRIVATE -DPYTNL_MODULE_POSTFIX=_dbg ) +endif() + +# Skip -march=native -mtune=native for pytnl_mpi - optimizing python bindings for +# a specific architecture is not very useful and prevents using Python tools on +# hybrid clusters. +get_target_property( pytnl_mpi_COMPILE_OPTIONS pytnl_mpi COMPILE_OPTIONS ) +if( pytnl_mpi_COMPILE_OPTIONS ) + string( REPLACE "-march=native" "" pytnl_mpi_COMPILE_OPTIONS "${pytnl_mpi_COMPILE_OPTIONS}" ) + string( REPLACE "-mtune=native" "" pytnl_mpi_COMPILE_OPTIONS "${pytnl_mpi_COMPILE_OPTIONS}" ) + set_target_properties( pytnl_mpi PROPERTIES COMPILE_OPTIONS "${pytnl_mpi_COMPILE_OPTIONS}" ) +endif() + +# We have bindings for unsafe objects (e.g. Array::operator[]) where assertion +# is the only safeguard, so we need to translate the TNL::AssertionError to +# Python's AssertionError. +# NDEBUG is defined in the global CMAKE_CXX_FLAGS and cannot be easily removed +# per-target, so we need to undefine it by passing -U NDEBUG. +target_compile_options( pytnl_mpi PRIVATE -U NDEBUG -D TNL_THROW_ASSERTION_ERROR ) + +# disable errors due to -Wunused-value coming from pybind11 +if( ${WITH_CI_FLAGS} ) + if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + target_compile_options( pytnl_mpi PRIVATE -Wno-error=unused-value ) + endif() +endif() + + +# enable zlib and tinyxml2 (used by PVTUReader) +find_package( ZLIB ) +if( ZLIB_FOUND ) + target_compile_definitions(pytnl_mpi PUBLIC "-DHAVE_ZLIB") + target_include_directories(pytnl_mpi PUBLIC ${ZLIB_INCLUDE_DIRS}) + target_link_libraries(pytnl_mpi PUBLIC ${ZLIB_LIBRARIES}) +endif() + +find_package( tinyxml2 QUIET ) +if( tinyxml2_FOUND ) + target_compile_definitions(pytnl_mpi PUBLIC "-DHAVE_TINYXML2") + target_link_libraries(pytnl_mpi PUBLIC tinyxml2::tinyxml2) +endif() + + +install( TARGETS pytnl_mpi DESTINATION ${PYTHON_SITE_PACKAGES_DIR} ) diff --git a/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp b/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0af175f3cd24792a4581c33b7349a1b3f34ab07f --- /dev/null +++ b/src/Python/pytnl/tnl_mpi/DistributedMesh.cpp @@ -0,0 +1,22 @@ +// conversions have to be registered for each object file +#include "../tnl_conversions.h" + +#include "../typedefs.h" +#include "DistributedMesh.h" +#include "../tnl/Array.h" + +void export_DistributedMeshes( py::module & m ) +{ + // make sure that bindings for the local meshes are available + py::module_::import(PYTNL_STRINGIFY(PYTNL_MODULE_NAME(tnl))); + + export_DistributedMesh< DistributedMeshOfEdges >( m, "DistributedMeshOfEdges" ); + export_DistributedMesh< DistributedMeshOfTriangles >( m, "DistributedMeshOfTriangles" ); + export_DistributedMesh< DistributedMeshOfQuadrangles >( m, "DistributedMeshOfQuadrangles" ); + export_DistributedMesh< DistributedMeshOfTetrahedrons >( m, "DistributedMeshOfTetrahedrons" ); + export_DistributedMesh< DistributedMeshOfHexahedrons >( m, "DistributedMeshOfHexahedrons" ); + + // export VTKTypesArrayType + using VTKTypesArrayType = typename DistributedMeshOfEdges::VTKTypesArrayType; + export_Array< VTKTypesArrayType >(m, "VTKTypesArrayType"); +} diff --git a/src/Python/pytnl/tnl_mpi/DistributedMesh.h b/src/Python/pytnl/tnl_mpi/DistributedMesh.h new file mode 100644 index 0000000000000000000000000000000000000000..64afe5978dc4d82d20caa1d484640ecedeba030f --- /dev/null +++ b/src/Python/pytnl/tnl_mpi/DistributedMesh.h @@ -0,0 +1,34 @@ +#pragma once + +#include +namespace py = pybind11; + +template< typename Mesh > +void export_DistributedMesh( py::module & m, const char* name ) +{ + auto mesh = py::class_< Mesh >( m, name ) + .def(py::init<>()) + .def_static("getMeshDimension", &Mesh::getMeshDimension) +// .def("setCommunicationGroup", &Mesh::setCommunicationGroup) +// .def("getCommunicationGroup", &Mesh::getCommunicationGroup) + .def("getLocalMesh", py::overload_cast<>(&Mesh::getLocalMesh), py::return_value_policy::reference_internal) + .def("setGhostLevels", &Mesh::setGhostLevels) + .def("getGhostLevels", &Mesh::getGhostLevels) + .def("getGlobalPointIndices", []( const Mesh& mesh ) -> typename Mesh::GlobalIndexArray const& { + return mesh.template getGlobalIndices< 0 >(); + }, + py::return_value_policy::reference_internal) + .def("getGlobalCellIndices", []( const Mesh& mesh ) -> typename Mesh::GlobalIndexArray const& { + return mesh.template getGlobalIndices< Mesh::getMeshDimension() >(); + }, + py::return_value_policy::reference_internal) + .def("vtkPointGhostTypes", []( const Mesh& mesh ) -> typename Mesh::VTKTypesArrayType const& { + return mesh.vtkPointGhostTypes(); + }, + py::return_value_policy::reference_internal) + .def("vtkCellGhostTypes", []( const Mesh& mesh ) -> typename Mesh::VTKTypesArrayType const& { + return mesh.vtkCellGhostTypes(); + }, + py::return_value_policy::reference_internal) + ; +} diff --git a/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c196a67cc4576b2865b9257a97c3986f028e983d --- /dev/null +++ b/src/Python/pytnl/tnl_mpi/DistributedMeshReaders.cpp @@ -0,0 +1,26 @@ +// conversions have to be registered for each object file +#include "../tnl_conversions.h" + +#include "../tnl/MeshReaders.h" +#include "../typedefs.h" + +#include + +void export_DistributedMeshReaders( py::module & m ) +{ + using XMLVTK = TNL::Meshes::Readers::XMLVTK; + using PVTUReader = TNL::Meshes::Readers::PVTUReader; + + // make sure that bindings for the parent class are available + py::module_::import(PYTNL_STRINGIFY(PYTNL_MODULE_NAME(tnl))); + + py::class_< PVTUReader, XMLVTK >( m, "PVTUReader" ) + .def(py::init()) + // loadMesh is not virtual in PVTUReader + .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfEdges >) + .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfTriangles >) + .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfQuadrangles >) + .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfTetrahedrons >) + .def("loadMesh", &PVTUReader::template loadMesh< DistributedMeshOfHexahedrons >) + ; +} diff --git a/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp b/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp new file mode 100644 index 0000000000000000000000000000000000000000..17bf57c128dabcb28793fa0bf831150929624abd --- /dev/null +++ b/src/Python/pytnl/tnl_mpi/DistributedMeshWriters.cpp @@ -0,0 +1,95 @@ +// conversions have to be registered for each object file +#include "../tnl_conversions.h" + +#include "../tnl/MeshWriters.h" +#include "../typedefs.h" + +#include + +#include + +template< template class WriterTemplate, typename LocalMesh, TNL::Meshes::VTK::FileFormat default_format > +void export_DistributedMeshWriter( py::module & m, const char* name ) +{ + using Writer = WriterTemplate< LocalMesh >; + using Mesh = TNL::Meshes::DistributedMeshes::DistributedMesh< LocalMesh >; + + // We cannot use MeshReader::VariantVector for Python bindings, because its variants are + // std::vector for T in std::int8_t, std::uint8_t, std::int16_t, std::uint16_t, std::int32_t, + // std::uint32_t, std::int64_t, std::uint64_t, float and double. Python types do not map + // nicely to C++ types, integers even have unlimited precision, pybind11 even checks if given + // Python value fits into the C++ type when selecting the alternative for a scalar type, and + // for containers like std::vector it merely selects the first possible type. For reference, see + // https://github.com/pybind/pybind11/issues/1625#issuecomment-723499161 + using VariantVector = mpark::variant< std::vector< IndexType >, std::vector< RealType > >; + + // Binding to Writer directly is not possible, because the writer has a std::ostream attribute + // which would reference the streambuf created by the type caster from the Python file-like object. + // However, the streambuf would be destroyed as soon as the writer is constructed and control + // returned to Python, so the following invokations would use an invalid object and segfault. + // To solve this, we use a transient wrapper struct PyWriter which holds the streambuf in its own + // ostream attribute and is initialized by a py::object to avoid type casting. + using PythonWriter = PyWriter< Writer, default_format >; + py::class_< PythonWriter >( m, name ) + .def(py::init(), py::keep_alive<1, 2>(), + py::arg("stream"), py::pos_only(), py::arg("format") = default_format) + .def("writeMetadata", &Writer::writeMetadata, py::kw_only(), py::arg("cycle") = -1, py::arg("time") = -1) + .def("writeVertices", static_cast< void (Writer::*)(const Mesh&) >(&Writer::template writeEntities< 0 >), + py::arg("distributedMesh")) + .def("writeVertices", static_cast< void (Writer::*)(const LocalMesh&, unsigned, unsigned) >(&Writer::template writeEntities< 0 >), + py::arg("localMesh"), py::arg("GhostLevel") = 0, py::arg("MinCommonVertices") = 0) + .def("writeCells", static_cast< void (Writer::*)(const Mesh&) >(&Writer::template writeEntities<>), + py::arg("distributedMesh")) + .def("writeCells", static_cast< void (Writer::*)(const LocalMesh&, unsigned, unsigned) >(&Writer::template writeEntities<>), + py::arg("localMesh"), py::arg("GhostLevel") = 0, py::arg("MinCommonVertices") = 0) + // INCONSISTENCY: the C++ methods writePPointData, writePCellData, writePDataArray do not + // take the whole array as parameter, only the ValueType as a template parameter. Since + // this does not map nicely to Python, we pass the whole array just like in the + // VTKWriter and VTUWriter classes. + // we use the VariantVector from MeshReader because we already have a caster for it + .def("writePPointData", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) { + using mpark::visit; + visit( [&](auto&& array) { + using value_type = typename std::decay_t::value_type; + writer.template writePPointData< value_type >( name, numberOfComponents ); + }, + array + ); + }, + py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1) + .def("writePCellData", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) { + using mpark::visit; + visit( [&](auto&& array) { + using value_type = typename std::decay_t::value_type; + writer.template writePCellData< value_type >( name, numberOfComponents ); + }, + array + ); + }, + py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1) + .def("writePDataArray", []( PythonWriter& writer, const VariantVector& array, std::string name, int numberOfComponents = 1 ) { + using mpark::visit; + visit( [&](auto&& array) { + using value_type = typename std::decay_t::value_type; + writer.template writePDataArray< value_type >( name, numberOfComponents ); + }, + array + ); + }, + py::arg("array"), py::arg("name"), py::arg("numberOfComponents") = 1) + // NOTE: only the overload intended for sequential writing is exported, because we don't + // have type casters for MPI_Comm (ideally, it would be compatible with the mpi4py objects) + .def("addPiece", static_cast< std::string (Writer::*)(const TNL::String&, unsigned) >( &Writer::addPiece ), + py::arg("mainFileName"), py::arg("subdomainIndex")) + ; +} + +void export_DistributedMeshWriters( py::module & m ) +{ + constexpr TNL::Meshes::VTK::FileFormat default_format = TNL::Meshes::VTK::FileFormat::zlib_compressed; + export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfEdges, default_format >( m, "PVTUWriter_MeshOfEdges" ); + export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfTriangles, default_format >( m, "PVTUWriter_MeshOfTriangles" ); + export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfQuadrangles, default_format >( m, "PVTUWriter_MeshOfQuadrangles" ); + export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfTetrahedrons, default_format >( m, "PVTUWriter_MeshOfTetrahedrons" ); + export_DistributedMeshWriter< TNL::Meshes::Writers::PVTUWriter, MeshOfHexahedrons, default_format >( m, "PVTUWriter_MeshOfHexahedrons" ); +} diff --git a/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a422795b6e8bd47a799379329c9252aefb831920 --- /dev/null +++ b/src/Python/pytnl/tnl_mpi/tnl_mpi.cpp @@ -0,0 +1,49 @@ +#include "../exceptions.h" +#include "../typedefs.h" + +// conversions have to be registered for each object file +#include "../tnl_conversions.h" +#include "TNL/MPI/Wrappers.h" + +// external functions +void export_DistributedMeshes( py::module & m ); +void export_DistributedMeshReaders( py::module & m ); +void export_DistributedMeshWriters( py::module & m ); + +#include + +// Python module definition +PYBIND11_MODULE(PYTNL_MODULE_NAME(tnl_mpi), m) +{ + register_exceptions(m); + + // MPI initialization and finalization + // https://stackoverflow.com/q/64647846 + if( ! TNL::MPI::Initialized() ) { + int argc = 0; + char** argv = nullptr; + TNL::MPI::Init( argc, argv ); + } + // https://pybind11.readthedocs.io/en/stable/advanced/misc.html#module-destructors + auto cleanup_callback = []() { + if( TNL::MPI::Initialized() && ! TNL::MPI::Finalized() ) + TNL::MPI::Finalize(); + }; + m.add_object("_cleanup", py::capsule(cleanup_callback)); + + // bindings for distributed data structures + export_DistributedMeshes(m); + export_DistributedMeshReaders(m); + export_DistributedMeshWriters(m); + + // bindings for functions + using TNL::Meshes::DistributedMeshes::distributeSubentities; + m.def("distributeFaces", []( DistributedMeshOfTriangles& mesh ) { + distributeSubentities< 1 >( mesh ); }); + m.def("distributeFaces", []( DistributedMeshOfQuadrangles& mesh ) { + distributeSubentities< 1 >( mesh ); }); + m.def("distributeFaces", []( DistributedMeshOfTetrahedrons& mesh ) { + distributeSubentities< 2 >( mesh ); }); + m.def("distributeFaces", []( DistributedMeshOfHexahedrons& mesh ) { + distributeSubentities< 2 >( mesh ); }); +} diff --git a/src/Python/pytnl/typedefs.h b/src/Python/pytnl/typedefs.h index 7a74237f02b6bb150a02b28e1b42bc9e343ea47c..7bc9fe0256f88c212995b987100000bbe808ce47 100644 --- a/src/Python/pytnl/typedefs.h +++ b/src/Python/pytnl/typedefs.h @@ -1,11 +1,28 @@ #pragma once +// helper macros (the _NX variants are needed to expand macros in the arguments) +#define PYTNL_STRINGIFY(U) PYTNL_STRINGIFY_NX(U) +#define PYTNL_STRINGIFY_NX(U) #U + +#define PYTNL_PPCAT(A, B) PYTNL_PPCAT_NX(A, B) +#define PYTNL_PPCAT_NX(A, B) A ## B + +// the Python module name depends on the build type, this macro can be used to concatenate with the correct suffix +#ifdef PYTNL_MODULE_POSTFIX + #define PYTNL_MODULE_NAME(name) PYTNL_PPCAT(name, PYTNL_MODULE_POSTFIX) +#else + #define PYTNL_MODULE_NAME(name) name +#endif + #include #include +#include #include #include #include +#include #include +#include using RealType = double; using DeviceType = TNL::Devices::Host; @@ -16,24 +33,22 @@ using Grid2D = TNL::Meshes::Grid<2, RealType, DeviceType, IndexType>; using Grid3D = TNL::Meshes::Grid<3, RealType, DeviceType, IndexType>; using LocalIndexType = short int; -using EdgeTopology = TNL::Meshes::Topologies::Edge; -using TriangleTopology = TNL::Meshes::Topologies::Triangle; -using TetrahedronTopology = TNL::Meshes::Topologies::Tetrahedron; -using MeshOfEdges = TNL::Meshes::Mesh< TNL::Meshes::DefaultConfig< - EdgeTopology, - EdgeTopology::dimension, - RealType, - IndexType, - LocalIndexType > >; -using MeshOfTriangles = TNL::Meshes::Mesh< TNL::Meshes::DefaultConfig< - TriangleTopology, - TriangleTopology::dimension, - RealType, - IndexType, - LocalIndexType > >; -using MeshOfTetrahedrons = TNL::Meshes::Mesh< TNL::Meshes::DefaultConfig< - TetrahedronTopology, - TetrahedronTopology::dimension, +template< typename Topology > +using DefaultMeshTemplate = TNL::Meshes::Mesh< TNL::Meshes::DefaultConfig< + Topology, + Topology::dimension, RealType, IndexType, LocalIndexType > >; + +using MeshOfEdges = DefaultMeshTemplate< TNL::Meshes::Topologies::Edge >; +using MeshOfTriangles = DefaultMeshTemplate< TNL::Meshes::Topologies::Triangle >; +using MeshOfQuadrangles = DefaultMeshTemplate< TNL::Meshes::Topologies::Quadrangle >; +using MeshOfTetrahedrons = DefaultMeshTemplate< TNL::Meshes::Topologies::Tetrahedron >; +using MeshOfHexahedrons = DefaultMeshTemplate< TNL::Meshes::Topologies::Hexahedron >; + +using DistributedMeshOfEdges = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfEdges >; +using DistributedMeshOfTriangles = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfTriangles >; +using DistributedMeshOfQuadrangles = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfQuadrangles >; +using DistributedMeshOfTetrahedrons = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfTetrahedrons >; +using DistributedMeshOfHexahedrons = TNL::Meshes::DistributedMeshes::DistributedMesh< MeshOfHexahedrons >; diff --git a/src/Python/pytnl/variant_caster.h b/src/Python/pytnl/variant_caster.h new file mode 100644 index 0000000000000000000000000000000000000000..c032448b598754e148e632004e16a9826ef247e3 --- /dev/null +++ b/src/Python/pytnl/variant_caster.h @@ -0,0 +1,15 @@ +#pragma once + +#include +#include + +#include // backport of std::variant from C++17 + +namespace pybind11 { namespace detail { + +// add specialization for concrete variant type +// (variant_caster is implemented in pybind11 and used for C++17's std::variant casting) +template struct type_caster> + : variant_caster> {}; + +}} // namespace pybind11::detail diff --git a/src/TNL/Algorithms/DistributedScan.h b/src/TNL/Algorithms/DistributedScan.h index 742acd5ed923b4d0e0cbf14e37be8fb40866ec06..aa7c008a7b6b5ccfe1445daebdc4312976eead0b 100644 --- a/src/TNL/Algorithms/DistributedScan.h +++ b/src/TNL/Algorithms/DistributedScan.h @@ -14,6 +14,7 @@ #include #include +#include namespace TNL { namespace Algorithms { @@ -32,10 +33,9 @@ struct DistributedScan { using RealType = typename DistributedVector::RealType; using DeviceType = typename DistributedVector::DeviceType; - using CommunicatorType = typename DistributedVector::CommunicatorType; const auto group = v.getCommunicationGroup(); - if( group != CommunicatorType::NullGroup ) { + if( group != MPI::NullGroup() ) { // adjust begin and end for the local range const auto localRange = v.getLocalRange(); begin = min( max( begin, localRange.getBegin() ), localRange.getEnd() ) - localRange.getBegin(); @@ -47,18 +47,18 @@ struct DistributedScan const RealType localSum = blockShifts.getElement( blockShifts.getSize() - 1 ); // exchange local sums between ranks - const int nproc = CommunicatorType::GetSize( group ); + const int nproc = MPI::GetSize( group ); RealType dataForScatter[ nproc ]; for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localSum; Containers::Vector< RealType, Devices::Host > rankSums( nproc ); // NOTE: exchanging general data types does not work with MPI - CommunicatorType::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group ); + MPI::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group ); // compute the scan of the per-rank sums Scan< Devices::Host, ScanType::Exclusive >::perform( rankSums, 0, nproc, reduction, zero ); // perform second phase: shift by the per-block and per-rank offsets - const int rank = CommunicatorType::GetRank( group ); + const int rank = MPI::GetRank( group ); Scan< DeviceType, Type >::performSecondPhase( localView, blockShifts, begin, end, reduction, rankSums[ rank ] ); } } diff --git a/src/TNL/Communicators/MPITypeResolver.h b/src/TNL/Communicators/MPITypeResolver.h deleted file mode 100644 index 5429d5e33c970576fac1856f3624eeef7a06a458..0000000000000000000000000000000000000000 --- a/src/TNL/Communicators/MPITypeResolver.h +++ /dev/null @@ -1,108 +0,0 @@ -/*************************************************************************** - MPITypeResolver.h - description - ------------------- - begin : Feb 4, 2019 - copyright : (C) 2019 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -namespace TNL { -namespace Communicators { - -#ifdef HAVE_MPI -template -struct MPITypeResolver -{ - static inline MPI_Datatype getType() - { - static_assert( sizeof(Type) == sizeof(char) || - sizeof(Type) == sizeof(int) || - sizeof(Type) == sizeof(short int) || - sizeof(Type) == sizeof(long int), - "Fatal Error - Unknown MPI Type"); - switch( sizeof( Type ) ) - { - case sizeof( char ): - return MPI_CHAR; - case sizeof( int ): - return MPI_INT; - case sizeof( short int ): - return MPI_SHORT; - case sizeof( long int ): - return MPI_LONG; - } - // this will never happen thanks to the static_assert above, but icpc is not that smart - // and complains about missing return statement at the end of non-void function - throw 0; - } -}; - -template<> struct MPITypeResolver< char > -{ - static inline MPI_Datatype getType(){return MPI_CHAR;}; -}; - -template<> struct MPITypeResolver< int > -{ - static inline MPI_Datatype getType(){return MPI_INT;}; -}; - -template<> struct MPITypeResolver< short int > -{ - static inline MPI_Datatype getType(){return MPI_SHORT;}; -}; - -template<> struct MPITypeResolver< long int > -{ - static inline MPI_Datatype getType(){return MPI_LONG;}; -}; - -template<> struct MPITypeResolver< unsigned char > -{ - static inline MPI_Datatype getType(){return MPI_UNSIGNED_CHAR;}; -}; - -template<> struct MPITypeResolver< unsigned short int > -{ - static inline MPI_Datatype getType(){return MPI_UNSIGNED_SHORT;}; -}; - -template<> struct MPITypeResolver< unsigned int > -{ - static inline MPI_Datatype getType(){return MPI_UNSIGNED;}; -}; - -template<> struct MPITypeResolver< unsigned long int > -{ - static inline MPI_Datatype getType(){return MPI_UNSIGNED_LONG;}; -}; - -template<> struct MPITypeResolver< float > -{ - static inline MPI_Datatype getType(){return MPI_FLOAT;}; -}; - -template<> struct MPITypeResolver< double > -{ - static inline MPI_Datatype getType(){return MPI_DOUBLE;}; -}; - -template<> struct MPITypeResolver< long double > -{ - static inline MPI_Datatype getType(){return MPI_LONG_DOUBLE;}; -}; - -template<> struct MPITypeResolver< bool > -{ - // sizeof(bool) is implementation-defined: https://stackoverflow.com/a/4897859 - static_assert( sizeof(bool) == 1, "The systems where sizeof(bool) != 1 are not supported by MPI." ); - static inline MPI_Datatype getType() { return MPI_C_BOOL; }; -}; -#endif - -} // namespace Communicators -} // namespace TNL diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h index 1382fb7a6fef4877d9beb21d0a5223245ac74d16..cd51629687444ce64b2bcb3fd61c3491e3ecce9a 100644 --- a/src/TNL/Communicators/MpiCommunicator.h +++ b/src/TNL/Communicators/MpiCommunicator.h @@ -10,38 +10,10 @@ #pragma once -#include -#include -#include - -#ifdef HAVE_MPI -#include -#ifdef OMPI_MAJOR_VERSION - // header specific to OpenMPI (needed for CUDA-aware detection) - #include -#endif - -#include // getpid - -#ifdef HAVE_CUDA - #include - - typedef struct __attribute__((__packed__)) { - char name[MPI_MAX_PROCESSOR_NAME]; - } procName; -#endif - -#endif - -#include -#include -#include -#include -#include -#include -#include -#include - +#include +#include +#include +#include namespace TNL { //! \brief Namespace for TNL communicators. @@ -49,7 +21,8 @@ namespace Communicators { namespace { //! \brief MPI communicator. -class MpiCommunicator +class [[deprecated("use the functions in the TNL::MPI namespace instead")]] +MpiCommunicator { public: #ifdef HAVE_MPI @@ -71,275 +44,81 @@ class MpiCommunicator static void configSetup( Config::ConfigDescription& config, const String& prefix = "" ) { -#ifdef HAVE_MPI - config.addEntry< bool >( "redirect-mpi-output", "Only process with rank 0 prints to console. Other processes are redirected to files.", true ); - config.addEntry< bool >( "mpi-gdb-debug", "Wait for GDB to attach the master MPI process.", false ); - config.addEntry< int >( "mpi-process-to-attach", "Number of the MPI process to be attached by GDB. Set -1 for all processes.", 0 ); -#endif + MPI::configSetup( config, prefix ); } static bool setup( const Config::ParameterContainer& parameters, const String& prefix = "" ) { -#ifdef HAVE_MPI - if(IsInitialized())//i.e. - isUsed - { - const bool redirect = parameters.getParameter< bool >( "redirect-mpi-output" ); - if( redirect ) - setupRedirection(); -#ifdef HAVE_CUDA - int size; - MPI_Comm_size( MPI_COMM_WORLD, &size ); - if( size > 1 ) - { - #if defined( MPIX_CUDA_AWARE_SUPPORT ) && MPIX_CUDA_AWARE_SUPPORT - std::cout << "CUDA-aware MPI detected on this system ... " << std::endl; - #elif defined( MPIX_CUDA_AWARE_SUPPORT ) && !MPIX_CUDA_AWARE_SUPPORT - std::cerr << "MPI is not CUDA-aware. Please install correct version of MPI." << std::endl; - return false; - #else - std::cerr << "WARNING: TNL cannot detect if you have CUDA-aware MPI. Some problems may occur." << std::endl; - #endif - } -#endif // HAVE_CUDA - bool gdbDebug = parameters.getParameter< bool >( "mpi-gdb-debug" ); - int processToAttach = parameters.getParameter< int >( "mpi-process-to-attach" ); - - if( gdbDebug ) - { - int rank = GetRank( MPI_COMM_WORLD ); - int pid = getpid(); - - volatile int tnlMPIDebugAttached = 0; - MPI_Send( &pid, 1, MPI_INT, 0, 0, MPI_COMM_WORLD ); - MPI_Barrier( MPI_COMM_WORLD ); - if( rank == 0 ) - { - std::cout << "Attach GDB to MPI process(es) by entering:" << std::endl; - for( int i = 0; i < GetSize( MPI_COMM_WORLD ); i++ ) - { - MPI_Status status; - int recvPid; - MPI_Recv( &recvPid, 1, MPI_INT, i, 0, MPI_COMM_WORLD, &status ); - - if( i == processToAttach || processToAttach == -1 ) - { - std::cout << " For MPI process " << i << ": gdb -q -ex \"attach " << recvPid << "\"" - << " -ex \"set variable tnlMPIDebugAttached=1\"" - << " -ex \"continue\"" << std::endl; - } - } - std::cout << std::flush; - } - if( rank == processToAttach || processToAttach == -1 ) - while( ! tnlMPIDebugAttached ); - MPI_Barrier( MPI_COMM_WORLD ); - } - } -#endif // HAVE_MPI - return true; + return MPI::setup( parameters, prefix ); } - static void Init(int& argc, char**& argv ) + static void Init( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE ) { -#ifdef HAVE_MPI - MPI_Init( &argc, &argv ); - selectGPU(); -#endif + MPI::Init( argc, argv, required_thread_level ); // silence warnings about (potentially) unused variables (void) NullGroup; - (void) NullRequest; - } - - static void setupRedirection() - { -#ifdef HAVE_MPI - if(isDistributed() ) - { - if(GetRank(AllGroup)!=0) - { - const std::string stdoutFile = std::string("./stdout_") + std::to_string(GetRank(AllGroup)) + ".txt"; - const std::string stderrFile = std::string("./stderr_") + std::to_string(GetRank(AllGroup)) + ".txt"; - std::cout << GetRank(AllGroup) << ": Redirecting stdout and stderr to files " << stdoutFile << " and " << stderrFile << std::endl; - Debugging::redirect_stdout_stderr( stdoutFile, stderrFile ); - } - } -#else - throw Exceptions::MPISupportMissing(); -#endif } static void Finalize() { -#ifdef HAVE_MPI - if(isDistributed()) - { - if(GetRank(AllGroup)!=0) - { - // restore redirection (not necessary, it uses RAII internally...) - Debugging::redirect_stdout_stderr( "", "", true ); - } - } - MPI_Finalize(); -#endif + MPI::Finalize(); } static bool IsInitialized() { -#ifdef HAVE_MPI - int initialized, finalized; - MPI_Initialized(&initialized); - MPI_Finalized(&finalized); - return initialized && !finalized; -#else - throw Exceptions::MPISupportMissing(); -#endif + return MPI::isInitialized(); } static int GetRank(CommunicationGroup group = AllGroup ) { -#ifdef HAVE_MPI - TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); - TNL_ASSERT_NE(group, NullGroup, "GetRank cannot be called with NullGroup"); - int rank; - MPI_Comm_rank(group,&rank); - return rank; -#else - throw Exceptions::MPISupportMissing(); -#endif + return MPI::GetRank( group ); } static int GetSize(CommunicationGroup group = AllGroup ) { -#ifdef HAVE_MPI - TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); - TNL_ASSERT_NE(group, NullGroup, "GetSize cannot be called with NullGroup"); - int size; - MPI_Comm_size(group,&size); - return size; -#else - throw Exceptions::MPISupportMissing(); -#endif - } - -#ifdef HAVE_MPI - template< typename T > - static MPI_Datatype getDataType( const T& t ) - { - return MPITypeResolver< T >::getType(); - } -#endif - - //dim-number of dimensions, distr array of guess distr - 0 for computation - //distr array will be filled by computed distribution - //more information in MPI documentation - static void DimsCreate(int nproc, int dim, int *distr) - { -#ifdef HAVE_MPI - int sum = 0, prod = 1; - for( int i = 0;i < dim; i++ ) { - sum += distr[ i ]; - prod *= distr[ i ]; - } - if( prod != 0 && prod != GetSize( AllGroup ) ) - throw Exceptions::MPIDimsCreateError(); - if(sum==0) { - for(int i=0;i static void Send( const T* data, int count, int dest, int tag, CommunicationGroup group = AllGroup ) { -#ifdef HAVE_MPI - TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); - TNL_ASSERT_NE(group, NullGroup, "Send cannot be called with NullGroup"); - MPI_Send( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType(), dest, tag, group ); -#else - throw Exceptions::MPISupportMissing(); -#endif + MPI::Send( data, count, dest, tag, group ); } template static void Recv( T* data, int count, int src, int tag, CommunicationGroup group = AllGroup ) { -#ifdef HAVE_MPI - TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); - TNL_ASSERT_NE(group, NullGroup, "Recv cannot be called with NullGroup"); - MPI_Status status; - MPI_Recv( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType() , src, tag, group, &status ); -#else - throw Exceptions::MPISupportMissing(); -#endif - } + MPI::Recv( data, count, src, tag, group ); + } template static Request ISend( const T* data, int count, int dest, int tag, CommunicationGroup group = AllGroup ) { -#ifdef HAVE_MPI - TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); - TNL_ASSERT_NE(group, NullGroup, "ISend cannot be called with NullGroup"); - Request req; - MPI_Isend( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType(), dest, tag, group, &req); - return req; -#else - throw Exceptions::MPISupportMissing(); -#endif + return MPI::Isend( data, count, dest, tag, group ); } template static Request IRecv( T* data, int count, int src, int tag, CommunicationGroup group = AllGroup ) { -#ifdef HAVE_MPI - TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); - TNL_ASSERT_NE(group, NullGroup, "IRecv cannot be called with NullGroup"); - Request req; - MPI_Irecv( const_cast< void* >( ( const void* ) data ), count, MPITypeResolver< T >::getType() , src, tag, group, &req); - return req; -#else - throw Exceptions::MPISupportMissing(); -#endif + return MPI::Irecv( data, count, src, tag, group ); } static void WaitAll(Request *reqs, int length) { -#ifdef HAVE_MPI - TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); - MPI_Waitall(length, reqs, MPI_STATUSES_IGNORE); -#else - throw Exceptions::MPISupportMissing(); -#endif + MPI::Waitall( reqs, length ); } template< typename T > static void Bcast( T* data, int count, int root, CommunicationGroup group) { -#ifdef HAVE_MPI - TNL_ASSERT_TRUE(IsInitialized(), "Fatal Error - MPI communicator is not initialized"); - TNL_ASSERT_NE(group, NullGroup, "BCast cannot be called with NullGroup"); - MPI_Bcast((void*) data, count, MPITypeResolver< T >::getType(), root, group); -#else - throw Exceptions::MPISupportMissing(); -#endif + MPI::Bcast( data, count, root, group ); } template< typename T > @@ -349,12 +128,7 @@ class MpiCommunicator const MPI_Op &op, CommunicationGroup group) { -#ifdef HAVE_MPI - TNL_ASSERT_NE(group, NullGroup, "Allreduce cannot be called with NullGroup"); - MPI_Allreduce( const_cast< void* >( ( void* ) data ), (void*) reduced_data,count,MPITypeResolver< T >::getType(),op,group); -#else - throw Exceptions::MPISupportMissing(); -#endif + MPI::Allreduce( data, reduced_data, count, op, group ); } // in-place variant of Allreduce @@ -364,29 +138,18 @@ class MpiCommunicator const MPI_Op &op, CommunicationGroup group) { -#ifdef HAVE_MPI - TNL_ASSERT_NE(group, NullGroup, "Allreduce cannot be called with NullGroup"); - MPI_Allreduce( MPI_IN_PLACE, (void*) data,count,MPITypeResolver< T >::getType(),op,group); -#else - throw Exceptions::MPISupportMissing(); -#endif + MPI::Allreduce( data, count, op, group ); } - template< typename T > static void Reduce( const T* data, T* reduced_data, int count, - MPI_Op &op, + const MPI_Op &op, int root, CommunicationGroup group) { -#ifdef HAVE_MPI - TNL_ASSERT_NE(group, NullGroup, "Reduce cannot be called with NullGroup"); - MPI_Reduce( const_cast< void* >( ( void*) data ), (void*) reduced_data,count,MPITypeResolver< T >::getType(),op,root,group); -#else - throw Exceptions::MPISupportMissing(); -#endif + MPI::Reduce( data, reduced_data, count, op, root, group ); } template< typename T > @@ -400,24 +163,7 @@ class MpiCommunicator int receiveTag, CommunicationGroup group ) { -#ifdef HAVE_MPI - TNL_ASSERT_NE(group, NullGroup, "SendReceive cannot be called with NullGroup"); - MPI_Status status; - MPI_Sendrecv( const_cast< void* >( ( void* ) sendData ), - sendCount, - MPITypeResolver< T >::getType(), - destination, - sendTag, - ( void* ) receiveData, - receiveCount, - MPITypeResolver< T >::getType(), - source, - receiveTag, - group, - &status ); -#else - throw Exceptions::MPISupportMissing(); -#endif + MPI::Sendrecv( sendData, sendCount, destination, sendTag, receiveData, receiveCount, source, receiveTag, group ); } template< typename T > @@ -427,94 +173,20 @@ class MpiCommunicator int receiveCount, CommunicationGroup group ) { -#ifdef HAVE_MPI - TNL_ASSERT_NE(group, NullGroup, "SendReceive cannot be called with NullGroup"); - MPI_Alltoall( const_cast< void* >( ( void* ) sendData ), - sendCount, - MPITypeResolver< T >::getType(), - ( void* ) receiveData, - receiveCount, - MPITypeResolver< T >::getType(), - group ); -#else - throw Exceptions::MPISupportMissing(); -#endif - } - - - static void writeProlog( Logger& logger ) - { - if( isDistributed() ) - { - logger.writeParameter( "MPI processes:", GetSize(AllGroup) ); - } - } - - static void CreateNewGroup( bool meToo, int myRank, CommunicationGroup &oldGroup, CommunicationGroup &newGroup ) - { -#ifdef HAVE_MPI - if(meToo) - MPI_Comm_split(oldGroup, 1, myRank, &newGroup); - else - MPI_Comm_split(oldGroup, MPI_UNDEFINED, GetRank(oldGroup), &newGroup); -#else - throw Exceptions::MPISupportMissing(); -#endif + MPI::Alltoall( sendData, sendCount, receiveData, receiveCount, group ); } #ifdef HAVE_MPI - static MPI_Request NullRequest; static MPI_Comm AllGroup; static MPI_Comm NullGroup; #else - static constexpr int NullRequest = -1; static constexpr int AllGroup = 1; static constexpr int NullGroup = 0; #endif private: - - static void selectGPU(void) - { -#ifdef HAVE_MPI - #ifdef HAVE_CUDA - const int count = GetSize(AllGroup); - const int rank = GetRank(AllGroup); - int gpuCount; - cudaGetDeviceCount(&gpuCount); - - procName names[count]; - - int i=0; - int len; - MPI_Get_processor_name(names[rank].name, &len); - - for(i=0;i -#include -#include - -namespace TNL { -namespace Communicators { - -//! \brief Dummy communicator without any distribution support. -class NoDistrCommunicator -{ - public: - using Request = int; - using CommunicationGroup = int; - static constexpr Request NullRequest = -1; - static constexpr CommunicationGroup AllGroup = 1; - static constexpr CommunicationGroup NullGroup = 0; - - static void configSetup( Config::ConfigDescription& config, const String& prefix = "" ){}; - - static bool setup( const Config::ParameterContainer& parameters, - const String& prefix = "" ) - { - return true; - } - - static void Init(int& argc, char**& argv) {} - - static void setupRedirection(){} - - static void Finalize(){} - - static bool IsInitialized() - { - return true; - } - - static bool isDistributed() - { - return false; - } - - static int GetRank(CommunicationGroup group = AllGroup ) - { - return 0; - } - - static int GetSize(CommunicationGroup group = AllGroup ) - { - return 1; - } - - static void DimsCreate(int nproc, int dim, int *distr) - { - for(int i=0;i - static Request ISend( const T *data, int count, int dest, int tag, CommunicationGroup group) - { - return 1; - } - - template - static Request IRecv( const T *data, int count, int src, int tag, CommunicationGroup group) - { - return 1; - } - - static void WaitAll(Request *reqs, int length) - { - } - - template< typename T > - static void Bcast( T* data, int count, int root, CommunicationGroup group) - { - } - - template< typename T > - static void Allreduce( const T* data, - T* reduced_data, - int count, - const MPI_Op &op, - CommunicationGroup group ) - { - memcpy( ( void* ) reduced_data, ( const void* ) data, count * sizeof( T ) ); - } - - // in-place variant of Allreduce - template< typename T > - static void Allreduce( T* data, - int count, - const MPI_Op &op, - CommunicationGroup group ) - { - } - - template< typename T > - static void Reduce( T* data, - T* reduced_data, - int count, - MPI_Op &op, - int root, - CommunicationGroup group ) - { - memcpy( ( void* ) reduced_data, ( void* ) data, count * sizeof( T ) ); - } - - template< typename T > - static void Alltoall( const T* sendData, - int sendCount, - T* receiveData, - int receiveCount, - CommunicationGroup group ) - { - TNL_ASSERT_EQ( sendCount, receiveCount, "sendCount must be equal to receiveCount for NoDistrCommunicator." ); - memcpy( (void*) receiveData, (const void*) sendData, sendCount * sizeof( T ) ); - } - - static void CreateNewGroup(bool meToo, int myRank, CommunicationGroup &oldGroup, CommunicationGroup &newGroup) - { - newGroup=oldGroup; - } - - static void writeProlog( Logger& logger ) - { - } -}; - -} // namespace Communicators -} // namespace TNL diff --git a/src/TNL/Containers/ByteArraySynchronizer.h b/src/TNL/Containers/ByteArraySynchronizer.h new file mode 100644 index 0000000000000000000000000000000000000000..0bfed4d92ce6c7e7ee2384a7644217537fa75887 --- /dev/null +++ b/src/TNL/Containers/ByteArraySynchronizer.h @@ -0,0 +1,147 @@ +/*************************************************************************** + ByteArraySynchronizer.h - description + ------------------- + begin : November 17, 2020 + copyright : (C) 2020 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovský + +#pragma once + +#include +// 3rd-party async library providing a thread-pool +#include + +#include +#include +#include + +namespace TNL { +namespace Containers { + +template< typename Device, typename Index > +class ByteArraySynchronizer +{ +private: + // NOTE: async::threadpool has alignment requirements, which causes problems: + // - it may become misaligned in derived classes, see e.g. + // https://stackoverflow.com/a/46475498 + // solution: specify it as the first member of the base class + // - operator new before C++17 may not support over-aligned types, see + // https://stackoverflow.com/a/53485295 + // solution: relaxed alignment requirements to not exceed the value of + // alignof(std::max_align_t), which is the strongest alignment supported + // by plain new. See https://github.com/d36u9/async/pull/2 + async::threadpool tp; + + int gpu_id = 0; + +public: + using ByteArrayView = ArrayView< std::uint8_t, Device, Index >; + using RequestsVector = std::vector< MPI_Request >; + + enum class AsyncPolicy { + synchronous, + deferred, + threadpool, + async, + }; + + ByteArraySynchronizer() : tp(1) {} + + virtual void synchronizeByteArray( ByteArrayView array, int bytesPerValue ) = 0; + + virtual RequestsVector synchronizeByteArrayAsyncWorker( ByteArrayView array, int bytesPerValue ) = 0; + + /** + * \brief An asynchronous version of \ref synchronizeByteArray. + * + * Note that this method is not thread-safe - only the thread which created + * and "owns" the instance of this object can call this method. + * + * Note that at most one async operation may be active at a time, the + * following calls will block until the pending operation is finished. + */ + void synchronizeByteArrayAsync( ByteArrayView array, int bytesPerValue, AsyncPolicy policy = AsyncPolicy::synchronous ) + { + // wait for any previous synchronization (multiple objects can share the + // same synchronizer) + if( async_op.valid() ) { + async_wait_before_start_timer.start(); + async_op.wait(); + async_wait_before_start_timer.stop(); + } + + async_start_timer.start(); + + // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/ + #ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaGetDevice(&gpu_id); + #endif + + if( policy == AsyncPolicy::threadpool || policy == AsyncPolicy::async ) { + // everything offloaded to a separate thread + auto worker = [=] () { + // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/ + #ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaSetDevice(this->gpu_id); + #endif + + this->synchronizeByteArray( array, bytesPerValue ); + }; + + if( policy == AsyncPolicy::threadpool ) + async_op = tp.post( worker ); + else + async_op = std::async( std::launch::async, worker ); + } + else if( policy == AsyncPolicy::deferred ) { + // immediate start, deferred synchronization (but still in the same thread) + auto requests = synchronizeByteArrayAsyncWorker( array, bytesPerValue ); + auto worker = [requests] () mutable { + MPI::Waitall( requests.data(), requests.size() ); + }; + this->async_op = std::async( std::launch::deferred, worker ); + } + else { + // synchronous + synchronizeByteArray( array, bytesPerValue ); + } + + async_ops_count++; + async_start_timer.stop(); + } + + virtual ~ByteArraySynchronizer() = default; + + /** + * \brief Can be used for checking if a synchronization started + * asynchronously has been finished. + * + * Note that derived classes *must* make this check in the destructor, + * otherwise running \ref synchronizeByteArrayAsync would lead to the error + * `pure virtual method called` when the derived object is destructed before + * the async operation finishes. This cannot be implemented in the base class + * destructor, because the derived destructor is run first. + * + * ~Derived() + * { + * if( this->async_op.valid() ) + * this->async_op.wait(); + * } + */ + std::future< void > async_op; + + // attributes for profiling + Timer async_wait_before_start_timer, async_start_timer, async_wait_timer; + std::size_t async_ops_count = 0; +}; + +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/DistributedArray.h b/src/TNL/Containers/DistributedArray.h index 66dd8a8f0a07f9ed3a60ddc6fbc0471c17008bb3..3947bfec438a31307b32241a4bebc9e6a4324ab7 100644 --- a/src/TNL/Containers/DistributedArray.h +++ b/src/TNL/Containers/DistributedArray.h @@ -21,22 +21,22 @@ namespace Containers { template< typename Value, typename Device = Devices::Host, typename Index = int, - typename Communicator = Communicators::MpiCommunicator > + typename Allocator = typename Allocators::Default< Device >::template Allocator< Value > > class DistributedArray { - using CommunicationGroup = typename Communicator::CommunicationGroup; - using LocalArrayType = Containers::Array< Value, Device, Index >; + using LocalArrayType = Containers::Array< Value, Device, Index, Allocator >; public: using ValueType = Value; using DeviceType = Device; - using CommunicatorType = Communicator; using IndexType = Index; + using AllocatorType = Allocator; using LocalRangeType = Subrange< Index >; using LocalViewType = Containers::ArrayView< Value, Device, Index >; using ConstLocalViewType = Containers::ArrayView< std::add_const_t< Value >, Device, Index >; - using ViewType = DistributedArrayView< Value, Device, Index, Communicator >; - using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index, Communicator >; + using ViewType = DistributedArrayView< Value, Device, Index >; + using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index >; + using SynchronizerType = typename ViewType::SynchronizerType; /** * \brief A template which allows to quickly obtain a \ref DistributedArray type with changed template parameters. @@ -44,52 +44,86 @@ public: template< typename _Value, typename _Device = Device, typename _Index = Index, - typename _Communicator = Communicator > - using Self = DistributedArray< _Value, _Device, _Index, _Communicator >; + typename _Allocator = typename Allocators::Default< _Device >::template Allocator< _Value > > + using Self = DistributedArray< _Value, _Device, _Index, _Allocator >; + ~DistributedArray(); + + /** + * \brief Constructs an empty array with zero size. + */ DistributedArray() = default; - DistributedArray( const DistributedArray& ) = default; + /** + * \brief Constructs an empty array and sets the provided allocator. + * + * \param allocator The allocator to be associated with this array. + */ + explicit DistributedArray( const AllocatorType& allocator ); + + /** + * \brief Copy constructor (makes a deep copy). + * + * \param array The array to be copied. + */ + explicit DistributedArray( const DistributedArray& array ); + + /** + * \brief Copy constructor with a specific allocator (makes a deep copy). + * + * \param array The array to be copied. + * \param allocator The allocator to be associated with this array. + */ + explicit DistributedArray( const DistributedArray& array, const AllocatorType& allocator ); - DistributedArray( LocalRangeType localRange, Index globalSize, CommunicationGroup group = Communicator::AllGroup ); + DistributedArray( LocalRangeType localRange, Index ghosts, Index globalSize, MPI_Comm group = MPI::AllGroup(), const AllocatorType& allocator = AllocatorType() ); - void setDistribution( LocalRangeType localRange, Index globalSize, CommunicationGroup group = Communicator::AllGroup ); + void setDistribution( LocalRangeType localRange, Index ghosts, Index globalSize, MPI_Comm group = MPI::AllGroup() ); const LocalRangeType& getLocalRange() const; - CommunicationGroup getCommunicationGroup() const; + IndexType getGhosts() const; + + MPI_Comm getCommunicationGroup() const; + + AllocatorType getAllocator() const; /** * \brief Returns a modifiable view of the local part of the array. - * - * If \e begin or \e end is set to a non-zero value, a view for the - * sub-interval `[begin, end)` is returned. Otherwise a view for whole - * local part of the array view is returned. - * - * \param begin The beginning of the array view sub-interval. It is 0 by - * default. - * \param end The end of the array view sub-interval. The default value is 0 - * which is, however, replaced with the array size. */ LocalViewType getLocalView(); /** * \brief Returns a non-modifiable view of the local part of the array. - * - * If \e begin or \e end is set to a non-zero value, a view for the - * sub-interval `[begin, end)` is returned. Otherwise a view for whole - * local part of the array view is returned. - * - * \param begin The beginning of the array view sub-interval. It is 0 by - * default. - * \param end The end of the array view sub-interval. The default value is 0 - * which is, however, replaced with the array size. */ ConstLocalViewType getConstLocalView() const; + /** + * \brief Returns a modifiable view of the local part of the array, + * including ghost values. + */ + LocalViewType getLocalViewWithGhosts(); + + /** + * \brief Returns a non-modifiable view of the local part of the array, + * including ghost values. + */ + ConstLocalViewType getConstLocalViewWithGhosts() const; + void copyFromGlobal( ConstLocalViewType globalArray ); + // synchronizer stuff + void setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement = 1 ); + + std::shared_ptr< SynchronizerType > getSynchronizer() const; + + int getValuesPerElement() const; + + void startSynchronization(); + + void waitForSynchronization() const; + // Usual Array methods follow below. @@ -168,10 +202,17 @@ public: // TODO: serialization (save, load) protected: - LocalRangeType localRange; - IndexType globalSize = 0; - CommunicationGroup group = Communicator::NullGroup; + ViewType view; LocalArrayType localData; + +private: + template< typename Array, std::enable_if_t< std::is_same< typename Array::DeviceType, DeviceType >::value, bool > = true > + static void setSynchronizerHelper( ViewType& view, const Array& array ) + { + view.setSynchronizer( array.getSynchronizer(), array.getValuesPerElement() ); + } + template< typename Array, std::enable_if_t< ! std::is_same< typename Array::DeviceType, DeviceType >::value, bool > = true > + static void setSynchronizerHelper( ViewType& view, const Array& array ) {} }; } // namespace Containers diff --git a/src/TNL/Containers/DistributedArray.hpp b/src/TNL/Containers/DistributedArray.hpp index c146bbf9f8657e6af5f38a8506d9c944a539c57a..e9ee120932070bfb7cb57e1e65ecd38da1cd01ce 100644 --- a/src/TNL/Containers/DistributedArray.hpp +++ b/src/TNL/Containers/DistributedArray.hpp @@ -15,7 +15,6 @@ #include "DistributedArray.h" #include -#include // important only when MPI is disabled namespace TNL { namespace Containers { @@ -23,94 +22,226 @@ namespace Containers { template< typename Value, typename Device, typename Index, - typename Communicator > -DistributedArray< Value, Device, Index, Communicator >:: -DistributedArray( LocalRangeType localRange, IndexType globalSize, CommunicationGroup group ) + typename Allocator > +DistributedArray< Value, Device, Index, Allocator >:: +~DistributedArray() { - setDistribution( localRange, globalSize, group ); + // Wait for pending async operation, otherwise the synchronizer would crash + // if the array goes out of scope. + waitForSynchronization(); } template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > +DistributedArray< Value, Device, Index, Allocator >:: +DistributedArray( const Allocator& allocator ) +: localData( allocator ) +{ +} + +template< typename Value, + typename Device, + typename Index, + typename Allocator > +DistributedArray< Value, Device, Index, Allocator >:: +DistributedArray( const DistributedArray& array ) +{ + setLike( array ); + view = array; +} + +template< typename Value, + typename Device, + typename Index, + typename Allocator > +DistributedArray< Value, Device, Index, Allocator >:: +DistributedArray( const DistributedArray& array, const Allocator& allocator ) +: localData( allocator ) +{ + setLike( array ); + view = array; +} + +template< typename Value, + typename Device, + typename Index, + typename Allocator > +DistributedArray< Value, Device, Index, Allocator >:: +DistributedArray( LocalRangeType localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group, const Allocator& allocator ) +: localData( allocator ) +{ + setDistribution( localRange, ghosts, globalSize, group ); +} + +template< typename Value, + typename Device, + typename Index, + typename Allocator > void -DistributedArray< Value, Device, Index, Communicator >:: -setDistribution( LocalRangeType localRange, IndexType globalSize, CommunicationGroup group ) +DistributedArray< Value, Device, Index, Allocator >:: +setDistribution( LocalRangeType localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group ) { TNL_ASSERT_LE( localRange.getEnd(), globalSize, "end of the local range is outside of the global range" ); - this->localRange = localRange; - this->globalSize = globalSize; - this->group = group; - if( group != Communicator::NullGroup ) - localData.setSize( localRange.getSize() ); + if( group != MPI::NullGroup() ) + localData.setSize( localRange.getSize() + ghosts ); + view.bind( localRange, ghosts, globalSize, group, localData.getView() ); } template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > const Subrange< Index >& -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: getLocalRange() const { - return localRange; + return view.getLocalRange(); +} + +template< typename Value, + typename Device, + typename Index, + typename Allocator > +Index +DistributedArray< Value, Device, Index, Allocator >:: +getGhosts() const +{ + return view.getGhosts(); } template< typename Value, typename Device, typename Index, - typename Communicator > -typename Communicator::CommunicationGroup -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +MPI_Comm +DistributedArray< Value, Device, Index, Allocator >:: getCommunicationGroup() const { - return group; + return view.getCommunicationGroup(); +} + +template< typename Value, + typename Device, + typename Index, + typename Allocator > +Allocator +DistributedArray< Value, Device, Index, Allocator >:: +getAllocator() const +{ + return localData.getAllocator(); } template< typename Value, typename Device, typename Index, - typename Communicator > -typename DistributedArray< Value, Device, Index, Communicator >::LocalViewType -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +typename DistributedArray< Value, Device, Index, Allocator >::LocalViewType +DistributedArray< Value, Device, Index, Allocator >:: getLocalView() { - return localData.getView(); + return view.getLocalView(); } template< typename Value, typename Device, typename Index, - typename Communicator > -typename DistributedArray< Value, Device, Index, Communicator >::ConstLocalViewType -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +typename DistributedArray< Value, Device, Index, Allocator >::ConstLocalViewType +DistributedArray< Value, Device, Index, Allocator >:: getConstLocalView() const { - return localData.getConstView(); + return view.getConstLocalView(); +} + +template< typename Value, + typename Device, + typename Index, + typename Allocator > +typename DistributedArray< Value, Device, Index, Allocator >::LocalViewType +DistributedArray< Value, Device, Index, Allocator >:: +getLocalViewWithGhosts() +{ + return view.getLocalViewWithGhosts(); +} + +template< typename Value, + typename Device, + typename Index, + typename Allocator > +typename DistributedArray< Value, Device, Index, Allocator >::ConstLocalViewType +DistributedArray< Value, Device, Index, Allocator >:: +getConstLocalViewWithGhosts() const +{ + return view.getConstLocalViewWithGhosts(); } template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > void -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: copyFromGlobal( ConstLocalViewType globalArray ) { - TNL_ASSERT_EQ( getSize(), globalArray.getSize(), - "given global array has different size than the distributed array" ); + view.copyFromGlobal( globalArray ); +} + +template< typename Value, + typename Device, + typename Index, + typename Allocator > +void +DistributedArray< Value, Device, Index, Allocator >:: +setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement ) +{ + view.setSynchronizer( synchronizer, valuesPerElement ); +} + +template< typename Value, + typename Device, + typename Index, + typename Allocator > +std::shared_ptr< typename DistributedArrayView< Value, Device, Index >::SynchronizerType > +DistributedArray< Value, Device, Index, Allocator >:: +getSynchronizer() const +{ + return view.getSynchronizer(); +} - LocalViewType localView( localData ); - const LocalRangeType localRange = getLocalRange(); +template< typename Value, + typename Device, + typename Index, + typename Allocator > +int +DistributedArray< Value, Device, Index, Allocator >:: +getValuesPerElement() const +{ + return view.getValuesPerElement(); +} - auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable - { - localView[ i ] = globalArray[ localRange.getGlobalIndex( i ) ]; - }; +template< typename Value, + typename Device, + typename Index, + typename Allocator > +void +DistributedArray< Value, Device, Index, Allocator >:: +startSynchronization() +{ + view.startSynchronization(); +} - Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, localRange.getSize(), kernel ); +template< typename Value, + typename Device, + typename Index, + typename Allocator > +void +DistributedArray< Value, Device, Index, Allocator >:: +waitForSynchronization() const +{ + view.waitForSynchronization(); } @@ -121,30 +252,30 @@ copyFromGlobal( ConstLocalViewType globalArray ) template< typename Value, typename Device, typename Index, - typename Communicator > -typename DistributedArray< Value, Device, Index, Communicator >::ViewType -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +typename DistributedArray< Value, Device, Index, Allocator >::ViewType +DistributedArray< Value, Device, Index, Allocator >:: getView() { - return ViewType( getLocalRange(), getSize(), getCommunicationGroup(), getLocalView() ); + return view; } template< typename Value, typename Device, typename Index, - typename Communicator > -typename DistributedArray< Value, Device, Index, Communicator >::ConstViewType -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +typename DistributedArray< Value, Device, Index, Allocator >::ConstViewType +DistributedArray< Value, Device, Index, Allocator >:: getConstView() const { - return ConstViewType( getLocalRange(), getSize(), getCommunicationGroup(), getConstLocalView() ); + return view.getConstView(); } template< typename Value, typename Device, typename Index, - typename Communicator > -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +DistributedArray< Value, Device, Index, Allocator >:: operator ViewType() { return getView(); @@ -153,8 +284,8 @@ operator ViewType() template< typename Value, typename Device, typename Index, - typename Communicator > -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +DistributedArray< Value, Device, Index, Allocator >:: operator ConstViewType() const { return getConstView(); @@ -163,206 +294,181 @@ operator ConstViewType() const template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Array > void -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: setLike( const Array& array ) { - localRange = array.getLocalRange(); - globalSize = array.getSize(); - group = array.getCommunicationGroup(); - localData.setLike( array.getConstLocalView() ); + localData.setLike( array.getConstLocalViewWithGhosts() ); + view.bind( array.getLocalRange(), array.getGhosts(), array.getSize(), array.getCommunicationGroup(), localData.getView() ); + // set, but do not unset, the synchronizer + if( array.getSynchronizer() ) + setSynchronizerHelper( view, array ); } template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > void -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: reset() { - localRange.reset(); - globalSize = 0; - group = Communicator::NullGroup; + view.reset(); localData.reset(); } template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > bool -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: empty() const { - return getSize() == 0; + return view.empty(); } template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > Index -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: getSize() const { - return globalSize; + return view.getSize(); } template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > void -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: setValue( ValueType value ) { - localData.setValue( value ); + view.setValue( value ); } template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > void -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: setElement( IndexType i, ValueType value ) { - const IndexType li = localRange.getLocalIndex( i ); - localData.setElement( li, value ); + view.setElement( i, value ); } template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > Value -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: getElement( IndexType i ) const { - const IndexType li = localRange.getLocalIndex( i ); - return localData.getElement( li ); + return view.getElement( i ); } template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > __cuda_callable__ Value& -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: operator[]( IndexType i ) { - const IndexType li = localRange.getLocalIndex( i ); - return localData[ li ]; + return view[ i ]; } template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > __cuda_callable__ const Value& -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: operator[]( IndexType i ) const { - const IndexType li = localRange.getLocalIndex( i ); - return localData[ li ]; + return view[ i ]; } template< typename Value, typename Device, typename Index, - typename Communicator > -DistributedArray< Value, Device, Index, Communicator >& -DistributedArray< Value, Device, Index, Communicator >:: + typename Allocator > +DistributedArray< Value, Device, Index, Allocator >& +DistributedArray< Value, Device, Index, Allocator >:: operator=( const DistributedArray& array ) { setLike( array ); - localData = array.getConstLocalView(); + view = array; return *this; } template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Array, typename..., typename > -DistributedArray< Value, Device, Index, Communicator >& -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >& +DistributedArray< Value, Device, Index, Allocator >:: operator=( const Array& array ) { setLike( array ); - localData = array.getConstLocalView(); + view = array; return *this; } template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Array > bool -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: operator==( const Array& array ) const { - // we can't run allreduce if the communication groups are different - if( group != array.getCommunicationGroup() ) - return false; - const bool localResult = - localRange == array.getLocalRange() && - globalSize == array.getSize() && - localData == array.getConstLocalView(); - bool result = true; - if( group != CommunicatorType::NullGroup ) - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group ); - return result; + return view == array; } template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Array > bool -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: operator!=( const Array& array ) const { - return ! (*this == array); + return view != array; } template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > bool -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: containsValue( ValueType value ) const { - bool result = false; - if( group != CommunicatorType::NullGroup ) { - const bool localResult = localData.containsValue( value ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LOR, group ); - } - return result; + return view.containsValue( value ); } template< typename Value, typename Device, typename Index, - typename Communicator > + typename Allocator > bool -DistributedArray< Value, Device, Index, Communicator >:: +DistributedArray< Value, Device, Index, Allocator >:: containsOnlyValue( ValueType value ) const { - bool result = true; - if( group != CommunicatorType::NullGroup ) { - const bool localResult = localData.containsOnlyValue( value ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group ); - } - return result; + return view.containsOnlyValue( value ); } } // namespace Containers diff --git a/src/TNL/Containers/DistributedArrayView.h b/src/TNL/Containers/DistributedArrayView.h index e17467befa5850d3f5c1d48723c50526f1ee7a39..cb3235ddbb746acf1149a697beaf49b16b39b1aa 100644 --- a/src/TNL/Containers/DistributedArrayView.h +++ b/src/TNL/Containers/DistributedArrayView.h @@ -12,117 +12,120 @@ #pragma once +#include + #include -#include #include +#include +#include namespace TNL { namespace Containers { template< typename Value, typename Device = Devices::Host, - typename Index = int, - typename Communicator = Communicators::MpiCommunicator > + typename Index = int > class DistributedArrayView { - using CommunicationGroup = typename Communicator::CommunicationGroup; public: using ValueType = Value; using DeviceType = Device; - using CommunicatorType = Communicator; using IndexType = Index; using LocalRangeType = Subrange< Index >; using LocalViewType = Containers::ArrayView< Value, Device, Index >; using ConstLocalViewType = Containers::ArrayView< std::add_const_t< Value >, Device, Index >; - using ViewType = DistributedArrayView< Value, Device, Index, Communicator >; - using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index, Communicator >; + using ViewType = DistributedArrayView< Value, Device, Index >; + using ConstViewType = DistributedArrayView< std::add_const_t< Value >, Device, Index >; + using SynchronizerType = ByteArraySynchronizer< DeviceType, IndexType >; /** * \brief A template which allows to quickly obtain a \ref DistributedArrayView type with changed template parameters. */ template< typename _Value, typename _Device = Device, - typename _Index = Index, - typename _Communicator = Communicator > - using Self = DistributedArrayView< _Value, _Device, _Index, _Communicator >; + typename _Index = Index > + using Self = DistributedArrayView< _Value, _Device, _Index >; + + ~DistributedArrayView(); // Initialization by raw data - __cuda_callable__ - DistributedArrayView( const LocalRangeType& localRange, IndexType globalSize, CommunicationGroup group, LocalViewType localData ) - : localRange(localRange), globalSize(globalSize), group(group), localData(localData) + DistributedArrayView( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group, LocalViewType localData ) + : localRange(localRange), ghosts(ghosts), globalSize(globalSize), group(group), localData(localData) { - TNL_ASSERT_EQ( localData.getSize(), localRange.getSize(), + TNL_ASSERT_EQ( localData.getSize(), localRange.getSize() + ghosts, "The local array size does not match the local range of the distributed array." ); + TNL_ASSERT_GE( ghosts, 0, "The ghosts count must be non-negative." ); } - __cuda_callable__ DistributedArrayView() = default; - // Copy-constructor does shallow copy, so views can be passed-by-value into - // CUDA kernels and they can be captured-by-value in __cuda_callable__ - // lambda functions. - __cuda_callable__ + // Copy-constructor does shallow copy. DistributedArrayView( const DistributedArrayView& ) = default; // "Templated copy-constructor" accepting any cv-qualification of Value template< typename Value_ > - __cuda_callable__ - DistributedArrayView( const DistributedArrayView< Value_, Device, Index, Communicator >& ); + DistributedArrayView( const DistributedArrayView< Value_, Device, Index >& ); // default move-constructor - __cuda_callable__ DistributedArrayView( DistributedArrayView&& ) = default; - // method for rebinding (reinitialization) - // Note that you can also bind directly to Array and other types implicitly - // convertible to ArrayView. - __cuda_callable__ + // method for rebinding (reinitialization) to raw data + void bind( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group, LocalViewType localData ); + + // Note that you can also bind directly to DistributedArray and other types implicitly + // convertible to DistributedArrayView. void bind( DistributedArrayView view ); // binding to local array via raw pointer - // (local range, global size and communication group are preserved) + // (local range, ghosts, global size and communication group are preserved) template< typename Value_ > void bind( Value_* data, IndexType localSize ); - /** - * \brief Returns a modifiable view of the array view. - */ - __cuda_callable__ - ViewType getView(); + const LocalRangeType& getLocalRange() const; - /** - * \brief Returns a non-modifiable view of the array view. - */ - __cuda_callable__ - ConstViewType getConstView() const; + IndexType getGhosts() const; + MPI_Comm getCommunicationGroup() const; - // Copy-assignment does deep copy, just like regular array, but the sizes - // must match (i.e. copy-assignment cannot resize). - DistributedArrayView& operator=( const DistributedArrayView& view ); + LocalViewType getLocalView(); - template< typename Array, - typename..., - typename = std::enable_if_t< HasSubscriptOperator::value > > - DistributedArrayView& operator=( const Array& array ); + ConstLocalViewType getConstLocalView() const; + LocalViewType getLocalViewWithGhosts(); - const LocalRangeType& getLocalRange() const; + ConstLocalViewType getConstLocalViewWithGhosts() const; - CommunicationGroup getCommunicationGroup() const; + void copyFromGlobal( ConstLocalViewType globalArray ); - LocalViewType getLocalView(); + // synchronizer stuff + void setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement = 1 ); - ConstLocalViewType getConstLocalView() const; + std::shared_ptr< SynchronizerType > getSynchronizer() const; - void copyFromGlobal( ConstLocalViewType globalArray ); + int getValuesPerElement() const; + + // Note that this method is not thread-safe - only the thread which created + // and "owns" the instance of this object can call this method. + void startSynchronization(); + + void waitForSynchronization() const; /* * Usual ArrayView methods follow below. */ + /** + * \brief Returns a modifiable view of the array view. + */ + ViewType getView(); + + /** + * \brief Returns a non-modifiable view of the array view. + */ + ConstViewType getConstView() const; + // Resets the array view to the empty state. void reset(); @@ -151,6 +154,15 @@ public: __cuda_callable__ const ValueType& operator[]( IndexType i ) const; + // Copy-assignment does deep copy, just like regular array, but the sizes + // must match (i.e. copy-assignment cannot resize). + DistributedArrayView& operator=( const DistributedArrayView& view ); + + template< typename Array, + typename..., + typename = std::enable_if_t< HasSubscriptOperator::value > > + DistributedArrayView& operator=( const Array& array ); + // Comparison operators template< typename Array > bool operator==( const Array& array ) const; @@ -166,9 +178,13 @@ public: protected: LocalRangeType localRange; + IndexType ghosts = 0; IndexType globalSize = 0; - CommunicationGroup group = Communicator::NullGroup; + MPI_Comm group = MPI::NullGroup(); LocalViewType localData; + + std::shared_ptr< SynchronizerType > synchronizer = nullptr; + int valuesPerElement = 1; }; } // namespace Containers diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp index 0199229d48cab585b78d6618437d9fbcf275092a..65ecc4101fc0258bec0635aa07b202e83c9f178d 100644 --- a/src/TNL/Containers/DistributedArrayView.hpp +++ b/src/TNL/Containers/DistributedArrayView.hpp @@ -19,160 +19,161 @@ namespace Containers { template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > +DistributedArrayView< Value, Device, Index >:: +~DistributedArrayView() +{ + // Wait for pending async operation, otherwise the synchronizer might crash + // if the view goes out of scope. + // (The same thing is done even in DistributedArray, but there might be views + // bound to an array without a synchronizer, in which case this helps.) + waitForSynchronization(); +} + +template< typename Value, + typename Device, + typename Index > template< typename Value_ > -__cuda_callable__ -DistributedArrayView< Value, Device, Index, Communicator >:: -DistributedArrayView( const DistributedArrayView< Value_, Device, Index, Communicator >& view ) +DistributedArrayView< Value, Device, Index >:: +DistributedArrayView( const DistributedArrayView< Value_, Device, Index >& view ) : localRange( view.getLocalRange() ), + ghosts( view.getGhosts() ), globalSize( view.getSize() ), group( view.getCommunicationGroup() ), - localData( view.getConstLocalView() ) + localData( view.getConstLocalViewWithGhosts() ), + synchronizer( view.getSynchronizer() ), + valuesPerElement( view.getValuesPerElement() ) {} template< typename Value, typename Device, - typename Index, - typename Communicator > -__cuda_callable__ + typename Index > +void +DistributedArrayView< Value, Device, Index >:: +bind( const LocalRangeType& localRange, IndexType ghosts, IndexType globalSize, MPI_Comm group, LocalViewType localData ) +{ + TNL_ASSERT_EQ( localData.getSize(), localRange.getSize() + ghosts, + "The local array size does not match the local range of the distributed array." ); + TNL_ASSERT_GE( ghosts, 0, "The ghosts count must be non-negative." ); + + this->localRange = localRange; + this->ghosts = ghosts; + this->globalSize = globalSize; + this->group = group; + this->localData.bind( localData ); +} + +template< typename Value, + typename Device, + typename Index > void -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: bind( DistributedArrayView view ) { localRange = view.getLocalRange(); + ghosts = view.getGhosts(); globalSize = view.getSize(); group = view.getCommunicationGroup(); - localData.bind( view.getLocalView() ); + localData.bind( view.getLocalViewWithGhosts() ); + // set, but do not unset, the synchronizer + if( view.getSynchronizer() ) + setSynchronizer( view.getSynchronizer(), view.getValuesPerElement() ); } template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Value_ > void -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: bind( Value_* data, IndexType localSize ) { - TNL_ASSERT_EQ( localSize, localRange.getSize(), + TNL_ASSERT_EQ( localSize, localRange.getSize() + ghosts, "The local array size does not match the local range of the distributed array." ); localData.bind( data, localSize ); } template< typename Value, typename Device, - typename Index, - typename Communicator > -__cuda_callable__ -typename DistributedArrayView< Value, Device, Index, Communicator >::ViewType -DistributedArrayView< Value, Device, Index, Communicator >:: -getView() -{ - return *this; -} - -template< typename Value, - typename Device, - typename Index, - typename Communicator > -__cuda_callable__ -typename DistributedArrayView< Value, Device, Index, Communicator >::ConstViewType -DistributedArrayView< Value, Device, Index, Communicator >:: -getConstView() const + typename Index > +const Subrange< Index >& +DistributedArrayView< Value, Device, Index >:: +getLocalRange() const { - return *this; + return localRange; } - template< typename Value, typename Device, - typename Index, - typename Communicator > -DistributedArrayView< Value, Device, Index, Communicator >& -DistributedArrayView< Value, Device, Index, Communicator >:: -operator=( const DistributedArrayView& view ) + typename Index > +Index +DistributedArrayView< Value, Device, Index >:: +getGhosts() const { - TNL_ASSERT_EQ( getSize(), view.getSize(), "The sizes of the array views must be equal, views are not resizable." ); - TNL_ASSERT_EQ( getLocalRange(), view.getLocalRange(), "The local ranges must be equal, views are not resizable." ); - TNL_ASSERT_EQ( getCommunicationGroup(), view.getCommunicationGroup(), "The communication groups of the array views must be equal." ); - localData = view.getConstLocalView(); - return *this; + return ghosts; } template< typename Value, typename Device, - typename Index, - typename Communicator > - template< typename Array, typename..., typename > -DistributedArrayView< Value, Device, Index, Communicator >& -DistributedArrayView< Value, Device, Index, Communicator >:: -operator=( const Array& array ) + typename Index > +MPI_Comm +DistributedArrayView< Value, Device, Index >:: +getCommunicationGroup() const { - TNL_ASSERT_EQ( getSize(), array.getSize(), "The global sizes must be equal, views are not resizable." ); - TNL_ASSERT_EQ( getLocalRange(), array.getLocalRange(), "The local ranges must be equal, views are not resizable." ); - TNL_ASSERT_EQ( getCommunicationGroup(), array.getCommunicationGroup(), "The communication groups must be equal." ); - localData = array.getConstLocalView(); - return *this; + return group; } - template< typename Value, typename Device, - typename Index, - typename Communicator > -const Subrange< Index >& -DistributedArrayView< Value, Device, Index, Communicator >:: -getLocalRange() const + typename Index > +typename DistributedArrayView< Value, Device, Index >::LocalViewType +DistributedArrayView< Value, Device, Index >:: +getLocalView() { - return localRange; + return LocalViewType( localData.getData(), localRange.getSize() ); } template< typename Value, typename Device, - typename Index, - typename Communicator > -typename Communicator::CommunicationGroup -DistributedArrayView< Value, Device, Index, Communicator >:: -getCommunicationGroup() const + typename Index > +typename DistributedArrayView< Value, Device, Index >::ConstLocalViewType +DistributedArrayView< Value, Device, Index >:: +getConstLocalView() const { - return group; + return ConstLocalViewType( localData.getData(), localRange.getSize() ); } template< typename Value, typename Device, - typename Index, - typename Communicator > -typename DistributedArrayView< Value, Device, Index, Communicator >::LocalViewType -DistributedArrayView< Value, Device, Index, Communicator >:: -getLocalView() + typename Index > +typename DistributedArrayView< Value, Device, Index >::LocalViewType +DistributedArrayView< Value, Device, Index >:: +getLocalViewWithGhosts() { return localData; } template< typename Value, typename Device, - typename Index, - typename Communicator > -typename DistributedArrayView< Value, Device, Index, Communicator >::ConstLocalViewType -DistributedArrayView< Value, Device, Index, Communicator >:: -getConstLocalView() const + typename Index > +typename DistributedArrayView< Value, Device, Index >::ConstLocalViewType +DistributedArrayView< Value, Device, Index >:: +getConstLocalViewWithGhosts() const { return localData; } template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > void -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: copyFromGlobal( ConstLocalViewType globalArray ) { TNL_ASSERT_EQ( getSize(), globalArray.getSize(), "given global array has different size than the distributed array view" ); - LocalViewType localView( localData ); + LocalViewType localView = getLocalView(); const LocalRangeType localRange = getLocalRange(); auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable @@ -181,29 +182,114 @@ copyFromGlobal( ConstLocalViewType globalArray ) }; Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, localRange.getSize(), kernel ); + startSynchronization(); +} + +template< typename Value, + typename Device, + typename Index > +void +DistributedArrayView< Value, Device, Index >:: +setSynchronizer( std::shared_ptr< SynchronizerType > synchronizer, int valuesPerElement ) +{ + this->synchronizer = synchronizer; + this->valuesPerElement = valuesPerElement; +} + +template< typename Value, + typename Device, + typename Index > +std::shared_ptr< typename DistributedArrayView< Value, Device, Index >::SynchronizerType > +DistributedArrayView< Value, Device, Index >:: +getSynchronizer() const +{ + return synchronizer; } +template< typename Value, + typename Device, + typename Index > +int +DistributedArrayView< Value, Device, Index >:: +getValuesPerElement() const +{ + return valuesPerElement; +} + +template< typename Value, + typename Device, + typename Index > +void +DistributedArrayView< Value, Device, Index >:: +startSynchronization() +{ + if( ghosts == 0 ) + return; + // TODO: assert does not play very nice with automatic synchronizations from operations like + // assignment of scalars + // (Maybe we should just drop all automatic syncs? But that's not nice for high-level codes + // like linear solvers...) + TNL_ASSERT_TRUE( synchronizer, "the synchronizer was not set" ); + + typename SynchronizerType::ByteArrayView bytes; + bytes.bind( reinterpret_cast( localData.getData() ), sizeof(ValueType) * localData.getSize() ); + synchronizer->synchronizeByteArrayAsync( bytes, sizeof(ValueType) * valuesPerElement ); +} template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > void -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: +waitForSynchronization() const +{ + if( synchronizer && synchronizer->async_op.valid() ) { + synchronizer->async_wait_timer.start(); + synchronizer->async_op.wait(); + synchronizer->async_wait_timer.stop(); + } +} + + +template< typename Value, + typename Device, + typename Index > +typename DistributedArrayView< Value, Device, Index >::ViewType +DistributedArrayView< Value, Device, Index >:: +getView() +{ + return *this; +} + +template< typename Value, + typename Device, + typename Index > +typename DistributedArrayView< Value, Device, Index >::ConstViewType +DistributedArrayView< Value, Device, Index >:: +getConstView() const +{ + return *this; +} + +template< typename Value, + typename Device, + typename Index > +void +DistributedArrayView< Value, Device, Index >:: reset() { localRange.reset(); + ghosts = 0; globalSize = 0; - group = Communicator::NullGroup; + group = MPI::NullGroup(); localData.reset(); } template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > bool -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: empty() const { return getSize() == 0; @@ -213,10 +299,9 @@ empty() const template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > Index -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: getSize() const { return globalSize; @@ -224,21 +309,20 @@ getSize() const template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > void -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: setValue( ValueType value ) { localData.setValue( value ); + startSynchronization(); } template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > void -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: setElement( IndexType i, ValueType value ) { const IndexType li = localRange.getLocalIndex( i ); @@ -247,10 +331,9 @@ setElement( IndexType i, ValueType value ) template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > Value -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: getElement( IndexType i ) const { const IndexType li = localRange.getLocalIndex( i ); @@ -259,11 +342,10 @@ getElement( IndexType i ) const template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > __cuda_callable__ Value& -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: operator[]( IndexType i ) { const IndexType li = localRange.getLocalIndex( i ); @@ -272,11 +354,10 @@ operator[]( IndexType i ) template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > __cuda_callable__ const Value& -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: operator[]( IndexType i ) const { const IndexType li = localRange.getLocalIndex( i ); @@ -285,11 +366,47 @@ operator[]( IndexType i ) const template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > +DistributedArrayView< Value, Device, Index >& +DistributedArrayView< Value, Device, Index >:: +operator=( const DistributedArrayView& view ) +{ + TNL_ASSERT_EQ( getSize(), view.getSize(), "The sizes of the array views must be equal, views are not resizable." ); + TNL_ASSERT_EQ( getLocalRange(), view.getLocalRange(), "The local ranges must be equal, views are not resizable." ); + TNL_ASSERT_EQ( getGhosts(), view.getGhosts(), "Ghosts must be equal, views are not resizable." ); + TNL_ASSERT_EQ( getCommunicationGroup(), view.getCommunicationGroup(), "The communication groups of the array views must be equal." ); + localData = view.getConstLocalViewWithGhosts(); + // set, but do not unset, the synchronizer + if( view.getSynchronizer() ) + setSynchronizer( view.getSynchronizer(), view.getValuesPerElement() ); + return *this; +} + +template< typename Value, + typename Device, + typename Index > + template< typename Array, typename..., typename > +DistributedArrayView< Value, Device, Index >& +DistributedArrayView< Value, Device, Index >:: +operator=( const Array& array ) +{ + TNL_ASSERT_EQ( getSize(), array.getSize(), "The global sizes must be equal, views are not resizable." ); + TNL_ASSERT_EQ( getLocalRange(), array.getLocalRange(), "The local ranges must be equal, views are not resizable." ); + TNL_ASSERT_EQ( getGhosts(), array.getGhosts(), "Ghosts must be equal, views are not resizable." ); + TNL_ASSERT_EQ( getCommunicationGroup(), array.getCommunicationGroup(), "The communication groups must be equal." ); + localData = array.getConstLocalViewWithGhosts(); + // set, but do not unset, the synchronizer + if( array.getSynchronizer() ) + setSynchronizer( array.getSynchronizer(), array.getValuesPerElement() ); + return *this; +} + +template< typename Value, + typename Device, + typename Index > template< typename Array > bool -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: operator==( const Array& array ) const { // we can't run allreduce if the communication groups are different @@ -297,21 +414,22 @@ operator==( const Array& array ) const return false; const bool localResult = localRange == array.getLocalRange() && + ghosts == array.getGhosts() && globalSize == array.getSize() && - localData == array.getConstLocalView(); + // compare without ghosts + getConstLocalView() == array.getConstLocalView(); bool result = true; - if( group != CommunicatorType::NullGroup ) - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group ); + if( group != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, group ); return result; } template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Array > bool -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: operator!=( const Array& array ) const { return ! (*this == array); @@ -319,32 +437,30 @@ operator!=( const Array& array ) const template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > bool -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: containsValue( ValueType value ) const { bool result = false; - if( group != CommunicatorType::NullGroup ) { + if( group != MPI::NullGroup() ) { const bool localResult = localData.containsValue( value ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LOR, group ); + MPI::Allreduce( &localResult, &result, 1, MPI_LOR, group ); } return result; } template< typename Value, typename Device, - typename Index, - typename Communicator > + typename Index > bool -DistributedArrayView< Value, Device, Index, Communicator >:: +DistributedArrayView< Value, Device, Index >:: containsOnlyValue( ValueType value ) const { bool result = true; - if( group != CommunicatorType::NullGroup ) { + if( group != MPI::NullGroup() ) { const bool localResult = localData.containsOnlyValue( value ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group ); + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, group ); } return result; } diff --git a/src/TNL/Containers/DistributedNDArray.h b/src/TNL/Containers/DistributedNDArray.h index 57b94a34b1bd7c210d24462aa1859cc68f087f15..c49e9e31b0250a333bf430e60612214e4d1585d0 100644 --- a/src/TNL/Containers/DistributedNDArray.h +++ b/src/TNL/Containers/DistributedNDArray.h @@ -12,34 +12,30 @@ #pragma once -#include #include -#include #include namespace TNL { namespace Containers { template< typename NDArray, - typename Communicator = Communicators::MpiCommunicator, typename Overlaps = __ndarray_impl::make_constant_index_sequence< NDArray::getDimension(), 0 > > class DistributedNDArray { - using CommunicationGroup = typename Communicator::CommunicationGroup; public: using ValueType = typename NDArray::ValueType; using DeviceType = typename NDArray::DeviceType; using IndexType = typename NDArray::IndexType; + using AllocatorType = typename NDArray::AllocatorType; using SizesHolderType = typename NDArray::SizesHolderType; using PermutationType = typename NDArray::PermutationType; - using CommunicatorType = Communicator; using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArray::SizesHolderType >; using LocalRangeType = Subrange< IndexType >; using OverlapsType = Overlaps; using LocalIndexerType = NDArrayIndexer< SizesHolderType, PermutationType, typename NDArray::NDBaseType, typename NDArray::StridesHolderType, Overlaps >; - using ViewType = DistributedNDArrayView< typename NDArray::ViewType, Communicator, Overlaps >; - using ConstViewType = DistributedNDArrayView< typename NDArray::ConstViewType, Communicator, Overlaps >; + using ViewType = DistributedNDArrayView< typename NDArray::ViewType, Overlaps >; + using ConstViewType = DistributedNDArrayView< typename NDArray::ConstViewType, Overlaps >; using LocalViewType = typename NDArray::ViewType; using ConstLocalViewType = typename NDArray::ConstViewType; @@ -49,10 +45,17 @@ public: DistributedNDArray() = default; - // The copy-constructor of TNL::Containers::Array makes shallow copy so our - // copy-constructor cannot be default. Actually, we most likely don't need - // it anyway, so let's just delete it. - DistributedNDArray( const DistributedNDArray& ) = delete; + DistributedNDArray( const AllocatorType& allocator ); + + // Copy constructor (makes a deep copy). + explicit DistributedNDArray( const DistributedNDArray& ) = default; + + // Copy constructor with a specific allocator (makes a deep copy). + explicit DistributedNDArray( const DistributedNDArray& other, const AllocatorType& allocator ) + : localArray( allocator ) + { + *this = other; + } // Standard copy-semantics with deep copy, just like regular 1D array. // Mismatched sizes cause reallocations. @@ -79,8 +82,13 @@ public: return NDArray::getDimension(); } + AllocatorType getAllocator() const + { + return localArray.getAllocator(); + } + __cuda_callable__ - CommunicationGroup getCommunicationGroup() const + MPI_Comm getCommunicationGroup() const { return group; } @@ -232,8 +240,8 @@ public: localEnds == other.localEnds && localArray == other.localArray; bool result = true; - if( group != CommunicatorType::NullGroup ) - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group ); + if( group != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, group ); return result; } @@ -375,7 +383,7 @@ public: } template< std::size_t level > - void setDistribution( IndexType begin, IndexType end, CommunicationGroup group = Communicator::AllGroup ) + void setDistribution( IndexType begin, IndexType end, MPI_Comm group = MPI::AllGroup() ) { static_assert( SizesHolderType::template getStaticSize< level >() == 0, "NDArray cannot be distributed in static dimensions." ); TNL_ASSERT_GE( begin, 0, "begin must be non-negative" ); @@ -383,7 +391,7 @@ public: TNL_ASSERT_LT( begin, end, "begin must be lesser than end" ); localBegins.template setSize< level >( begin ); localEnds.template setSize< level >( end ); - TNL_ASSERT( this->group == Communicator::NullGroup || this->group == group, + TNL_ASSERT( this->group == MPI::NullGroup() || this->group == group, std::cerr << "different groups cannot be combined for different dimensions" ); this->group = group; } @@ -408,7 +416,7 @@ public: void reset() { localArray.reset(); - group = CommunicatorType::NullGroup; + group = MPI::NullGroup(); globalSizes = SizesHolderType{}; localBegins = LocalBeginsType{}; localEnds = SizesHolderType{}; @@ -435,7 +443,7 @@ public: protected: NDArray localArray; - CommunicationGroup group = Communicator::NullGroup; + MPI_Comm group = MPI::NullGroup(); SizesHolderType globalSizes; // static sizes should have different type: localBegin is always 0, localEnd is always the full size LocalBeginsType localBegins; diff --git a/src/TNL/Containers/DistributedNDArraySynchronizer.h b/src/TNL/Containers/DistributedNDArraySynchronizer.h index bcec4a7b4760d9b864528f401f6ce68c7f3579f2..cea40bc21c3ec0c71ad891aafcfe620c9582754b 100644 --- a/src/TNL/Containers/DistributedNDArraySynchronizer.h +++ b/src/TNL/Containers/DistributedNDArraySynchronizer.h @@ -15,6 +15,7 @@ #include #include +#include namespace TNL { namespace Containers { @@ -69,7 +70,6 @@ public: protected: using DistributedNDArrayView = typename DistributedNDArray::ViewType; - using Communicator = typename DistributedNDArray::CommunicatorType; using Buffers = __ndarray_impl::SynchronizerBuffers< DistributedNDArray >; DistributedNDArrayView array_view; @@ -88,12 +88,12 @@ protected: Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, true ); // issue all send and receive async operations - std::vector< typename Communicator::Request > requests; - const typename Communicator::CommunicationGroup group = array_view.getCommunicationGroup(); + std::vector< MPI_Request > requests; + const MPI_Comm group = array_view.getCommunicationGroup(); Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), SendHelper >::execHost( buffers, requests, group ); // wait until send is done - Communicator::WaitAll( requests.data(), requests.size() ); + MPI::Waitall( requests.data(), requests.size() ); // copy data from receive buffers Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, false ); @@ -152,9 +152,9 @@ protected: dim_buffers.right_recv_offsets.template setSize< dim >( localEnds.template getSize< dim >() ); // FIXME: set proper neighbor IDs !!! - const typename Communicator::CommunicationGroup group = array_view.getCommunicationGroup(); - const int rank = Communicator::GetRank(group); - const int nproc = Communicator::GetSize(group); + const MPI_Comm group = array_view.getCommunicationGroup(); + const int rank = MPI::GetRank(group); + const int nproc = MPI::GetSize(group); dim_buffers.left_neighbor = (rank + nproc - 1) % nproc; dim_buffers.right_neighbor = (rank + 1) % nproc; } @@ -221,32 +221,32 @@ protected: auto& dim_buffers = buffers.template getDimBuffers< dim >(); if( LBM_HACK == false ) { - requests.push_back( Communicator::ISend( dim_buffers.left_send_view.getData(), - dim_buffers.left_send_view.getStorageSize(), - dim_buffers.left_neighbor, 0, group ) ); - requests.push_back( Communicator::IRecv( dim_buffers.left_recv_view.getData(), - dim_buffers.left_recv_view.getStorageSize(), - dim_buffers.left_neighbor, 1, group ) ); - requests.push_back( Communicator::ISend( dim_buffers.right_send_view.getData(), - dim_buffers.right_send_view.getStorageSize(), - dim_buffers.right_neighbor, 1, group ) ); - requests.push_back( Communicator::IRecv( dim_buffers.right_recv_view.getData(), - dim_buffers.right_recv_view.getStorageSize(), - dim_buffers.right_neighbor, 0, group ) ); + requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData(), + dim_buffers.left_send_view.getStorageSize(), + dim_buffers.left_neighbor, 0, group ) ); + requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData(), + dim_buffers.left_recv_view.getStorageSize(), + dim_buffers.left_neighbor, 1, group ) ); + requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData(), + dim_buffers.right_send_view.getStorageSize(), + dim_buffers.right_neighbor, 1, group ) ); + requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData(), + dim_buffers.right_recv_view.getStorageSize(), + dim_buffers.right_neighbor, 0, group ) ); } else { - requests.push_back( Communicator::ISend( dim_buffers.left_send_view.getData() + 0, - dim_buffers.left_send_view.getStorageSize() / 27 * 9, - dim_buffers.left_neighbor, 0, group ) ); - requests.push_back( Communicator::IRecv( dim_buffers.left_recv_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18, - dim_buffers.left_recv_view.getStorageSize() / 27 * 9, - dim_buffers.left_neighbor, 1, group ) ); - requests.push_back( Communicator::ISend( dim_buffers.right_send_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18, - dim_buffers.right_send_view.getStorageSize() / 27 * 9, - dim_buffers.right_neighbor, 1, group ) ); - requests.push_back( Communicator::IRecv( dim_buffers.right_recv_view.getData() + 0, - dim_buffers.right_recv_view.getStorageSize() / 27 * 9, - dim_buffers.right_neighbor, 0, group ) ); + requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData() + 0, + dim_buffers.left_send_view.getStorageSize() / 27 * 9, + dim_buffers.left_neighbor, 0, group ) ); + requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18, + dim_buffers.left_recv_view.getStorageSize() / 27 * 9, + dim_buffers.left_neighbor, 1, group ) ); + requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18, + dim_buffers.right_send_view.getStorageSize() / 27 * 9, + dim_buffers.right_neighbor, 1, group ) ); + requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData() + 0, + dim_buffers.right_recv_view.getStorageSize() / 27 * 9, + dim_buffers.right_neighbor, 0, group ) ); } } }; diff --git a/src/TNL/Containers/DistributedNDArrayView.h b/src/TNL/Containers/DistributedNDArrayView.h index 102985e9c15e4ff0d058dc79c04ff14b7ae2194b..4812bf5c006b24dc7ab201901338fcd8ae68337b 100644 --- a/src/TNL/Containers/DistributedNDArrayView.h +++ b/src/TNL/Containers/DistributedNDArrayView.h @@ -12,33 +12,30 @@ #pragma once -#include #include #include +#include namespace TNL { namespace Containers { template< typename NDArrayView, - typename Communicator = Communicators::MpiCommunicator, typename Overlaps = __ndarray_impl::make_constant_index_sequence< NDArrayView::getDimension(), 0 > > class DistributedNDArrayView { - using CommunicationGroup = typename Communicator::CommunicationGroup; public: using ValueType = typename NDArrayView::ValueType; using DeviceType = typename NDArrayView::DeviceType; using IndexType = typename NDArrayView::IndexType; using SizesHolderType = typename NDArrayView::SizesHolderType; using PermutationType = typename NDArrayView::PermutationType; - using CommunicatorType = Communicator; using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArrayView::SizesHolderType >; using LocalRangeType = Subrange< IndexType >; using OverlapsType = Overlaps; using LocalIndexerType = NDArrayIndexer< SizesHolderType, PermutationType, typename NDArrayView::NDBaseType, typename NDArrayView::StridesHolderType, Overlaps >; - using ViewType = DistributedNDArrayView< NDArrayView, Communicator, Overlaps >; - using ConstViewType = DistributedNDArrayView< typename NDArrayView::ConstViewType, Communicator, Overlaps >; + using ViewType = DistributedNDArrayView< NDArrayView, Overlaps >; + using ConstViewType = DistributedNDArrayView< typename NDArrayView::ConstViewType, Overlaps >; using LocalViewType = NDArrayView; using ConstLocalViewType = typename NDArrayView::ConstViewType; @@ -49,7 +46,7 @@ public: // explicit initialization by local array view, global sizes and local begins and ends __cuda_callable__ - DistributedNDArrayView( NDArrayView localView, SizesHolderType globalSizes, LocalBeginsType localBegins, SizesHolderType localEnds, CommunicationGroup group ) + DistributedNDArrayView( NDArrayView localView, SizesHolderType globalSizes, LocalBeginsType localBegins, SizesHolderType localEnds, MPI_Comm group ) : localView(localView), group(group), globalSizes(globalSizes), localBegins(localBegins), localEnds(localEnds) {} // Copy-constructor does shallow copy, so views can be passed-by-value into @@ -112,7 +109,7 @@ public: void reset() { localView.reset(); - group = CommunicatorType::NullGroup; + group = MPI::NullGroup(); globalSizes = SizesHolderType{}; localBegins = LocalBeginsType{}; localEnds = SizesHolderType{}; @@ -124,7 +121,7 @@ public: } __cuda_callable__ - CommunicationGroup getCommunicationGroup() const + MPI_Comm getCommunicationGroup() const { return group; } @@ -276,8 +273,8 @@ public: localEnds == other.localEnds && localView == other.localView; bool result = true; - if( group != CommunicatorType::NullGroup ) - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group ); + if( group != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, group ); return result; } @@ -406,7 +403,7 @@ public: protected: NDArrayView localView; - CommunicationGroup group = Communicator::NullGroup; + MPI_Comm group = MPI::NullGroup(); SizesHolderType globalSizes; // static sizes should have different type: localBegin is always 0, localEnd is always the full size LocalBeginsType localBegins; diff --git a/src/TNL/Containers/DistributedVector.h b/src/TNL/Containers/DistributedVector.h index 5d5f8303f520ac23171797f2cd240510e42f140c..8d737e3a975b5d4c91451bff93357f59b5864bbe 100644 --- a/src/TNL/Containers/DistributedVector.h +++ b/src/TNL/Containers/DistributedVector.h @@ -21,21 +21,20 @@ namespace Containers { template< typename Real, typename Device = Devices::Host, typename Index = int, - typename Communicator = Communicators::MpiCommunicator > + typename Allocator = typename Allocators::Default< Device >::template Allocator< Real > > class DistributedVector -: public DistributedArray< Real, Device, Index, Communicator > +: public DistributedArray< Real, Device, Index, Allocator > { - using CommunicationGroup = typename Communicator::CommunicationGroup; - using BaseType = DistributedArray< Real, Device, Index, Communicator >; + using BaseType = DistributedArray< Real, Device, Index, Allocator >; public: using RealType = Real; using DeviceType = Device; - using CommunicatorType = Communicator; using IndexType = Index; + using AllocatorType = Allocator; using LocalViewType = Containers::VectorView< Real, Device, Index >; using ConstLocalViewType = Containers::VectorView< std::add_const_t< Real >, Device, Index >; - using ViewType = DistributedVectorView< Real, Device, Index, Communicator >; - using ConstViewType = DistributedVectorView< std::add_const_t< Real >, Device, Index, Communicator >; + using ViewType = DistributedVectorView< Real, Device, Index >; + using ConstViewType = DistributedVectorView< std::add_const_t< Real >, Device, Index >; /** * \brief A template which allows to quickly obtain a \ref Vector type with changed template parameters. @@ -43,8 +42,8 @@ public: template< typename _Real, typename _Device = Device, typename _Index = Index, - typename _Communicator = Communicator > - using Self = DistributedVector< _Real, _Device, _Index, _Communicator >; + typename _Allocator = typename Allocators::Default< _Device >::template Allocator< _Real > > + using Self = DistributedVector< _Real, _Device, _Index, _Allocator >; // inherit all constructors and assignment operators from Array @@ -60,6 +59,11 @@ public: */ explicit DistributedVector( const DistributedVector& ) = default; + /** + * \brief Copy constructor with a specific allocator (makes a deep copy). + */ + explicit DistributedVector( const DistributedVector& vector, const AllocatorType& allocator ); + /** * \brief Default move constructor. */ @@ -75,11 +79,28 @@ public: */ DistributedVector& operator=( DistributedVector&& ) = default; - // we return only the view so that the user cannot resize it + /** + * \brief Returns a modifiable view of the local part of the vector. + */ LocalViewType getLocalView(); + /** + * \brief Returns a non-modifiable view of the local part of the vector. + */ ConstLocalViewType getConstLocalView() const; + /** + * \brief Returns a modifiable view of the local part of the vector, + * including ghost values. + */ + LocalViewType getLocalViewWithGhosts(); + + /** + * \brief Returns a non-modifiable view of the local part of the vector, + * including ghost values. + */ + ConstLocalViewType getConstLocalViewWithGhosts() const; + /** * \brief Returns a modifiable view of the vector. */ @@ -160,8 +181,8 @@ public: // Enable expression templates for DistributedVector namespace Expressions { - template< typename Real, typename Device, typename Index, typename Communicator > - struct HasEnabledDistributedExpressionTemplates< DistributedVector< Real, Device, Index, Communicator > > + template< typename Real, typename Device, typename Index, typename Allocator > + struct HasEnabledDistributedExpressionTemplates< DistributedVector< Real, Device, Index, Allocator > > : std::true_type {}; } // namespace Expressions diff --git a/src/TNL/Containers/DistributedVector.hpp b/src/TNL/Containers/DistributedVector.hpp index fa49591e8ae53ffd06214772491c656b91601413..044b747d9f42d148b17b1acb30917b5cdf04887c 100644 --- a/src/TNL/Containers/DistributedVector.hpp +++ b/src/TNL/Containers/DistributedVector.hpp @@ -21,9 +21,19 @@ namespace Containers { template< typename Real, typename Device, typename Index, - typename Communicator > -typename DistributedVector< Real, Device, Index, Communicator >::LocalViewType -DistributedVector< Real, Device, Index, Communicator >:: + typename Allocator > +DistributedVector< Real, Device, Index, Allocator >:: +DistributedVector( const DistributedVector& vector, const AllocatorType& allocator ) +: BaseType::DistributedArray( vector, allocator ) +{ +} + +template< typename Real, + typename Device, + typename Index, + typename Allocator > +typename DistributedVector< Real, Device, Index, Allocator >::LocalViewType +DistributedVector< Real, Device, Index, Allocator >:: getLocalView() { return BaseType::getLocalView(); @@ -32,41 +42,63 @@ getLocalView() template< typename Real, typename Device, typename Index, - typename Communicator > -typename DistributedVector< Real, Device, Index, Communicator >::ConstLocalViewType -DistributedVector< Real, Device, Index, Communicator >:: + typename Allocator > +typename DistributedVector< Real, Device, Index, Allocator >::ConstLocalViewType +DistributedVector< Real, Device, Index, Allocator >:: getConstLocalView() const { return BaseType::getConstLocalView(); } +template< typename Real, + typename Device, + typename Index, + typename Allocator > +typename DistributedVector< Real, Device, Index, Allocator >::LocalViewType +DistributedVector< Real, Device, Index, Allocator >:: +getLocalViewWithGhosts() +{ + return BaseType::getLocalViewWithGhosts(); +} + +template< typename Real, + typename Device, + typename Index, + typename Allocator > +typename DistributedVector< Real, Device, Index, Allocator >::ConstLocalViewType +DistributedVector< Real, Device, Index, Allocator >:: +getConstLocalViewWithGhosts() const +{ + return BaseType::getConstLocalViewWithGhosts(); +} + template< typename Value, typename Device, typename Index, - typename Communicator > -typename DistributedVector< Value, Device, Index, Communicator >::ViewType -DistributedVector< Value, Device, Index, Communicator >:: + typename Allocator > +typename DistributedVector< Value, Device, Index, Allocator >::ViewType +DistributedVector< Value, Device, Index, Allocator >:: getView() { - return ViewType( this->getLocalRange(), this->getSize(), this->getCommunicationGroup(), this->getLocalView() ); + return BaseType::getView(); } template< typename Value, typename Device, typename Index, - typename Communicator > -typename DistributedVector< Value, Device, Index, Communicator >::ConstViewType -DistributedVector< Value, Device, Index, Communicator >:: + typename Allocator > +typename DistributedVector< Value, Device, Index, Allocator >::ConstViewType +DistributedVector< Value, Device, Index, Allocator >:: getConstView() const { - return ConstViewType( this->getLocalRange(), this->getSize(), this->getCommunicationGroup(), this->getConstLocalView() ); + return BaseType::getConstView(); } template< typename Value, typename Device, typename Index, - typename Communicator > -DistributedVector< Value, Device, Index, Communicator >:: + typename Allocator > +DistributedVector< Value, Device, Index, Allocator >:: operator ViewType() { return getView(); @@ -75,8 +107,8 @@ operator ViewType() template< typename Value, typename Device, typename Index, - typename Communicator > -DistributedVector< Value, Device, Index, Communicator >:: + typename Allocator > +DistributedVector< Value, Device, Index, Allocator >:: operator ConstViewType() const { return getConstView(); @@ -90,194 +122,144 @@ operator ConstViewType() const template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Vector, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator=( const Vector& vector ) { this->setLike( vector ); - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() = vector.getConstLocalView(); - } + getView() = vector; return *this; } template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Vector, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator+=( const Vector& vector ) { - TNL_ASSERT_EQ( this->getSize(), vector.getSize(), - "Vector sizes must be equal." ); - TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(), - "Multiary operations are supported only on vectors which are distributed the same way." ); - TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), - "Multiary operations are supported only on vectors within the same communication group." ); - - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() += vector.getConstLocalView(); - } + getView() += vector; return *this; } template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Vector, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator-=( const Vector& vector ) { - TNL_ASSERT_EQ( this->getSize(), vector.getSize(), - "Vector sizes must be equal." ); - TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(), - "Multiary operations are supported only on vectors which are distributed the same way." ); - TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), - "Multiary operations are supported only on vectors within the same communication group." ); - - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() -= vector.getConstLocalView(); - } + getView() -= vector; return *this; } template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Vector, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator*=( const Vector& vector ) { - TNL_ASSERT_EQ( this->getSize(), vector.getSize(), - "Vector sizes must be equal." ); - TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(), - "Multiary operations are supported only on vectors which are distributed the same way." ); - TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), - "Multiary operations are supported only on vectors within the same communication group." ); - - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() *= vector.getConstLocalView(); - } + getView() *= vector; return *this; } template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Vector, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator/=( const Vector& vector ) { - TNL_ASSERT_EQ( this->getSize(), vector.getSize(), - "Vector sizes must be equal." ); - TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(), - "Multiary operations are supported only on vectors which are distributed the same way." ); - TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), - "Multiary operations are supported only on vectors within the same communication group." ); - - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() /= vector.getConstLocalView(); - } + getView() /= vector; return *this; } template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Scalar, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() = c; - } + getView() = c; return *this; } template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Scalar, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator+=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() += c; - } + getView() += c; return *this; } template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Scalar, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator-=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() -= c; - } + getView() -= c; return *this; } template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Scalar, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator*=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() *= c; - } + getView() *= c; return *this; } template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< typename Scalar, typename..., typename > -DistributedVector< Real, Device, Index, Communicator >& -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >& +DistributedVector< Real, Device, Index, Allocator >:: operator/=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() /= c; - } + getView() /= c; return *this; } template< typename Real, typename Device, typename Index, - typename Communicator > + typename Allocator > template< Algorithms::ScanType Type > void -DistributedVector< Real, Device, Index, Communicator >:: +DistributedVector< Real, Device, Index, Allocator >:: scan( IndexType begin, IndexType end ) { - if( end == 0 ) - end = this->getSize(); - Algorithms::DistributedScan< Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 ); + getView().template scan< Type >( begin, end ); } } // namespace Containers diff --git a/src/TNL/Containers/DistributedVectorView.h b/src/TNL/Containers/DistributedVectorView.h index 157a64b94d64da3ccad3ce81606f0708faa608c2..4a46a47cec4eba56de0e6078ccc4557a52d37177 100644 --- a/src/TNL/Containers/DistributedVectorView.h +++ b/src/TNL/Containers/DistributedVectorView.h @@ -21,32 +21,28 @@ namespace Containers { template< typename Real, typename Device = Devices::Host, - typename Index = int, - typename Communicator = Communicators::MpiCommunicator > + typename Index = int > class DistributedVectorView -: public DistributedArrayView< Real, Device, Index, Communicator > +: public DistributedArrayView< Real, Device, Index > { - using CommunicationGroup = typename Communicator::CommunicationGroup; - using BaseType = DistributedArrayView< Real, Device, Index, Communicator >; + using BaseType = DistributedArrayView< Real, Device, Index >; using NonConstReal = typename std::remove_const< Real >::type; public: using RealType = Real; using DeviceType = Device; - using CommunicatorType = Communicator; using IndexType = Index; using LocalViewType = Containers::VectorView< Real, Device, Index >; using ConstLocalViewType = Containers::VectorView< std::add_const_t< Real >, Device, Index >; - using ViewType = DistributedVectorView< Real, Device, Index, Communicator >; - using ConstViewType = DistributedVectorView< std::add_const_t< Real >, Device, Index, Communicator >; + using ViewType = DistributedVectorView< Real, Device, Index >; + using ConstViewType = DistributedVectorView< std::add_const_t< Real >, Device, Index >; /** * \brief A template which allows to quickly obtain a \ref VectorView type with changed template parameters. */ template< typename _Real, typename _Device = Device, - typename _Index = Index, - typename _Communicator = Communicator > - using Self = DistributedVectorView< _Real, _Device, _Index, _Communicator >; + typename _Index = Index > + using Self = DistributedVectorView< _Real, _Device, _Index >; // inherit all constructors and assignment operators from ArrayView @@ -58,29 +54,43 @@ public: // In C++14, default constructors cannot be inherited, although Clang // and GCC since version 7.0 inherit them. // https://stackoverflow.com/a/51854172 - __cuda_callable__ DistributedVectorView() = default; // initialization by base class is not a copy constructor so it has to be explicit template< typename Real_ > // template catches both const and non-const qualified Element - __cuda_callable__ - DistributedVectorView( const Containers::DistributedArrayView< Real_, Device, Index, Communicator >& view ) + DistributedVectorView( const Containers::DistributedArrayView< Real_, Device, Index >& view ) : BaseType( view ) {} + /** + * \brief Returns a modifiable view of the local part of the vector. + */ LocalViewType getLocalView(); + /** + * \brief Returns a non-modifiable view of the local part of the vector. + */ ConstLocalViewType getConstLocalView() const; + /** + * \brief Returns a modifiable view of the local part of the vector, + * including ghost values. + */ + LocalViewType getLocalViewWithGhosts(); + + /** + * \brief Returns a non-modifiable view of the local part of the vector, + * including ghost values. + */ + ConstLocalViewType getConstLocalViewWithGhosts() const; + /** * \brief Returns a modifiable view of the array view. */ - __cuda_callable__ ViewType getView(); /** * \brief Returns a non-modifiable view of the array view. */ - __cuda_callable__ ConstViewType getConstView() const; /* @@ -142,8 +152,8 @@ public: // Enable expression templates for DistributedVector namespace Expressions { - template< typename Real, typename Device, typename Index, typename Communicator > - struct HasEnabledDistributedExpressionTemplates< DistributedVectorView< Real, Device, Index, Communicator > > + template< typename Real, typename Device, typename Index > + struct HasEnabledDistributedExpressionTemplates< DistributedVectorView< Real, Device, Index > > : std::true_type {}; } // namespace Expressions diff --git a/src/TNL/Containers/DistributedVectorView.hpp b/src/TNL/Containers/DistributedVectorView.hpp index 70f61979fd44fb8d3f9d1878eb2c4a6ecd5c169b..2f9222f94efb579d3a39c803d5685283fee03b33 100644 --- a/src/TNL/Containers/DistributedVectorView.hpp +++ b/src/TNL/Containers/DistributedVectorView.hpp @@ -20,10 +20,9 @@ namespace Containers { template< typename Real, typename Device, - typename Index, - typename Communicator > -typename DistributedVectorView< Real, Device, Index, Communicator >::LocalViewType -DistributedVectorView< Real, Device, Index, Communicator >:: + typename Index > +typename DistributedVectorView< Real, Device, Index >::LocalViewType +DistributedVectorView< Real, Device, Index >:: getLocalView() { return BaseType::getLocalView(); @@ -31,22 +30,39 @@ getLocalView() template< typename Real, typename Device, - typename Index, - typename Communicator > -typename DistributedVectorView< Real, Device, Index, Communicator >::ConstLocalViewType -DistributedVectorView< Real, Device, Index, Communicator >:: + typename Index > +typename DistributedVectorView< Real, Device, Index >::ConstLocalViewType +DistributedVectorView< Real, Device, Index >:: getConstLocalView() const { return BaseType::getConstLocalView(); } +template< typename Real, + typename Device, + typename Index > +typename DistributedVectorView< Real, Device, Index >::LocalViewType +DistributedVectorView< Real, Device, Index >:: +getLocalViewWithGhosts() +{ + return BaseType::getLocalViewWithGhosts(); +} + +template< typename Real, + typename Device, + typename Index > +typename DistributedVectorView< Real, Device, Index >::ConstLocalViewType +DistributedVectorView< Real, Device, Index >:: +getConstLocalViewWithGhosts() const +{ + return BaseType::getConstLocalViewWithGhosts(); +} + template< typename Value, typename Device, - typename Index, - typename Communicator > -__cuda_callable__ -typename DistributedVectorView< Value, Device, Index, Communicator >::ViewType -DistributedVectorView< Value, Device, Index, Communicator >:: + typename Index > +typename DistributedVectorView< Value, Device, Index >::ViewType +DistributedVectorView< Value, Device, Index >:: getView() { return *this; @@ -54,11 +70,9 @@ getView() template< typename Value, typename Device, - typename Index, - typename Communicator > -__cuda_callable__ -typename DistributedVectorView< Value, Device, Index, Communicator >::ConstViewType -DistributedVectorView< Value, Device, Index, Communicator >:: + typename Index > +typename DistributedVectorView< Value, Device, Index >::ConstViewType +DistributedVectorView< Value, Device, Index >:: getConstView() const { return *this; @@ -71,201 +85,221 @@ getConstView() const template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Vector, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator=( const Vector& vector ) { TNL_ASSERT_EQ( this->getSize(), vector.getSize(), "The sizes of the array views must be equal, views are not resizable." ); TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(), "The local ranges must be equal, views are not resizable." ); + TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(), + "Ghosts must be equal, views are not resizable." ); TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), "The communication groups of the array views must be equal." ); - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() = vector.getConstLocalView(); + if( this->getCommunicationGroup() != MPI::NullGroup() ) { + // TODO: it might be better to split the local and ghost parts and synchronize in the middle + this->waitForSynchronization(); + vector.waitForSynchronization(); + getLocalViewWithGhosts() = vector.getConstLocalViewWithGhosts(); } return *this; } template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Vector, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator+=( const Vector& vector ) { TNL_ASSERT_EQ( this->getSize(), vector.getSize(), "Vector sizes must be equal." ); TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(), "Multiary operations are supported only on vectors which are distributed the same way." ); + TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(), + "Ghosts must be equal, views are not resizable." ); TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), "Multiary operations are supported only on vectors within the same communication group." ); - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() += vector.getConstLocalView(); + if( this->getCommunicationGroup() != MPI::NullGroup() ) { + // TODO: it might be better to split the local and ghost parts and synchronize in the middle + this->waitForSynchronization(); + vector.waitForSynchronization(); + getLocalViewWithGhosts() += vector.getConstLocalViewWithGhosts(); } return *this; } template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Vector, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator-=( const Vector& vector ) { TNL_ASSERT_EQ( this->getSize(), vector.getSize(), "Vector sizes must be equal." ); TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(), "Multiary operations are supported only on vectors which are distributed the same way." ); + TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(), + "Ghosts must be equal, views are not resizable." ); TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), "Multiary operations are supported only on vectors within the same communication group." ); - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() -= vector.getConstLocalView(); + if( this->getCommunicationGroup() != MPI::NullGroup() ) { + // TODO: it might be better to split the local and ghost parts and synchronize in the middle + this->waitForSynchronization(); + vector.waitForSynchronization(); + getLocalViewWithGhosts() -= vector.getConstLocalViewWithGhosts(); } return *this; } template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Vector, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator*=( const Vector& vector ) { TNL_ASSERT_EQ( this->getSize(), vector.getSize(), "Vector sizes must be equal." ); TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(), "Multiary operations are supported only on vectors which are distributed the same way." ); + TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(), + "Ghosts must be equal, views are not resizable." ); TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), "Multiary operations are supported only on vectors within the same communication group." ); - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() *= vector.getConstLocalView(); + if( this->getCommunicationGroup() != MPI::NullGroup() ) { + // TODO: it might be better to split the local and ghost parts and synchronize in the middle + this->waitForSynchronization(); + vector.waitForSynchronization(); + getLocalViewWithGhosts() *= vector.getConstLocalViewWithGhosts(); } return *this; } template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Vector, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator/=( const Vector& vector ) { TNL_ASSERT_EQ( this->getSize(), vector.getSize(), "Vector sizes must be equal." ); TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(), "Multiary operations are supported only on vectors which are distributed the same way." ); + TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(), + "Ghosts must be equal, views are not resizable." ); TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(), "Multiary operations are supported only on vectors within the same communication group." ); - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { - getLocalView() /= vector.getConstLocalView(); + if( this->getCommunicationGroup() != MPI::NullGroup() ) { + // TODO: it might be better to split the local and ghost parts and synchronize in the middle + this->waitForSynchronization(); + vector.waitForSynchronization(); + getLocalViewWithGhosts() /= vector.getConstLocalViewWithGhosts(); } return *this; } template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Scalar, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( this->getCommunicationGroup() != MPI::NullGroup() ) { getLocalView() = c; + this->startSynchronization(); } return *this; } template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Scalar, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator+=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( this->getCommunicationGroup() != MPI::NullGroup() ) { getLocalView() += c; + this->startSynchronization(); } return *this; } template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Scalar, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator-=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( this->getCommunicationGroup() != MPI::NullGroup() ) { getLocalView() -= c; + this->startSynchronization(); } return *this; } template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Scalar, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator*=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( this->getCommunicationGroup() != MPI::NullGroup() ) { getLocalView() *= c; + this->startSynchronization(); } return *this; } template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< typename Scalar, typename..., typename > -DistributedVectorView< Real, Device, Index, Communicator >& -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >& +DistributedVectorView< Real, Device, Index >:: operator/=( Scalar c ) { - if( this->getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( this->getCommunicationGroup() != MPI::NullGroup() ) { getLocalView() /= c; + this->startSynchronization(); } return *this; } template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > template< Algorithms::ScanType Type > void -DistributedVectorView< Real, Device, Index, Communicator >:: +DistributedVectorView< Real, Device, Index >:: scan( IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); Algorithms::DistributedScan< Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 ); + this->startSynchronization(); } } // namespace Containers diff --git a/src/TNL/Containers/Expressions/DistributedComparison.h b/src/TNL/Containers/Expressions/DistributedComparison.h index 4cecc92bb9b3823db92df893c289c0233d26bb14..10bf2d117ab3e740b6bc3aeebd1b1506688851a3 100644 --- a/src/TNL/Containers/Expressions/DistributedComparison.h +++ b/src/TNL/Containers/Expressions/DistributedComparison.h @@ -11,7 +11,7 @@ #pragma once #include -#include +#include namespace TNL { namespace Containers { @@ -38,11 +38,13 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression return false; const bool localResult = a.getLocalRange() == b.getLocalRange() && + a.getGhosts() == b.getGhosts() && a.getSize() == b.getSize() && + // compare without ghosts a.getConstLocalView() == b.getConstLocalView(); bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } @@ -55,14 +57,15 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression { TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not match." ); TNL_ASSERT_EQ( a.getLocalRange(), b.getLocalRange(), "Local ranges of expressions to be compared do not match." ); + TNL_ASSERT_EQ( a.getGhosts(), b.getGhosts(), "Ghosts of expressions to be compared do not match." ); // we can't run allreduce if the communication groups are different if( a.getCommunicationGroup() != b.getCommunicationGroup() ) return false; const bool localResult = a.getConstLocalView() < b.getConstLocalView(); bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } @@ -70,14 +73,15 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression { TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not match." ); TNL_ASSERT_EQ( a.getLocalRange(), b.getLocalRange(), "Local ranges of expressions to be compared do not match." ); + TNL_ASSERT_EQ( a.getGhosts(), b.getGhosts(), "Ghosts of expressions to be compared do not match." ); // we can't run allreduce if the communication groups are different if( a.getCommunicationGroup() != b.getCommunicationGroup() ) return false; const bool localResult = a.getConstLocalView() <= b.getConstLocalView(); bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } @@ -85,14 +89,15 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression { TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not match." ); TNL_ASSERT_EQ( a.getLocalRange(), b.getLocalRange(), "Local ranges of expressions to be compared do not match." ); + TNL_ASSERT_EQ( a.getGhosts(), b.getGhosts(), "Ghosts of expressions to be compared do not match." ); // we can't run allreduce if the communication groups are different if( a.getCommunicationGroup() != b.getCommunicationGroup() ) return false; const bool localResult = a.getConstLocalView() > b.getConstLocalView(); bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } @@ -100,14 +105,15 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, VectorExpression { TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not match." ); TNL_ASSERT_EQ( a.getLocalRange(), b.getLocalRange(), "Local ranges of expressions to be compared do not match." ); + TNL_ASSERT_EQ( a.getGhosts(), b.getGhosts(), "Ghosts of expressions to be compared do not match." ); // we can't run allreduce if the communication groups are different if( a.getCommunicationGroup() != b.getCommunicationGroup() ) return false; const bool localResult = a.getConstLocalView() >= b.getConstLocalView(); bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } }; @@ -122,8 +128,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab { const bool localResult = a == b.getConstLocalView(); bool result = true; - if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup ) - T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); + if( b.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); return result; } @@ -136,8 +142,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab { const bool localResult = a < b.getConstLocalView(); bool result = true; - if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup ) - T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); + if( b.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); return result; } @@ -145,8 +151,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab { const bool localResult = a <= b.getConstLocalView(); bool result = true; - if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup ) - T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); + if( b.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); return result; } @@ -154,8 +160,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab { const bool localResult = a > b.getConstLocalView(); bool result = true; - if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup ) - T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); + if( b.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); return result; } @@ -163,8 +169,8 @@ struct DistributedComparison< T1, T2, ArithmeticVariable, VectorExpressionVariab { const bool localResult = a >= b.getConstLocalView(); bool result = true; - if( b.getCommunicationGroup() != T2::CommunicatorType::NullGroup ) - T2::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); + if( b.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, b.getCommunicationGroup() ); return result; } }; @@ -179,8 +185,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab { const bool localResult = a.getConstLocalView() == b; bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } @@ -193,8 +199,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab { const bool localResult = a.getConstLocalView() < b; bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } @@ -202,8 +208,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab { const bool localResult = a.getConstLocalView() <= b; bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } @@ -211,8 +217,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab { const bool localResult = a.getConstLocalView() > b; bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } @@ -220,8 +226,8 @@ struct DistributedComparison< T1, T2, VectorExpressionVariable, ArithmeticVariab { const bool localResult = a.getConstLocalView() >= b; bool result = true; - if( a.getCommunicationGroup() != T1::CommunicatorType::NullGroup ) - T1::CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); + if( a.getCommunicationGroup() != MPI::NullGroup() ) + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, a.getCommunicationGroup() ); return result; } }; diff --git a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h index 1802dcc9553e81c7b31c0e792d6d368a4e743c8d..5f67084fd8f3e21dd84ff165625cc1186386dd9b 100644 --- a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h +++ b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h @@ -10,6 +10,7 @@ #pragma once #include +#include #include #include @@ -58,12 +59,11 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV using RealType = decltype( Operation::evaluate( std::declval()[0], std::declval()[0] ) ); using DeviceType = typename T1::DeviceType; using IndexType = typename T1::IndexType; - using CommunicatorType = typename T1::CommunicatorType; - using CommunicationGroup = typename CommunicatorType::CommunicationGroup; using LocalRangeType = typename T1::LocalRangeType; using ConstLocalViewType = BinaryExpressionTemplate< typename T1::ConstLocalViewType, typename T2::ConstLocalViewType, Operation >; + using SynchronizerType = typename T1::SynchronizerType; static_assert( HasEnabledDistributedExpressionTemplates< T1 >::value, "Invalid operand in distributed binary expression templates - distributed expression templates are not enabled for the left operand." ); @@ -79,13 +79,16 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV "Attempt to mix operands with different sizes." ); TNL_ASSERT_EQ( op1.getLocalRange(), op2.getLocalRange(), "Distributed expressions are supported only on vectors which are distributed the same way." ); + TNL_ASSERT_EQ( op1.getGhosts(), op2.getGhosts(), + "Distributed expressions are supported only on vectors which are distributed the same way." ); TNL_ASSERT_EQ( op1.getCommunicationGroup(), op2.getCommunicationGroup(), "Distributed expressions are supported only on vectors within the same communication group." ); } RealType getElement( const IndexType i ) const { - return getConstLocalView().getElement( i ); + const IndexType li = getLocalRange().getLocalIndex( i ); + return getConstLocalView().getElement( li ); } // this is actually never executed, but needed for proper ExpressionVariableTypeGetter @@ -105,7 +108,12 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV return op1.getLocalRange(); } - CommunicationGroup getCommunicationGroup() const + IndexType getGhosts() const + { + return op1.getGhosts(); + } + + MPI_Comm getCommunicationGroup() const { return op1.getCommunicationGroup(); } @@ -115,6 +123,27 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV return ConstLocalViewType( op1.getConstLocalView(), op2.getConstLocalView() ); } + ConstLocalViewType getConstLocalViewWithGhosts() const + { + return ConstLocalViewType( op1.getConstLocalViewWithGhosts(), op2.getConstLocalViewWithGhosts() ); + } + + std::shared_ptr< SynchronizerType > getSynchronizer() const + { + return op1.getSynchronizer(); + } + + int getValuesPerElement() const + { + return op1.getValuesPerElement(); + } + + void waitForSynchronization() const + { + op1.waitForSynchronization(); + op2.waitForSynchronization(); + } + protected: const T1& op1; const T2& op2; @@ -128,10 +157,9 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV using RealType = decltype( Operation::evaluate( std::declval()[0], std::declval() ) ); using DeviceType = typename T1::DeviceType; using IndexType = typename T1::IndexType; - using CommunicatorType = typename T1::CommunicatorType; - using CommunicationGroup = typename CommunicatorType::CommunicationGroup; using LocalRangeType = typename T1::LocalRangeType; using ConstLocalViewType = BinaryExpressionTemplate< typename T1::ConstLocalViewType, T2, Operation >; + using SynchronizerType = typename T1::SynchronizerType; static_assert( HasEnabledDistributedExpressionTemplates< T1 >::value, "Invalid operand in distributed binary expression templates - distributed expression templates are not enabled for the left operand." ); @@ -141,7 +169,8 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV RealType getElement( const IndexType i ) const { - return getConstLocalView().getElement( i ); + const IndexType li = getLocalRange().getLocalIndex( i ); + return getConstLocalView().getElement( li ); } // this is actually never executed, but needed for proper ExpressionVariableTypeGetter @@ -161,7 +190,12 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV return op1.getLocalRange(); } - CommunicationGroup getCommunicationGroup() const + IndexType getGhosts() const + { + return op1.getGhosts(); + } + + MPI_Comm getCommunicationGroup() const { return op1.getCommunicationGroup(); } @@ -171,6 +205,26 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionV return ConstLocalViewType( op1.getConstLocalView(), op2 ); } + ConstLocalViewType getConstLocalViewWithGhosts() const + { + return ConstLocalViewType( op1.getConstLocalViewWithGhosts(), op2 ); + } + + std::shared_ptr< SynchronizerType > getSynchronizer() const + { + return op1.getSynchronizer(); + } + + int getValuesPerElement() const + { + return op1.getValuesPerElement(); + } + + void waitForSynchronization() const + { + op1.waitForSynchronization(); + } + protected: const T1& op1; const T2& op2; @@ -184,10 +238,9 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl using RealType = decltype( Operation::evaluate( std::declval(), std::declval()[0] ) ); using DeviceType = typename T2::DeviceType; using IndexType = typename T2::IndexType; - using CommunicatorType = typename T2::CommunicatorType; - using CommunicationGroup = typename CommunicatorType::CommunicationGroup; using LocalRangeType = typename T2::LocalRangeType; using ConstLocalViewType = BinaryExpressionTemplate< T1, typename T2::ConstLocalViewType, Operation >; + using SynchronizerType = typename T2::SynchronizerType; static_assert( HasEnabledDistributedExpressionTemplates< T2 >::value, "Invalid operand in distributed binary expression templates - distributed expression templates are not enabled for the right operand." ); @@ -197,7 +250,8 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl RealType getElement( const IndexType i ) const { - return getConstLocalView().getElement( i ); + const IndexType li = getLocalRange().getLocalIndex( i ); + return getConstLocalView().getElement( li ); } // this is actually never executed, but needed for proper ExpressionVariableTypeGetter @@ -217,7 +271,12 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl return op2.getLocalRange(); } - CommunicationGroup getCommunicationGroup() const + IndexType getGhosts() const + { + return op2.getGhosts(); + } + + MPI_Comm getCommunicationGroup() const { return op2.getCommunicationGroup(); } @@ -227,6 +286,26 @@ struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariabl return ConstLocalViewType( op1, op2.getConstLocalView() ); } + ConstLocalViewType getConstLocalViewWithGhosts() const + { + return ConstLocalViewType( op1, op2.getConstLocalViewWithGhosts() ); + } + + std::shared_ptr< SynchronizerType > getSynchronizer() const + { + return op2.getSynchronizer(); + } + + int getValuesPerElement() const + { + return op2.getValuesPerElement(); + } + + void waitForSynchronization() const + { + op2.waitForSynchronization(); + } + protected: const T1& op1; const T2& op2; @@ -241,10 +320,9 @@ struct DistributedUnaryExpressionTemplate using RealType = decltype( Operation::evaluate( std::declval()[0] ) ); using DeviceType = typename T1::DeviceType; using IndexType = typename T1::IndexType; - using CommunicatorType = typename T1::CommunicatorType; - using CommunicationGroup = typename CommunicatorType::CommunicationGroup; using LocalRangeType = typename T1::LocalRangeType; using ConstLocalViewType = UnaryExpressionTemplate< typename T1::ConstLocalViewType, Operation >; + using SynchronizerType = typename T1::SynchronizerType; static_assert( HasEnabledDistributedExpressionTemplates< T1 >::value, "Invalid operand in distributed unary expression templates - distributed expression templates are not enabled for the operand." ); @@ -254,7 +332,8 @@ struct DistributedUnaryExpressionTemplate RealType getElement( const IndexType i ) const { - return getConstLocalView().getElement( i ); + const IndexType li = getLocalRange().getLocalIndex( i ); + return getConstLocalView().getElement( li ); } // this is actually never executed, but needed for proper ExpressionVariableTypeGetter @@ -274,7 +353,12 @@ struct DistributedUnaryExpressionTemplate return operand.getLocalRange(); } - CommunicationGroup getCommunicationGroup() const + IndexType getGhosts() const + { + return operand.getGhosts(); + } + + MPI_Comm getCommunicationGroup() const { return operand.getCommunicationGroup(); } @@ -284,6 +368,26 @@ struct DistributedUnaryExpressionTemplate return ConstLocalViewType( operand.getConstLocalView() ); } + ConstLocalViewType getConstLocalViewWithGhosts() const + { + return ConstLocalViewType( operand.getConstLocalViewWithGhosts() ); + } + + std::shared_ptr< SynchronizerType > getSynchronizer() const + { + return operand.getSynchronizer(); + } + + int getValuesPerElement() const + { + return operand.getValuesPerElement(); + } + + void waitForSynchronization() const + { + operand.waitForSynchronization(); + } + protected: const T1& operand; }; @@ -812,10 +916,19 @@ template< typename T1, typename Operation > std::ostream& operator<<( std::ostream& str, const DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression ) { + const auto localRange = expression.getLocalRange(); str << "[ "; - for( int i = 0; i < expression.getSize() - 1; i++ ) + for( int i = localRange.getBegin(); i < localRange.getEnd() - 1; i++ ) str << expression.getElement( i ) << ", "; - str << expression.getElement( expression.getSize() - 1 ) << " ]"; + str << expression.getElement( localRange.getEnd() - 1 ); + if( expression.getGhosts() > 0 ) { + str << " | "; + const auto localView = expression.getConstLocalViewWithGhosts(); + for( int i = localRange.getSize(); i < localView.getSize() - 1; i++ ) + str << localView.getElement( i ) << ", "; + str << localView.getElement( localView.getSize() - 1 ); + } + str << " ]"; return str; } @@ -823,10 +936,19 @@ template< typename T, typename Operation > std::ostream& operator<<( std::ostream& str, const DistributedUnaryExpressionTemplate< T, Operation >& expression ) { + const auto localRange = expression.getLocalRange(); str << "[ "; - for( int i = 0; i < expression.getSize() - 1; i++ ) + for( int i = localRange.getBegin(); i < localRange.getEnd() - 1; i++ ) str << expression.getElement( i ) << ", "; - str << expression.getElement( expression.getSize() - 1 ) << " ]"; + str << expression.getElement( localRange.getEnd() - 1 ); + if( expression.getGhosts() > 0 ) { + str << " | "; + const auto localView = expression.getConstLocalViewWithGhosts(); + for( int i = localRange.getSize(); i < localView.getSize() - 1; i++ ) + str << localView.getElement( i ) << ", "; + str << localView.getElement( localView.getSize() - 1 ); + } + str << " ]"; return str; } diff --git a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h index b525e8a5398001998955f2cdcfa40fdcb0891b05..903df1e1dd23ac9e9d0b5193f57760c3d3a9d710 100644 --- a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h +++ b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h @@ -11,7 +11,7 @@ #pragma once #include -#include +#include namespace TNL { namespace Containers { @@ -21,14 +21,13 @@ template< typename Expression > auto DistributedExpressionMin( const Expression& expression ) -> std::decay_t< decltype( expression[0] ) > { using ResultType = std::decay_t< decltype( expression[0] ) >; - using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< ResultType >::is_specialized, "std::numeric_limits is not specialized for the reduction's result type" ); ResultType result = std::numeric_limits< ResultType >::max(); - if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( expression.getCommunicationGroup() != MPI::NullGroup() ) { const ResultType localResult = ExpressionMin( expression.getConstLocalView() ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_MIN, expression.getCommunicationGroup() ); + MPI::Allreduce( &localResult, &result, 1, MPI_MIN, expression.getCommunicationGroup() ); } return result; } @@ -40,26 +39,25 @@ auto DistributedExpressionArgMin( const Expression& expression ) using RealType = std::decay_t< decltype( expression[0] ) >; using IndexType = typename Expression::IndexType; using ResultType = std::pair< RealType, IndexType >; - using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< RealType >::is_specialized, "std::numeric_limits is not specialized for the reduction's real type" ); ResultType result( -1, std::numeric_limits< RealType >::max() ); const auto group = expression.getCommunicationGroup(); - if( group != CommunicatorType::NullGroup ) { + if( group != MPI::NullGroup() ) { // compute local argMin ResultType localResult = ExpressionArgMin( expression.getConstLocalView() ); // transform local index to global index localResult.second += expression.getLocalRange().getBegin(); // scatter local result to all processes and gather their results - const int nproc = CommunicatorType::GetSize( group ); + const int nproc = MPI::GetSize( group ); ResultType dataForScatter[ nproc ]; for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localResult; ResultType gatheredResults[ nproc ]; // NOTE: exchanging general data types does not work with MPI - //CommunicatorType::Alltoall( dataForScatter, 1, gatheredResults, 1, group ); - CommunicatorType::Alltoall( (char*) dataForScatter, sizeof(ResultType), (char*) gatheredResults, sizeof(ResultType), group ); + //MPI::Alltoall( dataForScatter, 1, gatheredResults, 1, group ); + MPI::Alltoall( (char*) dataForScatter, sizeof(ResultType), (char*) gatheredResults, sizeof(ResultType), group ); // reduce the gathered data const auto* _data = gatheredResults; // workaround for nvcc which does not allow to capture variable-length arrays (even in pure host code!) @@ -82,14 +80,13 @@ template< typename Expression > auto DistributedExpressionMax( const Expression& expression ) -> std::decay_t< decltype( expression[0] ) > { using ResultType = std::decay_t< decltype( expression[0] ) >; - using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< ResultType >::is_specialized, "std::numeric_limits is not specialized for the reduction's result type" ); ResultType result = std::numeric_limits< ResultType >::lowest(); - if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( expression.getCommunicationGroup() != MPI::NullGroup() ) { const ResultType localResult = ExpressionMax( expression.getConstLocalView() ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_MAX, expression.getCommunicationGroup() ); + MPI::Allreduce( &localResult, &result, 1, MPI_MAX, expression.getCommunicationGroup() ); } return result; } @@ -101,26 +98,25 @@ auto DistributedExpressionArgMax( const Expression& expression ) using RealType = std::decay_t< decltype( expression[0] ) >; using IndexType = typename Expression::IndexType; using ResultType = std::pair< RealType, IndexType >; - using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< RealType >::is_specialized, "std::numeric_limits is not specialized for the reduction's real type" ); ResultType result( -1, std::numeric_limits< RealType >::lowest() ); const auto group = expression.getCommunicationGroup(); - if( group != CommunicatorType::NullGroup ) { + if( group != MPI::NullGroup() ) { // compute local argMax ResultType localResult = ExpressionArgMax( expression.getConstLocalView() ); // transform local index to global index localResult.second += expression.getLocalRange().getBegin(); // scatter local result to all processes and gather their results - const int nproc = CommunicatorType::GetSize( group ); + const int nproc = MPI::GetSize( group ); ResultType dataForScatter[ nproc ]; for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localResult; ResultType gatheredResults[ nproc ]; // NOTE: exchanging general data types does not work with MPI - //CommunicatorType::Alltoall( dataForScatter, 1, gatheredResults, 1, group ); - CommunicatorType::Alltoall( (char*) dataForScatter, sizeof(ResultType), (char*) gatheredResults, sizeof(ResultType), group ); + //MPI::Alltoall( dataForScatter, 1, gatheredResults, 1, group ); + MPI::Alltoall( (char*) dataForScatter, sizeof(ResultType), (char*) gatheredResults, sizeof(ResultType), group ); // reduce the gathered data const auto* _data = gatheredResults; // workaround for nvcc which does not allow to capture variable-length arrays (even in pure host code!) @@ -143,12 +139,11 @@ template< typename Expression > auto DistributedExpressionSum( const Expression& expression ) -> std::decay_t< decltype( expression[0] ) > { using ResultType = std::decay_t< decltype( expression[0] ) >; - using CommunicatorType = typename Expression::CommunicatorType; ResultType result = 0; - if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( expression.getCommunicationGroup() != MPI::NullGroup() ) { const ResultType localResult = ExpressionSum( expression.getConstLocalView() ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_SUM, expression.getCommunicationGroup() ); + MPI::Allreduce( &localResult, &result, 1, MPI_SUM, expression.getCommunicationGroup() ); } return result; } @@ -157,12 +152,11 @@ template< typename Expression > auto DistributedExpressionProduct( const Expression& expression ) -> std::decay_t< decltype( expression[0] * expression[0] ) > { using ResultType = std::decay_t< decltype( expression[0] ) >; - using CommunicatorType = typename Expression::CommunicatorType; ResultType result = 1; - if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( expression.getCommunicationGroup() != MPI::NullGroup() ) { const ResultType localResult = ExpressionProduct( expression.getConstLocalView() ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_PROD, expression.getCommunicationGroup() ); + MPI::Allreduce( &localResult, &result, 1, MPI_PROD, expression.getCommunicationGroup() ); } return result; } @@ -171,14 +165,13 @@ template< typename Expression > auto DistributedExpressionLogicalAnd( const Expression& expression ) -> std::decay_t< decltype( expression[0] && expression[0] ) > { using ResultType = std::decay_t< decltype( expression[0] && expression[0] ) >; - using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< ResultType >::is_specialized, "std::numeric_limits is not specialized for the reduction's result type" ); ResultType result = std::numeric_limits< ResultType >::max(); - if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( expression.getCommunicationGroup() != MPI::NullGroup() ) { const ResultType localResult = ExpressionLogicalAnd( expression.getConstLocalView() ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, expression.getCommunicationGroup() ); + MPI::Allreduce( &localResult, &result, 1, MPI_LAND, expression.getCommunicationGroup() ); } return result; } @@ -187,12 +180,11 @@ template< typename Expression > auto DistributedExpressionLogicalOr( const Expression& expression ) -> std::decay_t< decltype( expression[0] || expression[0] ) > { using ResultType = std::decay_t< decltype( expression[0] || expression[0] ) >; - using CommunicatorType = typename Expression::CommunicatorType; ResultType result = 0; - if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( expression.getCommunicationGroup() != MPI::NullGroup() ) { const ResultType localResult = ExpressionLogicalOr( expression.getConstLocalView() ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LOR, expression.getCommunicationGroup() ); + MPI::Allreduce( &localResult, &result, 1, MPI_LOR, expression.getCommunicationGroup() ); } return result; } @@ -201,14 +193,13 @@ template< typename Expression > auto DistributedExpressionBinaryAnd( const Expression& expression ) -> std::decay_t< decltype( expression[0] | expression[0] ) > { using ResultType = std::decay_t< decltype( expression[0] & expression[0] ) >; - using CommunicatorType = typename Expression::CommunicatorType; static_assert( std::numeric_limits< ResultType >::is_specialized, "std::numeric_limits is not specialized for the reduction's result type" ); ResultType result = std::numeric_limits< ResultType >::max(); - if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( expression.getCommunicationGroup() != MPI::NullGroup() ) { const ResultType localResult = ExpressionLogicalBinaryAnd( expression.getConstLocalView() ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_BAND, expression.getCommunicationGroup() ); + MPI::Allreduce( &localResult, &result, 1, MPI_BAND, expression.getCommunicationGroup() ); } return result; } @@ -217,12 +208,11 @@ template< typename Expression > auto DistributedExpressionBinaryOr( const Expression& expression ) -> std::decay_t< decltype( expression[0] | expression[0] ) > { using ResultType = std::decay_t< decltype( expression[0] | expression[0] ) >; - using CommunicatorType = typename Expression::CommunicatorType; ResultType result = 0; - if( expression.getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( expression.getCommunicationGroup() != MPI::NullGroup() ) { const ResultType localResult = ExpressionBinaryOr( expression.getConstLocalView() ); - CommunicatorType::Allreduce( &localResult, &result, 1, MPI_BOR, expression.getCommunicationGroup() ); + MPI::Allreduce( &localResult, &result, 1, MPI_BOR, expression.getCommunicationGroup() ); } return result; } diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h index 7b8a2f31c388f3e1836a86a8579b0a463cb710a7..f8ba157ba6ce1e8fc85c4b9a28526808e8bb2597 100644 --- a/src/TNL/Containers/NDArray.h +++ b/src/TNL/Containers/NDArray.h @@ -59,10 +59,8 @@ public: NDArrayStorage() = default; - // The copy-constructor of TNL::Containers::Array makes shallow copy so our - // copy-constructor cannot be default. Actually, we most likely don't need - // it anyway, so let's just delete it. - NDArrayStorage( const NDArrayStorage& ) = delete; + // Copy constructor (makes a deep copy). + explicit NDArrayStorage( const NDArrayStorage& ) = default; // Standard copy-semantics with deep copy, just like regular 1D array. // Mismatched sizes cause reallocations. @@ -326,21 +324,49 @@ template< typename Value, typename SizesHolder, typename Permutation = std::make_index_sequence< SizesHolder::getDimension() >, // identity by default typename Device = Devices::Host, - typename Index = typename SizesHolder::IndexType > + typename Index = typename SizesHolder::IndexType, + typename Allocator = typename Allocators::Default< Device >::template Allocator< Value > > class NDArray -: public NDArrayStorage< Array< Value, Device, Index >, +: public NDArrayStorage< Array< Value, Device, Index, Allocator >, SizesHolder, Permutation, __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > > { - using Base = NDArrayStorage< Array< Value, Device, Index >, + using Base = NDArrayStorage< Array< Value, Device, Index, Allocator >, SizesHolder, Permutation, __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >; public: - // inherit all assignment operators + // inherit all constructors and assignment operators + using Base::Base; using Base::operator=; + + // default constructor + NDArray() = default; + + // implement dynamic array interface + using AllocatorType = Allocator; + + NDArray( const NDArray& allocator ) + { + // set empty array containing the specified allocator + this->getStorageArray() = Array< Value, Device, Index, Allocator >( allocator ); + } + + // Copy constructor with a specific allocator (makes a deep copy). + explicit NDArray( const NDArray& other, const AllocatorType& allocator ) + { + // set empty array containing the specified allocator + this->array = Array< Value, Device, Index, Allocator >( allocator ); + // copy the data + *this = other; + } + + AllocatorType getAllocator() const + { + return this->array.getAllocator(); + } }; template< typename Value, @@ -372,21 +398,49 @@ template< typename Value, typename Permutation = std::make_index_sequence< SizesHolder::getDimension() >, // identity by default typename SliceInfo = SliceInfo<>, // no slicing by default typename Device = Devices::Host, - typename Index = typename SizesHolder::IndexType > + typename Index = typename SizesHolder::IndexType, + typename Allocator = typename Allocators::Default< Device >::template Allocator< Value > > class SlicedNDArray -: public NDArrayStorage< Array< Value, Device, Index >, +: public NDArrayStorage< Array< Value, Device, Index, Allocator >, SizesHolder, Permutation, __ndarray_impl::SlicedNDArrayBase< SliceInfo > > { - using Base = NDArrayStorage< Array< Value, Device, Index >, + using Base = NDArrayStorage< Array< Value, Device, Index, Allocator >, SizesHolder, Permutation, __ndarray_impl::SlicedNDArrayBase< SliceInfo > >; public: - // inherit all assignment operators + // inherit all constructors and assignment operators + using Base::Base; using Base::operator=; + + // default constructor + SlicedNDArray() = default; + + // implement dynamic array interface + using AllocatorType = Allocator; + + SlicedNDArray( const SlicedNDArray& allocator ) + { + // set empty array containing the specified allocator + this->getStorageArray() = Array< Value, Device, Index, Allocator >( allocator ); + } + + // Copy constructor with a specific allocator (makes a deep copy). + explicit SlicedNDArray( const SlicedNDArray& other, const AllocatorType& allocator ) + { + // set empty array containing the specified allocator + this->array = Array< Value, Device, Index, Allocator >( allocator ); + // copy the data + *this = other; + } + + AllocatorType getAllocator() const + { + return this->array.getAllocator(); + } }; } // namespace Containers diff --git a/src/TNL/Containers/Partitioner.h b/src/TNL/Containers/Partitioner.h index f0b50747599fad82e9dff3ca21f3cf4944782cf1..6d3605b5a7449faa9664a2e641d6a32cf92e5659 100644 --- a/src/TNL/Containers/Partitioner.h +++ b/src/TNL/Containers/Partitioner.h @@ -12,25 +12,27 @@ #pragma once +#include + #include "Subrange.h" +#include "ByteArraySynchronizer.h" #include namespace TNL { namespace Containers { -template< typename Index, typename Communicator > +template< typename Index > class Partitioner { - using CommunicationGroup = typename Communicator::CommunicationGroup; public: using SubrangeType = Subrange< Index >; - static SubrangeType splitRange( Index globalSize, CommunicationGroup group ) + static SubrangeType splitRange( Index globalSize, MPI_Comm group ) { - if( group != Communicator::NullGroup ) { - const int rank = Communicator::GetRank( group ); - const int partitions = Communicator::GetSize( group ); + if( group != MPI::NullGroup() ) { + const int rank = MPI::GetRank( group ); + const int partitions = MPI::GetSize( group ); const Index begin = TNL::min( globalSize, rank * globalSize / partitions ); const Index end = TNL::min( globalSize, (rank + 1) * globalSize / partitions ); return SubrangeType( begin, end ); @@ -66,13 +68,77 @@ public: const Index end = min( globalSize, (rank + 1) * globalSize / partitions ); return end - begin; } -}; -// TODO: -// - partitioner in deal.II stores also ghost indices: -// https://www.dealii.org/8.4.0/doxygen/deal.II/classUtilities_1_1MPI_1_1Partitioner.html -// - ghost indices are stored in a general IndexMap class (based on collection of subranges): -// https://www.dealii.org/8.4.0/doxygen/deal.II/classIndexSet.html + template< typename Device > + class ArraySynchronizer + : public ByteArraySynchronizer< Device, Index > + { + using Base = ByteArraySynchronizer< Device, Index >; + + SubrangeType localRange; + int overlaps; + MPI_Comm group; + + public: + using ByteArrayView = typename Base::ByteArrayView; + using RequestsVector = typename Base::RequestsVector; + + ~ArraySynchronizer() + { + // wait for pending async operation, otherwise it would crash + if( this->async_op.valid() ) + this->async_op.wait(); + } + + ArraySynchronizer() = delete; + + ArraySynchronizer( SubrangeType localRange, int overlaps, MPI_Comm group ) + : localRange(localRange), overlaps(overlaps), group(group) + {} + + virtual void synchronizeByteArray( ByteArrayView array, int bytesPerValue ) override + { + auto requests = synchronizeByteArrayAsyncWorker( array, bytesPerValue ); + MPI::Waitall( requests.data(), requests.size() ); + } + + virtual RequestsVector synchronizeByteArrayAsyncWorker( ByteArrayView array, int bytesPerValue ) override + { + TNL_ASSERT_EQ( array.getSize(), bytesPerValue * (localRange.getSize() + 2 * overlaps), + "unexpected array size" ); + + const int rank = MPI::GetRank( group ); + const int nproc = MPI::GetSize( group ); + const int left = (rank > 0) ? rank - 1 : nproc - 1; + const int right = (rank < nproc - 1) ? rank + 1 : 0; + + // buffer for asynchronous communication requests + std::vector< MPI_Request > requests; + + // issue all async receive operations + requests.push_back( MPI::Irecv( + array.getData() + bytesPerValue * localRange.getSize(), + bytesPerValue * overlaps, + left, 0, group ) ); + requests.push_back( MPI::Irecv( + array.getData() + bytesPerValue * (localRange.getSize() + overlaps), + bytesPerValue * overlaps, + right, 0, group ) ); + + // issue all async send operations + requests.push_back( MPI::Isend( + array.getData(), + bytesPerValue * overlaps, + left, 0, group ) ); + requests.push_back( MPI::Isend( + array.getData() + bytesPerValue * (localRange.getSize() - overlaps), + bytesPerValue * overlaps, + right, 0, group ) ); + + return requests; + } + }; +}; } // namespace Containers } // namespace TNL diff --git a/src/TNL/Exceptions/MPIDimsCreateError.h b/src/TNL/Exceptions/MPIDimsCreateError.h deleted file mode 100644 index 1cb1a8f2e61abedb6b7798828918a5f118d9fd89..0000000000000000000000000000000000000000 --- a/src/TNL/Exceptions/MPIDimsCreateError.h +++ /dev/null @@ -1,28 +0,0 @@ -/*************************************************************************** - MPIDimsCreateError.h - description - ------------------- - begin : Jan 30, 2019 - copyright : (C) 2019 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include - -namespace TNL { -namespace Exceptions { - -struct MPIDimsCreateError - : public std::runtime_error -{ - MPIDimsCreateError() - : std::runtime_error( "The program tries to call MPI_Dims_create with wrong dimensions." - "Non of the dimensions is zero and product of all dimensions does not fit with number of MPI processes." ) - {} -}; - -} // namespace Exceptions -} // namespace TNL diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h index 7bfeb4976f46bc98e70454e16b63b94a26bd6dab..3e1ea757b9dd656de17a5fe224695b99e3791e6d 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalProblem_impl.h @@ -4,7 +4,7 @@ * and open the template in the editor. */ -/* +/* * File: tnlFastSweepingMethod_impl.h * Author: oberhuber * @@ -25,7 +25,7 @@ String tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >:: getType() { - return String( "DirectEikonalProblem< " + + return String( "DirectEikonalProblem< " + Mesh::getType() + ", " + Anisotropy::getType() + ", " + Real::getType() + ", " + @@ -54,7 +54,7 @@ tnlDirectEikonalProblem< Mesh, Communicator, Anisotropy, Real, Index >:: writeProlog( Logger& logger, const Config::ParameterContainer& parameters ) const { - + } template< typename Mesh, @@ -123,7 +123,7 @@ setInitialCondition( const Config::ParameterContainer& parameters, { this->bindDofs( dofs ); String inputFile = parameters.getParameter< String >( "input-file" ); - this->initialData->setMesh( this->getMesh() ); + this->initialData->setMesh( this->getMesh() ); if( CommunicatorType::isDistributed() ) { std::cout<<"Nodes Distribution: " << initialData->getMesh().getDistributedMesh()->printProcessDistr() << std::endl; @@ -132,7 +132,7 @@ setInitialCondition( const Config::ParameterContainer& parameters, if(distributedIOType==Meshes::DistributedMeshes::LocalCopy) Meshes::DistributedMeshes::DistributedGridIO ::load(inputFile, *initialData ); synchronizer.setDistributedGrid( initialData->getMesh().getDistributedMesh() ); - synchronizer.template synchronize( *initialData ); + synchronizer.synchronize( *initialData ); } else { @@ -190,7 +190,7 @@ solve( DofVectorPointer& dofs ) { FastSweepingMethod< MeshType, Communicator,AnisotropyType > fsm; fsm.solve( this->getMesh(), u, anisotropy, initialData ); - + makeSnapshot(); return true; } diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index a1ca740e4ba7743935ca34797ae21a532a47ac8f..14a52ec40cb5349b741e2880a9474b5ff2b210d9 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -4,7 +4,7 @@ * and open the template in the editor. */ -/* +/* * File: tnlFastSweepingMethod2D_impl.h * Author: oberhuber * @@ -24,7 +24,7 @@ FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisot FastSweepingMethod() : maxIterations( 1 ) { - + } template< typename Real, @@ -36,7 +36,7 @@ const Index& FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: getMaxIterations() const { - + } template< typename Real, @@ -48,68 +48,68 @@ void FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: setMaxIterations( const IndexType& maxIterations ) { - + } template< typename Real, typename Device, typename Index, typename Communicator, - typename Anisotropy > + typename Anisotropy > void FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: solve( const MeshPointer& mesh, MeshFunctionPointer& Aux, const AnisotropyPointer& anisotropy, const MeshFunctionPointer& u ) -{ +{ MeshFunctionPointer auxPtr; InterfaceMapPointer interfaceMapPtr; auxPtr->setMesh( mesh ); interfaceMapPtr->setMesh( mesh ); - + // Setting overlaps ( WITHOUT MPI SHOULD BE 0 ) StaticVector vecLowerOverlaps, vecUpperOverlaps; setOverlaps( vecLowerOverlaps, vecUpperOverlaps, mesh ); - + std::cout << "Initiating the interface cells ..." << std::endl; BaseType::initInterface( u, auxPtr, interfaceMapPtr, vecLowerOverlaps, vecUpperOverlaps ); - + //auxPtr->save( "aux-ini.tnl" ); - + typename MeshType::Cell cell( *mesh ); - + IndexType iteration( 0 ); InterfaceMapType interfaceMap = *interfaceMapPtr; MeshFunctionType aux = *auxPtr; synchronizer.setDistributedGrid( aux.getMesh().getDistributedMesh() ); - synchronizer.template synchronize< Communicator >( aux ); //synchronize initialized overlaps - - std::cout << "Calculating the values ..." << std::endl; + synchronizer.synchronize( aux ); //synchronize initialized overlaps + + std::cout << "Calculating the values ..." << std::endl; while( iteration < this->maxIterations ) { - // calculatedBefore indicates weather we calculated in the last passage of the while cycle - // calculatedBefore is same for all ranks + // calculatedBefore indicates weather we calculated in the last passage of the while cycle + // calculatedBefore is same for all ranks // without MPI should be FALSE at the end of while cycle body int calculatedBefore = 1; - + // calculateMPIAgain indicates if the thread should calculate again in upcoming passage of while cycle // calculateMPIAgain is a value that can differ in every rank // without MPI should be FALSE at the end of while cycle body - int calculateMPIAgain = 1; - + int calculateMPIAgain = 1; + while( calculatedBefore ) { calculatedBefore = 0; - + if( std::is_same< DeviceType, Devices::Host >::value && calculateMPIAgain ) // should we calculate in Host? { calculateMPIAgain = 0; - + /**--HERE-IS-PARALLEL-OMP-CODE--!!!WITHOUT MPI!!!--------------------**/ /* int numThreadsPerBlock = -1; - + numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0)); //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); if( numThreadsPerBlock <= 16 ) @@ -127,28 +127,28 @@ solve( const MeshPointer& mesh, else numThreadsPerBlock = 1024; //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); - + if( numThreadsPerBlock == -1 ){ printf("Fail in setting numThreadsPerBlock.\n"); break; } - - - + + + int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0); int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0); - + //std::cout << "numBlocksX = " << numBlocksX << std::endl; - + //Real **sArray = new Real*[numBlocksX*numBlocksY]; //for( int i = 0; i < numBlocksX * numBlocksY; i++ ) // sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)]; - + ArrayContainer BlockIterHost; BlockIterHost.setSize( numBlocksX * numBlocksY ); BlockIterHost.setValue( 1 ); int IsCalculationDone = 1; - + MeshFunctionPointer helpFunc( mesh ); MeshFunctionPointer helpFunc1( mesh ); helpFunc1 = auxPtr; @@ -164,7 +164,7 @@ solve( const MeshPointer& mesh, // std::cout<template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); } - - - //Reduction + + + //Reduction for( int i = 0; i < BlockIterHost.getSize(); i++ ){ if( IsCalculationDone == 0 ){ IsCalculationDone = IsCalculationDone || BlockIterHost[ i ]; @@ -196,16 +196,16 @@ solve( const MeshPointer& mesh, } numWhile++; //std::cout <<"numWhile = "<< numWhile <-1; j-- ){ // for( int i = 0; i < numBlocksX; i++ ) // std::cout << BlockIterHost[ j * numBlocksX + i ]; // std::cout << std::endl; // } // std::cout << std::endl; - + this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY ); - + //std::cout<getDimensions().x() - vecUpperOverlaps[0]; calculatedBefore = goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); //aux.save("aux-1.tnl"); - + // UP and LEFL boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = -1 + vecLowerOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); //aux.save( "aux-2.tnl" ); - + // DOWN and RIGHT boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); //aux.save( "aux-3.tnl" ); - + // DOWN and LEFT boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - + } if( std::is_same< DeviceType, Devices::Cuda >::value && calculateMPIAgain ) // should we calculate on CUDA? { calculateMPIAgain = 0; - + #ifdef HAVE_CUDA TNL_CHECK_CUDA_DEVICE; // Maximum cudaBlockSite is 32. Because of maximum num. of threads in kernel. // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2) const int cudaBlockSize( 16 ); - + // Setting number of threads and blocks for kernel int numBlocksX = Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize ); int numBlocksY = Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize ); dim3 blockSize( cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY ); - + // Need for calling functions from kernel BaseType ptr; - + // True if we should calculate again. int calculateCudaBlocksAgain = 1; - + // Array that identifies which blocks should be calculated. // All blocks should calculate in first passage ( setValue(1) ) TNL::Containers::Array< int, Devices::Cuda, IndexType > blockCalculationIndicator( numBlocksX * numBlocksY ); blockCalculationIndicator.setValue( 1 ); TNL_CHECK_CUDA_DEVICE; - + // Array into which we identify the neighbours and then copy it into blockCalculationIndicator TNL::Containers::Array< int, Devices::Cuda, IndexType > blockCalculationIndicatorHelp(numBlocksX * numBlocksY ); blockCalculationIndicatorHelp.setValue( 0 ); - + // number of Blocks for kernel that calculates neighbours. int nBlocksNeigh = ( numBlocksX * numBlocksY )/1024 + ((( numBlocksX * numBlocksY )%1024 != 0) ? 1:0); - + // Helping meshFunction that switches with AuxPtr in every calculation of CudaUpdateCellCaller<<<>>>() Containers::Vector< RealType, DeviceType, IndexType > helpVec; helpVec.setLike( auxPtr.template getData().getData() ); MeshFunctionPointer helpFunc; helpFunc->bind( mesh, helpVec ); - helpFunc.template modifyData() = auxPtr.template getData(); - + helpFunc.template modifyData() = auxPtr.template getData(); + // number of iterations of while calculateCudaBlocksAgain int numIter = 0; - + //int oddEvenBlock = 0; while( calculateCudaBlocksAgain ) { /** HERE IS CHESS METHOD (NO MPI) **/ - + /* CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), @@ -302,25 +302,25 @@ solve( const MeshPointer& mesh, oddEvenBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - + oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; - + CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), helpFunc.template getData< Device>(), auxPtr.template modifyData< Device>(), - blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps, + blockCalculationIndicator, vecLowerOverlaps, vecUpperOverlaps, oddEvenBlock ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - + oddEvenBlock= (oddEvenBlock == 0) ? 1: 0; - + calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1); */ /**------------------------------------------------------------------------------------------------*/ - - + + /** HERE IS FIM FOR MPI AND WITHOUT MPI **/ Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(), @@ -328,10 +328,10 @@ solve( const MeshPointer& mesh, blockCalculationIndicator.getView(), vecLowerOverlaps, vecUpperOverlaps ); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - + // Switching helpFunc and auxPtr. auxPtr.swap( helpFunc ); - + // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now. Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator.getView(), blockCalculationIndicatorHelp.getView(), numBlocksX, numBlocksY ); @@ -340,15 +340,15 @@ solve( const MeshPointer& mesh, blockCalculationIndicator = blockCalculationIndicatorHelp; cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - + // "Parallel reduction" to see if we should calculate again calculateCudaBlocksAgain calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1); - + // When we change something then we should caclucate again in the next passage of MPI ( calculated = true ) if( calculateCudaBlocksAgain ){ calculatedBefore = 1; } - + /**-----------------------------------------------------------------------------------------------------------*/ numIter ++; } @@ -364,13 +364,13 @@ solve( const MeshPointer& mesh, #endif } - -/**----------------------MPI-TO-DO---------------------------------------------**/ + +/**----------------------MPI-TO-DO---------------------------------------------**/ #ifdef HAVE_MPI if( CommunicatorType::isDistributed() ){ getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh ); - - synchronizer.template synchronize< Communicator >( aux ); + + synchronizer.synchronize( aux ); } #endif if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculated 0! @@ -384,9 +384,9 @@ solve( const MeshPointer& mesh, // PROTECTED FUNCTIONS: -template< typename Real, typename Device, typename Index, +template< typename Real, typename Device, typename Index, typename Communicator, typename Anisotropy > -void +void FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, const MeshPointer& mesh) @@ -406,11 +406,11 @@ setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, -template< typename Real, typename Device, typename Index, +template< typename Real, typename Device, typename Index, typename Communicator, typename Anisotropy > -bool +bool FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: -goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, +goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, MeshFunctionType& aux, const InterfaceMapType& interfaceMap, const AnisotropyPointer& anisotropy ) { @@ -418,10 +418,10 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, const MeshType& mesh = aux.getMesh(); const IndexType stepX = boundsFrom[0] < boundsTo[0]? 1 : -1; const IndexType stepY = boundsFrom[1] < boundsTo[1]? 1 : -1; - + typename MeshType::Cell cell( mesh ); cell.refresh(); - + for( cell.getCoordinates().y() = boundsFrom[1]; TNL::abs( cell.getCoordinates().y() - boundsTo[1] ) > 0; cell.getCoordinates().y() += stepY ) @@ -444,54 +444,54 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, #ifdef HAVE_MPI -template< typename Real, typename Device, typename Index, +template< typename Real, typename Device, typename Index, typename Communicator, typename Anisotropy > -void +void FastSweepingMethod< Meshes::Grid< 2, Real, Device, Index >, Communicator, Anisotropy >:: getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh ) { Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh(); - + int calculateFromNeighbours[4] = {0,0,0,0}; const int *neighbours = meshDistr->getNeighbors(); // Getting neighbors of distributed mesh MPI::Request *requestsInformation; - requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ]; - + requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ]; + int neighCount = 0; // should this thread calculate again? - + if( neighbours[0] != -1 ) // LEFT { requestsInformation[neighCount++] = MPI::ISend( &calculatedBefore, 1, neighbours[0], 0, MPI::AllGroup ); - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup ); } - + if( neighbours[1] != -1 ) // RIGHT { requestsInformation[neighCount++] = - MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); - requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[1], 1, neighbours[1], 0, MPI::AllGroup ); } - + if( neighbours[2] != -1 ) //UP { - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::ISend( &calculatedBefore, 1, neighbours[2], 0, MPI::AllGroup ); requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[2], 1, neighbours[2], 0, MPI::AllGroup ); } - + if( neighbours[5] != -1 ) //DOWN { - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::ISend( &calculatedBefore, 1, neighbours[5], 0, MPI::AllGroup ); - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[3], 1, neighbours[5], 0, MPI::AllGroup ); } MPI::WaitAll( requestsInformation, neighCount ); - + MPI::Allreduce( &calculatedBefore, &calculatedBefore, 1, MPI_LOR, MPI::AllGroup ); calculateMPIAgain = calculateFromNeighbours[0] || calculateFromNeighbours[1] || calculateFromNeighbours[2] || calculateFromNeighbours[3]; diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h index add4d9610b1fff9bd08d8901a3a708f94d66f1b1..9468ff1db32fe86e2546052dfa77f9dc268d1182 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h @@ -4,7 +4,7 @@ * and open the template in the editor. */ -/* +/* * File: tnlFastSweepingMethod2D_impl.h * Author: oberhuber * @@ -24,7 +24,7 @@ FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisot FastSweepingMethod() : maxIterations( 1 ) { - + } template< typename Real, @@ -36,7 +36,7 @@ const Index& FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: getMaxIterations() const { - + } template< typename Real, @@ -48,7 +48,7 @@ void FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: setMaxIterations( const IndexType& maxIterations ) { - + } template< typename Real, @@ -67,46 +67,46 @@ solve( const MeshPointer& mesh, InterfaceMapPointer interfaceMapPtr; auxPtr->setMesh( mesh ); interfaceMapPtr->setMesh( mesh ); - + // getting overlaps ( WITHOUT MPI SHOULD BE 0 ) Containers::StaticVector< 3, IndexType > vecLowerOverlaps, vecUpperOverlaps; setOverlaps( vecLowerOverlaps, vecUpperOverlaps, mesh ); - + std::cout << "Initiating the interface cells ..." << std::endl; BaseType::initInterface( u, auxPtr, interfaceMapPtr, vecLowerOverlaps, vecUpperOverlaps ); - auxPtr->save( "aux-ini.tnl" ); - + auxPtr->save( "aux-ini.tnl" ); + typename MeshType::Cell cell( *mesh ); - + IndexType iteration( 0 ); MeshFunctionType aux = *auxPtr; InterfaceMapType interfaceMap = * interfaceMapPtr; synchronizer.setDistributedGrid( aux.getMesh().getDistributedMesh() ); - synchronizer.template synchronize< Communicator >( aux ); //synchronization of intial conditions - + synchronizer.synchronize( aux ); //synchronization of intial conditions + while( iteration < this->maxIterations ) { - // indicates weather we calculated in the last passage of the while cycle - // calculatedBefore is same for all ranks + // indicates weather we calculated in the last passage of the while cycle + // calculatedBefore is same for all ranks // without MPI should be FALSE at the end of while cycle body - int calculatedBefore = 1; - + int calculatedBefore = 1; + // indicates if the MPI process should calculate again in upcoming passage of cycle // calculateMPIAgain is a value that can differ in every rank // without MPI should be FALSE at the end of while cycle body - int calculateMPIAgain = 1; - + int calculateMPIAgain = 1; + while( calculatedBefore ) { calculatedBefore = 0; - + if( std::is_same< DeviceType, Devices::Host >::value && calculateMPIAgain ) // should we calculate in Host? { calculateMPIAgain = 0; - + /** HERE IS FSM FOR OPENMP (NO MPI) - isnt worthy */ /*int numThreadsPerBlock = -1; - + numThreadsPerBlock = ( mesh->getDimensions().x()/2 + (mesh->getDimensions().x() % 2 != 0 ? 1:0)); //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); if( numThreadsPerBlock <= 16 ) @@ -124,26 +124,26 @@ solve( const MeshPointer& mesh, else numThreadsPerBlock = 1024; //printf("numThreadsPerBlock = %d\n", numThreadsPerBlock); - + if( numThreadsPerBlock == -1 ){ printf("Fail in setting numThreadsPerBlock.\n"); break; } - + int numBlocksX = mesh->getDimensions().x() / numThreadsPerBlock + (mesh->getDimensions().x() % numThreadsPerBlock != 0 ? 1:0); int numBlocksY = mesh->getDimensions().y() / numThreadsPerBlock + (mesh->getDimensions().y() % numThreadsPerBlock != 0 ? 1:0); int numBlocksZ = mesh->getDimensions().z() / numThreadsPerBlock + (mesh->getDimensions().z() % numThreadsPerBlock != 0 ? 1:0); //std::cout << "numBlocksX = " << numBlocksX << std::endl; - + //Real **sArray = new Real*[numBlocksX*numBlocksY]; // for( int i = 0; i < numBlocksX * numBlocksY; i++ ) // sArray[ i ] = new Real [ (numThreadsPerBlock + 2)*(numThreadsPerBlock + 2)]; - + ArrayContainer BlockIterHost; BlockIterHost.setSize( numBlocksX * numBlocksY * numBlocksZ ); BlockIterHost.setValue( 1 ); int IsCalculationDone = 1; - + MeshFunctionPointer helpFunc( mesh ); MeshFunctionPointer helpFunc1( mesh ); helpFunc1 = auxPtr; @@ -159,7 +159,7 @@ solve( const MeshPointer& mesh, // std::cout<template updateBlocks< 1028 >( interfaceMap, *auxPtr, *helpFunc, BlockIterHost, numThreadsPerBlock ); } - //Reduction + //Reduction for( int i = 0; i < BlockIterHost.getSize(); i++ ){ if( IsCalculationDone == 0 ){ IsCalculationDone = IsCalculationDone || BlockIterHost[ i ]; @@ -188,10 +188,10 @@ solve( const MeshPointer& mesh, } } numWhile++; - - + + this->getNeighbours( BlockIterHost, numBlocksX, numBlocksY, numBlocksZ ); - + //string s( "aux-"+ std::to_string(numWhile) + ".tnl"); //aux.save( s ); } @@ -200,60 +200,60 @@ solve( const MeshPointer& mesh, } aux = *auxPtr;*/ /**------------------------------------------------------------------------------*/ - - + + /** HERE IS FSM WITH MPI AND WITHOUT MPI */ StaticVector boundsFrom; StaticVector boundsTo; - - // TOP, NORTH and EAST + + // TOP, NORTH and EAST boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2]; boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; calculatedBefore = goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - - // TOP, NORTH and WEST + + // TOP, NORTH and WEST boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2]; boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - - // TOP, SOUTH and EAST + + // TOP, SOUTH and EAST boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2]; boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - - // TOP, SOUTH and WEST + + // TOP, SOUTH and WEST boundsFrom[2] = vecLowerOverlaps[2]; boundsTo[2] = mesh->getDimensions().z() - vecUpperOverlaps[2]; boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; - boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - - // BOTTOM, NOTH and EAST + + // BOTTOM, NOTH and EAST boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2]; boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; - goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - - // BOTTOM, NOTH and WEST + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + + // BOTTOM, NOTH and WEST boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2]; boundsFrom[1] = vecLowerOverlaps[1]; boundsTo[1] = mesh->getDimensions().y() - vecUpperOverlaps[1]; - boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; + boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - - // BOTTOM, SOUTH and EAST + + // BOTTOM, SOUTH and EAST boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2]; boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; boundsFrom[0] = vecLowerOverlaps[0]; boundsTo[0] = mesh->getDimensions().x() - vecUpperOverlaps[0]; - goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - - // BOTTOM, SOUTH and WEST + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + + // BOTTOM, SOUTH and WEST boundsFrom[2] = mesh->getDimensions().z() - 1 - vecUpperOverlaps[2]; boundsTo[2] = - 1 + vecLowerOverlaps[2]; boundsFrom[1] = mesh->getDimensions().y() - 1 - vecUpperOverlaps[1]; boundsTo[1] = - 1 + vecLowerOverlaps[1]; boundsFrom[0] = mesh->getDimensions().x() - 1 - vecUpperOverlaps[0]; boundsTo[0] = - 1 + vecLowerOverlaps[0]; - goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); - - + goThroughSweep( boundsFrom, boundsTo, aux, interfaceMap, anisotropy ); + + /**----------------------------------------------------------------------------------*/ } if( std::is_same< DeviceType, Devices::Cuda >::value && calculateMPIAgain ) @@ -263,50 +263,50 @@ solve( const MeshPointer& mesh, // the number should be less than 10^3 (num of threads in one grid is maximally 1024) // IF YOU CHANGE THIS, YOU NEED TO CHANGE THE TEMPLATE PARAMETER IN CudaUpdateCellCaller (The Number + 2) const int cudaBlockSize( 8 ); - + // Getting the number of blocks in grid in each direction (without overlaps bcs we dont calculate on overlaps) int numBlocksX = Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize ); int numBlocksY = Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize ); - int numBlocksZ = Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize ); + int numBlocksZ = Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize ); if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; - + // Making the variables for global function CudaUpdateCellCaller. dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ ); - + BaseType ptr; // tnlDirectEikonalMethodBase type for calling of function inside CudaUpdateCellCaller - - + + int BlockIterD = 1; //variable that tells us weather we should calculate the main cuda body again - + // Array containing information about each block in grid, answering question (Have we calculated in this block?) TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterDevice( numBlocksX * numBlocksY * numBlocksZ ); BlockIterDevice.setValue( 1 ); // calculate all in the first passage - + // Helping Array for GetNeighbours3D TNL::Containers::Array< int, Devices::Cuda, IndexType > BlockIterPom( numBlocksX * numBlocksY * numBlocksZ ); BlockIterPom.setValue( 0 ); //doesnt matter what number - - - + + + // number of neighbours in one block (1024 threads) for GetNeighbours3D int nBlocksNeigh = ( numBlocksX * numBlocksY * numBlocksZ )/1024 + ((( numBlocksX * numBlocksY * numBlocksZ )%1024 != 0) ? 1:0); - - - //MeshFunctionPointer helpFunc1( mesh ); + + + //MeshFunctionPointer helpFunc1( mesh ); Containers::Vector< RealType, DeviceType, IndexType > helpVec; helpVec.setLike( auxPtr.template getData().getData() ); MeshFunctionPointer helpFunc; helpFunc->bind( mesh, helpVec ); helpFunc.template modifyData() = auxPtr.template getData(); Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); - + int numIter = 0; // number of passages of following while cycle - + while( BlockIterD ) //main body of cuda code { - + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); // main function that calculates all values in each blocks // calculated values are in helpFunc @@ -319,7 +319,7 @@ solve( const MeshPointer& mesh, TNL_CHECK_CUDA_DEVICE; // Switching pointers to helpFunc and auxPtr so real results are in memory of helpFunc but here under variable auxPtr auxPtr.swap( helpFunc ); - + Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); // Neighbours of blocks that calculatedBefore in this passage should calculate in the next! // BlockIterDevice contains blocks that calculatedBefore in this passage and BlockIterPom those that should calculate in next (are neighbours) @@ -328,23 +328,23 @@ solve( const MeshPointer& mesh, TNL_CHECK_CUDA_DEVICE; BlockIterDevice = BlockIterPom; Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); - + // .containsValue(1) is actually parallel reduction implemented in TNL BlockIterD = BlockIterDevice.containsValue(1); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; - + numIter++; - if( BlockIterD ){ + if( BlockIterD ){ // if we calculated in this passage, we should send the info via MPI so neighbours should calculate after synchronization calculatedBefore = 1; } } if( numIter%2 == 1 ){ - + // We need auxPtr to point on memory of original auxPtr (not to helpFunc) // last passage of previous while cycle didnt calculate any number anyway so switching names doesnt effect values - auxPtr.swap( helpFunc ); + auxPtr.swap( helpFunc ); Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >(); } cudaDeviceSynchronize(); @@ -353,35 +353,35 @@ solve( const MeshPointer& mesh, interfaceMap = *interfaceMapPtr; #endif } - + #ifdef HAVE_MPI if( CommunicatorType::isDistributed() ) { getInfoFromNeighbours( calculatedBefore, calculateMPIAgain, mesh ); - // synchronizate the overlaps - synchronizer.template synchronize< Communicator >( aux ); + // synchronizate the overlaps + synchronizer.synchronize( aux ); } #endif - + if( !CommunicatorType::isDistributed() ) // If we start the solver without MPI, we need calculatedBefore 0! calculatedBefore = 0; //otherwise we would go throw the FSM code and CUDA FSM code again uselessly } //aux.save( "aux-8.tnl" ); iteration++; - + } // Saving the results into Aux for MakeSnapshot function. - Aux = auxPtr; + Aux = auxPtr; aux.save("aux-final.tnl"); } // PROTECTED FUNCTIONS: -template< typename Real, typename Device, typename Index, +template< typename Real, typename Device, typename Index, typename Communicator, typename Anisotropy > -void +void FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, const MeshPointer& mesh) @@ -402,11 +402,11 @@ setOverlaps( StaticVector& vecLowerOverlaps, StaticVector& vecUpperOverlaps, -template< typename Real, typename Device, typename Index, +template< typename Real, typename Device, typename Index, typename Communicator, typename Anisotropy > -bool +bool FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: -goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, +goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, MeshFunctionType& aux, const InterfaceMapType& interfaceMap, const AnisotropyPointer& anisotropy ) { @@ -415,10 +415,10 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, const IndexType stepX = boundsFrom[0] < boundsTo[0]? 1 : -1; const IndexType stepY = boundsFrom[1] < boundsTo[1]? 1 : -1; const IndexType stepZ = boundsFrom[2] < boundsTo[2]? 1 : -1; - + typename MeshType::Cell cell( mesh ); cell.refresh(); - + for( cell.getCoordinates().z() = boundsFrom[2]; TNL::abs( cell.getCoordinates().z() - boundsTo[2] ) > 0; cell.getCoordinates().z() += stepZ ) @@ -446,72 +446,72 @@ goThroughSweep( const StaticVector boundsFrom, const StaticVector boundsTo, #ifdef HAVE_MPI -template< typename Real, typename Device, typename Index, +template< typename Real, typename Device, typename Index, typename Communicator, typename Anisotropy > -void +void FastSweepingMethod< Meshes::Grid< 3, Real, Device, Index >, Communicator, Anisotropy >:: getInfoFromNeighbours( int& calculatedBefore, int& calculateMPIAgain, const MeshPointer& mesh ) { Meshes::DistributedMeshes::DistributedMesh< MeshType >* meshDistr = mesh->getDistributedMesh(); - + int calculateFromNeighbours[6] = {0,0,0,0,0,0}; - + const int *neighbours = meshDistr->getNeighbors(); // Getting neighbors of distributed mesh MPI::Request *requestsInformation; - requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ]; - + requestsInformation = new MPI::Request[ meshDistr->getNeighborsCount() ]; + int neighCount = 0; // should this thread calculate again? - + if( neighbours[0] != -1 ) // WEST { requestsInformation[neighCount++] = MPI::ISend( &calculatedBefore, 1, neighbours[0], 0, MPI::AllGroup ); - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[0], 1, neighbours[0], 0, MPI::AllGroup ); } - + if( neighbours[1] != -1 ) // EAST { requestsInformation[neighCount++] = - MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); - requestsInformation[neighCount++] = + MPI::ISend( &calculatedBefore, 1, neighbours[1], 0, MPI::AllGroup ); + requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[1], 1, neighbours[1], 0, MPI::AllGroup ); } - + if( neighbours[2] != -1 ) //NORTH { - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::ISend( &calculatedBefore, 1, neighbours[2], 0, MPI::AllGroup ); requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[2], 1, neighbours[2], 0, MPI::AllGroup ); } - + if( neighbours[5] != -1 ) //SOUTH { - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::ISend( &calculatedBefore, 1, neighbours[5], 0, MPI::AllGroup ); - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[3], 1, neighbours[5], 0, MPI::AllGroup ); } - - if( neighbours[8] != -1 ) // TOP + + if( neighbours[8] != -1 ) // TOP { - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::ISend( &calculatedBefore, 1, neighbours[8], 0, MPI::AllGroup ); - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[4], 1, neighbours[8], 0, MPI::AllGroup ); } - + if( neighbours[17] != -1 ) //BOTTOM { requestsInformation[neighCount++] = MPI::ISend( &calculatedBefore, 1, neighbours[17], 0, MPI::AllGroup ); - requestsInformation[neighCount++] = + requestsInformation[neighCount++] = MPI::IRecv( &calculateFromNeighbours[5], 1, neighbours[17], 0, MPI::AllGroup ); } - + MPI::WaitAll( requestsInformation, neighCount ); - + MPI::Allreduce( &calculatedBefore, &calculatedBefore, 1, MPI_LOR, MPI::AllGroup ); calculateMPIAgain = calculateFromNeighbours[0] || calculateFromNeighbours[1] || calculateFromNeighbours[2] || calculateFromNeighbours[3] || diff --git a/src/TNL/Functions/CutMeshFunction.h b/src/TNL/Functions/CutMeshFunction.h index 3cc0af53ae824a9fa8f79bf6f505dbeaa26465cf..b9ec101cf60d2bb3d266cb92d07bfbccac1eb28f 100644 --- a/src/TNL/Functions/CutMeshFunction.h +++ b/src/TNL/Functions/CutMeshFunction.h @@ -14,9 +14,8 @@ #include namespace TNL { -namespace Functions { -template < typename CommunicatorType, - typename MeshFunctionType, +namespace Functions { +template < typename MeshFunctionType, typename OutMesh, typename OutDof, int outDimension=OutMesh::getMeshDimension(), @@ -25,10 +24,10 @@ class CutMeshFunction { public: static bool Cut(MeshFunctionType &inputMeshFunction, - OutMesh &outMesh, + OutMesh &outMesh, OutDof &outData, - Containers::StaticVector savedDimensions, - Containers::StaticVector reducedDimensions, + Containers::StaticVector savedDimensions, + Containers::StaticVector reducedDimensions, Containers::StaticVector fixedIndexs ) { bool inCut; @@ -44,7 +43,7 @@ class CutMeshFunction auto toDistributedGrid=outMesh.getDistributedMesh(); TNL_ASSERT_TRUE(toDistributedGrid!=nullptr,"You are trying cut distributed meshfunction, but output grid is not set up for distribution"); - inCut=toDistributedGrid-> template SetupByCut(*fromDistributedGrid,savedDimensions,reducedDimensions,fixedIndexs); + inCut=toDistributedGrid->SetupByCut(*fromDistributedGrid,savedDimensions,reducedDimensions,fixedIndexs); if(inCut) { toDistributedGrid->setupGrid(outMesh); @@ -56,7 +55,7 @@ class CutMeshFunction { typename OutMesh::PointType outOrigin; typename OutMesh::PointType outProportions; - typename OutMesh::CoordinatesType outDimensions; + typename OutMesh::CoordinatesType outDimensions; for(int i=0; i + +#ifdef HAVE_MPI +#ifdef OMPI_MAJOR_VERSION + // header specific to OpenMPI (needed for CUDA-aware detection) + #include +#endif + +#include // getpid +#endif + +#include +#include +#include "Utils.h" + +namespace TNL { +namespace MPI { + +inline void configSetup( Config::ConfigDescription& config, const String& prefix = "" ) +{ +#ifdef HAVE_MPI + config.addEntry< bool >( "redirect-mpi-output", "Only process with rank 0 prints to console. Other processes are redirected to files.", true ); + config.addEntry< String >( "redirect-mpi-output-dir", "Directory where ranks will store the files if their output is redirected.", "." ); + config.addEntry< bool >( "mpi-gdb-debug", "Wait for GDB to attach the master MPI process.", false ); + config.addEntry< int >( "mpi-process-to-attach", "Number of the MPI process to be attached by GDB. Set -1 for all processes.", 0 ); +#endif +} + +inline bool setup( const Config::ParameterContainer& parameters, + const String& prefix = "" ) +{ +#ifdef HAVE_MPI + if( Initialized() && ! Finalized() ) + { + const bool redirect = parameters.getParameter< bool >( "redirect-mpi-output" ); + const String outputDirectory = parameters.getParameter< String >( "redirect-mpi-output-dir" ); + if( redirect ) + MPI::setupRedirection( outputDirectory ); +#ifdef HAVE_CUDA + if( GetSize() > 1 ) + { +#if defined( MPIX_CUDA_AWARE_SUPPORT ) && MPIX_CUDA_AWARE_SUPPORT + std::cout << "CUDA-aware MPI detected on this system ... " << std::endl; +#elif defined( MPIX_CUDA_AWARE_SUPPORT ) && !MPIX_CUDA_AWARE_SUPPORT + std::cerr << "MPI is not CUDA-aware. Please install correct version of MPI." << std::endl; + return false; +#else + std::cerr << "WARNING: TNL cannot detect if you have CUDA-aware MPI. Some problems may occur." << std::endl; +#endif + } +#endif // HAVE_CUDA + bool gdbDebug = parameters.getParameter< bool >( "mpi-gdb-debug" ); + int processToAttach = parameters.getParameter< int >( "mpi-process-to-attach" ); + + if( gdbDebug ) + { + int rank = GetRank( MPI_COMM_WORLD ); + int pid = getpid(); + + volatile int tnlMPIDebugAttached = 0; + MPI_Send( &pid, 1, MPI_INT, 0, 0, MPI_COMM_WORLD ); + MPI_Barrier( MPI_COMM_WORLD ); + if( rank == 0 ) + { + std::cout << "Attach GDB to MPI process(es) by entering:" << std::endl; + for( int i = 0; i < GetSize(); i++ ) + { + MPI_Status status; + int recvPid; + MPI_Recv( &recvPid, 1, MPI_INT, i, 0, MPI_COMM_WORLD, &status ); + + if( i == processToAttach || processToAttach == -1 ) + { + std::cout << " For MPI process " << i << ": gdb -q -ex \"attach " << recvPid << "\"" + << " -ex \"set variable tnlMPIDebugAttached=1\"" + << " -ex \"continue\"" << std::endl; + } + } + std::cout << std::flush; + } + if( rank == processToAttach || processToAttach == -1 ) + while( ! tnlMPIDebugAttached ); + MPI_Barrier( MPI_COMM_WORLD ); + } + } +#endif // HAVE_MPI + return true; +} + +} // namespace MPI +} // namespace TNL diff --git a/src/TNL/MPI/DummyDefs.h b/src/TNL/MPI/DummyDefs.h new file mode 100644 index 0000000000000000000000000000000000000000..578e46dfef428084937b7be0d034a0cd5bc4a840 --- /dev/null +++ b/src/TNL/MPI/DummyDefs.h @@ -0,0 +1,51 @@ +/*************************************************************************** + MPI/DummyDefs.h - description + ------------------- + begin : Dec 29, 2020 + copyright : (C) 2020 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#ifndef HAVE_MPI +using MPI_Request = int; +using MPI_Comm = int; + +enum MPI_Op { + MPI_MAX, + MPI_MIN, + MPI_SUM, + MPI_PROD, + MPI_LAND, + MPI_BAND, + MPI_LOR, + MPI_BOR, + MPI_LXOR, + MPI_BXOR, + MPI_MINLOC, + MPI_MAXLOC, +}; + +// MPI_Init_thread constants +enum { + MPI_THREAD_SINGLE, + MPI_THREAD_FUNNELED, + MPI_THREAD_SERIALIZED, + MPI_THREAD_MULTIPLE +}; + +// Miscellaneous constants +#define MPI_ANY_SOURCE -1 /* match any source rank */ +#define MPI_PROC_NULL -2 /* rank of null process */ +#define MPI_ROOT -4 /* special value for intercomms */ +#define MPI_ANY_TAG -1 /* match any message tag */ +#define MPI_UNDEFINED -32766 /* undefined stuff */ +#define MPI_DIST_GRAPH 3 /* dist graph topology */ +#define MPI_CART 1 /* cartesian topology */ +#define MPI_GRAPH 2 /* graph topology */ +#define MPI_KEYVAL_INVALID -1 /* invalid key value */ + +#endif diff --git a/src/TNL/Communicators/MPIPrint.h b/src/TNL/MPI/Print.h similarity index 75% rename from src/TNL/Communicators/MPIPrint.h rename to src/TNL/MPI/Print.h index 6d78eafaf8c67c1c770faf01fa879d4b31b4032a..5cd4819a2951cf46093eecb6ab5052a7b278e155 100644 --- a/src/TNL/Communicators/MPIPrint.h +++ b/src/TNL/MPI/Print.h @@ -1,8 +1,8 @@ /*************************************************************************** - MPIPrint.h - description + MPI/Print.h - description ------------------- begin : Feb 7, 2019 - copyright : (C) 2019 by Tomas Oberhuber + copyright : (C) 2019 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ @@ -10,34 +10,35 @@ #pragma once +#include #include -#include + +#include +#include #ifdef HAVE_MPI #define TNL_MPI_PRINT( message ) \ -if( ! TNL::Communicators::MpiCommunicator::IsInitialized() ) \ +if( ! TNL::MPI::Initialized() || TNL::MPI::Finalized() ) \ std::cerr << message << std::endl; \ else \ { \ - if( TNL::Communicators::MpiCommunicator::GetRank() > 0 ) \ + if( TNL::MPI::GetRank() > 0 ) \ { \ std::stringstream __tnl_mpi_print_stream_; \ - __tnl_mpi_print_stream_ << "Node " << TNL::Communicators::MpiCommunicator::GetRank() << " of " \ - << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl; \ + __tnl_mpi_print_stream_ << "Node " << TNL::MPI::GetRank() << " of " << TNL::MPI::GetSize() << " : " \ + << message << std::endl; \ TNL::String __tnl_mpi_print_string_( __tnl_mpi_print_stream_.str() ); \ mpiSend( __tnl_mpi_print_string_, 0, std::numeric_limits< int >::max() ); \ } \ else \ { \ - std::cerr << "Node 0 of " << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl; \ - for( int __tnl_mpi_print_j = 1; \ - __tnl_mpi_print_j < TNL::Communicators::MpiCommunicator::GetSize(); \ - __tnl_mpi_print_j++ ) \ - { \ - TNL::String __tnl_mpi_print_string_; \ - mpiReceive( __tnl_mpi_print_string_, __tnl_mpi_print_j, std::numeric_limits< int >::max() ); \ - std::cerr << __tnl_mpi_print_string_; \ - } \ + std::cerr << "Node 0 of " << TNL::MPI::GetSize() << " : " << message << std::endl; \ + for( int __tnl_mpi_print_j = 1; __tnl_mpi_print_j < TNL::MPI::GetSize(); __tnl_mpi_print_j++ ) \ + { \ + TNL::String __tnl_mpi_print_string_; \ + mpiReceive( __tnl_mpi_print_string_, __tnl_mpi_print_j, std::numeric_limits< int >::max() ); \ + std::cerr << __tnl_mpi_print_string_; \ + } \ } \ } #else @@ -47,11 +48,11 @@ else #ifdef HAVE_MPI #define TNL_MPI_PRINT_MASTER( message ) \ -if( ! TNL::Communicators::MpiCommunicator::IsInitialized() ) \ +if( ! TNL::MPI::Initialized() || TNL::MPI::Finalized() ) \ std::cerr << message << std::endl; \ else \ { \ - if( TNL::Communicators::MpiCommunicator::GetRank() == 0 ) \ + if( TNL::MPI::GetRank() == 0 ) \ { \ std::cerr << "Master node : " << message << std::endl; \ } \ @@ -63,20 +64,20 @@ else #ifdef HAVE_MPI #define TNL_MPI_PRINT_COND( condition, message ) \ -if( ! TNL::Communicators::MpiCommunicator::IsInitialized() ) \ +if( ! TNL::MPI::Initialized() || TNL::MPI::Finalized() ) \ { \ if( condition) std::cerr << message << std::endl; \ } \ else \ { \ - if( TNL::Communicators::MpiCommunicator::GetRank() > 0 ) \ + if( TNL::MPI::GetRank() > 0 ) \ { \ int __tnl_mpi_print_cnd = ( condition ); \ - TNL::Communicators::MpiCommunicator::Send( &__tnl_mpi_print_cnd, 1, 0, 0 ); \ + TNL::MPI::Send( &__tnl_mpi_print_cnd, 1, 0, 0 ); \ if( condition ) { \ std::stringstream __tnl_mpi_print_stream_; \ - __tnl_mpi_print_stream_ << "Node " << TNL::Communicators::MpiCommunicator::GetRank() << " of " \ - << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl; \ + __tnl_mpi_print_stream_ << "Node " << TNL::MPI::GetRank() << " of " << TNL::MPI::GetSize() << " : " \ + << message << std::endl; \ TNL::String __tnl_mpi_print_string_( __tnl_mpi_print_stream_.str() ); \ mpiSend( __tnl_mpi_print_string_, 0, std::numeric_limits< int >::max() ); \ } \ @@ -84,13 +85,11 @@ else else \ { \ if( condition ) \ - std::cerr << "Node 0 of " << TNL::Communicators::MpiCommunicator::GetSize() << " : " << message << std::endl; \ - for( int __tnl_mpi_print_j = 1; \ - __tnl_mpi_print_j < TNL::Communicators::MpiCommunicator::GetSize(); \ - __tnl_mpi_print_j++ ) \ + std::cerr << "Node 0 of " << TNL::MPI::GetSize() << " : " << message << std::endl; \ + for( int __tnl_mpi_print_j = 1; __tnl_mpi_print_j < TNL::MPI::GetSize(); __tnl_mpi_print_j++ ) \ { \ int __tnl_mpi_print_cond; \ - TNL::Communicators::MpiCommunicator::Recv( &__tnl_mpi_print_cond, 1, __tnl_mpi_print_j, 0 ); \ + TNL::MPI::Recv( &__tnl_mpi_print_cond, 1, __tnl_mpi_print_j, 0 ); \ if( __tnl_mpi_print_cond ) \ { \ TNL::String __tnl_mpi_print_string_; \ diff --git a/src/TNL/MPI/Profiling.h b/src/TNL/MPI/Profiling.h new file mode 100644 index 0000000000000000000000000000000000000000..d50427c16b2f3210ded666cc36d564547a206e03 --- /dev/null +++ b/src/TNL/MPI/Profiling.h @@ -0,0 +1,25 @@ +/*************************************************************************** + MPI/Profiling.h - description + ------------------- + begin : Jan 1, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include + +namespace TNL { +namespace MPI { + +inline Timer& getTimerAllreduce() +{ + static Timer t; + return t; +} + +} // namespace MPI +} // namespace TNL diff --git a/src/TNL/Communicators/ScopedInitializer.h b/src/TNL/MPI/ScopedInitializer.h similarity index 72% rename from src/TNL/Communicators/ScopedInitializer.h rename to src/TNL/MPI/ScopedInitializer.h index 2970bc628319bdf9d4c40d7a2cb32694a8148f7d..82ba02bc5743611bfb4af7395142de730672d548 100644 --- a/src/TNL/Communicators/ScopedInitializer.h +++ b/src/TNL/MPI/ScopedInitializer.h @@ -12,22 +12,25 @@ #pragma once +#include "Wrappers.h" +#include "Utils.h" + namespace TNL { -namespace Communicators { +namespace MPI { -template< typename Communicator > struct ScopedInitializer { - ScopedInitializer( int& argc, char**& argv ) + ScopedInitializer( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE ) { - Communicator::Init( argc, argv ); + Init( argc, argv ); } ~ScopedInitializer() { - Communicator::Finalize(); + restoreRedirection(); + Finalize(); } }; -} // namespace Communicators +} // namespace MPI } // namespace TNL diff --git a/src/TNL/MPI/Utils.h b/src/TNL/MPI/Utils.h new file mode 100644 index 0000000000000000000000000000000000000000..d334aaf5bc545ea1953aa7369dcf3669dc16ae0b --- /dev/null +++ b/src/TNL/MPI/Utils.h @@ -0,0 +1,76 @@ +/*************************************************************************** + MPI/Wrappers.h - description + ------------------- + begin : Apr 23, 2005 + copyright : (C) 2005 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include + +#include "Wrappers.h" + +namespace TNL { +namespace MPI { + +inline bool isInitialized() +{ + return Initialized() && ! Finalized(); +} + +inline void setupRedirection( std::string outputDirectory ) +{ +#ifdef HAVE_MPI + if( GetSize() > 1 && GetRank() != 0 ) { + const std::string stdoutFile = outputDirectory + "/stdout_" + std::to_string(GetRank()) + ".txt"; + const std::string stderrFile = outputDirectory + "/stderr_" + std::to_string(GetRank()) + ".txt"; + std::cout << GetRank() << ": Redirecting stdout and stderr to files " << stdoutFile << " and " << stderrFile << std::endl; + Debugging::redirect_stdout_stderr( stdoutFile, stderrFile ); + } +#endif +} + +// restore redirection (usually not necessary, it uses RAII internally...) +inline void restoreRedirection() +{ + if( GetSize() > 1 && GetRank() != 0 ) { + Debugging::redirect_stdout_stderr( "", "", true ); + } +} + +/** + * \brief Returns a local rank ID of the current process within a group of + * processes running on a shared-memory node. + * + * The given MPI communicator is split into groups according to the + * `MPI_COMM_TYPE_SHARED` type (from MPI-3) and the rank ID of the process + * within the group is returned. + */ +inline int getRankOnNode( MPI_Comm group = AllGroup() ) +{ +#ifdef HAVE_MPI + const int rank = GetRank(group); + + MPI_Info info; + MPI_Info_create( &info ); + + MPI_Comm local_comm; + MPI_Comm_split_type( group, MPI_COMM_TYPE_SHARED, rank, info, &local_comm ); + + const int local_rank = GetRank( local_comm ); + + MPI_Comm_free(&local_comm); + MPI_Info_free(&info); + + return local_rank; +#else + return 0; +#endif +} + +} // namespace MPI +} // namespace TNL diff --git a/src/TNL/MPI/Wrappers.h b/src/TNL/MPI/Wrappers.h new file mode 100644 index 0000000000000000000000000000000000000000..8a455dcb75d4bba5d38b993cca932a6cb2c4ea2f --- /dev/null +++ b/src/TNL/MPI/Wrappers.h @@ -0,0 +1,407 @@ +/*************************************************************************** + MPI/Wrappers.h - description + ------------------- + begin : Apr 23, 2005 + copyright : (C) 2005 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include + +#ifdef HAVE_MPI + #include +#else + #include "DummyDefs.h" + #include // std::memcpy + #include +#endif + +#include +#include "getDataType.h" +#include "Profiling.h" + +namespace TNL { +namespace MPI { + +// forward declaration to break cyclic inclusion +inline void selectGPU(); + +// function wrappers for MPI constants + +inline MPI_Comm AllGroup() +{ +#ifdef HAVE_MPI + return MPI_COMM_WORLD; +#else + return 1; +#endif +} + +inline MPI_Comm NullGroup() +{ +#ifdef HAVE_MPI + return MPI_COMM_NULL; +#else + return 0; +#endif +} + +inline MPI_Request NullRequest() +{ +#ifdef HAVE_MPI + return MPI_REQUEST_NULL; +#else + return 0; +#endif +} + +// wrappers for basic MPI functions + +inline void Init( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE ) +{ +#ifdef HAVE_MPI + switch( required_thread_level ) { + case MPI_THREAD_SINGLE: + case MPI_THREAD_FUNNELED: + case MPI_THREAD_SERIALIZED: + case MPI_THREAD_MULTIPLE: + break; + default: + std::cerr << "ERROR: invalid argument for the 'required' thread level support: " << required_thread_level << std::endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + int provided; + MPI_Init_thread( &argc, &argv, required_thread_level, &provided ); + if( provided < required_thread_level ) { + const char* level = ""; + switch( required_thread_level ) { + case MPI_THREAD_SINGLE: + level = "MPI_THREAD_SINGLE"; + break; + case MPI_THREAD_FUNNELED: + level = "MPI_THREAD_FUNNELED"; + break; + case MPI_THREAD_SERIALIZED: + level = "MPI_THREAD_SERIALIZED"; + break; + case MPI_THREAD_MULTIPLE: + level = "MPI_THREAD_MULTIPLE"; + break; + } + std::cerr << "ERROR: The MPI library does not have the required level of thread support: " << level << std::endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + selectGPU(); +#endif +} + +inline void Finalize() +{ +#ifdef HAVE_MPI + MPI_Finalize(); +#endif +} + +inline bool Initialized() +{ +#ifdef HAVE_MPI + int flag; + MPI_Initialized(&flag); + return flag; +#else + return true; +#endif +} + +inline bool Finalized() +{ +#ifdef HAVE_MPI + int flag; + MPI_Finalized(&flag); + return flag; +#else + return false; +#endif +} + +inline int GetRank( MPI_Comm group = AllGroup() ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + TNL_ASSERT_NE( group, NullGroup(), "GetRank cannot be called with NullGroup" ); + int rank; + MPI_Comm_rank( group, &rank ); + return rank; +#else + return 0; +#endif +} + +inline int GetSize( MPI_Comm group = AllGroup() ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + TNL_ASSERT_NE( group, NullGroup(), "GetSize cannot be called with NullGroup" ); + int size; + MPI_Comm_size( group, &size ); + return size; +#else + return 1; +#endif +} + +// wrappers for MPI helper functions + +inline MPI_Comm Comm_split( MPI_Comm comm, int color, int key ) +{ +#ifdef HAVE_MPI + MPI_Comm newcomm; + MPI_Comm_split( comm, color, key, &newcomm ); + return newcomm; +#else + return comm; +#endif +} + +/** + * \brief Wrapper for \ref MPI_Dims_create. + * + * \param nproc - number of processes in the group to be distributed + * \param ndims - number of dimensions of the Cartesian grid + * \param dims - distribution of processes into the \e dim-dimensional + * Cartesian grid (array of length \e ndims) + * + * Negative input values of \e dims[i] are erroneous. An error will occur if + * \e nproc is not a multiple of the product of all non-zero values \e dims[i]. + * + * See the MPI documentation for more information. + */ +inline void Compute_dims( int nproc, int ndims, int* dims ) +{ +#ifdef HAVE_MPI + int prod = 1; + for( int i = 0; i < ndims; i++ ) { + if( dims[ i ] < 0 ) + throw std::invalid_argument( "Negative value passed to MPI::Compute_dims in the dims array argument." ); + if( dims[ i ] > 0 ) + prod *= dims[ i ]; + } + + if( nproc % prod != 0 ) + throw std::logic_error( "The program tries to call MPI_Dims_create with wrong dimensions." + "The product of the non-zero values dims[i] is " + std::to_string(prod) + " and the " + "number of processes (" + std::to_string(nproc) + ") is not a multiple of the product." ); + + MPI_Dims_create( nproc, ndims, dims ); +#else + for( int i = 0; i < ndims; i++) + dims[ i ] = 1; +#endif +} + +// wrappers for MPI communication functions + +inline void Barrier( MPI_Comm group = AllGroup() ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + TNL_ASSERT_NE( group, NullGroup(), "Barrier cannot be called with NullGroup" ); + MPI_Barrier(group); +#endif +} + +inline void Waitall( MPI_Request* reqs, int length ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + MPI_Waitall( length, reqs, MPI_STATUSES_IGNORE ); +#endif +} + +template< typename T > +void Send( const T* data, + int count, + int dest, + int tag, + MPI_Comm group = AllGroup() ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + TNL_ASSERT_NE( group, NullGroup(), "Send cannot be called with NullGroup" ); + MPI_Send( (const void*) data, count, getDataType(), dest, tag, group ); +#endif +} + +template< typename T > +void Recv( T* data, + int count, + int src, + int tag, + MPI_Comm group = AllGroup() ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + TNL_ASSERT_NE( group, NullGroup(), "Recv cannot be called with NullGroup" ); + MPI_Recv( (void*) data, count, getDataType(), src, tag, group, MPI_STATUS_IGNORE ); +#endif +} + +template< typename T > +void Sendrecv( const T* sendData, + int sendCount, + int destination, + int sendTag, + T* receiveData, + int receiveCount, + int source, + int receiveTag, + MPI_Comm group = AllGroup() ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + TNL_ASSERT_NE( group, NullGroup(), "Sendrecv cannot be called with NullGroup" ); + MPI_Sendrecv( (void*) sendData, + sendCount, + getDataType(), + destination, + sendTag, + (void*) receiveData, + receiveCount, + getDataType(), + source, + receiveTag, + group, + MPI_STATUS_IGNORE ); +#else + throw Exceptions::MPISupportMissing(); +#endif +} + +template< typename T > +MPI_Request Isend( const T* data, + int count, + int dest, + int tag, + MPI_Comm group = AllGroup() ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + TNL_ASSERT_NE( group, NullGroup(), "Isend cannot be called with NullGroup" ); + MPI_Request req; + MPI_Isend( (const void*) data, count, getDataType(), dest, tag, group, &req ); + return req; +#else + return NullRequest(); +#endif +} + +template< typename T > +MPI_Request Irecv( T* data, + int count, + int src, + int tag, + MPI_Comm group = AllGroup() ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + TNL_ASSERT_NE( group, NullGroup(), "Irecv cannot be called with NullGroup" ); + MPI_Request req; + MPI_Irecv( (void*) data, count, getDataType(), src, tag, group, &req ); + return req; +#else + return NullRequest(); +#endif +} + +template< typename T > +void Allreduce( const T* data, + T* reduced_data, + int count, + const MPI_Op& op, + MPI_Comm group) +{ +#ifdef HAVE_MPI + TNL_ASSERT_NE( group, NullGroup(), "Allreduce cannot be called with NullGroup" ); + getTimerAllreduce().start(); + MPI_Allreduce( (const void*) data, (void*) reduced_data, count, getDataType(), op, group ); + getTimerAllreduce().stop(); +#else + std::memcpy( (void*) reduced_data, (const void*) data, count * sizeof(T) ); +#endif +} + +// in-place variant of Allreduce +template< typename T > +void Allreduce( T* data, + int count, + const MPI_Op& op, + MPI_Comm group) +{ +#ifdef HAVE_MPI + TNL_ASSERT_NE( group, NullGroup(), "Allreduce cannot be called with NullGroup" ); + getTimerAllreduce().start(); + MPI_Allreduce( MPI_IN_PLACE, (void*) data, count, getDataType(), op, group ); + getTimerAllreduce().stop(); +#endif +} + +template< typename T > +void Reduce( const T* data, + T* reduced_data, + int count, + const MPI_Op& op, + int root, + MPI_Comm group) +{ +#ifdef HAVE_MPI + TNL_ASSERT_NE( group, NullGroup(), "Reduce cannot be called with NullGroup" ); + MPI_Reduce( (const void*) data, (void*) reduced_data, count, getDataType(), op, root, group ); +#else + std::memcpy( (void*) reduced_data, (void*) data, count * sizeof(T) ); +#endif +} + +template< typename T > +void Bcast( T* data, int count, int root, MPI_Comm group) +{ +#ifdef HAVE_MPI + TNL_ASSERT_TRUE( Initialized() && ! Finalized(), "Fatal Error - MPI is not initialized" ); + TNL_ASSERT_NE( group, NullGroup(), "Bcast cannot be called with NullGroup" ); + MPI_Bcast( (void*) data, count, getDataType(), root, group ); +#endif +} + +template< typename T > +void Alltoall( const T* sendData, + int sendCount, + T* receiveData, + int receiveCount, + MPI_Comm group ) +{ +#ifdef HAVE_MPI + TNL_ASSERT_NE( group, NullGroup(), "Alltoall cannot be called with NullGroup" ); + MPI_Alltoall( (const void*) sendData, + sendCount, + getDataType(), + (void*) receiveData, + receiveCount, + getDataType(), + group ); +#else + TNL_ASSERT_EQ( sendCount, receiveCount, "sendCount must be equal to receiveCount when running without MPI." ); + std::memcpy( (void*) receiveData, (const void*) sendData, sendCount * sizeof(T) ); +#endif +} + +} // namespace MPI +} // namespace TNL + +// late inclusion to break cyclic inclusion +#include "selectGPU.h" diff --git a/src/TNL/MPI/getDataType.h b/src/TNL/MPI/getDataType.h new file mode 100644 index 0000000000000000000000000000000000000000..f3570679bf2708cca08de3e485890588396a051e --- /dev/null +++ b/src/TNL/MPI/getDataType.h @@ -0,0 +1,119 @@ +/*************************************************************************** + getDataType.h - description + ------------------- + begin : Feb 4, 2019 + copyright : (C) 2019 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#ifdef HAVE_MPI + #include +#endif + +namespace TNL { +namespace MPI { + +#ifdef HAVE_MPI +template< typename T > +struct TypeResolver +{ + static inline MPI_Datatype getType() + { + static_assert( sizeof(T) == sizeof(char) || + sizeof(T) == sizeof(int) || + sizeof(T) == sizeof(short int) || + sizeof(T) == sizeof(long int), + "Fatal Error - Unknown MPI Type"); + switch( sizeof(T) ) + { + case sizeof(char): + return MPI_CHAR; + case sizeof(int): + return MPI_INT; + case sizeof(short int): + return MPI_SHORT; + case sizeof(long int): + return MPI_LONG; + } + // This will never happen thanks to the static_assert above, but icpc is + // not that smart and complains about missing return statement at the end + // of non-void function. + throw 0; + } +}; + +template<> struct TypeResolver< char > +{ + static inline MPI_Datatype getType(){return MPI_CHAR;}; +}; + +template<> struct TypeResolver< int > +{ + static inline MPI_Datatype getType(){return MPI_INT;}; +}; + +template<> struct TypeResolver< short int > +{ + static inline MPI_Datatype getType(){return MPI_SHORT;}; +}; + +template<> struct TypeResolver< long int > +{ + static inline MPI_Datatype getType(){return MPI_LONG;}; +}; + +template<> struct TypeResolver< unsigned char > +{ + static inline MPI_Datatype getType(){return MPI_UNSIGNED_CHAR;}; +}; + +template<> struct TypeResolver< unsigned short int > +{ + static inline MPI_Datatype getType(){return MPI_UNSIGNED_SHORT;}; +}; + +template<> struct TypeResolver< unsigned int > +{ + static inline MPI_Datatype getType(){return MPI_UNSIGNED;}; +}; + +template<> struct TypeResolver< unsigned long int > +{ + static inline MPI_Datatype getType(){return MPI_UNSIGNED_LONG;}; +}; + +template<> struct TypeResolver< float > +{ + static inline MPI_Datatype getType(){return MPI_FLOAT;}; +}; + +template<> struct TypeResolver< double > +{ + static inline MPI_Datatype getType(){return MPI_DOUBLE;}; +}; + +template<> struct TypeResolver< long double > +{ + static inline MPI_Datatype getType(){return MPI_LONG_DOUBLE;}; +}; + +template<> struct TypeResolver< bool > +{ + // sizeof(bool) is implementation-defined: https://stackoverflow.com/a/4897859 + static_assert( sizeof(bool) == 1, "The systems where sizeof(bool) != 1 are not supported by MPI." ); + static inline MPI_Datatype getType() { return MPI_C_BOOL; }; +}; + +template< typename T > +MPI_Datatype getDataType( const T& = T{} ) +{ + return TypeResolver< T >::getType(); +} +#endif + +} // namespace MPI +} // namespace TNL diff --git a/src/TNL/MPI/selectGPU.h b/src/TNL/MPI/selectGPU.h new file mode 100644 index 0000000000000000000000000000000000000000..781a52809a0151f30b3c031acbb7aaadf51b766d --- /dev/null +++ b/src/TNL/MPI/selectGPU.h @@ -0,0 +1,37 @@ +/*************************************************************************** + MPI/Wrappers.h - description + ------------------- + begin : Apr 23, 2005 + copyright : (C) 2005 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include + +#include "Utils.h" + +namespace TNL { +namespace MPI { + +inline void selectGPU() +{ +#ifdef HAVE_MPI +#ifdef HAVE_CUDA + int gpuCount; + cudaGetDeviceCount(&gpuCount); + + const int local_rank = getRankOnNode(); + const int gpuNumber = local_rank % gpuCount; + + cudaSetDevice(gpuNumber); + TNL_CHECK_CUDA_DEVICE; +#endif +#endif +} + +} // namespace MPI +} // namespace TNL diff --git a/src/TNL/Matrices/DistributedMatrix.h b/src/TNL/Matrices/DistributedMatrix.h index faa220da69975ed3a3a963fcf48186fb6a98740b..61e4eabb6dd1a3629e0f95ac67332386a4f94760 100644 --- a/src/TNL/Matrices/DistributedMatrix.h +++ b/src/TNL/Matrices/DistributedMatrix.h @@ -14,7 +14,6 @@ #include -#include #include #include #include @@ -23,65 +22,42 @@ namespace TNL { namespace Matrices { -template< typename T, typename R = void > -struct enable_if_type -{ - using type = R; -}; - -template< typename T, typename Enable = void > -struct has_communicator : std::false_type {}; - -template< typename T > -struct has_communicator< T, typename enable_if_type< typename T::CommunicatorType >::type > -: std::true_type -{}; - - // TODO: 2D distribution for dense matrices (maybe it should be in different template, // because e.g. setRowFast doesn't make sense for dense matrices) -template< typename Matrix, - typename Communicator = Communicators::MpiCommunicator > +template< typename Matrix > class DistributedMatrix { - using CommunicationGroup = typename Communicator::CommunicationGroup; public: using MatrixType = Matrix; using RealType = typename Matrix::RealType; using DeviceType = typename Matrix::DeviceType; using IndexType = typename Matrix::IndexType; - using CommunicatorType = Communicator; using LocalRangeType = Containers::Subrange< typename Matrix::IndexType >; - using CompressedRowLengthsVector = Containers::DistributedVector< IndexType, DeviceType, IndexType, CommunicatorType >; + using CompressedRowLengthsVector = Containers::DistributedVector< IndexType, DeviceType, IndexType >; using MatrixRow = typename Matrix::RowView; using ConstMatrixRow = typename Matrix::ConstRowView; template< typename _Real = RealType, typename _Device = DeviceType, - typename _Index = IndexType, - typename _Communicator = Communicator > - using Self = DistributedMatrix< typename MatrixType::template Self< _Real, _Device, _Index >, _Communicator >; + typename _Index = IndexType > + using Self = DistributedMatrix< typename MatrixType::template Self< _Real, _Device, _Index > >; DistributedMatrix() = default; DistributedMatrix( DistributedMatrix& ) = default; - DistributedMatrix( LocalRangeType localRowRange, IndexType rows, IndexType columns, CommunicationGroup group = Communicator::AllGroup ); + DistributedMatrix( LocalRangeType localRowRange, IndexType rows, IndexType columns, MPI_Comm group = MPI::AllGroup() ); - void setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns, CommunicationGroup group = Communicator::AllGroup ); + void setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns, MPI_Comm group = MPI::AllGroup() ); - __cuda_callable__ const LocalRangeType& getLocalRowRange() const; - __cuda_callable__ - CommunicationGroup getCommunicationGroup() const; + MPI_Comm getCommunicationGroup() const; - __cuda_callable__ const Matrix& getLocalMatrix() const; - __cuda_callable__ Matrix& getLocalMatrix(); @@ -99,10 +75,8 @@ public: void reset(); - __cuda_callable__ IndexType getRows() const; - __cuda_callable__ IndexType getColumns() const; template< typename RowCapacitiesVector > @@ -120,20 +94,17 @@ public: RealType getElement( IndexType row, IndexType column ) const; - __cuda_callable__ RealType getElementFast( IndexType row, IndexType column ) const; - __cuda_callable__ MatrixRow getRow( IndexType row ); - __cuda_callable__ ConstMatrixRow getRow( IndexType row ) const; // multiplication with a global vector template< typename InVector, typename OutVector > - typename std::enable_if< ! has_communicator< InVector >::value >::type + typename std::enable_if< ! HasGetCommunicationGroupMethod< InVector >::value >::type vectorProduct( const InVector& inVector, OutVector& outVector ) const; @@ -144,7 +115,7 @@ public: // (not const because it modifies internal bufers) template< typename InVector, typename OutVector > - typename std::enable_if< has_communicator< InVector >::value >::type + typename std::enable_if< HasGetCommunicationGroupMethod< InVector >::value >::type vectorProduct( const InVector& inVector, OutVector& outVector ) const; @@ -158,10 +129,10 @@ public: protected: LocalRangeType localRowRange; IndexType rows = 0; // global rows count - CommunicationGroup group = Communicator::NullGroup; + MPI_Comm group = MPI::NullGroup(); Matrix localMatrix; - DistributedSpMV< Matrix, Communicator > spmv; + DistributedSpMV< Matrix > spmv; }; } // namespace Matrices diff --git a/src/TNL/Matrices/DistributedMatrix_impl.h b/src/TNL/Matrices/DistributedMatrix_impl.h index 806703ca6a28ea647d1760010b3a4febe9a0e439..8bc5d09820d0d7961bf91710f97b7eb4247dce1f 100644 --- a/src/TNL/Matrices/DistributedMatrix_impl.h +++ b/src/TNL/Matrices/DistributedMatrix_impl.h @@ -17,64 +17,54 @@ namespace TNL { namespace Matrices { -template< typename Matrix, - typename Communicator > -DistributedMatrix< Matrix, Communicator >:: -DistributedMatrix( LocalRangeType localRowRange, IndexType rows, IndexType columns, CommunicationGroup group ) +template< typename Matrix > +DistributedMatrix< Matrix >:: +DistributedMatrix( LocalRangeType localRowRange, IndexType rows, IndexType columns, MPI_Comm group ) { setDistribution( localRowRange, rows, columns, group ); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > void -DistributedMatrix< Matrix, Communicator >:: -setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns, CommunicationGroup group ) +DistributedMatrix< Matrix >:: +setDistribution( LocalRangeType localRowRange, IndexType rows, IndexType columns, MPI_Comm group ) { this->localRowRange = localRowRange; this->rows = rows; this->group = group; - if( group != Communicator::NullGroup ) + if( group != MPI::NullGroup() ) localMatrix.setDimensions( localRowRange.getSize(), columns ); spmv.reset(); } -template< typename Matrix, - typename Communicator > -__cuda_callable__ +template< typename Matrix > const Containers::Subrange< typename Matrix::IndexType >& -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: getLocalRowRange() const { return localRowRange; } -template< typename Matrix, - typename Communicator > -__cuda_callable__ -typename Communicator::CommunicationGroup -DistributedMatrix< Matrix, Communicator >:: +template< typename Matrix > +MPI_Comm +DistributedMatrix< Matrix >:: getCommunicationGroup() const { return group; } -template< typename Matrix, - typename Communicator > -__cuda_callable__ +template< typename Matrix > const Matrix& -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: getLocalMatrix() const { return localMatrix; } -template< typename Matrix, - typename Communicator > -__cuda_callable__ +template< typename Matrix > Matrix& -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: getLocalMatrix() { return localMatrix; @@ -85,10 +75,9 @@ getLocalMatrix() * Some common Matrix methods follow below. */ -template< typename Matrix, - typename Communicator > -DistributedMatrix< Matrix, Communicator >& -DistributedMatrix< Matrix, Communicator >:: +template< typename Matrix > +DistributedMatrix< Matrix >& +DistributedMatrix< Matrix >:: operator=( const DistributedMatrix& matrix ) { setLike( matrix ); @@ -96,11 +85,10 @@ operator=( const DistributedMatrix& matrix ) return *this; } -template< typename Matrix, - typename Communicator > +template< typename Matrix > template< typename MatrixT > -DistributedMatrix< Matrix, Communicator >& -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >& +DistributedMatrix< Matrix >:: operator=( const MatrixT& matrix ) { setLike( matrix ); @@ -108,11 +96,10 @@ operator=( const MatrixT& matrix ) return *this; } -template< typename Matrix, - typename Communicator > +template< typename Matrix > template< typename MatrixT > void -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: setLike( const MatrixT& matrix ) { localRowRange = matrix.getLocalRowRange(); @@ -123,86 +110,77 @@ setLike( const MatrixT& matrix ) spmv.reset(); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > void -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: reset() { localRowRange.reset(); rows = 0; - group = Communicator::NullGroup; + group = MPI::NullGroup(); localMatrix.reset(); spmv.reset(); } -template< typename Matrix, - typename Communicator > -__cuda_callable__ +template< typename Matrix > typename Matrix::IndexType -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: getRows() const { return rows; } -template< typename Matrix, - typename Communicator > -__cuda_callable__ +template< typename Matrix > typename Matrix::IndexType -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: getColumns() const { return localMatrix.getColumns(); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > template< typename RowCapacitiesVector > void -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: setRowCapacities( const RowCapacitiesVector& rowCapacities ) { TNL_ASSERT_EQ( rowCapacities.getSize(), getRows(), "row lengths vector has wrong size" ); TNL_ASSERT_EQ( rowCapacities.getLocalRange(), getLocalRowRange(), "row lengths vector has wrong distribution" ); TNL_ASSERT_EQ( rowCapacities.getCommunicationGroup(), getCommunicationGroup(), "row lengths vector has wrong communication group" ); - if( getCommunicationGroup() != CommunicatorType::NullGroup ) { + if( getCommunicationGroup() != MPI::NullGroup() ) { localMatrix.setRowCapacities( rowCapacities.getConstLocalView() ); spmv.reset(); } } -template< typename Matrix, - typename Communicator > +template< typename Matrix > template< typename Vector > void -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: getCompressedRowLengths( Vector& rowLengths ) const { - if( getCommunicationGroup() != CommunicatorType::NullGroup ) { - rowLengths.setDistribution( getLocalRowRange(), getRows(), getCommunicationGroup() ); + if( getCommunicationGroup() != MPI::NullGroup() ) { + rowLengths.setDistribution( getLocalRowRange(), 0, getRows(), getCommunicationGroup() ); auto localRowLengths = rowLengths.getLocalView(); localMatrix.getCompressedRowLengths( localRowLengths ); } } -template< typename Matrix, - typename Communicator > +template< typename Matrix > typename Matrix::IndexType -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: getRowCapacity( IndexType row ) const { const IndexType localRow = localRowRange.getLocalIndex( row ); return localMatrix.getRowCapacity( localRow ); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > void -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: setElement( IndexType row, IndexType column, RealType value ) @@ -211,10 +189,9 @@ setElement( IndexType row, localMatrix.setElement( localRow, column, value ); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > typename Matrix::RealType -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: getElement( IndexType row, IndexType column ) const { @@ -222,11 +199,9 @@ getElement( IndexType row, return localMatrix.getElement( localRow, column ); } -template< typename Matrix, - typename Communicator > -__cuda_callable__ +template< typename Matrix > typename Matrix::RealType -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: getElementFast( IndexType row, IndexType column ) const { @@ -234,34 +209,29 @@ getElementFast( IndexType row, return localMatrix.getElementFast( localRow, column ); } -template< typename Matrix, - typename Communicator > -__cuda_callable__ -typename DistributedMatrix< Matrix, Communicator >::MatrixRow -DistributedMatrix< Matrix, Communicator >:: +template< typename Matrix > +typename DistributedMatrix< Matrix >::MatrixRow +DistributedMatrix< Matrix >:: getRow( IndexType row ) { const IndexType localRow = localRowRange.getLocalIndex( row ); return localMatrix.getRow( localRow ); } -template< typename Matrix, - typename Communicator > -__cuda_callable__ -typename DistributedMatrix< Matrix, Communicator >::ConstMatrixRow -DistributedMatrix< Matrix, Communicator >:: +template< typename Matrix > +typename DistributedMatrix< Matrix >::ConstMatrixRow +DistributedMatrix< Matrix >:: getRow( IndexType row ) const { const IndexType localRow = localRowRange.getLocalIndex( row ); return localMatrix.getRow( localRow ); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > template< typename InVector, typename OutVector > -typename std::enable_if< ! has_communicator< InVector >::value >::type -DistributedMatrix< Matrix, Communicator >:: +typename std::enable_if< ! HasGetCommunicationGroupMethod< InVector >::value >::type +DistributedMatrix< Matrix >:: vectorProduct( const InVector& inVector, OutVector& outVector ) const { @@ -274,44 +244,57 @@ vectorProduct( const InVector& inVector, localMatrix.vectorProduct( inVector, outView ); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > void -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: updateVectorProductCommunicationPattern() { - if( getCommunicationGroup() == CommunicatorType::NullGroup ) + if( getCommunicationGroup() == MPI::NullGroup() ) return; spmv.updateCommunicationPattern( getLocalMatrix(), getCommunicationGroup() ); } -template< typename Matrix, - typename Communicator > +template< typename Matrix > template< typename InVector, typename OutVector > -typename std::enable_if< has_communicator< InVector >::value >::type -DistributedMatrix< Matrix, Communicator >:: +typename std::enable_if< HasGetCommunicationGroupMethod< InVector >::value >::type +DistributedMatrix< Matrix >:: vectorProduct( const InVector& inVector, OutVector& outVector ) const { - TNL_ASSERT_EQ( inVector.getSize(), getColumns(), "input vector has wrong size" ); TNL_ASSERT_EQ( inVector.getLocalRange(), getLocalRowRange(), "input vector has wrong distribution" ); TNL_ASSERT_EQ( inVector.getCommunicationGroup(), getCommunicationGroup(), "input vector has wrong communication group" ); TNL_ASSERT_EQ( outVector.getSize(), getRows(), "output vector has wrong size" ); TNL_ASSERT_EQ( outVector.getLocalRange(), getLocalRowRange(), "output vector has wrong distribution" ); TNL_ASSERT_EQ( outVector.getCommunicationGroup(), getCommunicationGroup(), "output vector has wrong communication group" ); - if( getCommunicationGroup() == CommunicatorType::NullGroup ) + if( getCommunicationGroup() == MPI::NullGroup() ) return; - const_cast< DistributedMatrix* >( this )->spmv.vectorProduct( outVector, localMatrix, localRowRange, inVector, getCommunicationGroup() ); + if( inVector.getGhosts() == 0 ) { + // NOTE: this branch is deprecated and kept only due to existing benchmarks + TNL_ASSERT_EQ( inVector.getSize(), getColumns(), "input vector has wrong size" ); + const_cast< DistributedMatrix* >( this )->spmv.vectorProduct( outVector, localMatrix, localRowRange, inVector, getCommunicationGroup() ); + } + else { + TNL_ASSERT_EQ( inVector.getConstLocalViewWithGhosts().getSize(), localMatrix.getColumns(), "the matrix uses non-local and non-ghost column indices" ); + TNL_ASSERT_EQ( inVector.getGhosts(), localMatrix.getColumns() - localMatrix.getRows(), "input vector has wrong ghosts size" ); + TNL_ASSERT_EQ( outVector.getGhosts(), localMatrix.getColumns() - localMatrix.getRows(), "output vector has wrong ghosts size" ); + TNL_ASSERT_EQ( outVector.getConstLocalView().getSize(), localMatrix.getRows(), "number of local matrix rows does not match the output vector local size" ); + + inVector.waitForSynchronization(); + const auto inView = inVector.getConstLocalViewWithGhosts(); + auto outView = outVector.getLocalView(); + localMatrix.vectorProduct( inView, outView ); + // TODO: synchronization is not always necessary, e.g. when a preconditioning step follows +// outVector.startSynchronization(); + } } -template< typename Matrix, - typename Communicator > +template< typename Matrix > template< typename Vector1, typename Vector2 > bool -DistributedMatrix< Matrix, Communicator >:: +DistributedMatrix< Matrix >:: performSORIteration( const Vector1& b, const IndexType row, Vector2& x, diff --git a/src/TNL/Matrices/DistributedSpMV.h b/src/TNL/Matrices/DistributedSpMV.h index 76aaa77fef49997429db3ea47076a01db0f48997..bea864eadcbb368ef2748fbc3bfef47ad47a9abc 100644 --- a/src/TNL/Matrices/DistributedSpMV.h +++ b/src/TNL/Matrices/DistributedSpMV.h @@ -33,7 +33,7 @@ namespace TNL { namespace Matrices { -template< typename Matrix, typename Communicator > +template< typename Matrix > class DistributedSpMV { public: @@ -41,8 +41,6 @@ public: using RealType = typename Matrix::RealType; using DeviceType = typename Matrix::DeviceType; using IndexType = typename Matrix::IndexType; - using CommunicatorType = Communicator; - using CommunicationGroup = typename CommunicatorType::CommunicationGroup; using LocalRangeType = Containers::Subrange< typename Matrix::IndexType >; // - communication pattern: vector components whose indices are in the range @@ -55,10 +53,10 @@ public: // - assembly of the i-th row involves traversal of the local matrix stored // in the i-th process // - assembly of the full matrix needs all-to-all communication - void updateCommunicationPattern( const MatrixType& localMatrix, const LocalRangeType& localRowRange, CommunicationGroup group ) + void updateCommunicationPattern( const MatrixType& localMatrix, const LocalRangeType& localRowRange, MPI_Comm group ) { - const int rank = CommunicatorType::GetRank( group ); - const int nproc = CommunicatorType::GetSize( group ); + const int rank = MPI::GetRank( group ); + const int nproc = MPI::GetSize( group ); commPatternStarts.setDimensions( nproc, nproc ); commPatternEnds.setDimensions( nproc, nproc ); @@ -67,9 +65,9 @@ public: { Containers::Array< IndexType, Devices::Host, int > sendbuf( nproc ); sendbuf.setValue( localRowRange.getBegin() ); - CommunicatorType::Alltoall( sendbuf.getData(), 1, - globalOffsets.getData(), 1, - group ); + MPI::Alltoall( sendbuf.getData(), 1, + globalOffsets.getData(), 1, + group ); } const auto globalOffsetsView = globalOffsets.getConstView(); auto getOwner = [=] __cuda_callable__ ( IndexType global_idx ) -> int @@ -150,12 +148,12 @@ public: } // assemble the commPattern* matrices - CommunicatorType::Alltoall( &preCommPatternStarts(0, 0), nproc, - &commPatternStarts(0, 0), nproc, - group ); - CommunicatorType::Alltoall( &preCommPatternEnds(0, 0), nproc, - &commPatternEnds(0, 0), nproc, - group ); + MPI::Alltoall( &preCommPatternStarts(0, 0), nproc, + &commPatternStarts(0, 0), nproc, + group ); + MPI::Alltoall( &preCommPatternEnds(0, 0), nproc, + &commPatternEnds(0, 0), nproc, + group ); } template< typename InVector, @@ -164,10 +162,10 @@ public: const MatrixType& localMatrix, const LocalRangeType& localRowRange, const InVector& inVector, - CommunicationGroup group ) + MPI_Comm group ) { - const int rank = CommunicatorType::GetRank( group ); - const int nproc = CommunicatorType::GetSize( group ); + const int rank = MPI::GetRank( group ); + const int nproc = MPI::GetSize( group ); // handle trivial case if( nproc == 1 ) { @@ -190,14 +188,14 @@ public: TNL_ASSERT_EQ( globalBuffer.getSize(), localMatrix.getColumns(), "the global buffer size does not match the number of matrix columns" ); // buffer for asynchronous communication requests - std::vector< typename CommunicatorType::Request > commRequests; + std::vector< MPI_Request > commRequests; // send our data to all processes that need it for( int i = 0; i < commPatternStarts.getRows(); i++ ) { if( i == rank ) continue; if( commPatternStarts( i, rank ) < commPatternEnds( i, rank ) ) - commRequests.push_back( CommunicatorType::ISend( + commRequests.push_back( MPI::Isend( inVector.getConstLocalView().getData() + commPatternStarts( i, rank ) - localRowRange.getBegin(), commPatternEnds( i, rank ) - commPatternStarts( i, rank ), i, 0, group ) ); @@ -208,7 +206,7 @@ public: if( j == rank ) continue; if( commPatternStarts( rank, j ) < commPatternEnds( rank, j ) ) - commRequests.push_back( CommunicatorType::IRecv( + commRequests.push_back( MPI::Irecv( globalBuffer.getPointer( commPatternStarts( rank, j ) ), commPatternEnds( rank, j ) - commPatternStarts( rank, j ), j, 0, group ) ); @@ -217,7 +215,7 @@ public: // general variant if( localOnlySpan.first >= localOnlySpan.second ) { // wait for all communications to finish - CommunicatorType::WaitAll( commRequests.data(), commRequests.size() ); + MPI::Waitall( commRequests.data(), commRequests.size() ); // perform matrix-vector multiplication auto outVectorView = outVector.getLocalView(); @@ -231,7 +229,7 @@ public: localMatrix.vectorProduct( inVector, outVectorView, 1.0, 0.0, localOnlySpan.first, localOnlySpan.second ); // wait for all communications to finish - CommunicatorType::WaitAll( commRequests.data(), commRequests.size() ); + MPI::Waitall( commRequests.data(), commRequests.size() ); // finish the multiplication by adding the non-local entries localMatrix.vectorProduct( globalBuffer, outVectorView, 1.0, 0.0, 0, localOnlySpan.first ); diff --git a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h index 6030b976f038ab290ada814575db1bfb444ce694..04647cb4af883d07bf00e6ef177a8205c5be559c 100644 --- a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h +++ b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h @@ -12,7 +12,6 @@ #include #include -#include namespace TNL { namespace Meshes { diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.h b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.h index 6e346668ccaddcc0b124afc733d1a602d3dfadfa..4082024e378d844f9155bbabbedaccfd6f468599 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.h +++ b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.h @@ -11,8 +11,6 @@ #pragma once -#include - #include #include #include @@ -20,7 +18,7 @@ namespace TNL { -namespace Meshes { +namespace Meshes { namespace DistributedMeshes { @@ -28,7 +26,7 @@ namespace DistributedMeshes { template< int Dimension, typename Real, typename Device, - typename Index > + typename Index > class DistributedMesh< Grid< Dimension, Real, Device, Index > > { public: @@ -41,44 +39,43 @@ class DistributedMesh< Grid< Dimension, Real, Device, Index > > typedef Containers::StaticVector< Dimension, IndexType > CoordinatesType; typedef Containers::StaticVector< Dimension, IndexType > SubdomainOverlapsType; - static constexpr int getMeshDimension() { return Dimension; }; + static constexpr int getMeshDimension() { return Dimension; }; - static constexpr int getNeighborsCount() { return DirectionCount::get(); } //c++14 may use Directions::pow3(Dimension)-1 + static constexpr int getNeighborsCount() { return DirectionCount::get(); } //c++14 may use Directions::pow3(Dimension)-1 DistributedMesh(); ~DistributedMesh(); - + static void configSetup( Config::ConfigDescription& config ); - + bool setup( const Config::ParameterContainer& parameters, - const String& prefix ); - + const String& prefix ); + void setDomainDecomposition( const CoordinatesType& domainDecomposition ); - + const CoordinatesType& getDomainDecomposition() const; - - template< typename CommunicatorType > + void setGlobalGrid( const GridType& globalGrid ); - + const GridType& getGlobalGrid() const; - + void setOverlaps( const SubdomainOverlapsType& lower, const SubdomainOverlapsType& upper); - + void setupGrid( GridType& grid); bool isDistributed() const; - + bool isBoundarySubdomain() const; - + // TODO: replace it with getLowerOverlap() and getUpperOverlap() // It is still being used in cuts set-up const CoordinatesType& getOverlap() const { return this->overlap;}; - + //currently used overlaps at this subdomain const SubdomainOverlapsType& getLowerOverlap() const; - + const SubdomainOverlapsType& getUpperOverlap() const; //number of elements of local sub domain WITHOUT overlap @@ -95,7 +92,7 @@ class DistributedMesh< Grid< Dimension, Real, Device, Index > > //number of elements of local sub domain WITH overlap // TODO: replace with localGrid const CoordinatesType& getLocalGridSize() const; - + //coordinates of begin of local subdomain without overlaps in local grid const CoordinatesType& getLocalBegin() const; @@ -104,40 +101,40 @@ class DistributedMesh< Grid< Dimension, Real, Device, Index > > const PointType& getLocalOrigin() const; const PointType& getSpaceSteps() const; - //aka MPI-communcicator - void setCommunicationGroup(void * group); - void * getCommunicationGroup() const; + //aka MPI-communcicator + void setCommunicationGroup(MPI_Comm group); + MPI_Comm getCommunicationGroup() const; template< int EntityDimension > IndexType getEntitiesCount() const; template< typename Entity > - IndexType getEntitiesCount() const; + IndexType getEntitiesCount() const; const int* getNeighbors() const; - - const int* getPeriodicNeighbors() const; - template - bool SetupByCut(DistributedGridType &inputDistributedGrid, - Containers::StaticVector savedDimensions, - Containers::StaticVector reducedDimensions, + const int* getPeriodicNeighbors() const; + + template + bool SetupByCut(DistributedGridType &inputDistributedGrid, + Containers::StaticVector savedDimensions, + Containers::StaticVector reducedDimensions, Containers::StaticVector fixedIndexs); int getRankOfProcCoord(const CoordinatesType &nodeCoordinates) const; - + String printProcessCoords() const; String printProcessDistr() const; - + void writeProlog( Logger& logger ); - public: - + public: + bool isThereNeighbor(const CoordinatesType &direction) const; void setupNeighbors(); - + void print( std::ostream& str ) const; GridType globalGrid; @@ -149,26 +146,26 @@ class DistributedMesh< Grid< Dimension, Real, Device, Index > > //CoordinatesType globalDimensions; CoordinatesType globalBegin; PointType spaceSteps; - + SubdomainOverlapsType lowerOverlap, upperOverlap, globalLowerOverlap, globalUpperOverlap; CoordinatesType domainDecomposition; - CoordinatesType subdomainCoordinates; + CoordinatesType subdomainCoordinates; // TODO: static arrays int neighbors[ getNeighborsCount() ]; int periodicNeighbors[ getNeighborsCount() ]; - IndexType Dimensions; + IndexType Dimensions; bool distributed; - + int rank; int nproc; bool isSet; - //aka MPI-communicator - void * communicationGroup; + //aka MPI-communicator + MPI_Comm group; }; diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp index a35b539629544c88e4946fcadff68cc19b6b4bc4..c48fec9af40e64d4f8dc12ff51c323e47360f09f 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp +++ b/src/TNL/Meshes/DistributedMeshes/DistributedGrid.hpp @@ -11,9 +11,9 @@ #pragma once #include -#include #include "DistributedGrid.h" +#include namespace TNL { namespace Meshes { @@ -28,8 +28,6 @@ template DistributedMesh< Grid< Dimension, Real, Device, Index > >:: ~DistributedMesh() { - if(isSet && this->communicationGroup!=nullptr) - std::free(this->communicationGroup); } @@ -57,7 +55,7 @@ setup( const Config::ParameterContainer& parameters, return true; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > void DistributedMesh< Grid< Dimension, Real, Device, Index > >:: setDomainDecomposition( const CoordinatesType& domainDecomposition ) @@ -65,7 +63,7 @@ setDomainDecomposition( const CoordinatesType& domainDecomposition ) this->domainDecomposition = domainDecomposition; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getDomainDecomposition() const @@ -73,18 +71,12 @@ getDomainDecomposition() const return this->domainDecomposition; } -template< int Dimension, typename Real, typename Device, typename Index > -template< typename CommunicatorType > +template< int Dimension, typename Real, typename Device, typename Index > void DistributedMesh< Grid< Dimension, Real, Device, Index > >:: setGlobalGrid( const GridType &globalGrid ) { - if(this->isSet && this->communicationGroup != nullptr) - std::free(this->communicationGroup); - this->communicationGroup= std::malloc(sizeof(typename CommunicatorType::CommunicationGroup)); - - *((typename CommunicatorType::CommunicationGroup *)this->communicationGroup) = CommunicatorType::AllGroup; - auto group=*((typename CommunicatorType::CommunicationGroup *)this->communicationGroup); + this->group = MPI::AllGroup(); this->globalGrid = globalGrid; this->isSet=true; @@ -99,15 +91,12 @@ setGlobalGrid( const GridType &globalGrid ) this->spaceSteps=globalGrid.getSpaceSteps(); this->distributed=false; - if( CommunicatorType::IsInitialized() ) + this->rank=MPI::GetRank(group); + this->nproc=MPI::GetSize(group); + //use MPI only if have more than one process + if(this->nproc>1) { - this->rank=CommunicatorType::GetRank(group); - this->nproc=CommunicatorType::GetSize(group); - //use MPI only if have more than one process - if(this->nproc>1) - { - this->distributed=true; - } + this->distributed=true; } if( !this->distributed ) @@ -127,10 +116,8 @@ setGlobalGrid( const GridType &globalGrid ) //compute node distribution int dims[ Dimension ]; for( int i = 0; i < Dimension; i++ ) - dims[ i ]= this->domainDecomposition[ i ]; - - - CommunicatorType::DimsCreate( this->nproc, Dimension, dims ); + dims[ i ] = this->domainDecomposition[ i ]; + MPI::Compute_dims( this->nproc, Dimension, dims ); for( int i = 0; i < Dimension; i++ ) this->domainDecomposition[ i ] = dims[ i ]; @@ -146,16 +133,16 @@ setGlobalGrid( const GridType &globalGrid ) for( int i = 0; i < Dimension; i++ ) { numberOfLarger[ i ] = globalGrid.getDimensions()[ i ] % this->domainDecomposition[ i ]; - + this->localSize[ i ] = globalGrid.getDimensions()[ i ] / this->domainDecomposition[ i ]; - + if( numberOfLarger[ i ] > this->subdomainCoordinates[ i ] ) this->localSize[ i ] += 1; - + if( numberOfLarger[ i ] > this->subdomainCoordinates[ i ] ) this->globalBegin[ i ] = this->subdomainCoordinates[ i ] * this->localSize[ i ]; else - this->globalBegin[ i ] = numberOfLarger[ i ] * ( this->localSize[ i ] + 1 ) + + this->globalBegin[ i ] = numberOfLarger[ i ] * ( this->localSize[ i ] + 1 ) + ( this->subdomainCoordinates[ i ] - numberOfLarger[ i ] ) * this->localSize[ i ]; } @@ -164,7 +151,7 @@ setGlobalGrid( const GridType &globalGrid ) } } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > void DistributedMesh< Grid< Dimension, Real, Device, Index > >:: setOverlaps( const SubdomainOverlapsType& lower, @@ -191,7 +178,7 @@ setupGrid( GridType& grid) grid.setDistMesh(this); }; -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getSubdomainCoordinates() const @@ -199,7 +186,7 @@ getSubdomainCoordinates() const return this->subdomainCoordinates; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::PointType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getLocalOrigin() const @@ -207,15 +194,15 @@ getLocalOrigin() const return this->localOrigin; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::PointType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getSpaceSteps() const { return this->spaceSteps; } - -template< int Dimension, typename Real, typename Device, typename Index > + +template< int Dimension, typename Real, typename Device, typename Index > bool DistributedMesh< Grid< Dimension, Real, Device, Index > >:: isDistributed() const @@ -223,7 +210,7 @@ isDistributed() const return this->distributed; }; -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > bool DistributedMesh< Grid< Dimension, Real, Device, Index > >:: isBoundarySubdomain() const @@ -234,7 +221,7 @@ isBoundarySubdomain() const return false; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getLowerOverlap() const @@ -242,7 +229,7 @@ getLowerOverlap() const return this->lowerOverlap; }; -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getUpperOverlap() const @@ -250,7 +237,7 @@ getUpperOverlap() const return this->upperOverlap; }; -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getLocalSize() const @@ -258,7 +245,7 @@ getLocalSize() const return this->localSize; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getGlobalSize() const @@ -266,7 +253,7 @@ getGlobalSize() const return this->globalGrid.getDimensions(); } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::GridType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getGlobalGrid() const @@ -274,7 +261,7 @@ getGlobalGrid() const return this->globalGrid; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getGlobalBegin() const @@ -282,7 +269,7 @@ getGlobalBegin() const return this->globalBegin; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getLocalGridSize() const @@ -290,7 +277,7 @@ getLocalGridSize() const return this->localGridSize; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const typename DistributedMesh< Grid< Dimension, Real, Device, Index > >::CoordinatesType& DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getLocalBegin() const @@ -298,7 +285,7 @@ getLocalBegin() const return this->localBegin; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > template< int EntityDimension > Index DistributedMesh< Grid< Dimension, Real, Device, Index > >:: @@ -307,7 +294,7 @@ getEntitiesCount() const return this->globalGrid. template getEntitiesCount< EntityDimension >(); } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > template< typename Entity > Index DistributedMesh< Grid< Dimension, Real, Device, Index > >:: @@ -316,23 +303,23 @@ getEntitiesCount() const return this->globalGrid. template getEntitiesCount< Entity >(); } -template< int Dimension, typename Real, typename Device, typename Index > -void +template< int Dimension, typename Real, typename Device, typename Index > +void DistributedMesh< Grid< Dimension, Real, Device, Index > >:: -setCommunicationGroup(void * group) +setCommunicationGroup(MPI_Comm group) { - this->communicationGroup=group; + this->group=group; } -template< int Dimension, typename Real, typename Device, typename Index > -void * +template< int Dimension, typename Real, typename Device, typename Index > +MPI_Comm DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getCommunicationGroup() const { - return this->communicationGroup; + return this->group; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > int DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getRankOfProcCoord(const CoordinatesType &nodeCoordinates) const @@ -347,7 +334,7 @@ getRankOfProcCoord(const CoordinatesType &nodeCoordinates) const return ret; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > bool DistributedMesh< Grid< Dimension, Real, Device, Index > >:: isThereNeighbor(const CoordinatesType &direction) const @@ -365,7 +352,7 @@ isThereNeighbor(const CoordinatesType &direction) const } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > void DistributedMesh< Grid< Dimension, Real, Device, Index > >:: setupNeighbors() @@ -378,7 +365,7 @@ setupNeighbors() this->neighbors[ i ] = this->getRankOfProcCoord( coordinates ); else this->neighbors[ i ] = -1; - + // Handling periodic neighbors for( int d = 0; d < Dimension; d++ ) { @@ -388,12 +375,12 @@ setupNeighbors() coordinates[ d ] = 0; this->periodicNeighbors[ i ] = this->getRankOfProcCoord( coordinates ); } - + //std::cout << "Setting i-th neighbour to " << neighbors[ i ] << " and " << periodicNeighbors[ i ] << std::endl; } } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const int* DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getNeighbors() const @@ -402,7 +389,7 @@ getNeighbors() const return this->neighbors; } -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > const int* DistributedMesh< Grid< Dimension, Real, Device, Index > >:: getPeriodicNeighbors() const @@ -412,12 +399,12 @@ getPeriodicNeighbors() const } template< int Dimension, typename Real, typename Device, typename Index > - template -bool + template +bool DistributedMesh< Grid< Dimension, Real, Device, Index > >:: -SetupByCut(DistributedGridType &inputDistributedGrid, - Containers::StaticVector savedDimensions, - Containers::StaticVector reducedDimensions, +SetupByCut(DistributedGridType &inputDistributedGrid, + Containers::StaticVector savedDimensions, + Containers::StaticVector reducedDimensions, Containers::StaticVector fixedIndexs) { @@ -432,21 +419,17 @@ SetupByCut(DistributedGridType &inputDistributedGrid, } //create new group with used nodes - typename CommunicatorType::CommunicationGroup *oldGroup=(typename CommunicatorType::CommunicationGroup *)(inputDistributedGrid.getCommunicationGroup()); - if(this->isSet && this->communicationGroup != nullptr) - free(this->communicationGroup); - this->communicationGroup = std::malloc(sizeof(typename CommunicatorType::CommunicationGroup)); - + const MPI_Comm oldGroup=inputDistributedGrid.getCommunicationGroup(); if(isInCut) { this->isSet=true; - + auto fromGlobalMesh=inputDistributedGrid.getGlobalGrid(); //set global grid typename GridType::PointType outOrigin; typename GridType::PointType outProportions; typename GridType::CoordinatesType outDimensions; - + for(int i=0; ispaceSteps[i]=inputDistributedGrid.getSpaceSteps()[savedDimensions[i]]; } - int newRank= getRankOfProcCoord(this->subdomainCoordinates); - - CommunicatorType::CreateNewGroup(isInCut,newRank,*oldGroup ,*((typename CommunicatorType::CommunicationGroup*) this->communicationGroup)); + int newRank = getRankOfProcCoord(this->subdomainCoordinates); + this->group = MPI::Comm_split( oldGroup, 1, newRank ); setupNeighbors(); - + bool isDistributed=false; for(int i=0;idistributed=isDistributed; - + this->globalGrid.setDimensions(outDimensions); this->globalGrid.setDomain(outOrigin,outProportions); @@ -491,7 +473,7 @@ SetupByCut(DistributedGridType &inputDistributedGrid, } else { - CommunicatorType::CreateNewGroup(isInCut,0,*oldGroup ,*((typename CommunicatorType::CommunicationGroup*) this->communicationGroup)); + this->group = MPI::Comm_split( oldGroup, MPI_UNDEFINED, 0 ); } return false; @@ -517,7 +499,7 @@ printProcessDistr() const for(int i=1; idomainDecomposition[i]); return res; -}; +}; template< int Dimension, typename Real, typename Device, typename Index > void @@ -525,19 +507,18 @@ DistributedMesh< Grid< Dimension, Real, Device, Index > >:: writeProlog( Logger& logger ) { logger.writeParameter( "Domain decomposition:", this->getDomainDecomposition() ); -} +} -template< int Dimension, typename Real, typename Device, typename Index > +template< int Dimension, typename Real, typename Device, typename Index > void DistributedMesh< Grid< Dimension, Real, Device, Index > >:: print( std::ostream& str ) const { - using Communicator = Communicators::MpiCommunicator; - for( int j = 0; j < Communicator::GetSize( Communicator::AllGroup ); j++ ) + for( int j = 0; j < MPI::GetSize(); j++ ) { - if( j == Communicator::GetRank( Communicator::AllGroup ) ) + if( j == MPI::GetRank() ) { - str << "Node : " << Communicator::GetRank( Communicator::AllGroup ) << std::endl + str << "Node : " << MPI::GetRank() << std::endl << " localOrigin : " << localOrigin << std::endl << " localBegin : " << localBegin << std::endl << " localSize : " << localSize << std::endl @@ -558,7 +539,7 @@ print( std::ostream& str ) const str << " " << periodicNeighbors[ i ]; str << std::endl; } - Communicator::Barrier( Communicator::AllGroup ); + MPI::Barrier(); } } diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h index 38a7c04f0b5e1d3a86fe7cb30740dba2a242908d..edb08baf7b6cd909988e50446a19a1a66df42e6a 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h +++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h @@ -13,7 +13,6 @@ #include #include -#include #include #include #include @@ -21,11 +20,11 @@ #include namespace TNL { -namespace Meshes { +namespace Meshes { namespace DistributedMeshes { enum DistrGridIOTypes { Dummy = 0 , LocalCopy = 1, MpiIO=2 }; - + template< typename MeshFunction, DistrGridIOTypes type = LocalCopy, typename Mesh = typename MeshFunction::MeshType, @@ -34,7 +33,7 @@ class DistributedGridIO { }; -template< typename MeshFunctionType > +template< typename MeshFunctionType > class DistributedGridIO< MeshFunctionType, Dummy > { bool save(const String& fileName, MeshFunctionType &meshFunction) diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h index 60605c6eb07514359e799026b73416eb404d9d95..698d7e41dc7ebbf5b2cf2481bb97c991620dcc96 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h +++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_MeshFunction.h @@ -12,6 +12,7 @@ #include #include +#include namespace TNL { namespace Meshes { @@ -19,7 +20,7 @@ namespace DistributedMeshes { /* - * This variant cerate copy of MeshFunction but smaller, reduced to local entities, without overlap. + * This variant cerate copy of MeshFunction but smaller, reduced to local entities, without overlap. * It is slow and has high RAM consumption */ template< typename MeshFunction, @@ -88,8 +89,8 @@ class DistributedGridIO< return true; }; - - static bool load(const String& fileName,MeshFunctionType &meshFunction) + + static bool load(const String& fileName,MeshFunctionType &meshFunction) { auto *distrGrid=meshFunction.getMesh().getDistributedMesh(); if(distrGrid==NULL) //not distributed @@ -99,10 +100,10 @@ class DistributedGridIO< } const MeshType& mesh=meshFunction.getMesh(); - + PointType spaceSteps=mesh.getSpaceSteps(); PointType origin=mesh.getOrigin(); - + CoordinatesType localSize=distrGrid->getLocalSize(); CoordinatesType localBegin=distrGrid->getLocalBegin(); @@ -111,33 +112,33 @@ class DistributedGridIO< newMesh->setSpaceSteps(spaceSteps); CoordinatesType newOrigin; newMesh->setOrigin(origin+spaceSteps*localBegin); - + VectorType newDof(newMesh-> template getEntitiesCount< typename MeshType::Cell >()); MeshFunctionType newMeshFunction; - newMeshFunction.bind(newMesh,newDof); + newMeshFunction.bind(newMesh,newDof); CoordinatesType zeroCoord; - zeroCoord.setValue(0); + zeroCoord.setValue(0); File file; file.open( fileName+String("-")+distrGrid->printProcessCoords()+String(".tnl"), std::ios_base::in ); newMeshFunction.boundLoad(file); file.close(); CopyEntitiesHelper::Copy(newMeshFunction,meshFunction,zeroCoord,localBegin,localSize); - + return true; }; - + }; /* - * Save distributed data into single file without overlaps using MPIIO and MPI datatypes, + * Save distributed data into single file without overlaps using MPIIO and MPI datatypes, * EXPLOSIVE: works with only Grids and MPI * BAD IMPLEMENTTION creating MPI-Types at every save! -- I dont want contamine more places by MPI.. */ #ifdef HAVE_MPI -template +template class DistributedGridIO_MPIIOBase { public: @@ -152,13 +153,13 @@ class DistributedGridIO_MPIIOBase static bool save(const String& fileName, MeshFunctionType &meshFunction, RealType *data) { auto *distrGrid=meshFunction.getMesh().getDistributedMesh(); - + if(distrGrid==NULL) //not distributed { meshFunction.save(fileName); } - MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup())); + MPI_Comm group=distrGrid->getCommunicationGroup(); MPI_File file; int ok=MPI_File_open( group, @@ -168,7 +169,7 @@ class DistributedGridIO_MPIIOBase &file); if( ok != 0 ) throw std::runtime_error("Open file falied"); - + int written=save(file,meshFunction, data,0); MPI_File_close(&file); @@ -176,21 +177,21 @@ class DistributedGridIO_MPIIOBase return written>0; }; - + static int save(MPI_File &file, MeshFunctionType &meshFunction, RealType *data, int offset) { auto *distrGrid=meshFunction.getMesh().getDistributedMesh(); - MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup())); + MPI_Comm group=distrGrid->getCommunicationGroup(); MPI_Datatype ftype; MPI_Datatype atype; int dataCount=CreateDataTypes(distrGrid,&ftype,&atype); int headerSize; - + MPI_File_set_view(file,0,MPI_BYTE,MPI_BYTE,"native",MPI_INFO_NULL); - if(Communicators::MpiCommunicator::GetRank(group)==0) + if(MPI::GetRank(group)==0) { MPI_File_seek(file,offset,MPI_SEEK_SET); headerSize=writeMeshFunctionHeader(file,meshFunction,dataCount); @@ -200,9 +201,9 @@ class DistributedGridIO_MPIIOBase offset +=headerSize; MPI_File_set_view(file,offset, - Communicators::MPITypeResolver::getType(), + TNL::MPI::getDataType(), ftype,"native",MPI_INFO_NULL); - + MPI_Status wstatus; MPI_File_write(file,data,1,atype,&wstatus); @@ -222,7 +223,7 @@ class DistributedGridIO_MPIIOBase int fstarts[dim]; int flsize[dim]; int fgsize[dim]; - + hackArray(dim,fstarts,distrGrid->getGlobalBegin().getData()); hackArray(dim,flsize,distrGrid->getLocalSize().getData()); hackArray(dim,fgsize,distrGrid->getGlobalSize().getData()); @@ -230,14 +231,14 @@ class DistributedGridIO_MPIIOBase MPI_Type_create_subarray(dim, fgsize,flsize,fstarts, MPI_ORDER_C, - Communicators::MPITypeResolver::getType(), + TNL::MPI::getDataType(), ftype); MPI_Type_commit(ftype); int agsize[dim]; int alsize[dim]; - int astarts[dim]; + int astarts[dim]; hackArray(dim,astarts,distrGrid->getLocalBegin().getData()); hackArray(dim,alsize,distrGrid->getLocalSize().getData()); @@ -246,7 +247,7 @@ class DistributedGridIO_MPIIOBase MPI_Type_create_subarray(dim, agsize,alsize,astarts, MPI_ORDER_C, - Communicators::MPITypeResolver::getType(), + TNL::MPI::getDataType(), atype); MPI_Type_commit(atype); @@ -333,7 +334,7 @@ class DistributedGridIO_MPIIOBase return true; } - MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup())); + MPI_Comm group=distrGrid->getCommunicationGroup(); MPI_File file; if( MPI_File_open( group, @@ -350,39 +351,39 @@ class DistributedGridIO_MPIIOBase MPI_File_close(&file); return ret; } - + /* Funky bomb - no checks - only dirty load */ - static int load(MPI_File &file,MeshFunctionType &meshFunction, RealType* data, int offset ) + static int load(MPI_File &file,MeshFunctionType &meshFunction, RealType* data, int offset ) { auto *distrGrid=meshFunction.getMesh().getDistributedMesh(); - MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup())); + MPI_Comm group=distrGrid->getCommunicationGroup(); MPI_Datatype ftype; MPI_Datatype atype; int dataCount=CreateDataTypes(distrGrid,&ftype,&atype); - + MPI_File_set_view(file,0,MPI_BYTE,MPI_BYTE,"native",MPI_INFO_NULL); int headerSize=0; - if(Communicators::MpiCommunicator::GetRank(group)==0) + if(MPI::GetRank(group)==0) { MPI_File_seek(file,offset,MPI_SEEK_SET); headerSize=readMeshFunctionHeader(file,meshFunction,dataCount); } MPI_Bcast(&headerSize, 1, MPI_INT,0, group); - + if(headerSize<0) return false; offset+=headerSize; MPI_File_set_view(file,offset, - Communicators::MPITypeResolver::getType(), + TNL::MPI::getDataType(), ftype,"native",MPI_INFO_NULL); MPI_Status wstatus; MPI_File_read(file,(void*)data,1,atype,&wstatus); - + MPI_Type_free(&atype); MPI_Type_free(&ftype); @@ -412,7 +413,7 @@ class DistributedGridIO_MPIIOBase size+=count*sizeof(char); MPI_File_read(file, (void *)&count,1, MPI_INT, &rstatus);//DATACOUNT size+=1*sizeof(int); - + if(count!=length) { std::cerr<<"Chyba načítání MeshFunction, délka dat v souboru neodpovídá očekávané délce" << std::endl; @@ -421,7 +422,7 @@ class DistributedGridIO_MPIIOBase return size; }; - + }; #endif @@ -442,25 +443,25 @@ class DistributedGridIO< static bool save(const String& fileName, MeshFunctionType &meshFunction) { #ifdef HAVE_MPI - if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed + if(MPI::isInitialized())//i.e. - isUsed { - using HostVectorType = Containers::Vector; + using HostVectorType = Containers::Vector; HostVectorType hostVector; hostVector=meshFunction.getData(); - typename MeshFunctionType::RealType * data=hostVector.getData(); + typename MeshFunctionType::RealType * data=hostVector.getData(); return DistributedGridIO_MPIIOBase::save(fileName,meshFunction,data); } #endif - std::cout << "MPIIO can be used only with MPICommunicator." << std::endl; + std::cout << "MPIIO can be used only when MPI is initialized." << std::endl; return false; }; - static bool load(const String& fileName,MeshFunctionType &meshFunction) + static bool load(const String& fileName,MeshFunctionType &meshFunction) { #ifdef HAVE_MPI - if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed + if(MPI::isInitialized())//i.e. - isUsed { - using HostVectorType = Containers::Vector; + using HostVectorType = Containers::Vector; HostVectorType hostVector; hostVector.setLike(meshFunction.getData()); auto* data=hostVector.getData(); @@ -469,7 +470,7 @@ class DistributedGridIO< return true; } #endif - std::cout << "MPIIO can be used only with MPICommunicator." << std::endl; + std::cout << "MPIIO can be used only when MPI is initialized." << std::endl; return false; }; }; @@ -491,26 +492,26 @@ class DistributedGridIO< static bool save(const String& fileName, MeshFunctionType &meshFunction) { #ifdef HAVE_MPI - if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed + if(MPI::isInitialized())//i.e. - isUsed { typename MeshFunctionType::RealType* data=meshFunction.getData().getData(); return DistributedGridIO_MPIIOBase::save(fileName,meshFunction,data); } #endif - std::cout << "MPIIO can be used only with MPICommunicator." << std::endl; + std::cout << "MPIIO can be used only when MPI is initialized." << std::endl; return false; }; - static bool load(const String& fileName,MeshFunctionType &meshFunction) + static bool load(const String& fileName,MeshFunctionType &meshFunction) { #ifdef HAVE_MPI - if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed + if(MPI::isInitialized())//i.e. - isUsed { typename MeshFunctionType::RealType* data = meshFunction.getData().getData(); return DistributedGridIO_MPIIOBase::load(fileName,meshFunction,data); } #endif - std::cout << "MPIIO can be used only with MPICommunicator." << std::endl; + std::cout << "MPIIO can be used only when MPI is initialized." << std::endl; return false; }; }; diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_VectorField.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_VectorField.h index 52217c336f8226854322e5cdd5ebcb29da108c47..8febf3c723b19bfffef001a70c6b3ed769a96420 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_VectorField.h +++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO_VectorField.h @@ -49,7 +49,7 @@ class DistributedGridIO_VectorField< static bool save(const String& fileName, Functions::VectorField< Size, MeshFunctionType > &vectorField) { #ifdef HAVE_MPI - if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed + if(MPI::isInitialized())//i.e. - isUsed { auto *distrGrid=vectorField.getMesh().getDistributedMesh(); if(distrGrid==NULL) @@ -58,9 +58,9 @@ class DistributedGridIO_VectorField< return true; } - MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup())); + MPI_Comm group=distrGrid->getCommunicationGroup(); - //write + //write MPI_File file; MPI_File_open( group, const_cast< char* >( fileName.getString() ), @@ -68,12 +68,12 @@ class DistributedGridIO_VectorField< MPI_INFO_NULL, &file); - - int offset=0; //global offset -> every mesh function creates it's own data types we need manage global offset - if(Communicators::MpiCommunicator::GetRank(group)==0) + + int offset=0; //global offset -> every mesh function creates it's own data types we need manage global offset + if(MPI::GetRank(group)==0) offset+=writeVectorFieldHeader(file,vectorField); MPI_Bcast(&offset, 1, MPI_INT,0, group); - + for( int i = 0; i < vectorField.getVectorDimension(); i++ ) { typename MeshFunctionType::RealType * data=vectorField[i]->getData().getData(); //here manage data transfer Device... @@ -83,13 +83,13 @@ class DistributedGridIO_VectorField< return false; } - MPI_File_close(&file); - return true; + MPI_File_close(&file); + return true; } #endif - std::cout << "MPIIO can be used only with MPICommunicator." << std::endl; + std::cout << "MPIIO can be used only when MPI is initialized." << std::endl; return false; - + }; #ifdef HAVE_MPI @@ -140,7 +140,7 @@ class DistributedGridIO_VectorField< static bool load(const String& fileName, Functions::VectorField &vectorField) { #ifdef HAVE_MPI - if(Communicators::MpiCommunicator::IsInitialized())//i.e. - isUsed + if(MPI::isInitialized())//i.e. - isUsed { auto *distrGrid=vectorField.getMesh().getDistributedMesh(); if(distrGrid==NULL) @@ -149,9 +149,9 @@ class DistributedGridIO_VectorField< return true; } - MPI_Comm group=*((MPI_Comm*)(distrGrid->getCommunicationGroup())); + MPI_Comm group=distrGrid->getCommunicationGroup(); - //write + //write MPI_File file; MPI_File_open( group, const_cast< char* >( fileName.getString() ), @@ -159,12 +159,12 @@ class DistributedGridIO_VectorField< MPI_INFO_NULL, &file); - - int offset=0; //global offset -> every meshfunctoion creates it's own datatypes we need manage global offset - if(Communicators::MpiCommunicator::GetRank(group)==0) + + int offset=0; //global offset -> every meshfunctoion creates it's own datatypes we need manage global offset + if(MPI::GetRank(group)==0) offset+=readVectorFieldHeader(file,vectorField); MPI_Bcast(&offset, 1, MPI_INT,0, group); - + for( int i = 0; i < vectorField.getVectorDimension(); i++ ) { typename MeshFunctionType::RealType * data=vectorField[i]->getData().getData(); //here manage data transfer Device... @@ -174,13 +174,13 @@ class DistributedGridIO_VectorField< return false; } - MPI_File_close(&file); - return true; + MPI_File_close(&file); + return true; } #endif - std::cout << "MPIIO can be used only with MPICommunicator." << std::endl; + std::cout << "MPIIO can be used only when MPI is initialized." << std::endl; return false; - + }; }; diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h index 5a11502403ca54be090c350b802e80d102926392..ed68150a041dc4ed209ac3a15ea226b96c801c6e 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h +++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h @@ -16,7 +16,6 @@ #include #include #include -#include #include namespace TNL { @@ -112,8 +111,7 @@ class DistributedMeshSynchronizer< DistributedMesh< Grid< MeshDimension, GridRea } } - template< typename CommunicatorType, - typename MeshFunctionType, + template< typename MeshFunctionType, typename PeriodicBoundariesMaskPointer = Pointers::SharedPointer< MeshFunctionType > > void synchronize( MeshFunctionType &meshFunction, bool periodicBoundaries = false, @@ -145,9 +143,8 @@ class DistributedMeshSynchronizer< DistributedMesh< Grid< MeshDimension, GridRea PeriodicBoundariesMaskPointer( nullptr ) ); // the mask is used only when receiving data ); //async send and receive - typename CommunicatorType::Request requests[2*this->getNeighborCount()]; - typename CommunicatorType::CommunicationGroup group; - group=*((typename CommunicatorType::CommunicationGroup *)(distributedGrid->getCommunicationGroup())); + MPI_Request requests[2*this->getNeighborCount()]; + MPI_Comm group = distributedGrid->getCommunicationGroup(); int requestsCount( 0 ); //send everything, recieve everything @@ -159,22 +156,22 @@ class DistributedMeshSynchronizer< DistributedMesh< Grid< MeshDimension, GridRea if( neighbors[ i ] != -1 ) { //TNL_MPI_PRINT( "Sending data to node " << neighbors[ i ] ); - requests[ requestsCount++ ] = CommunicatorType::ISend( reinterpret_cast( sendBuffers[ i ].getData() ), sendSizes[ i ], neighbors[ i ], 0, group ); + requests[ requestsCount++ ] = MPI::Isend( reinterpret_cast( sendBuffers[ i ].getData() ), sendSizes[ i ], neighbors[ i ], 0, group ); //TNL_MPI_PRINT( "Receiving data from node " << neighbors[ i ] ); - requests[ requestsCount++ ] = CommunicatorType::IRecv( reinterpret_cast( recieveBuffers[ i ].getData() ), sendSizes[ i ], neighbors[ i ], 0, group ); + requests[ requestsCount++ ] = MPI::Irecv( reinterpret_cast( recieveBuffers[ i ].getData() ), sendSizes[ i ], neighbors[ i ], 0, group ); } else if( periodicBoundaries && sendSizes[ i ] !=0 ) { //TNL_MPI_PRINT( "Sending data to node " << periodicNeighbors[ i ] ); - requests[ requestsCount++ ] = CommunicatorType::ISend( reinterpret_cast( sendBuffers[ i ].getData() ), sendSizes[ i ], periodicNeighbors[ i ], 1, group ); + requests[ requestsCount++ ] = MPI::Isend( reinterpret_cast( sendBuffers[ i ].getData() ), sendSizes[ i ], periodicNeighbors[ i ], 1, group ); //TNL_MPI_PRINT( "Receiving data to node " << periodicNeighbors[ i ] ); - requests[ requestsCount++ ] = CommunicatorType::IRecv( reinterpret_cast( recieveBuffers[ i ].getData() ), sendSizes[ i ], periodicNeighbors[ i ], 1, group ); + requests[ requestsCount++ ] = MPI::Irecv( reinterpret_cast( recieveBuffers[ i ].getData() ), sendSizes[ i ], periodicNeighbors[ i ], 1, group ); } } //wait until send is done //TNL_MPI_PRINT( "Waiting for data ..." ) - CommunicatorType::WaitAll( requests, requestsCount ); + MPI::Waitall( requests, requestsCount ); //copy data from receive buffers //TNL_MPI_PRINT( "Copying data ..." ) diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedMesh.h b/src/TNL/Meshes/DistributedMeshes/DistributedMesh.h index 9a79f823d1379fbf4d314c9bc4bb3641fd5e9a78..21116d35725281aed1ea457c57bc19c370d395c1 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedMesh.h +++ b/src/TNL/Meshes/DistributedMeshes/DistributedMesh.h @@ -13,7 +13,7 @@ #pragma once #include -#include +#include #include #include @@ -34,8 +34,6 @@ public: using PointType = typename Mesh::PointType; using RealType = typename PointType::RealType; using GlobalIndexArray = typename Mesh::GlobalIndexArray; - using CommunicatorType = Communicators::MpiCommunicator; - using CommunicationGroup = typename CommunicatorType::CommunicationGroup; using VTKTypesArrayType = Containers::Array< std::uint8_t, Devices::Sequential, GlobalIndexType >; DistributedMesh() = default; @@ -101,12 +99,12 @@ public: /** * Methods specific to the distributed mesh */ - void setCommunicationGroup( CommunicationGroup group ) + void setCommunicationGroup( MPI_Comm group ) { this->group = group; } - CommunicationGroup getCommunicationGroup() const + MPI_Comm getCommunicationGroup() const { return group; } @@ -190,10 +188,10 @@ public: const GlobalIndexType verticesCount = localMesh.template getEntitiesCount< 0 >(); const GlobalIndexType cellsCount = localMesh.template getEntitiesCount< Mesh::getMeshDimension() >(); - CommunicatorType::Barrier(); - for( int i = 0; i < CommunicatorType::GetSize(); i++ ) { - if( i == CommunicatorType::GetRank() ) { - str << "MPI rank:\t" << CommunicatorType::GetRank() << "\n" + MPI::Barrier(); + for( int i = 0; i < MPI::GetSize(); i++ ) { + if( i == MPI::GetRank() ) { + str << "MPI rank:\t" << MPI::GetRank() << "\n" << "\tMesh dimension:\t" << getMeshDimension() << "\n" << "\tCell topology:\t" << getType( typename Cell::EntityTopology{} ) << "\n" << "\tCells count:\t" << cellsCount << "\n" @@ -230,13 +228,13 @@ public: } str.flush(); } - CommunicatorType::Barrier(); + MPI::Barrier(); } } protected: MeshType localMesh; - CommunicationGroup group = CommunicatorType::NullGroup; + MPI_Comm group = MPI::NullGroup(); int ghostLevels = 0; // vtkGhostType arrays for points and cells (cached for output into VTK formats) diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h index 724510bf4a6c576eafd5ba58d5d1065d4733a674..36f28ba458b67e872f9ea7d317f3654fb019215d 100644 --- a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h +++ b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h @@ -12,8 +12,10 @@ #pragma once +#include #include #include +#include namespace TNL { namespace Meshes { @@ -32,11 +34,22 @@ struct HasMeshType< T, typename Containers::Expressions::enable_if_type< typenam template< typename DistributedMesh, int EntityDimension = DistributedMesh::getMeshDimension() > class DistributedMeshSynchronizer +: public Containers::ByteArraySynchronizer< typename DistributedMesh::DeviceType, typename DistributedMesh::GlobalIndexType > { + using Base = Containers::ByteArraySynchronizer< typename DistributedMesh::DeviceType, typename DistributedMesh::GlobalIndexType >; + public: using DeviceType = typename DistributedMesh::DeviceType; using GlobalIndexType = typename DistributedMesh::GlobalIndexType; - using CommunicatorType = typename DistributedMesh::CommunicatorType; + using ByteArrayView = typename Base::ByteArrayView; + using RequestsVector = typename Base::RequestsVector; + + ~DistributedMeshSynchronizer() + { + // wait for pending async operation, otherwise it would crash + if( this->async_op.valid() ) + this->async_op.wait(); + } DistributedMeshSynchronizer() = default; @@ -47,15 +60,9 @@ public: TNL_ASSERT_EQ( mesh.template getGlobalIndices< EntityDimension >().getSize(), mesh.getLocalMesh().template getEntitiesCount< EntityDimension >(), "Global indices are not allocated properly." ); - // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/ - #ifdef HAVE_CUDA - if( std::is_same< DeviceType, Devices::Cuda >::value ) - cudaGetDevice(&this->gpu_id); - #endif - group = mesh.getCommunicationGroup(); - const int rank = CommunicatorType::GetRank( group ); - const int nproc = CommunicatorType::GetSize( group ); + const int rank = MPI::GetRank( group ); + const int nproc = MPI::GetSize( group ); // exchange the global index offsets so that each rank can determine the // owner of every entity by its global index @@ -64,9 +71,9 @@ public: { Containers::Array< GlobalIndexType, Devices::Host, int > sendbuf( nproc ); sendbuf.setValue( ownStart ); - CommunicatorType::Alltoall( sendbuf.getData(), 1, - globalOffsets.getData(), 1, - group ); + MPI::Alltoall( sendbuf.getData(), 1, + globalOffsets.getData(), 1, + group ); } // count local ghost entities for each rank @@ -103,9 +110,9 @@ public: for( int j = 0; j < nproc; j++ ) for( int i = 0; i < nproc; i++ ) sendbuf.setElement( j, i, localGhostCounts[ i ] ); - CommunicatorType::Alltoall( &sendbuf(0, 0), nproc, - &ghostEntitiesCounts(0, 0), nproc, - group ); + MPI::Alltoall( &sendbuf(0, 0), nproc, + &ghostEntitiesCounts(0, 0), nproc, + group ); } // allocate ghost offsets @@ -122,14 +129,14 @@ public: // send indices of ghost entities - set them as ghost neighbors on the target rank { - std::vector< typename CommunicatorType::Request > requests; + RequestsVector requests; // send our ghost indices to the neighboring ranks GlobalIndexType ghostOffset = mesh.getLocalMesh().template getGhostEntitiesOffset< EntityDimension >(); ghostOffsets[ 0 ] = ghostOffset; for( int i = 0; i < nproc; i++ ) { if( ghostEntitiesCounts( rank, i ) > 0 ) { - requests.push_back( CommunicatorType::ISend( + requests.push_back( MPI::Isend( mesh.template getGlobalIndices< EntityDimension >().getData() + ghostOffset, ghostEntitiesCounts( rank, i ), i, 0, group ) ); @@ -144,7 +151,7 @@ public: // receive ghost indices from the neighboring ranks for( int j = 0; j < nproc; j++ ) { if( ghostEntitiesCounts( j, rank ) > 0 ) { - requests.push_back( CommunicatorType::IRecv( + requests.push_back( MPI::Irecv( ghostNeighbors.getData() + ghostNeighborOffsets[ j ], ghostEntitiesCounts( j, rank ), j, 0, group ) ); @@ -152,7 +159,7 @@ public: } // wait for all communications to finish - CommunicatorType::WaitAll( requests.data(), requests.size() ); + MPI::Waitall( requests.data(), requests.size() ); // convert received ghost indices from global to local ghostNeighbors -= ownStart; @@ -182,43 +189,53 @@ public: template< typename Array > void synchronizeArray( Array& array, int valuesPerElement = 1 ) { - TNL_ASSERT_EQ( array.getSize(), valuesPerElement * ghostOffsets[ ghostOffsets.getSize() - 1 ], - "The array does not have the expected size." ); + static_assert( std::is_same< typename Array::DeviceType, DeviceType >::value, + "mismatched DeviceType of the array" ); using ValueType = typename Array::ValueType; - // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/ - #ifdef HAVE_CUDA - if( std::is_same< DeviceType, Devices::Cuda >::value ) - cudaSetDevice(gpu_id); - #endif + ByteArrayView view; + view.bind( reinterpret_cast( array.getData() ), sizeof(ValueType) * array.getSize() ); + synchronizeByteArray( view, sizeof(ValueType) * valuesPerElement ); + } - const int rank = CommunicatorType::GetRank( group ); - const int nproc = CommunicatorType::GetSize( group ); + virtual void synchronizeByteArray( ByteArrayView array, int bytesPerValue ) override + { + auto requests = synchronizeByteArrayAsyncWorker( array, bytesPerValue ); + MPI::Waitall( requests.data(), requests.size() ); + } + + virtual RequestsVector synchronizeByteArrayAsyncWorker( ByteArrayView array, int bytesPerValue ) override + { + TNL_ASSERT_EQ( array.getSize(), bytesPerValue * ghostOffsets[ ghostOffsets.getSize() - 1 ], + "The array does not have the expected size." ); + + const int rank = MPI::GetRank( group ); + const int nproc = MPI::GetSize( group ); // allocate send buffers (setSize does nothing if the array size is already correct) - sendBuffers.setSize( valuesPerElement * ghostNeighborOffsets[ nproc ] * sizeof(ValueType) ); + sendBuffers.setSize( bytesPerValue * ghostNeighborOffsets[ nproc ] ); // buffer for asynchronous communication requests - std::vector< typename CommunicatorType::Request > requests; + RequestsVector requests; // issue all receive async operations for( int j = 0; j < nproc; j++ ) { if( ghostEntitiesCounts( rank, j ) > 0 ) { - requests.push_back( CommunicatorType::IRecv( - array.getData() + valuesPerElement * ghostOffsets[ j ], - valuesPerElement * ghostEntitiesCounts( rank, j ), + requests.push_back( MPI::Irecv( + array.getData() + bytesPerValue * ghostOffsets[ j ], + bytesPerValue * ghostEntitiesCounts( rank, j ), j, 0, group ) ); } } - Containers::ArrayView< ValueType, DeviceType, GlobalIndexType > sendBuffersView; - sendBuffersView.bind( reinterpret_cast( sendBuffers.getData() ), valuesPerElement * ghostNeighborOffsets[ nproc ] ); + ByteArrayView sendBuffersView; + sendBuffersView.bind( sendBuffers.getData(), bytesPerValue * ghostNeighborOffsets[ nproc ] ); const auto ghostNeighborsView = ghostNeighbors.getConstView(); const auto arrayView = array.getConstView(); - auto copy_kernel = [sendBuffersView, arrayView, ghostNeighborsView, valuesPerElement] __cuda_callable__ ( GlobalIndexType k, GlobalIndexType offset ) mutable + auto copy_kernel = [sendBuffersView, arrayView, ghostNeighborsView, bytesPerValue] __cuda_callable__ ( GlobalIndexType k, GlobalIndexType offset ) mutable { - for( int i = 0; i < valuesPerElement; i++ ) - sendBuffersView[ i + valuesPerElement * (offset + k) ] = arrayView[ i + valuesPerElement * ghostNeighborsView[ offset + k ] ]; + for( int i = 0; i < bytesPerValue; i++ ) + sendBuffersView[ i + bytesPerValue * (offset + k) ] = arrayView[ i + bytesPerValue * ghostNeighborsView[ offset + k ] ]; }; for( int i = 0; i < nproc; i++ ) { @@ -228,15 +245,14 @@ public: Algorithms::ParallelFor< DeviceType >::exec( (GlobalIndexType) 0, ghostEntitiesCounts( i, rank ), copy_kernel, offset ); // issue async send operation - requests.push_back( CommunicatorType::ISend( - sendBuffersView.getData() + valuesPerElement * ghostNeighborOffsets[ i ], - valuesPerElement * ghostEntitiesCounts( i, rank ), + requests.push_back( MPI::Isend( + sendBuffersView.getData() + bytesPerValue * ghostNeighborOffsets[ i ], + bytesPerValue * ghostEntitiesCounts( i, rank ), i, 0, group ) ); } } - // wait for all communications to finish - CommunicatorType::WaitAll( requests.data(), requests.size() ); + return requests; } // performs a synchronization of a sparse matrix @@ -252,11 +268,11 @@ public: { TNL_ASSERT_EQ( pattern.getRows(), ghostOffsets[ ghostOffsets.getSize() - 1 ], "invalid sparse pattern matrix" ); - const int rank = CommunicatorType::GetRank( group ); - const int nproc = CommunicatorType::GetSize( group ); + const int rank = MPI::GetRank( group ); + const int nproc = MPI::GetSize( group ); // buffer for asynchronous communication requests - std::vector< typename CommunicatorType::Request > requests; + RequestsVector requests; Containers::Array< GlobalIndexType, Devices::Host, int > send_rankOffsets( nproc + 1 ), recv_rankOffsets( nproc + 1 ); Containers::Array< GlobalIndexType, Devices::Host, GlobalIndexType > send_rowCapacities, send_rowPointers, send_columnIndices, recv_rowPointers, recv_columnIndices; @@ -290,7 +306,7 @@ public: // send our row sizes to the target rank if( ! assumeConsistentRowCapacities ) // issue async send operation - requests.push_back( CommunicatorType::ISend( + requests.push_back( MPI::Isend( send_rowCapacities.getData() + send_rankOffsets[ i ], ghostNeighborOffsets[ i + 1 ] - ghostNeighborOffsets[ i ], i, 1, group ) ); @@ -318,7 +334,7 @@ public: if( send_rankOffsets[ i + 1 ] == send_rankOffsets[ i ] ) continue; // issue async send operation - requests.push_back( CommunicatorType::ISend( + requests.push_back( MPI::Isend( send_columnIndices.getData() + send_rowPointers[ send_rankOffsets[ i ] ], send_rowPointers[ send_rankOffsets[ i + 1 ] ] - send_rowPointers[ send_rankOffsets[ i ] ], i, 0, group ) ); @@ -335,7 +351,7 @@ public: // allocate row pointers recv_rowPointers.setSize( recv_rankOffsets[ nproc ] + 1 ); - std::vector< typename CommunicatorType::Request > row_lengths_requests; + RequestsVector row_lengths_requests; // set row pointers GlobalIndexType rowPtr = 0; @@ -353,7 +369,7 @@ public: else { // receive row sizes from the sender // issue async recv operation - row_lengths_requests.push_back( CommunicatorType::IRecv( + row_lengths_requests.push_back( MPI::Irecv( recv_rowPointers.getData() + recv_rankOffsets[ i ], ghostOffsets[ i + 1 ] - ghostOffsets[ i ], i, 1, group ) ); @@ -362,7 +378,7 @@ public: if( ! assumeConsistentRowCapacities ) { // wait for all row lengths - CommunicatorType::WaitAll( row_lengths_requests.data(), row_lengths_requests.size() ); + MPI::Waitall( row_lengths_requests.data(), row_lengths_requests.size() ); // scan the rowPointers array to convert Containers::VectorView< GlobalIndexType, Devices::Host, GlobalIndexType > rowPointersView; @@ -377,7 +393,7 @@ public: if( recv_rankOffsets[ i + 1 ] == recv_rankOffsets[ i ] ) continue; // issue async recv operation - requests.push_back( CommunicatorType::IRecv( + requests.push_back( MPI::Irecv( recv_columnIndices.getData() + recv_rowPointers[ recv_rankOffsets[ i ] ], recv_rowPointers[ recv_rankOffsets[ i + 1 ] ] - recv_rowPointers[ recv_rankOffsets[ i ] ], i, 0, group ) ); @@ -385,7 +401,7 @@ public: } // wait for all communications to finish - CommunicatorType::WaitAll( requests.data(), requests.size() ); + MPI::Waitall( requests.data(), requests.size() ); return std::make_tuple( recv_rankOffsets, recv_rowPointers, recv_columnIndices ); } @@ -428,11 +444,8 @@ public: } protected: - // GOTCHA (see above) - int gpu_id = 0; - // communication group taken from the distributed mesh - typename CommunicatorType::CommunicationGroup group; + MPI_Comm group; /** * Global offsets: array of size nproc where the i-th value is the lowest diff --git a/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h b/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h index 851ff66273fcc97f705aa37d5f6e1c8af3336260..b479544f7e1ebccb739027758d046a4263aa25e6 100644 --- a/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h +++ b/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h @@ -16,23 +16,21 @@ namespace TNL { namespace Meshes { namespace DistributedMeshes { - -template< typename Mesh, - typename Communicator > + +template< typename Mesh > class SubdomainOverlapsGetter {}; -// TODO: Specializations by the grid dimension can be avoided when the MPI directions are +// TODO: Specializations by the grid dimension can be avoided when the MPI directions are // rewritten in a dimension independent way template< typename Real, typename Device, - typename Index, - typename Communicator > -class SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator > + typename Index > +class SubdomainOverlapsGetter< Grid< 1, Real, Device, Index > > { public: - + static const int Dimension = 1; using MeshType = Grid< Dimension, Real, Device, Index >; using DeviceType = Device; @@ -40,10 +38,9 @@ class SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator > using DistributedMeshType = DistributedMesh< MeshType >; using SubdomainOverlapsType = typename DistributedMeshType::SubdomainOverlapsType; using CoordinatesType = typename DistributedMeshType::CoordinatesType; - using CommunicatorType = Communicator; - + // Computes subdomain overlaps - /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions. + /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions. * lower.x() is overlap of the subdomain at boundary where x = 0, * upper.x() is overlap of the subdomain at boundary where x = grid.getDimensions().x() - 1, */ @@ -53,18 +50,17 @@ class SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator > IndexType subdomainOverlapSize, const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize = 0, const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize = 0 ); - + }; template< typename Real, typename Device, - typename Index, - typename Communicator > -class SubdomainOverlapsGetter< Grid< 2, Real, Device, Index >, Communicator > + typename Index > +class SubdomainOverlapsGetter< Grid< 2, Real, Device, Index > > { public: - + static const int Dimension = 2; using MeshType = Grid< Dimension, Real, Device, Index >; using DeviceType = Device; @@ -72,10 +68,9 @@ class SubdomainOverlapsGetter< Grid< 2, Real, Device, Index >, Communicator > using DistributedMeshType = DistributedMesh< MeshType >; using SubdomainOverlapsType = typename DistributedMeshType::SubdomainOverlapsType; using CoordinatesType = typename DistributedMeshType::CoordinatesType; - using CommunicatorType = Communicator; - + // Computes subdomain overlaps - /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions. + /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions. * lower.x() is overlap of the subdomain at boundary where x = 0, * lower.y() is overlap of the subdomain at boundary where y = 0, * upper.x() is overlap of the subdomain at boundary where x = grid.getDimensions().x() - 1, @@ -87,17 +82,16 @@ class SubdomainOverlapsGetter< Grid< 2, Real, Device, Index >, Communicator > IndexType subdomainOverlapSize, const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize = 0, const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize = 0 ); - + }; template< typename Real, typename Device, - typename Index, - typename Communicator > -class SubdomainOverlapsGetter< Grid< 3, Real, Device, Index >, Communicator > + typename Index > +class SubdomainOverlapsGetter< Grid< 3, Real, Device, Index > > { public: - + static const int Dimension = 3; using MeshType = Grid< Dimension, Real, Device, Index >; using DeviceType = Device; @@ -105,10 +99,9 @@ class SubdomainOverlapsGetter< Grid< 3, Real, Device, Index >, Communicator > using DistributedMeshType = DistributedMesh< MeshType >; using SubdomainOverlapsType = typename DistributedMeshType::SubdomainOverlapsType; using CoordinatesType = typename DistributedMeshType::CoordinatesType; - using CommunicatorType = Communicator; - + // Computes subdomain overlaps - /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions. + /* SubdomainOverlapsType is a touple of the same size as the mesh dimensions. * lower.x() is overlap of the subdomain at boundary where x = 0, * lower.y() is overlap of the subdomain at boundary where y = 0, * lower.z() is overlap of the subdomain at boundary where z = 0, @@ -122,7 +115,7 @@ class SubdomainOverlapsGetter< Grid< 3, Real, Device, Index >, Communicator > IndexType subdomainOverlapSize, const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize = 0, const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize = 0 ); - + }; diff --git a/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.hpp b/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.hpp index 9dbb1372b061007b55082a81819d11888000bd74..aa185e1ecf0d08193feb8a58abeb40785914eead 100644 --- a/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.hpp +++ b/src/TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.hpp @@ -10,6 +10,7 @@ #pragma once +#include #include #include @@ -19,26 +20,25 @@ namespace TNL { /* * TODO: This could work when the MPI directions are rewritten - + template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > void -SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator >:: +SubdomainOverlapsGetter< Grid< 1, Real, Device, Index > >:: getOverlaps( const DistributedMeshType* distributedMesh, SubdomainOverlapsType& lower, SubdomainOverlapsType& upper, IndexType subdomainOverlapSize, const SubdomainOverlapsType& periodicBoundariesOverlapSize ) { - if( ! CommunicatorType::isDistributed() ) + if( ! MPI::isDistributed() ) return; TNL_ASSERT_TRUE( distributedMesh != NULL, "" ); const CoordinatesType& subdomainCoordinates = distributedMesh->getSubdomainCoordinates(); - int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup ); - + int rank = MPI::GetRank(); + for( int i = 0; i < Dimension; i++ ) { CoordinatesType neighborDirection( 0 ); @@ -47,7 +47,7 @@ getOverlaps( const DistributedMeshType* distributedMesh, lower[ i ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ Directions::getDirection( neighborDirection ) ] != rank ) lower[ i ] = periodicBoundariesOverlapSize[ i ]; - + neighborDirection[ i ] = 1; if( subdomainCoordinates[ i ] < distributedMesh->getDomainDecomposition()[ i ] - 1 ) upper[ i ] = subdomainOverlapSize; @@ -55,15 +55,14 @@ getOverlaps( const DistributedMeshType* distributedMesh, upper[ i ] = periodicBoundariesOverlapSize[ i ]; } } - + */ template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > void -SubdomainOverlapsGetter< Grid< 1, Real, Device, Index >, Communicator >:: +SubdomainOverlapsGetter< Grid< 1, Real, Device, Index > >:: getOverlaps( const DistributedMeshType* distributedMesh, SubdomainOverlapsType& lower, SubdomainOverlapsType& upper, @@ -71,13 +70,13 @@ getOverlaps( const DistributedMeshType* distributedMesh, const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize, const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize ) { - if( ! CommunicatorType::isDistributed() ) + if( MPI::GetSize() == 1 ) return; TNL_ASSERT_TRUE( distributedMesh != NULL, "" ); const CoordinatesType& subdomainCoordinates = distributedMesh->getSubdomainCoordinates(); - int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup ); - + int rank = MPI::GetRank(); + if( subdomainCoordinates[ 0 ] > 0 ) lower[ 0 ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXm ] != rank ) @@ -92,10 +91,9 @@ getOverlaps( const DistributedMeshType* distributedMesh, template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > void -SubdomainOverlapsGetter< Grid< 2, Real, Device, Index >, Communicator >:: +SubdomainOverlapsGetter< Grid< 2, Real, Device, Index > >:: getOverlaps( const DistributedMeshType* distributedMesh, SubdomainOverlapsType& lower, SubdomainOverlapsType& upper, @@ -103,15 +101,15 @@ getOverlaps( const DistributedMeshType* distributedMesh, const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize, const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize ) { - if( ! CommunicatorType::isDistributed() ) + if( MPI::GetSize() == 1 ) return; TNL_ASSERT_TRUE( distributedMesh != NULL, "" ); const CoordinatesType& subdomainCoordinates = distributedMesh->getSubdomainCoordinates(); - int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup ); + int rank = MPI::GetRank(); lower = 0; upper = 0; - + if( subdomainCoordinates[ 0 ] > 0 ) lower[ 0 ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXm ] != rank ) @@ -121,7 +119,7 @@ getOverlaps( const DistributedMeshType* distributedMesh, upper[ 0 ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXp ] != rank ) upper[ 0 ] = upperPeriodicBoundariesOverlapSize[ 0 ]; - + if( subdomainCoordinates[ 1 ] > 0 ) lower[ 1 ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ ZzYmXz ] != rank ) @@ -135,10 +133,9 @@ getOverlaps( const DistributedMeshType* distributedMesh, template< typename Real, typename Device, - typename Index, - typename Communicator > + typename Index > void -SubdomainOverlapsGetter< Grid< 3, Real, Device, Index >, Communicator >:: +SubdomainOverlapsGetter< Grid< 3, Real, Device, Index > >:: getOverlaps( const DistributedMeshType* distributedMesh, SubdomainOverlapsType& lower, SubdomainOverlapsType& upper, @@ -146,13 +143,13 @@ getOverlaps( const DistributedMeshType* distributedMesh, const SubdomainOverlapsType& lowerPeriodicBoundariesOverlapSize, const SubdomainOverlapsType& upperPeriodicBoundariesOverlapSize ) { - if( ! CommunicatorType::isDistributed() ) + if( MPI::GetSize() == 1 ) return; TNL_ASSERT_TRUE( distributedMesh != NULL, "" ); const CoordinatesType& subdomainCoordinates = distributedMesh->getSubdomainCoordinates(); - int rank = CommunicatorType::GetRank( CommunicatorType::AllGroup ); - + int rank = MPI::GetRank(); + if( subdomainCoordinates[ 0 ] > 0 ) lower[ 0 ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXm ] != rank ) @@ -162,7 +159,7 @@ getOverlaps( const DistributedMeshType* distributedMesh, upper[ 0 ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ ZzYzXp ] != rank ) upper[ 0 ] = upperPeriodicBoundariesOverlapSize[ 0 ]; - + if( subdomainCoordinates[ 1 ] > 0 ) lower[ 1 ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ ZzYmXz ] != rank ) @@ -172,7 +169,7 @@ getOverlaps( const DistributedMeshType* distributedMesh, upper[ 1 ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ ZzYpXz ] != rank ) upper[ 1 ] = upperPeriodicBoundariesOverlapSize[ 1 ]; - + if( subdomainCoordinates[ 2 ] > 0 ) lower[ 2 ] = subdomainOverlapSize; else if( distributedMesh->getPeriodicNeighbors()[ ZmYzXz ] != rank ) diff --git a/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h b/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h index 63a10b1cf08bd6c121402fa792c1f77e002f7516..120cadf808f7d4171d80e4f7ac375939ad1caf17 100644 --- a/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h +++ b/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h @@ -19,14 +19,14 @@ namespace TNL { namespace Meshes { namespace DistributedMeshes { -template< typename CommunicatorType, typename GlobalIndexType > +template< typename GlobalIndexType > auto -exchangeGhostEntitySeeds( typename CommunicatorType::CommunicationGroup group, +exchangeGhostEntitySeeds( MPI_Comm group, const std::vector< std::vector< GlobalIndexType > >& seeds_vertex_indices, const std::vector< std::vector< GlobalIndexType > >& seeds_entity_offsets ) { - const int rank = CommunicatorType::GetRank( group ); - const int nproc = CommunicatorType::GetSize( group ); + const int rank = MPI::GetRank( group ); + const int nproc = MPI::GetSize( group ); // exchange sizes of the arrays Containers::Array< GlobalIndexType, Devices::Host, int > sizes_vertex_indices( nproc ), sizes_entity_offsets( nproc ); @@ -36,12 +36,12 @@ exchangeGhostEntitySeeds( typename CommunicatorType::CommunicationGroup group, sendbuf_indices[ i ] = seeds_vertex_indices[ i ].size(); sendbuf_offsets[ i ] = seeds_entity_offsets[ i ].size(); } - CommunicatorType::Alltoall( sendbuf_indices.getData(), 1, - sizes_vertex_indices.getData(), 1, - group ); - CommunicatorType::Alltoall( sendbuf_offsets.getData(), 1, - sizes_entity_offsets.getData(), 1, - group ); + MPI::Alltoall( sendbuf_indices.getData(), 1, + sizes_vertex_indices.getData(), 1, + group ); + MPI::Alltoall( sendbuf_offsets.getData(), 1, + sizes_entity_offsets.getData(), 1, + group ); } // allocate arrays for the results @@ -54,17 +54,17 @@ exchangeGhostEntitySeeds( typename CommunicatorType::CommunicationGroup group, } // buffer for asynchronous communication requests - std::vector< typename CommunicatorType::Request > requests; + std::vector< MPI_Request > requests; // issue all async receive operations for( int j = 0; j < nproc; j++ ) { if( j == rank ) continue; - requests.push_back( CommunicatorType::IRecv( + requests.push_back( MPI::Irecv( foreign_seeds_vertex_indices[ j ].data(), foreign_seeds_vertex_indices[ j ].size(), j, 0, group ) ); - requests.push_back( CommunicatorType::IRecv( + requests.push_back( MPI::Irecv( foreign_seeds_entity_offsets[ j ].data(), foreign_seeds_entity_offsets[ j ].size(), j, 1, group ) ); @@ -74,30 +74,30 @@ exchangeGhostEntitySeeds( typename CommunicatorType::CommunicationGroup group, for( int i = 0; i < nproc; i++ ) { if( i == rank ) continue; - requests.push_back( CommunicatorType::ISend( + requests.push_back( MPI::Isend( seeds_vertex_indices[ i ].data(), seeds_vertex_indices[ i ].size(), i, 0, group ) ); - requests.push_back( CommunicatorType::ISend( + requests.push_back( MPI::Isend( seeds_entity_offsets[ i ].data(), seeds_entity_offsets[ i ].size(), i, 1, group ) ); } // wait for all communications to finish - CommunicatorType::WaitAll( requests.data(), requests.size() ); + MPI::Waitall( requests.data(), requests.size() ); return std::make_tuple( foreign_seeds_vertex_indices, foreign_seeds_entity_offsets ); } -template< typename CommunicatorType, typename GlobalIndexType > +template< typename GlobalIndexType > auto -exchangeGhostIndices( typename CommunicatorType::CommunicationGroup group, +exchangeGhostIndices( MPI_Comm group, const std::vector< std::vector< GlobalIndexType > >& foreign_ghost_indices, const std::vector< std::vector< GlobalIndexType > >& seeds_local_indices ) { - const int rank = CommunicatorType::GetRank( group ); - const int nproc = CommunicatorType::GetSize( group ); + const int rank = MPI::GetRank( group ); + const int nproc = MPI::GetSize( group ); // allocate arrays for the results std::vector< std::vector< GlobalIndexType > > ghost_indices; @@ -106,13 +106,13 @@ exchangeGhostIndices( typename CommunicatorType::CommunicationGroup group, ghost_indices[ i ].resize( seeds_local_indices[ i ].size() ); // buffer for asynchronous communication requests - std::vector< typename CommunicatorType::Request > requests; + std::vector< MPI_Request > requests; // issue all async receive operations for( int j = 0; j < nproc; j++ ) { if( j == rank ) continue; - requests.push_back( CommunicatorType::IRecv( + requests.push_back( MPI::Irecv( ghost_indices[ j ].data(), ghost_indices[ j ].size(), j, 0, group ) ); @@ -122,14 +122,14 @@ exchangeGhostIndices( typename CommunicatorType::CommunicationGroup group, for( int i = 0; i < nproc; i++ ) { if( i == rank ) continue; - requests.push_back( CommunicatorType::ISend( + requests.push_back( MPI::Isend( foreign_ghost_indices[ i ].data(), foreign_ghost_indices[ i ].size(), i, 0, group ) ); } // wait for all communications to finish - CommunicatorType::WaitAll( requests.data(), requests.size() ); + MPI::Waitall( requests.data(), requests.size() ); return ghost_indices; } @@ -145,7 +145,6 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true ) using GlobalIndexType = typename DistributedMesh::GlobalIndexType; using LocalIndexType = typename DistributedMesh::LocalIndexType; using LocalMesh = typename DistributedMesh::MeshType; - using CommunicatorType = typename DistributedMesh::CommunicatorType; static_assert( ! std::is_same< DeviceType, Devices::Cuda >::value, "this method can be called only for host meshes" ); @@ -154,8 +153,8 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true ) if( mesh.getGhostLevels() <= 0 ) throw std::logic_error( "There are no ghost levels on the distributed mesh." ); - const int rank = CommunicatorType::GetRank( mesh.getCommunicationGroup() ); - const int nproc = CommunicatorType::GetSize( mesh.getCommunicationGroup() ); + const int rank = MPI::GetRank( mesh.getCommunicationGroup() ); + const int nproc = MPI::GetSize( mesh.getCommunicationGroup() ); // 0. exchange cell data to prepare getCellOwner for use in getEntityOwner DistributedMeshSynchronizer< DistributedMesh, DistributedMesh::getMeshDimension() > cell_synchronizer; @@ -235,9 +234,9 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true ) Containers::Array< GlobalIndexType, Devices::Host, int > sendbuf( nproc ); sendbuf.setValue( localEntitiesCount ); - CommunicatorType::Alltoall( sendbuf.getData(), 1, - globalOffsets.getData(), 1, - mesh.getCommunicationGroup() ); + MPI::Alltoall( sendbuf.getData(), 1, + globalOffsets.getData(), 1, + mesh.getCommunicationGroup() ); } globalOffsets.template scan< Algorithms::ScanType::Exclusive >(); @@ -288,7 +287,7 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true ) } // 5. exchange seeds for ghost entities - const auto foreign_seeds = exchangeGhostEntitySeeds< CommunicatorType >( mesh.getCommunicationGroup(), seeds_vertex_indices, seeds_entity_offsets ); + const auto foreign_seeds = exchangeGhostEntitySeeds( mesh.getCommunicationGroup(), seeds_vertex_indices, seeds_entity_offsets ); const auto& foreign_seeds_vertex_indices = std::get< 0 >( foreign_seeds ); const auto& foreign_seeds_entity_offsets = std::get< 1 >( foreign_seeds ); @@ -373,7 +372,7 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true ) }); // 6b. exchange global ghost indices - const auto ghost_indices = exchangeGhostIndices< CommunicatorType >( mesh.getCommunicationGroup(), foreign_ghost_indices, seeds_local_indices ); + const auto ghost_indices = exchangeGhostIndices( mesh.getCommunicationGroup(), foreign_ghost_indices, seeds_local_indices ); // 6c. set the global indices of our ghost entities bool done = true; @@ -387,7 +386,7 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true ) // 6d. check if finished bool all_done = false; - CommunicatorType::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() ); + MPI::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() ); if( all_done ) break; } diff --git a/src/TNL/Meshes/DistributedMeshes/loadDistributedMesh.h b/src/TNL/Meshes/DistributedMeshes/loadDistributedMesh.h index 135e3c15a8c5f64b3032b2d2dba28a0b3d964bd9..52c0b543b641a6a9a433109c7f380063d16ac66e 100644 --- a/src/TNL/Meshes/DistributedMeshes/loadDistributedMesh.h +++ b/src/TNL/Meshes/DistributedMeshes/loadDistributedMesh.h @@ -94,8 +94,7 @@ resolveAndLoadDistributedMesh( Functor&& functor, return resolveDistributedMeshType< ConfigTag, Device >( wrapper, fileName, fileFormat ); } -template< typename CommunicatorType, - typename MeshConfig, +template< typename MeshConfig, typename Device > bool loadDistributedMesh( Mesh< MeshConfig, Device >& mesh, @@ -145,8 +144,7 @@ decomposeMesh( const Config::ParameterContainer& parameters, } // overloads for grids -template< typename CommunicatorType, - int Dimension, +template< int Dimension, typename Real, typename Device, typename Index > @@ -171,7 +169,7 @@ loadDistributedMesh( Grid< Dimension, Real, Device, Index >& mesh, std::cout << " [ OK ] " << std::endl; typename Meshes::DistributedMeshes::DistributedMesh>::SubdomainOverlapsType overlap; - distributedMesh.template setGlobalGrid< CommunicatorType >( globalGrid ); + distributedMesh.setGlobalGrid( globalGrid ); distributedMesh.setupGrid(mesh); return true; } @@ -191,7 +189,6 @@ decomposeMesh( const Config::ParameterContainer& parameters, using GridType = Grid< Dimension, Real, Device, Index >; using DistributedGridType = DistributedMeshes::DistributedMesh< GridType >; using SubdomainOverlapsType = typename DistributedGridType::SubdomainOverlapsType; - using CommunicatorType = typename Problem::CommunicatorType; SubdomainOverlapsType lower( 0 ), upper( 0 ); distributedMesh.setOverlaps( lower, upper ); diff --git a/src/TNL/Meshes/Geometry/getEntityCenter.h b/src/TNL/Meshes/Geometry/getEntityCenter.h index 6e869f6ec2655797d56b11adee010160eeb2890f..addef6b9f3d01839d81bfeb64c8cf948405d8942 100644 --- a/src/TNL/Meshes/Geometry/getEntityCenter.h +++ b/src/TNL/Meshes/Geometry/getEntityCenter.h @@ -39,7 +39,7 @@ getEntityCenter( const Mesh< MeshConfig, Device > & mesh, /* * Get an arithmetic mean of the entity's subvertices. * - * For an simplex entity this corresponds to the centroid of the entity, but + * For a simplex entity this corresponds to the centroid of the entity, but * note that other shapes such as general polygons have different formulas for * the centroid: https://en.wikipedia.org/wiki/Centroid#Centroid_of_a_polygon */ diff --git a/src/TNL/Meshes/Geometry/getEntityMeasure.h b/src/TNL/Meshes/Geometry/getEntityMeasure.h index 70d5614ce9de85691da7d83f53eb65fabcb9f695..fb1e2d468b097b9a292d5d901bdbe2f32630e565 100644 --- a/src/TNL/Meshes/Geometry/getEntityMeasure.h +++ b/src/TNL/Meshes/Geometry/getEntityMeasure.h @@ -19,6 +19,7 @@ #include #include #include +#include namespace TNL { namespace Meshes { @@ -148,5 +149,28 @@ getEntityMeasure( const Mesh< MeshConfig, Device > & mesh, return getTetrahedronVolume( v3 - v0, v2 - v0, v1 - v0 ); } +template< typename MeshConfig, typename Device > +__cuda_callable__ +typename MeshConfig::RealType +getEntityMeasure( const Mesh< MeshConfig, Device > & mesh, + const MeshEntity< MeshConfig, Device, Topologies::Hexahedron > & entity ) +{ + const auto& v0 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 0 ) ); + const auto& v1 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 1 ) ); + const auto& v2 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 2 ) ); + const auto& v3 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 3 ) ); + const auto& v4 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 4 ) ); + const auto& v5 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 5 ) ); + const auto& v6 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 6 ) ); + const auto& v7 = mesh.getPoint( entity.template getSubentityIndex< 0 >( 7 ) ); + // https://www.cfd-online.com/Forums/main/163122-volume-general-hexahedron.html#post574650 + return getTetrahedronVolume( v0 - v4, v3 - v4, v1 - v4 ) + + getTetrahedronVolume( v2 - v4, v3 - v4, v1 - v4 ) + + getTetrahedronVolume( v1 - v4, v2 - v4, v5 - v4 ) + + getTetrahedronVolume( v6 - v4, v2 - v4, v5 - v4 ) + + getTetrahedronVolume( v3 - v4, v2 - v4, v7 - v4 ) + + getTetrahedronVolume( v6 - v4, v2 - v4, v7 - v4 ); +} + } // namespace Meshes } // namespace TNL diff --git a/src/TNL/Meshes/Geometry/getOutwardNormalVector.h b/src/TNL/Meshes/Geometry/getOutwardNormalVector.h index 53680086264bd6a803ce26307264d6a28171a387..d3fa6ea50fef14482bc913aaf0fe2fec62f6c110 100644 --- a/src/TNL/Meshes/Geometry/getOutwardNormalVector.h +++ b/src/TNL/Meshes/Geometry/getOutwardNormalVector.h @@ -11,6 +11,7 @@ #pragma once #include +#include namespace TNL { namespace Meshes { @@ -87,5 +88,63 @@ getOutwardNormalVector( const Grid & grid, } } +template< typename MeshConfig, typename Device > +__cuda_callable__ +typename MeshTraits< MeshConfig >::PointType +getOutwardNormalVector( const Mesh< MeshConfig, Device > & mesh, + const MeshEntity< MeshConfig, Device, Topologies::Edge > & face, + typename MeshTraits< MeshConfig >::PointType cellCenter ) +{ + using MeshType = Mesh< MeshConfig, Device >; + using FaceType = MeshEntity< MeshConfig, Device, Topologies::Edge >; + using PointType = typename MeshTraits< MeshConfig >::PointType; + static_assert( std::is_same< typename MeshType::Face, FaceType >::value, "getOutwardNormalVector called for an entity which is not a face" ); + static_assert( MeshConfig::worldDimension == 2, "TODO: normal vectors for 2D meshes in a 3D space are not implemented yet" ); + + const auto& v0 = mesh.getPoint( face.template getSubentityIndex< 0 >( 0 ) ); + const auto& v1 = mesh.getPoint( face.template getSubentityIndex< 0 >( 1 ) ); + const PointType u = v0 - v1; + const PointType n {u[1], -u[0]}; + + // check on which side of the face is the reference cell center + const PointType faceCenter = getEntityCenter( mesh, face ); + if( dot( n, cellCenter - faceCenter ) < 0 ) + return n / l2Norm( n ); + else + return - n / l2Norm( n ); +} + +template< typename MeshConfig, typename Device, typename EntityTopology > +__cuda_callable__ +typename MeshTraits< MeshConfig >::PointType +getOutwardNormalVector( const Mesh< MeshConfig, Device > & mesh, + const MeshEntity< MeshConfig, Device, EntityTopology > & face, + typename MeshTraits< MeshConfig >::PointType cellCenter ) +{ + using MeshType = Mesh< MeshConfig, Device >; + using FaceType = MeshEntity< MeshConfig, Device, EntityTopology >; + using PointType = typename MeshTraits< MeshConfig >::PointType; + static_assert( std::is_same< typename MeshType::Face, FaceType >::value, "getOutwardNormalVector called for an entity which is not a face" ); + static_assert( MeshConfig::worldDimension == 3, "general overload intended for 3D was called with the wrong world dimension" ); + + const auto& v0 = mesh.getPoint( face.template getSubentityIndex< 0 >( 0 ) ); + const auto& v1 = mesh.getPoint( face.template getSubentityIndex< 0 >( 1 ) ); + const auto& v2 = mesh.getPoint( face.template getSubentityIndex< 0 >( 2 ) ); + const PointType u1 = v0 - v1; + const PointType u2 = v0 - v2; + const PointType n { + u1.y() * u2.z() - u1.z() * u2.y(), // first component of the cross product + u1.z() * u2.x() - u1.x() * u2.z(), // second component of the cross product + u1.x() * u2.y() - u1.y() * u2.x() // third component of the cross product + }; + + // check on which side of the face is the reference cell center + const PointType faceCenter = getEntityCenter( mesh, face ); + if( dot( n, cellCenter - faceCenter ) < 0 ) + return n / l2Norm( n ); + else + return - n / l2Norm( n ); +} + } // namespace Meshes } // namespace TNL diff --git a/src/TNL/Meshes/Readers/MeshReader.h b/src/TNL/Meshes/Readers/MeshReader.h index 88e2986bad3b8590ac53f6112202486348fcc7a8..8bf8189ba161899a54527ad7c0fee281ee9c246b 100644 --- a/src/TNL/Meshes/Readers/MeshReader.h +++ b/src/TNL/Meshes/Readers/MeshReader.h @@ -150,6 +150,18 @@ public: throw MeshReaderError( "VTKReader", "MeshBuilder failed" ); } + virtual VariantVector + readPointData( std::string arrayName ) + { + throw Exceptions::NotImplementedError( "readPointData is not implemented in the mesh reader for this specific file format." ); + } + + virtual VariantVector + readCellData( std::string arrayName ) + { + throw Exceptions::NotImplementedError( "readPointData is not implemented in the mesh reader for this specific file format." ); + } + std::string getMeshType() const { diff --git a/src/TNL/Meshes/Readers/PVTUReader.h b/src/TNL/Meshes/Readers/PVTUReader.h index 666aa4f453478c260240fb45a1dc78d8698ce39e..725aa7fec4baaf797afdda3908f126b728744fc2 100644 --- a/src/TNL/Meshes/Readers/PVTUReader.h +++ b/src/TNL/Meshes/Readers/PVTUReader.h @@ -14,7 +14,7 @@ #include -#include +#include #include #include @@ -67,13 +67,13 @@ class PVTUReader throw MeshReaderError( "PVTUReader", "the file does not contain any element." ); // check that the number of pieces matches the number of MPI ranks - const int nproc = CommunicatorType::GetSize( group ); + const int nproc = MPI::GetSize( group ); if( (int) pieceSources.size() != nproc ) throw MeshReaderError( "PVTUReader", "the number of subdomains does not match the number of MPI ranks (" + std::to_string(pieceSources.size()) + " vs " + std::to_string(nproc) + ")." ); // read the local piece source - const int rank = CommunicatorType::GetRank( group ); + const int rank = MPI::GetRank( group ); localReader.setFileName( pieceSources[ rank ] ); localReader.detectMesh(); @@ -100,12 +100,9 @@ class PVTUReader #endif public: - using CommunicatorType = Communicators::MpiCommunicator; - using CommunicationGroup = typename CommunicatorType::CommunicationGroup; - PVTUReader() = default; - PVTUReader( const std::string& fileName, CommunicationGroup group = CommunicatorType::AllGroup ) + PVTUReader( const std::string& fileName, MPI_Comm group = MPI::AllGroup() ) : XMLVTK( fileName ), group( group ) {} @@ -211,6 +208,18 @@ public: mesh.setCommunicationGroup( group ); } + virtual VariantVector + readPointData( std::string arrayName ) override + { + return localReader.readPointData( arrayName ); + } + + virtual VariantVector + readCellData( std::string arrayName ) override + { + return localReader.readCellData( arrayName ); + } + virtual void reset() override { resetBase(); @@ -221,7 +230,7 @@ public: } protected: - CommunicationGroup group; + MPI_Comm group; int ghostLevels = 0; int minCommonVertices = 0; diff --git a/src/TNL/Meshes/Readers/XMLVTK.h b/src/TNL/Meshes/Readers/XMLVTK.h index fb8e1eb40df1919a7a57b86db8978fde042b2262..af864e6e9c603f5998c6c29b9a91a0b43c64167d 100644 --- a/src/TNL/Meshes/Readers/XMLVTK.h +++ b/src/TNL/Meshes/Readers/XMLVTK.h @@ -325,8 +325,8 @@ public: #endif } - VariantVector - readPointData( std::string arrayName ) + virtual VariantVector + readPointData( std::string arrayName ) override { #ifdef HAVE_TINYXML2 return readPointOrCellData( "PointData", arrayName ); @@ -335,8 +335,8 @@ public: #endif } - VariantVector - readCellData( std::string arrayName ) + virtual VariantVector + readCellData( std::string arrayName ) override { #ifdef HAVE_TINYXML2 return readPointOrCellData( "CellData", arrayName ); diff --git a/src/TNL/Meshes/Readers/getMeshReader.h b/src/TNL/Meshes/Readers/getMeshReader.h new file mode 100644 index 0000000000000000000000000000000000000000..2c2c18a8e3da03a923a62be80629aec81ce6d246 --- /dev/null +++ b/src/TNL/Meshes/Readers/getMeshReader.h @@ -0,0 +1,58 @@ +/*************************************************************************** + getMeshReader.h - description + ------------------- + begin : Nov 7, 2020 + copyright : (C) 2020 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovský + +#pragma once + +#include + +#include +#include +#include +#include + +namespace TNL { +namespace Meshes { +namespace Readers { + +std::shared_ptr< Readers::MeshReader > +getMeshReader( const std::string& fileName, + const std::string& fileFormat ) +{ + namespace fs = std::experimental::filesystem; + std::string format = fileFormat; + if( format == "auto" ) { + format = fs::path(fileName).extension(); + if( format.length() > 0 ) + // remove dot from the extension + format = format.substr(1); + } + + if( format == "ng" ) + return std::make_shared< Readers::NetgenReader >( fileName ); + else if( format == "vtk" ) + return std::make_shared< Readers::VTKReader >( fileName ); + else if( format == "vtu" ) + return std::make_shared< Readers::VTUReader >( fileName ); + else if( format == "pvtu" ) + return std::make_shared< Readers::PVTUReader >( fileName ); + + if( fileFormat == "auto" ) + std::cerr << "File '" << fileName << "' has unsupported format (based on the file extension): " << format << "."; + else + std::cerr << "Unsupported fileFormat parameter: " << fileFormat << "."; + std::cerr << " Supported formats are 'vtk', 'vtu', 'pvtu' and 'ng'." << std::endl; + return nullptr; +} + +} // namespace Readers +} // namespace Meshes +} // namespace TNL diff --git a/src/TNL/Meshes/VTKTraits.h b/src/TNL/Meshes/VTKTraits.h index e09b6c34205aeeeb95e51d7a56a730b82c07c837..0883b607a54ab755b96907ee243dca614f14f08b 100644 --- a/src/TNL/Meshes/VTKTraits.h +++ b/src/TNL/Meshes/VTKTraits.h @@ -172,16 +172,16 @@ enum class CellGhostTypes DUPLICATECELL = 1, // the cell is present on multiple processors HIGHCONNECTIVITYCELL = 2, // the cell has more neighbors than in a regular mesh LOWCONNECTIVITYCELL = 4, // the cell has less neighbors than in a regular mesh - REFINEDCELL = 8, // other cells are present that refines it. + REFINEDCELL = 8, // other cells are present that refines it EXTERIORCELL = 16, // the cell is on the exterior of the data set - HIDDENCELL = 32 // the cell is needed to maintain connectivity, but the data values should be ignored. + HIDDENCELL = 32 // the cell is needed to maintain connectivity, but the data values should be ignored }; enum class PointGhostTypes : std::uint8_t { DUPLICATEPOINT = 1, // the cell is present on multiple processors - HIDDENPOINT = 2 // the point is needed to maintain connectivity, but the data values should be ignored. + HIDDENPOINT = 2 // the point is needed to maintain connectivity, but the data values should be ignored }; /** diff --git a/src/TNL/Meshes/Writers/PVTUWriter.h b/src/TNL/Meshes/Writers/PVTUWriter.h index 8ef4d2b7bc1c6f84090cf2d8f0dbd929bb4c9bda..2f332d20ee39fa89a2ec3f765c61f6eaf832f67d 100644 --- a/src/TNL/Meshes/Writers/PVTUWriter.h +++ b/src/TNL/Meshes/Writers/PVTUWriter.h @@ -31,7 +31,7 @@ public: PVTUWriter() = delete; PVTUWriter( std::ostream& str, VTK::FileFormat format = VTK::FileFormat::zlib_compressed ) - : str(str), format(format) + : str(str.rdbuf()), format(format) {} // If desired, cycle and time of the simulation can put into the file. This follows the instructions at @@ -65,9 +65,8 @@ public: // add all pieces and return the source path for the current rank // (useful for parallel writing) - template< typename Communicator > std::string addPiece( const String& mainFileName, - const typename Communicator::CommunicationGroup group ); + const MPI_Comm group ); ~PVTUWriter(); @@ -79,7 +78,7 @@ protected: void writeFooter(); - std::ostream& str; + std::ostream str; VTK::FileFormat format; diff --git a/src/TNL/Meshes/Writers/PVTUWriter.hpp b/src/TNL/Meshes/Writers/PVTUWriter.hpp index 71e19da1de2bcf4e77f22f31473eacd0d57340a9..affee65a289d2465150cb80a41aa396710939e90 100644 --- a/src/TNL/Meshes/Writers/PVTUWriter.hpp +++ b/src/TNL/Meshes/Writers/PVTUWriter.hpp @@ -137,15 +137,14 @@ PVTUWriter< Mesh >::addPiece( const String& mainFileName, } template< typename Mesh > - template< typename Communicator > std::string PVTUWriter< Mesh >::addPiece( const String& mainFileName, - const typename Communicator::CommunicationGroup group ) + const MPI_Comm group ) { std::string source; - for( int i = 0; i < Communicator::GetSize( group ); i++ ) { + for( int i = 0; i < MPI::GetSize( group ); i++ ) { const std::string s = addPiece( mainFileName, i ); - if( i == Communicator::GetRank( group ) ) + if( i == MPI::GetRank( group ) ) source = s; } return source; diff --git a/src/TNL/Meshes/Writers/VTKWriter.h b/src/TNL/Meshes/Writers/VTKWriter.h index e1c5fae9786c670dc43002efb3a65856a9d3b78c..db0c09b1306ce3524d2d088215f29919a0c4740d 100644 --- a/src/TNL/Meshes/Writers/VTKWriter.h +++ b/src/TNL/Meshes/Writers/VTKWriter.h @@ -45,7 +45,7 @@ public: VTKWriter() = delete; VTKWriter( std::ostream& str, VTK::FileFormat format = VTK::FileFormat::binary ) - : str(str), format(format) + : str(str.rdbuf()), format(format) { if( format != VTK::FileFormat::ascii && format != VTK::FileFormat::binary ) throw std::domain_error("The Legacy VTK file formats support only ASCII and BINARY formats."); @@ -78,7 +78,7 @@ protected: void writeHeader(); - std::ostream& str; + std::ostream str; VTK::FileFormat format; diff --git a/src/TNL/Meshes/Writers/VTKWriter.hpp b/src/TNL/Meshes/Writers/VTKWriter.hpp index 125366d0334507ff9f50c765417077cc1f253978..801d3bc1926a944396fc19b9765e3d6bfb9ad841 100644 --- a/src/TNL/Meshes/Writers/VTKWriter.hpp +++ b/src/TNL/Meshes/Writers/VTKWriter.hpp @@ -509,7 +509,7 @@ VTKWriter< Mesh >::writeDataArray( const Array& array, // use a host buffer if direct access to the array elements is not possible if( std::is_same< typename Array::DeviceType, Devices::Cuda >::value ) { - using HostArray = typename Array::template Self< typename Array::ValueType, Devices::Host >; + using HostArray = typename Array::template Self< std::remove_const_t< typename Array::ValueType >, Devices::Host, typename Array::IndexType >; HostArray hostBuffer; hostBuffer = array; writeDataArray( hostBuffer, name, numberOfComponents ); diff --git a/src/TNL/Meshes/Writers/VTUWriter.h b/src/TNL/Meshes/Writers/VTUWriter.h index 9f715dce65af314acfa5eca21467c5d7eabcef84..00765cc0d14434379850d59d857a5cde463adfdb 100644 --- a/src/TNL/Meshes/Writers/VTUWriter.h +++ b/src/TNL/Meshes/Writers/VTUWriter.h @@ -44,7 +44,7 @@ public: VTUWriter() = delete; VTUWriter( std::ostream& str, VTK::FileFormat format = VTK::FileFormat::zlib_compressed ) - : str(str), format(format) + : str(str.rdbuf()), format(format) {} // If desired, cycle and time of the simulation can put into the file. This follows the instructions at @@ -78,7 +78,7 @@ protected: void writeFooter(); - std::ostream& str; + std::ostream str; VTK::FileFormat format; diff --git a/src/TNL/Meshes/Writers/VTUWriter.hpp b/src/TNL/Meshes/Writers/VTUWriter.hpp index 8d609f0a78a9327885751065f015b607c39f45e0..c8093010d6db57db63675d8adc0fcab002a997b4 100644 --- a/src/TNL/Meshes/Writers/VTUWriter.hpp +++ b/src/TNL/Meshes/Writers/VTUWriter.hpp @@ -83,6 +83,7 @@ template< typename MeshReal, struct MeshEntitiesVTUCollector< Meshes::Grid< 1, MeshReal, Device, MeshIndex >, 1 > { using Mesh = Meshes::Grid< 1, MeshReal, Device, MeshIndex >; + using Entity = typename Mesh::template EntityType< 1 >; static void exec( const Mesh& mesh, std::vector< typename Mesh::GlobalIndexType > & connectivity, @@ -94,7 +95,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 1, MeshReal, Device, MeshIndex >, connectivity.push_back( i ); connectivity.push_back( i+1 ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Line ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } } }; @@ -106,6 +107,7 @@ template< typename MeshReal, struct MeshEntitiesVTUCollector< Meshes::Grid< 1, MeshReal, Device, MeshIndex >, 0 > { using Mesh = Meshes::Grid< 1, MeshReal, Device, MeshIndex >; + using Entity = typename Mesh::template EntityType< 0 >; static void exec( const Mesh& mesh, std::vector< typename Mesh::GlobalIndexType > & connectivity, @@ -116,7 +118,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 1, MeshReal, Device, MeshIndex >, { connectivity.push_back( i ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Vertex ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } } }; @@ -128,6 +130,7 @@ template< typename MeshReal, struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, 2 > { using Mesh = Meshes::Grid< 2, MeshReal, Device, MeshIndex >; + using Entity = typename Mesh::template EntityType< 2 >; static void exec( const Mesh& mesh, std::vector< typename Mesh::GlobalIndexType > & connectivity, @@ -142,7 +145,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, connectivity.push_back( (j+1) * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( (j+1) * ( mesh.getDimensions().x() + 1 ) + i + 1 ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Pixel ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } } }; @@ -154,6 +157,7 @@ template< typename MeshReal, struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, 1 > { using Mesh = Meshes::Grid< 2, MeshReal, Device, MeshIndex >; + using Entity = typename Mesh::template EntityType< 1 >; static void exec( const Mesh& mesh, std::vector< typename Mesh::GlobalIndexType > & connectivity, @@ -161,21 +165,21 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, std::vector< std::uint8_t > & types ) { for( MeshIndex j = 0; j < mesh.getDimensions().y(); j++ ) - for( MeshIndex i = 0; i < ( mesh.getDimensions().x() + 1 ); i++ ) + for( MeshIndex i = 0; i < (mesh.getDimensions().x() + 1); i++ ) { connectivity.push_back( j * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( (j+1) * ( mesh.getDimensions().x() + 1 ) + i ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Line ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } - for( MeshIndex j = 0; j < (mesh.getDimensions().y()+1); j++ ) + for( MeshIndex j = 0; j < (mesh.getDimensions().y() + 1); j++ ) for( MeshIndex i = 0; i < mesh.getDimensions().x(); i++ ) { connectivity.push_back( j * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( j * ( mesh.getDimensions().x() + 1 ) + i + 1 ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Line ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } } }; @@ -187,6 +191,7 @@ template< typename MeshReal, struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, 0 > { using Mesh = Meshes::Grid< 2, MeshReal, Device, MeshIndex >; + using Entity = typename Mesh::template EntityType< 0 >; static void exec( const Mesh& mesh, std::vector< typename Mesh::GlobalIndexType > & connectivity, @@ -198,7 +203,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 2, MeshReal, Device, MeshIndex >, { connectivity.push_back( j * mesh.getDimensions().x() + i ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Vertex ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } } }; @@ -210,6 +215,7 @@ template< typename MeshReal, struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 3 > { using Mesh = Meshes::Grid< 3, MeshReal, Device, MeshIndex >; + using Entity = typename Mesh::template EntityType< 3 >; static void exec( const Mesh& mesh, std::vector< typename Mesh::GlobalIndexType > & connectivity, @@ -229,7 +235,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i + 1 ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Voxel ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } } }; @@ -241,6 +247,7 @@ template< typename MeshReal, struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 2 > { using Mesh = Meshes::Grid< 3, MeshReal, Device, MeshIndex >; + using Entity = typename Mesh::template EntityType< 2 >; static void exec( const Mesh& mesh, std::vector< typename Mesh::GlobalIndexType > & connectivity, @@ -256,7 +263,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Pixel ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } for( MeshIndex k = 0; k < mesh.getDimensions().z(); k++ ) @@ -268,7 +275,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i + 1 ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Pixel ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } for( MeshIndex k = 0; k <= mesh.getDimensions().z(); k++ ) @@ -280,7 +287,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i + 1 ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Pixel ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } } }; @@ -292,6 +299,7 @@ template< typename MeshReal, struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 1 > { using Mesh = Meshes::Grid< 3, MeshReal, Device, MeshIndex >; + using Entity = typename Mesh::template EntityType< 1 >; static void exec( const Mesh& mesh, std::vector< typename Mesh::GlobalIndexType > & connectivity, @@ -305,7 +313,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i + 1 ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Line ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } for( MeshIndex k = 0; k <= mesh.getDimensions().z(); k++ ) @@ -315,7 +323,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + (j+1) * ( mesh.getDimensions().x() + 1 ) + i ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Line ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } for( MeshIndex k = 0; k < mesh.getDimensions().z(); k++ ) @@ -325,7 +333,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i ); connectivity.push_back( (k+1) * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Line ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } } }; @@ -337,6 +345,7 @@ template< typename MeshReal, struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, 0 > { using Mesh = Meshes::Grid< 3, MeshReal, Device, MeshIndex >; + using Entity = typename Mesh::template EntityType< 0 >; static void exec( const Mesh& mesh, std::vector< typename Mesh::GlobalIndexType > & connectivity, @@ -349,7 +358,7 @@ struct MeshEntitiesVTUCollector< Meshes::Grid< 3, MeshReal, Device, MeshIndex >, { connectivity.push_back( k * ( mesh.getDimensions().y() + 1 ) * ( mesh.getDimensions().x() + 1 ) + j * ( mesh.getDimensions().x() + 1 ) + i ); offsets.push_back( connectivity.size() ); - types.push_back( (std::uint8_t) VTK::EntityShape::Vertex ); + types.push_back( (std::uint8_t) VTK::GridEntityShape< Entity >::shape ); } } }; @@ -459,7 +468,7 @@ VTUWriter< Mesh >::writeDataArray( const Array& array, // use a host buffer if direct access to the array elements is not possible if( std::is_same< typename Array::DeviceType, Devices::Cuda >::value ) { - using HostArray = typename Array::template Self< typename Array::ValueType, Devices::Host >; + using HostArray = typename Array::template Self< std::remove_const_t< typename Array::ValueType >, Devices::Host, typename Array::IndexType >; HostArray hostBuffer; hostBuffer = array; writeDataArray( hostBuffer, name, numberOfComponents ); diff --git a/src/TNL/Problems/HeatEquationProblem_impl.h b/src/TNL/Problems/HeatEquationProblem_impl.h index 1da61c51ef80c77b133ae9060ef410d1a4a00949..131697afb38443a8c5bbc2206c30a3953a578d27 100644 --- a/src/TNL/Problems/HeatEquationProblem_impl.h +++ b/src/TNL/Problems/HeatEquationProblem_impl.h @@ -146,7 +146,7 @@ setInitialCondition( const Config::ParameterContainer& parameters, if(distributedIOType==Meshes::DistributedMeshes::LocalCopy) Meshes::DistributedMeshes::DistributedGridIO ::load(initialConditionFile, *uPointer ); synchronizer.setDistributedGrid( uPointer->getMesh().getDistributedMesh() ); - synchronizer.template synchronize( *uPointer ); + synchronizer.synchronize( *uPointer ); } else { @@ -173,7 +173,7 @@ template< typename Mesh, typename RightHandSide, typename Communicator, typename DifferentialOperator > - template< typename MatrixPointer > + template< typename MatrixPointer > bool HeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, Communicator, DifferentialOperator >:: setupLinearSystem( MatrixPointer& matrixPointer ) @@ -247,7 +247,7 @@ getExplicitUpdate( const RealType& time, * * You may use supporting vectors again if you need. */ - + this->bindDofs( uDofs ); this->fuPointer->bind( this->getMesh(), *fuDofs ); this->explicitUpdater.template update< typename Mesh::Cell, Communicator >( time, tau, this->getMesh(), this->uPointer, this->fuPointer ); @@ -258,7 +258,7 @@ template< typename Mesh, typename RightHandSide, typename Communicator, typename DifferentialOperator > -void +void HeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, Communicator, DifferentialOperator >:: applyBoundaryConditions( const RealType& time, DofVectorPointer& uDofs ) @@ -272,7 +272,7 @@ template< typename Mesh, typename RightHandSide, typename Communicator, typename DifferentialOperator > - template< typename MatrixPointer > + template< typename MatrixPointer > void HeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, Communicator, DifferentialOperator >:: assemblyLinearSystem( const RealType& time, @@ -282,7 +282,7 @@ assemblyLinearSystem( const RealType& time, DofVectorPointer& bPointer ) { this->bindDofs( dofsPointer ); - this->systemAssembler.template assembly< typename Mesh::Cell, typename MatrixPointer::ObjectType >( + this->systemAssembler.template assembly< typename Mesh::Cell, typename MatrixPointer::ObjectType >( time, tau, this->getMesh(), diff --git a/src/TNL/Problems/PDEProblem_impl.h b/src/TNL/Problems/PDEProblem_impl.h index 6a3aa63e6d82bce68b9f549b413d275504f137aa..f42f18b165887c4ad006bad75e5ac0bdc3beea98 100644 --- a/src/TNL/Problems/PDEProblem_impl.h +++ b/src/TNL/Problems/PDEProblem_impl.h @@ -59,7 +59,7 @@ template< typename Mesh, typename PDEProblem< Mesh, Communicator, Real, Device, Index >::IndexType PDEProblem< Mesh, Communicator, Real, Device, Index >:: subdomainOverlapSize() -{ +{ return 1; } @@ -77,9 +77,9 @@ getSubdomainOverlaps( const Config::ParameterContainer& parameters, SubdomainOverlapsType& upper ) { using namespace Meshes::DistributedMeshes; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( mesh.getDistributedMesh(), lower, upper, this->subdomainOverlapSize() ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( mesh.getDistributedMesh(), lower, upper, this->subdomainOverlapSize() ); } - + template< typename Mesh, typename Communicator, typename Real, diff --git a/src/TNL/Solvers/Linear/BICGStab.h b/src/TNL/Solvers/Linear/BICGStab.h index 2cede824ad00c4ea8b4cb2f270d86882f5bfcfe3..474a45d023a579095bf22db383cdbee4feb43294 100644 --- a/src/TNL/Solvers/Linear/BICGStab.h +++ b/src/TNL/Solvers/Linear/BICGStab.h @@ -37,6 +37,10 @@ public: bool solve( ConstVectorViewType b, VectorViewType x ) override; protected: + void compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b ); + + void preconditioned_matvec( ConstVectorViewType src, VectorViewType dst ); + void setSize( const VectorViewType& x ); bool exact_residue = false; diff --git a/src/TNL/Solvers/Linear/BICGStab_impl.h b/src/TNL/Solvers/Linear/BICGStab_impl.h index baa4b6363e712ec4156e7a4bc79bc6e32bcc031c..ff3b42ed0c7b7cc65527481f8edd2990b94b639d 100644 --- a/src/TNL/Solvers/Linear/BICGStab_impl.h +++ b/src/TNL/Solvers/Linear/BICGStab_impl.h @@ -38,111 +38,80 @@ setup( const Config::ParameterContainer& parameters, } template< typename Matrix > -bool BICGStab< Matrix >::solve( ConstVectorViewType b, VectorViewType x ) +bool +BICGStab< Matrix >:: +solve( ConstVectorViewType b, VectorViewType x ) { this->setSize( x ); - RealType alpha, beta, omega, aux, rho, rho_old, b_norm; + RealType alpha, beta, omega, rho, rho_old, b_norm, r_ast_sqnorm; + // initialize the norm of the preconditioned right-hand-side if( this->preconditioner ) { this->preconditioner->solve( b, M_tmp ); b_norm = lpNorm( M_tmp, 2.0 ); - - this->matrix->vectorProduct( x, M_tmp ); - M_tmp = b - M_tmp; - this->preconditioner->solve( M_tmp, r ); } - else { + else b_norm = lpNorm( b, 2.0 ); - this->matrix->vectorProduct( x, r ); - r = b - r; - } + if( b_norm == 0.0 ) + b_norm = 1.0; + + // r = M.solve(b - A * x); + compute_residue( r, x, b ); p = r_ast = r; s.setValue( 0.0 ); - rho = (r, r_ast); + r_ast_sqnorm = rho = (r, r_ast); - if( b_norm == 0.0 ) - b_norm = 1.0; + const RealType eps2 = std::numeric_limits::epsilon() * std::numeric_limits::epsilon(); this->resetIterations(); this->setResidue( std::sqrt( rho ) / b_norm ); while( this->nextIteration() ) { - /**** - * alpha_j = ( r_j, r^ast_0 ) / ( A * p_j, r^ast_0 ) - */ - if( this->preconditioner ) { - this->matrix->vectorProduct( p, M_tmp ); - this->preconditioner->solve( M_tmp, Ap ); - } - else { - this->matrix->vectorProduct( p, Ap ); - } - aux = (Ap, r_ast); - alpha = rho / aux; + // alpha_j = ( r_j, r^ast_0 ) / ( A * p_j, r^ast_0 ) + preconditioned_matvec( p, Ap ); + alpha = rho / (Ap, r_ast); - /**** - * s_j = r_j - alpha_j * A p_j - */ + // s_j = r_j - alpha_j * A p_j s = r - alpha * Ap; - /**** - * omega_j = ( A s_j, s_j ) / ( A s_j, A s_j ) - */ - if( this->preconditioner ) { - this->matrix->vectorProduct( s, M_tmp ); - this->preconditioner->solve( M_tmp, As ); - } - else { - this->matrix->vectorProduct( s, As ); - } - aux = lpNorm( As, 2.0 ); - omega = (As, s) / (aux * aux); + // omega_j = ( A s_j, s_j ) / ( A s_j, A s_j ) + preconditioned_matvec( s, As ); + omega = (As, s) / (As, As); - /**** - * x_{j+1} = x_j + alpha_j * p_j + omega_j * s_j - */ + // x_{j+1} = x_j + alpha_j * p_j + omega_j * s_j x += alpha * p + omega * s; - /**** - * r_{j+1} = s_j - omega_j * A s_j - */ + // r_{j+1} = s_j - omega_j * A s_j r = s - omega * As; - /**** - * beta = alpha_j / omega_j * ( r_{j+1}, r^ast_0 ) / ( r_j, r^ast_0 ) - */ + // compute scalar product of the residual vectors rho_old = rho; rho = (r, r_ast); + if( abs(rho) < eps2 * r_ast_sqnorm ) { + // The new residual vector has become too orthogonal to the arbitrarily chosen direction r_ast. + // Let's restart with a new r0: + compute_residue( r, x, b ); + r_ast = r; + r_ast_sqnorm = rho = (r, r_ast); + } + + // beta = alpha_j / omega_j * ( r_{j+1}, r^ast_0 ) / ( r_j, r^ast_0 ) beta = (rho / rho_old) * (alpha / omega); - /**** - * p_{j+1} = r_{j+1} + beta_j * ( p_j - omega_j * A p_j ) - */ + // p_{j+1} = r_{j+1} + beta_j * ( p_j - omega_j * A p_j ) p = r + beta * p - (beta * omega) * Ap; if( exact_residue ) { - /**** - * Compute the exact preconditioned residue into the 's' vector. - */ - if( this->preconditioner ) { - this->matrix->vectorProduct( x, M_tmp ); - M_tmp = b - M_tmp; - this->preconditioner->solve( M_tmp, s ); - } - else { - this->matrix->vectorProduct( x, s ); - s = b - s; - } + // Compute the exact preconditioned residue into the 's' vector. + compute_residue( s, x, b ); const RealType residue = lpNorm( s, 2.0 ); this->setResidue( residue / b_norm ); } else { - /**** - * Use the "orthogonal residue vector" for stopping. - */ + // Use the "orthogonal residue vector" for stopping. const RealType residue = lpNorm( r, 2.0 ); this->setResidue( residue / b_norm ); } @@ -153,7 +122,40 @@ bool BICGStab< Matrix >::solve( ConstVectorViewType b, VectorViewType x ) } template< typename Matrix > -void BICGStab< Matrix > :: setSize( const VectorViewType& x ) +void +BICGStab< Matrix >:: +compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b ) +{ + // r = M.solve(b - A * x); + if( this->preconditioner ) { + this->matrix->vectorProduct( x, M_tmp ); + M_tmp = b - M_tmp; + this->preconditioner->solve( M_tmp, r ); + } + else { + this->matrix->vectorProduct( x, r ); + r = b - r; + } +} + +template< typename Matrix > +void +BICGStab< Matrix >:: +preconditioned_matvec( ConstVectorViewType src, VectorViewType dst ) +{ + if( this->preconditioner ) { + this->matrix->vectorProduct( src, M_tmp ); + this->preconditioner->solve( M_tmp, dst ); + } + else { + this->matrix->vectorProduct( src, dst ); + } +} + +template< typename Matrix > +void +BICGStab< Matrix >:: +setSize( const VectorViewType& x ) { r.setLike( x ); r_ast.setLike( x ); diff --git a/src/TNL/Solvers/Linear/GMRES.h b/src/TNL/Solvers/Linear/GMRES.h index e1c02f0ab5eadbaea7ba2c2678641b0c1a05ee6b..818f1c163019a3f83347c2f8a0ca0ce1c518a667 100644 --- a/src/TNL/Solvers/Linear/GMRES.h +++ b/src/TNL/Solvers/Linear/GMRES.h @@ -23,10 +23,7 @@ class GMRES : public LinearSolver< Matrix > { using Base = LinearSolver< Matrix >; - - // compatibility shortcuts using Traits = Linear::Traits< Matrix >; - using CommunicatorType = typename Traits::CommunicatorType; public: using RealType = typename Base::RealType; diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h index 02a122a5dd178cb7100edd52210004dccddf2626..3b13e0b28ed4c6f17bea1b9c582fbdcfd6edc961 100644 --- a/src/TNL/Solvers/Linear/GMRES_impl.h +++ b/src/TNL/Solvers/Linear/GMRES_impl.h @@ -477,20 +477,20 @@ hauseholder_generate( const int i, ConstVectorViewType z ) { // XXX: the upper-right triangle of Y will be full of zeros, which can be exploited for optimization + ConstDeviceView z_local = Traits::getConstLocalView( z ); + DeviceView y_i_local = Traits::getLocalView( y_i ); if( localOffset == 0 ) { TNL_ASSERT_LT( i, size, "upper-right triangle of Y is not on rank 0" ); auto kernel_truncation = [=] __cuda_callable__ ( IndexType j ) mutable { if( j < i ) - y_i[ j ] = 0.0; + y_i_local[ j ] = 0.0; else - y_i[ j ] = z[ j ]; + y_i_local[ j ] = z_local[ j ]; }; Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, size, kernel_truncation ); } else { - ConstDeviceView z_local = Traits::getConstLocalView( z ); - DeviceView y_i_local = Traits::getLocalView( y_i ); y_i_local = z_local; } @@ -510,7 +510,7 @@ hauseholder_generate( const int i, norm_yi_squared = 2 * (normz * normz + std::fabs( y_ii ) * normz); } // no-op if the problem is not distributed - CommunicatorType::Bcast( &norm_yi_squared, 1, 0, Traits::getCommunicationGroup( *this->matrix ) ); + MPI::Bcast( &norm_yi_squared, 1, 0, Traits::getCommunicationGroup( *this->matrix ) ); // XXX: normalization is slower, but more stable // y_i *= 1.0 / std::sqrt( norm_yi_squared ); @@ -534,7 +534,7 @@ hauseholder_generate( const int i, i, aux ); // no-op if the problem is not distributed - CommunicatorType::Allreduce( aux, i, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) ); + MPI::Allreduce( aux, i, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) ); // [T_i]_{0..i-1} = - T_{i-1} * t_i * aux for( int k = 0; k < i; k++ ) { @@ -559,7 +559,7 @@ hauseholder_apply_trunc( HostView out, HostView YL_i( &YL[ i * (restarting_max + 1) ], restarting_max + 1 ); Algorithms::MultiDeviceMemoryOperations< Devices::Host, DeviceType >::copy( YL_i.getData(), Traits::getLocalView( y_i ).getData(), YL_i.getSize() ); // no-op if the problem is not distributed - CommunicatorType::Bcast( YL_i.getData(), YL_i.getSize(), 0, Traits::getCommunicationGroup( *this->matrix ) ); + MPI::Bcast( YL_i.getData(), YL_i.getSize(), 0, Traits::getCommunicationGroup( *this->matrix ) ); // NOTE: aux = t_i * (y_i, z) = 1 since t_i = 2 / ||y_i||^2 and // (y_i, z) = ||z_trunc||^2 + |z_i| ||z_trunc|| = ||y_i||^2 / 2 @@ -579,7 +579,7 @@ hauseholder_apply_trunc( HostView out, } // no-op if the problem is not distributed - CommunicatorType::Bcast( out.getData(), i + 1, 0, Traits::getCommunicationGroup( *this->matrix ) ); + MPI::Bcast( out.getData(), i + 1, 0, Traits::getCommunicationGroup( *this->matrix ) ); } template< typename Matrix > @@ -634,7 +634,7 @@ hauseholder_cwy_transposed( VectorViewType z, i + 1, aux ); // no-op if the problem is not distributed - Traits::CommunicatorType::Allreduce( aux, i + 1, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) ); + MPI::Allreduce( aux, i + 1, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) ); // aux = T_i^T * aux // Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h index f88e315ccf734a12ec20e53fb930016aa0330b36..7c03dd7ce6bf073a1a675798eb19f677d8339520 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h +++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal.h @@ -42,12 +42,12 @@ protected: VectorType diagonal; }; -template< typename Matrix, typename Communicator > -class Diagonal< Matrices::DistributedMatrix< Matrix, Communicator > > -: public Preconditioner< Matrices::DistributedMatrix< Matrix, Communicator > > +template< typename Matrix > +class Diagonal< Matrices::DistributedMatrix< Matrix > > +: public Preconditioner< Matrices::DistributedMatrix< Matrix > > { public: - using MatrixType = Matrices::DistributedMatrix< Matrix, Communicator >; + using MatrixType = Matrices::DistributedMatrix< Matrix >; using RealType = typename MatrixType::RealType; using DeviceType = typename MatrixType::DeviceType; using IndexType = typename MatrixType::IndexType; diff --git a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h index 788fc228d0d226db0e53e507c331859483ab7f69..17746373a338fc885d2ab46c9743448811fac857 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h +++ b/src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h @@ -49,57 +49,58 @@ void Diagonal< Matrix >:: solve( ConstVectorViewType b, VectorViewType x ) const { - ConstVectorViewType diag_view( diagonal ); - - auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable - { - x[ i ] = b[ i ] / diag_view[ i ]; - }; - - Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel ); + x = b / diagonal; } -template< typename Matrix, typename Communicator > +template< typename Matrix > void -Diagonal< Matrices::DistributedMatrix< Matrix, Communicator > >:: +Diagonal< Matrices::DistributedMatrix< Matrix > >:: update( const MatrixPointer& matrixPointer ) { TNL_ASSERT_GT( matrixPointer->getRows(), 0, "empty matrix" ); - TNL_ASSERT_EQ( matrixPointer->getRows(), matrixPointer->getColumns(), "matrix must be square" ); - diagonal.setSize( matrixPointer->getLocalMatrix().getRows() ); LocalViewType diag_view( diagonal ); - const MatrixType* kernel_matrix = &matrixPointer.template getData< DeviceType >(); - - auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable - { - const IndexType gi = kernel_matrix->getLocalRowRange().getGlobalIndex( i ); - diag_view[ i ] = kernel_matrix->getLocalMatrix().getElement( i, gi ); - }; - - Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel ); + // FIXME: SparseMatrix::getConstView is broken +// const auto matrix_view = matrixPointer->getLocalMatrix().getConstView(); + const auto matrix_view = matrixPointer->getLocalMatrix().getView(); + + if( matrixPointer->getRows() == matrixPointer->getColumns() ) { + // square matrix, assume global column indices + const auto row_range = matrixPointer->getLocalRowRange(); + auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable + { + const IndexType gi = row_range.getGlobalIndex( i ); + diag_view[ i ] = matrix_view.getElement( i, gi ); + }; + Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel ); + } + else { + // non-square matrix, assume ghost indexing + TNL_ASSERT_LT( matrixPointer->getLocalMatrix().getRows(), matrixPointer->getLocalMatrix().getColumns(), "the local matrix should have more columns than rows" ); + auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable + { + diag_view[ i ] = matrix_view.getElement( i, i ); + }; + Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel ); + } } -template< typename Matrix, typename Communicator > +template< typename Matrix > void -Diagonal< Matrices::DistributedMatrix< Matrix, Communicator > >:: +Diagonal< Matrices::DistributedMatrix< Matrix > >:: solve( ConstVectorViewType b, VectorViewType x ) const { ConstLocalViewType diag_view( diagonal ); const auto b_view = b.getConstLocalView(); auto x_view = x.getLocalView(); - TNL_ASSERT_EQ( b_view.getSize(), diagonal.getSize(), "The size of the vector b does not match the size of the extracted diagonal." ); - TNL_ASSERT_EQ( x_view.getSize(), diagonal.getSize(), "The size of the vector x does not match the size of the extracted diagonal." ); + // compute without ghosts (diagonal includes only local rows) + x_view = b_view / diag_view; - auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable - { - x_view[ i ] = b_view[ i ] / diag_view[ i ]; - }; - - Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel ); + // synchronize ghosts + x.startSynchronization(); } } // namespace Preconditioners diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h index c4b409bb3a047e12410066bddbe9bdcce509ee89..a4eb9e8aae26786412fe8945a9ccf2795f6293fa 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h +++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h @@ -90,7 +90,12 @@ protected: template< typename M > static IndexType getMinColumn( const Matrices::DistributedMatrix< M >& m ) { - return m.getLocalRowRange().getBegin(); + if( m.getRows() == m.getColumns() ) + // square matrix, assume global column indices + return m.getLocalRowRange().getBegin(); + else + // non-square matrix, assume ghost indexing + return 0; } }; @@ -189,11 +194,11 @@ protected: #endif }; -template< typename Matrix, typename Communicator > -class ILU0_impl< Matrices::DistributedMatrix< Matrix, Communicator >, double, Devices::Cuda, int > -: public Preconditioner< Matrices::DistributedMatrix< Matrix, Communicator > > +template< typename Matrix > +class ILU0_impl< Matrices::DistributedMatrix< Matrix >, double, Devices::Cuda, int > +: public Preconditioner< Matrices::DistributedMatrix< Matrix > > { - using MatrixType = Matrices::DistributedMatrix< Matrix, Communicator >; + using MatrixType = Matrices::DistributedMatrix< Matrix >; public: using RealType = double; using DeviceType = Devices::Cuda; diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h index c11909c073e9ec10b5310bebd2c4a20bbfbee5dc..f68a93f16c21c2a96ce1ed55132f021dc573b068 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h +++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h @@ -145,6 +145,9 @@ solve( ConstVectorViewType _b, VectorViewType _x ) const // Step 2: solve x from Ux = y triangularSolveUpper< true, true >( U, x, x ); + + // synchronize ghosts + Traits< Matrix >::startSynchronization( _x ); } diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h index d46f3f900f4357dd7bf15dce170e5d63ecf22497..344daf1a0103a0a93ca576358b2da787d7578f8b 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h +++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h @@ -79,7 +79,12 @@ protected: template< typename M > static IndexType getMinColumn( const Matrices::DistributedMatrix< M >& m ) { - return m.getLocalRowRange().getBegin(); + if( m.getRows() == m.getColumns() ) + // square matrix, assume global column indices + return m.getLocalRowRange().getBegin(); + else + // non-square matrix, assume ghost indexing + return 0; } }; diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h index c9c2a0b7768d1a0da4ba11460a2e0fa0c67eb068..21b895c48a2074b78b54d3eea11a301549f1afa6 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h +++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT_impl.h @@ -272,6 +272,9 @@ solve( ConstVectorViewType _b, VectorViewType _x ) const // Step 2: solve x from Ux = y triangularSolveUpper< true, false >( U, x, x ); + + // synchronize ghosts + Traits< Matrix >::startSynchronization( _x ); } } // namespace Preconditioners diff --git a/src/TNL/Solvers/Linear/Traits.h b/src/TNL/Solvers/Linear/Traits.h index 9a5db2c40297aeba9713482603fc8fcb15d6793e..d98b78294cd148584a51fa037a5763df8c9ebc3e 100644 --- a/src/TNL/Solvers/Linear/Traits.h +++ b/src/TNL/Solvers/Linear/Traits.h @@ -12,7 +12,7 @@ #pragma once -#include +#include #include #include #include @@ -26,8 +26,6 @@ namespace Linear { template< typename Matrix > struct Traits { - using CommunicatorType = Communicators::NoDistrCommunicator; - using VectorType = Containers::Vector < typename Matrix::RealType, typename Matrix::DeviceType, @@ -51,29 +49,26 @@ struct Traits static ConstLocalViewType getConstLocalView( ConstVectorViewType v ) { return v; } static LocalViewType getLocalView( VectorViewType v ) { return v; } - static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const Matrix& m ) { return CommunicatorType::AllGroup; } + static MPI_Comm getCommunicationGroup( const Matrix& m ) { return MPI::AllGroup(); } + static void startSynchronization( VectorViewType v ) {} + static void waitForSynchronization( VectorViewType v ) {} }; -template< typename Matrix, typename Communicator > -struct Traits< Matrices::DistributedMatrix< Matrix, Communicator > > +template< typename Matrix > +struct Traits< Matrices::DistributedMatrix< Matrix > > { - using CommunicatorType = Communicator; - using VectorType = Containers::DistributedVector < typename Matrix::RealType, typename Matrix::DeviceType, - typename Matrix::IndexType, - Communicator >; + typename Matrix::IndexType >; using VectorViewType = Containers::DistributedVectorView < typename Matrix::RealType, typename Matrix::DeviceType, - typename Matrix::IndexType, - Communicator >; + typename Matrix::IndexType >; using ConstVectorViewType = Containers::DistributedVectorView < std::add_const_t< typename Matrix::RealType >, typename Matrix::DeviceType, - typename Matrix::IndexType, - Communicator >; + typename Matrix::IndexType >; using LocalVectorType = Containers::Vector < typename Matrix::RealType, @@ -89,12 +84,13 @@ struct Traits< Matrices::DistributedMatrix< Matrix, Communicator > > typename Matrix::IndexType >; // compatibility wrappers for some DistributedMatrix methods - static const Matrix& getLocalMatrix( const Matrices::DistributedMatrix< Matrix, Communicator >& m ) - { return m.getLocalMatrix(); } + static const Matrix& getLocalMatrix( const Matrices::DistributedMatrix< Matrix >& m ) { return m.getLocalMatrix(); } static ConstLocalViewType getConstLocalView( ConstVectorViewType v ) { return v.getConstLocalView(); } static LocalViewType getLocalView( VectorViewType v ) { return v.getLocalView(); } - static typename CommunicatorType::CommunicationGroup getCommunicationGroup( const Matrices::DistributedMatrix< Matrix, Communicator >& m ) { return m.getCommunicationGroup(); } + static MPI_Comm getCommunicationGroup( const Matrices::DistributedMatrix< Matrix >& m ) { return m.getCommunicationGroup(); } + static void startSynchronization( VectorViewType v ) { v.startSynchronization(); } + static void waitForSynchronization( VectorViewType v ) { v.waitForSynchronization(); } }; } // namespace Linear diff --git a/src/TNL/Solvers/ODE/Merson_impl.h b/src/TNL/Solvers/ODE/Merson_impl.h index 4c7b21bc93c5bcfb5adff76e89d876778f1049aa..247318f330d6154a419aa235f49cb74d22afdba5 100644 --- a/src/TNL/Solvers/ODE/Merson_impl.h +++ b/src/TNL/Solvers/ODE/Merson_impl.h @@ -13,14 +13,13 @@ #include #include #include -#include -#include +#include #include "Merson.h" namespace TNL { namespace Solvers { -namespace ODE { +namespace ODE { /**** * In this code we do not use constants and references as we would like to. @@ -155,9 +154,9 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& _u ) RealType error( 0.0 ); if( adaptivity != 0.0 ) { - const RealType localError = + const RealType localError = max( currentTau / 3.0 * abs( 0.2 * k1 -0.9 * k3 + 0.8 * k4 -0.1 * k5 ) ); - Problem::CommunicatorType::Allreduce( &localError, &error, 1, MPI_MAX, Problem::CommunicatorType::AllGroup ); + MPI::Allreduce( &localError, &error, 1, MPI_MAX, MPI::AllGroup() ); } if( adaptivity == 0.0 || error < adaptivity ) @@ -186,7 +185,7 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& _u ) currentTau = min( currentTau, this->getMaxTau() ); #ifdef USE_MPI TNLMPI::Bcast( currentTau, 1, 0 ); -#endif +#endif } if( time + currentTau > this->getStopTime() ) currentTau = this->getStopTime() - time; //we don't want to keep such tau diff --git a/src/TNL/Solvers/PDE/PDESolver.h b/src/TNL/Solvers/PDE/PDESolver.h index b9bbcd5e2e3ba89d0a424611ea3547008f5f9632..70f19d8de9e58fcb8ae3c141eeecd7ea95e3e862 100644 --- a/src/TNL/Solvers/PDE/PDESolver.h +++ b/src/TNL/Solvers/PDE/PDESolver.h @@ -18,8 +18,8 @@ namespace TNL { namespace Solvers { -namespace PDE { - +namespace PDE { + template< typename Real, typename Index > class PDESolver @@ -28,8 +28,8 @@ class PDESolver using RealType = Real; using IndexType = Index; using SolverMonitorType = IterativeSolverMonitor< RealType, IndexType >; - - + + PDESolver(); static void configSetup( Config::ConfigDescription& config, @@ -38,29 +38,28 @@ class PDESolver bool setup( const Config::ParameterContainer& parameters, const String& prefix = "" ); - template< typename Communicator > bool writeProlog( Logger& logger, const Config::ParameterContainer& parameters ); - + void setIoTimer( Timer& ioTimer); void setComputeTimer( Timer& computeTimer ); - + void setTotalTimer( Timer& totalTimer ); - + void setSolverMonitor( SolverMonitorType& solverMonitor ); - + SolverMonitorType& getSolverMonitor(); - bool writeEpilog( Logger& logger ) const; - + bool writeEpilog( Logger& logger ) const; + protected: Timer *ioTimer, *computeTimer, *totalTimer; - + SolverMonitorType *solverMonitorPointer; }; - + } // namespace PDE } // namespace Solvers } // namespace TNL diff --git a/src/TNL/Solvers/PDE/PDESolver_impl.h b/src/TNL/Solvers/PDE/PDESolver_impl.h index 37ade9f38e74427903b1c624a383544179839bed..8bdcbd86ab3905dc08227370580f034d53abc612 100644 --- a/src/TNL/Solvers/PDE/PDESolver_impl.h +++ b/src/TNL/Solvers/PDE/PDESolver_impl.h @@ -11,21 +11,22 @@ #pragma once #include +#include namespace TNL { namespace Solvers { -namespace PDE { +namespace PDE { template< typename Real, - typename Index > -PDESolver< Real, Index >::PDESolver() + typename Index > +PDESolver< Real, Index >::PDESolver() : ioTimer( 0 ), computeTimer( 0 ), totalTimer( 0 ), solverMonitorPointer( 0 ) { } - + template< typename Real, typename Index > void @@ -65,7 +66,6 @@ getSolverMonitor() template< typename Real, typename Index > - template< typename Communicator > bool PDESolver< Real, Index >:: writeProlog( Logger& logger, @@ -84,7 +84,8 @@ writeProlog( Logger& logger, else logger.writeParameter< String >( "OMP enabled:", "no", 1 ); } - Communicator::writeProlog( logger ); + if( MPI::isInitialized() ) + logger.writeParameter( "MPI processes:", MPI::GetSize() ); logger.writeSeparator(); const bool printGPUs = parameters.getParameter< String >( "device" ) == "cuda"; logger.writeSystemInformation( printGPUs ); @@ -116,9 +117,9 @@ void PDESolver< Real, Index >:: setTotalTimer( Timer& totalTimer ) { this->totalTimer = &totalTimer; -} - +} + } // namespace PDE } // namespace Solvers } // namespace TNL - + diff --git a/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h b/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h index 46ffa6fea83ac96ab8e987da149516ac0f6f7213..34f2798f8d6ff70196ed0c7e375eab63f371eb19 100644 --- a/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h +++ b/src/TNL/Solvers/PDE/TimeDependentPDESolver_impl.h @@ -63,7 +63,7 @@ setup( const Config::ParameterContainer& parameters, const String& meshFileFormat = parameters.getParameter< String >( "mesh-format" ); this->distributedMesh.setup( parameters, prefix ); if( Problem::CommunicatorType::isDistributed() ) { - if( ! Meshes::loadDistributedMesh< typename Problem::CommunicatorType >( *this->meshPointer, distributedMesh, meshFile, meshFileFormat ) ) + if( ! Meshes::loadDistributedMesh( *this->meshPointer, distributedMesh, meshFile, meshFileFormat ) ) return false; if( ! Meshes::decomposeMesh< Problem >( parameters, prefix, *this->meshPointer, distributedMesh, *problem ) ) return false; @@ -165,7 +165,7 @@ writeProlog( Logger& logger, logger.writeParameter< int >( "Maximal number of iterations:", "max-iterations", parameters ); logger.writeParameter< int >( "Minimal number of iterations:", "min-iterations", parameters ); logger.writeSeparator(); - return BaseType::template writeProlog< typename Problem::CommunicatorType >( logger, parameters ); + return BaseType::writeProlog( logger, parameters ); } template< typename Problem, diff --git a/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h b/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h index 455682e2b6498f13061f8a86c005c2402c44c7da..880d0ab31de1fcf6557ab40a5d2bbb1fbe3f0cd3 100644 --- a/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h +++ b/src/TNL/Solvers/PDE/TimeIndependentPDESolver_impl.h @@ -15,7 +15,7 @@ * * ***************************************************************************/ -#pragma once +#pragma once #include #include @@ -23,7 +23,7 @@ namespace TNL { namespace Solvers { -namespace PDE { +namespace PDE { template< typename Problem > @@ -54,7 +54,7 @@ setup( const Config::ParameterContainer& parameters, const String& meshFileFormat = parameters.getParameter< String >( "mesh-format" ); this->distributedMesh.setup( parameters, prefix ); if( Problem::CommunicatorType::isDistributed() ) { - if( ! Meshes::loadDistributedMesh< typename Problem::CommunicatorType >( *this->meshPointer, distributedMesh, meshFile, meshFileFormat ) ) + if( ! Meshes::loadDistributedMesh( *this->meshPointer, distributedMesh, meshFile, meshFileFormat ) ) return false; if( ! Meshes::decomposeMesh< Problem >( parameters, prefix, *this->meshPointer, distributedMesh, *problem ) ) return false; @@ -75,7 +75,7 @@ setup( const Config::ParameterContainer& parameters, return false; } problem->setCommonData( this->commonDataPointer ); - + /**** * Setup the problem */ @@ -83,7 +83,7 @@ setup( const Config::ParameterContainer& parameters, { std::cerr << "The problem initiation failed!" << std::endl; return false; - } + } /**** * Set DOFs (degrees of freedom) @@ -91,9 +91,9 @@ setup( const Config::ParameterContainer& parameters, TNL_ASSERT_GT( problem->getDofs(), 0, "number of DOFs must be positive" ); this->dofs->setSize( problem->getDofs() ); this->dofs->setValue( 0.0 ); - this->problem->bindDofs( this->dofs ); - - + this->problem->bindDofs( this->dofs ); + + /*** * Set-up the initial condition */ @@ -102,7 +102,7 @@ setup( const Config::ParameterContainer& parameters, if( ! this->problem->setInitialCondition( parameters, this->dofs ) ) return false; std::cout << " [ OK ]" << std::endl; - + return true; } @@ -128,7 +128,7 @@ writeProlog( Logger& logger, logger.writeParameter< int >( "Maximal number of iterations:", "max-iterations", parameters ); logger.writeParameter< int >( "Minimal number of iterations:", "min-iterations", parameters ); logger.writeSeparator(); - return BaseType::template writeProlog< typename Problem::CommunicatorType >( logger, parameters ); + return BaseType::writeProlog( logger, parameters ); } template< typename Problem > diff --git a/src/TNL/Solvers/SolverInitiator.h b/src/TNL/Solvers/SolverInitiator.h index 0ba4dc55a9bbad7354dfe296cecb2c10074da3ff..06285752054fdd675e449e68cbc2506ac48a6565 100644 --- a/src/TNL/Solvers/SolverInitiator.h +++ b/src/TNL/Solvers/SolverInitiator.h @@ -16,7 +16,7 @@ namespace TNL { namespace Solvers { -template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter , typename CommunicatorType > class ProblemSetter, +template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter, typename CommunicatorType > class ProblemSetter, typename ConfigTag > class SolverInitiator { diff --git a/src/TNL/Solvers/SolverInitiator_impl.h b/src/TNL/Solvers/SolverInitiator_impl.h index 16e0fd2227830a5687e8ee9a04a878d723efd8d3..3d704426dd2715d18355b6a1329c6333efaedf73 100644 --- a/src/TNL/Solvers/SolverInitiator_impl.h +++ b/src/TNL/Solvers/SolverInitiator_impl.h @@ -18,7 +18,6 @@ #include #include -#include #include namespace TNL { @@ -50,15 +49,6 @@ template< template< typename Real, typename Device, typename Index, typename Mes typename Device, typename Index, typename ConfigTag, - bool enabled = true > -class CommunicatorTypeResolver {}; - -template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter, typename CommunicatorType > class ProblemSetter, - typename Real, - typename Device, - typename Index, - typename ConfigTag, - typename CommunicatorType, bool enabled = ConfigTagMeshResolve< ConfigTag >::enabled > class SolverInitiatorMeshResolver {}; @@ -169,7 +159,7 @@ class SolverInitiatorIndexResolver< ProblemSetter, Real, Device, Index, ConfigTa public: static bool run( const Config::ParameterContainer& parameters ) { - return CommunicatorTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true >::run( parameters ); + return SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag >::run( parameters ); } }; @@ -178,28 +168,12 @@ template< template< typename Real, typename Device, typename Index, typename Mes typename Device, typename Index, typename ConfigTag > -class CommunicatorTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, true > -{ - public: - static bool run( const Config::ParameterContainer& parameters ) - { - if( Communicators::MpiCommunicator::isDistributed() ) - return SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, Communicators::MpiCommunicator >::run( parameters ); - return SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, Communicators::NoDistrCommunicator >::run( parameters ); - } -}; - -template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter, typename CommunicatorType > class ProblemSetter, - typename Real, - typename Device, - typename Index, - typename ConfigTag, - typename CommunicatorType > -class SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, CommunicatorType, false > +class SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, false > { public: static bool run( const Config::ParameterContainer& parameters ) { + using CommunicatorType = Communicators::MpiCommunicator; return ProblemSetter< Real, Device, Index, @@ -213,10 +187,11 @@ template< template< typename Real, typename Device, typename Index, typename Mes typename Real, typename Device, typename Index, - typename ConfigTag, - typename CommunicatorType > -class SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag,CommunicatorType, true > + typename ConfigTag > +class SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, true > { + using CommunicatorType = Communicators::MpiCommunicator; + // wrapper for MeshTypeResolver template< typename MeshType > using ProblemSetterWrapper = ProblemSetter< Real, Device, Index, MeshType, ConfigTag, SolverStarter< ConfigTag >, CommunicatorType >; diff --git a/src/TNL/Solvers/SolverStarter_impl.h b/src/TNL/Solvers/SolverStarter_impl.h index d2bbd81594658ca60b74426e6b4bce8d0b68f74c..dbecdaad98dc62f9a633a0f629ec9b246ce00df8 100644 --- a/src/TNL/Solvers/SolverStarter_impl.h +++ b/src/TNL/Solvers/SolverStarter_impl.h @@ -14,8 +14,7 @@ #include #include #include -#include -#include +#include #include #include #include @@ -25,14 +24,14 @@ #include namespace TNL { -namespace Solvers { +namespace Solvers { template< typename Problem, typename ConfigTag, bool TimeDependent = Problem::isTimeDependent() > class TimeDependencyResolver {}; - + template< typename Problem, typename ConfigTag, typename TimeStepper = typename Problem::TimeStepper > @@ -66,8 +65,7 @@ bool SolverStarter< ConfigTag > :: run( const Config::ParameterContainer& parame */ if( ! Devices::Host::setup( parameters ) || ! Devices::Cuda::setup( parameters ) || - ! Communicators::NoDistrCommunicator::setup( parameters ) || - ! Communicators::MpiCommunicator::setup( parameters ) + ! MPI::setup( parameters ) ) return false; Problem problem; @@ -95,7 +93,7 @@ class TimeDependencyResolver< Problem, ConfigTag, false > const Config::ParameterContainer& parameters ) { // TODO: This should be improved - at least rename to LinearSolverSetter - return SolverStarterTimeDiscretisationSetter< Problem, SemiImplicitTimeDiscretisationTag, ConfigTag, true >::run( problem, parameters ); + return SolverStarterTimeDiscretisationSetter< Problem, SemiImplicitTimeDiscretisationTag, ConfigTag, true >::run( problem, parameters ); } }; @@ -338,7 +336,7 @@ bool SolverStarter< ConfigTag > :: runPDESolver( Problem& problem, */ this->computeTimer.reset(); this->ioTimer.reset(); - + /**** * Create solver monitor thread */ diff --git a/src/TNL/Solvers/Solver_impl.h b/src/TNL/Solvers/Solver_impl.h index 9182c620fe2ed589389ab5a64328917f392795c7..bc1f43c7779e4af5d48063f3e3794fc3e4a1cd06 100644 --- a/src/TNL/Solvers/Solver_impl.h +++ b/src/TNL/Solvers/Solver_impl.h @@ -15,12 +15,12 @@ #include #include #include -#include -#include +#include +#include namespace TNL { namespace Solvers { - + template< template< typename Real, typename Device, typename Index, typename MeshType, typename MeshConfig, typename SolverStarter, typename CommunicatorType > class ProblemSetter, template< typename MeshConfig > class ProblemConfig, typename MeshConfig > @@ -35,9 +35,9 @@ run( int argc, char* argv[] ) configDescription.addDelimiter( "Parallelization setup:" ); Devices::Host::configSetup( configDescription ); Devices::Cuda::configSetup( configDescription ); - Communicators::MpiCommunicator::configSetup( configDescription ); + MPI::configSetup( configDescription ); - Communicators::ScopedInitializer< Communicators::MpiCommunicator > mpi( argc, argv ); + TNL::MPI::ScopedInitializer mpi( argc, argv ); if( ! parseCommandLine( argc, argv, configDescription, parameters ) ) return false; diff --git a/src/TNL/TypeTraits.h b/src/TNL/TypeTraits.h index 2afda7aad4522e6a62eb42838c8a23deca9d6b4b..63b8fc27391ebf9682c4a6a0e7a022a81caa2599 100644 --- a/src/TNL/TypeTraits.h +++ b/src/TNL/TypeTraits.h @@ -253,4 +253,21 @@ public: static constexpr bool value = type::value; }; +/** + * \brief Type trait for checking if T has getCommunicationGroup method. + */ +template< typename T > +class HasGetCommunicationGroupMethod +{ +private: + typedef char YesType[1]; + typedef char NoType[2]; + + template< typename C > static YesType& test( decltype(std::declval< C >().getCommunicationGroup()) ); + template< typename C > static NoType& test(...); + +public: + static constexpr bool value = ( sizeof( test< std::decay_t >(0) ) == sizeof( YesType ) ); +}; + } //namespace TNL diff --git a/src/Tools/tnl-game-of-life.cpp b/src/Tools/tnl-game-of-life.cpp index c33ae829439885aa0e69ed2efde610796d4d0ed2..7003489ab287dd733fa71761c0bc44302eb478d3 100644 --- a/src/Tools/tnl-game-of-life.cpp +++ b/src/Tools/tnl-game-of-life.cpp @@ -17,13 +17,11 @@ #include #include #include -#include -#include +#include +#include using namespace TNL; -using CommunicatorType = Communicators::MpiCommunicator; - struct MyConfigTag {}; namespace TNL { @@ -198,8 +196,8 @@ bool runGameOfLife( const Mesh& mesh ) } } Index max_count; - CommunicatorType::Allreduce( &count, &max_count, 1, MPI_MAX, mesh.getCommunicationGroup() ); - std::cout << "Rank " << CommunicatorType::GetRank() << ": count=" << count << ", max_count=" << max_count << std::endl; + TNL::MPI::Allreduce( &count, &max_count, 1, MPI_MAX, mesh.getCommunicationGroup() ); + std::cout << "Rank " << TNL::MPI::GetRank() << ": count=" << count << ", max_count=" << max_count << std::endl; Index reference_cell = 0; if( count == max_count ) { // find cell which has all points in the central box @@ -256,7 +254,7 @@ bool runGameOfLife( const Mesh& mesh ) // create a .pvtu file (only rank 0 actually writes to the file) const std::string mainFilePath = "GoL." + std::to_string(iteration) + ".pvtu"; std::ofstream file; - if( CommunicatorType::GetRank() == 0 ) + if( TNL::MPI::GetRank() == 0 ) file.open( mainFilePath ); using PVTU = Meshes::Writers::PVTUWriter< LocalMesh >; PVTU pvtu( file ); @@ -266,7 +264,7 @@ bool runGameOfLife( const Mesh& mesh ) if( mesh.getGhostLevels() > 0 ) pvtu.template writePCellData< std::uint8_t >( Meshes::VTK::ghostArrayName() ); pvtu.template writePCellData< Real >( "function values" ); - const std::string subfilePath = pvtu.template addPiece< CommunicatorType >( mainFilePath, mesh.getCommunicationGroup() ); + const std::string subfilePath = pvtu.addPiece( mainFilePath, mesh.getCommunicationGroup() ); // create a .vtu file for local data using Writer = Meshes::Writers::VTUWriter< LocalMesh >; @@ -292,7 +290,7 @@ bool runGameOfLife( const Mesh& mesh ) Index iteration = 0; do { iteration++; - if( CommunicatorType::GetRank() == 0 ) + if( TNL::MPI::GetRank() == 0 ) std::cout << "Computing iteration " << iteration << "..." << std::endl; // iterate over all local entities @@ -338,7 +336,7 @@ bool runGameOfLife( const Mesh& mesh ) // check if finished const bool done = max( f_in.getData() ) == 0 || iteration > max_iter || f_in.getData() == f_out.getData(); - CommunicatorType::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() ); + TNL::MPI::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() ); } while( all_done == false ); @@ -351,7 +349,7 @@ void configSetup( Config::ConfigDescription& config ) config.addRequiredEntry< String >( "input-file", "Input file with the mesh." ); config.addEntry< String >( "input-file-format", "Input mesh file format.", "auto" ); config.addDelimiter( "MPI settings:" ); - CommunicatorType::configSetup( config ); + TNL::MPI::configSetup( config ); } int main( int argc, char* argv[] ) @@ -361,12 +359,12 @@ int main( int argc, char* argv[] ) configSetup( conf_desc ); - Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv); + TNL::MPI::ScopedInitializer mpi(argc, argv); if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) return EXIT_FAILURE; - if( ! CommunicatorType::setup( parameters ) ) + if( ! TNL::MPI::setup( parameters ) ) return EXIT_FAILURE; const String inputFileName = parameters.getParameter< String >( "input-file" ); diff --git a/src/Tools/tnl-init.cpp b/src/Tools/tnl-init.cpp index 1a7769b5c89911018127c0fb4e105f3931f6a7a1..a1b3a8ff33c259562bb23d779553d23b11368a1a 100644 --- a/src/Tools/tnl-init.cpp +++ b/src/Tools/tnl-init.cpp @@ -15,8 +15,8 @@ #include #include -#include -#include +#include +#include using namespace TNL; @@ -53,9 +53,9 @@ int main( int argc, char* argv[] ) Config::ConfigDescription configDescription; setupConfig( configDescription ); - Communicators::MpiCommunicator::configSetup( configDescription ); + TNL::MPI::configSetup( configDescription ); - Communicators::ScopedInitializer< Communicators::MpiCommunicator > mpi(argc, argv); + TNL::MPI::ScopedInitializer mpi(argc, argv); if( ! parseCommandLine( argc, argv, configDescription, parameters ) ) return EXIT_FAILURE; diff --git a/src/Tools/tnl-init.h b/src/Tools/tnl-init.h index a0d171f14b4f102a56d862fb283d3bb4437e680a..e78db1153b9bb52001bba79c182f1401f54307b8 100644 --- a/src/Tools/tnl-init.h +++ b/src/Tools/tnl-init.h @@ -10,6 +10,7 @@ #pragma once +#include #include #include #include @@ -21,37 +22,32 @@ #include #include -#include -#include - using namespace TNL; template< typename MeshType, typename RealType, - typename CommunicatorType, int xDiff, int yDiff, int zDiff > bool renderFunction( const Config::ParameterContainer& parameters ) { - using namespace Meshes::DistributedMeshes; using DistributedGridType = Meshes::DistributedMeshes::DistributedMesh; DistributedGridType distributedMesh; Pointers::SharedPointer< MeshType > meshPointer; MeshType globalMesh; - if(CommunicatorType::isDistributed()) + if(TNL::MPI::GetSize() > 1) { //suppose global mesh loaded from single file String meshFile = parameters.getParameter< String >( "mesh" ); std::cout << "+ -> Loading mesh from " << meshFile << " ... " << std::endl; globalMesh.load( meshFile ); - + // TODO: This should work with no overlaps - distributedMesh.template setGlobalGrid(globalMesh); + distributedMesh.setGlobalGrid(globalMesh); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedMesh, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedMesh, lowerOverlap, upperOverlap, 1 ); distributedMesh.setOverlaps( lowerOverlap, upperOverlap ); distributedMesh.setupGrid(*meshPointer); } @@ -74,7 +70,7 @@ bool renderFunction( const Config::ParameterContainer& parameters ) MeshFunctionPointer meshFunction( meshPointer ); //if( ! discreteFunction.setSize( mesh.template getEntitiesCount< typename MeshType::Cell >() ) ) // return false; - + double finalTime = parameters.getParameter< double >( "final-time" ); double initialTime = parameters.getParameter< double >( "initial-time" ); double tau = parameters.getParameter< double >( "snapshot-period" ); @@ -116,7 +112,7 @@ bool renderFunction( const Config::ParameterContainer& parameters ) else std::cout << "+ -> Writing the function to " << outputFile << " ... " << std::endl; - if(CommunicatorType::isDistributed()) + if(TNL::MPI::GetSize() > 1) { if( ! Meshes::DistributedMeshes::DistributedGridIO ::save(outputFile, *meshFunction ) ) return false; @@ -130,20 +126,6 @@ bool renderFunction( const Config::ParameterContainer& parameters ) return true; } -template< typename MeshType, - typename RealType, - int xDiff, - int yDiff, - int zDiff > -bool resolveCommunicator( const Config::ParameterContainer& parameters ) -{ -#ifdef HAVE_MPI - if( Communicators::MpiCommunicator::isDistributed() ) - return renderFunction(parameters); -#endif - return renderFunction(parameters); -} - template< typename MeshType, typename RealType > bool resolveDerivatives( const Config::ParameterContainer& parameters ) @@ -160,75 +142,75 @@ bool resolveDerivatives( const Config::ParameterContainer& parameters ) return false; } if( xDiff == 0 && yDiff == 0 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 0, 0, 0 >( parameters ); + return renderFunction< MeshType, RealType, 0, 0, 0 >( parameters ); if( xDiff == 0 && yDiff == 0 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 0, 0, 1 >( parameters ); + return renderFunction< MeshType, RealType, 0, 0, 1 >( parameters ); if( xDiff == 0 && yDiff == 0 && zDiff == 2 ) - return resolveCommunicator< MeshType, RealType, 0, 0, 2 >( parameters ); + return renderFunction< MeshType, RealType, 0, 0, 2 >( parameters ); if( xDiff == 0 && yDiff == 0 && zDiff == 3 ) - return resolveCommunicator< MeshType, RealType, 0, 0, 3 >( parameters ); + return renderFunction< MeshType, RealType, 0, 0, 3 >( parameters ); if( xDiff == 0 && yDiff == 0 && zDiff == 4 ) - return resolveCommunicator< MeshType, RealType, 0, 0, 4 >( parameters ); + return renderFunction< MeshType, RealType, 0, 0, 4 >( parameters ); if( xDiff == 0 && yDiff == 1 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 0, 1, 0 >( parameters ); + return renderFunction< MeshType, RealType, 0, 1, 0 >( parameters ); if( xDiff == 0 && yDiff == 1 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 0, 1, 1 >( parameters ); + return renderFunction< MeshType, RealType, 0, 1, 1 >( parameters ); if( xDiff == 0 && yDiff == 1 && zDiff == 2 ) - return resolveCommunicator< MeshType, RealType, 0, 1, 2 >( parameters ); + return renderFunction< MeshType, RealType, 0, 1, 2 >( parameters ); if( xDiff == 0 && yDiff == 1 && zDiff == 3 ) - return resolveCommunicator< MeshType, RealType, 0, 1, 3 >( parameters ); + return renderFunction< MeshType, RealType, 0, 1, 3 >( parameters ); if( xDiff == 0 && yDiff == 2 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 0, 2, 0 >( parameters ); + return renderFunction< MeshType, RealType, 0, 2, 0 >( parameters ); if( xDiff == 0 && yDiff == 2 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 0, 2, 1 >( parameters ); + return renderFunction< MeshType, RealType, 0, 2, 1 >( parameters ); if( xDiff == 0 && yDiff == 2 && zDiff == 2 ) - return resolveCommunicator< MeshType, RealType, 0, 2, 2 >( parameters ); + return renderFunction< MeshType, RealType, 0, 2, 2 >( parameters ); if( xDiff == 0 && yDiff == 3 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 0, 3, 0 >( parameters ); + return renderFunction< MeshType, RealType, 0, 3, 0 >( parameters ); if( xDiff == 0 && yDiff == 3 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 0, 3, 1 >( parameters ); + return renderFunction< MeshType, RealType, 0, 3, 1 >( parameters ); if( xDiff == 0 && yDiff == 4 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 0, 4, 0 >( parameters ); + return renderFunction< MeshType, RealType, 0, 4, 0 >( parameters ); if( xDiff == 1 && yDiff == 0 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 1, 0, 0 >( parameters ); + return renderFunction< MeshType, RealType, 1, 0, 0 >( parameters ); if( xDiff == 1 && yDiff == 0 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 1, 0, 1 >( parameters ); + return renderFunction< MeshType, RealType, 1, 0, 1 >( parameters ); if( xDiff == 1 && yDiff == 0 && zDiff == 2 ) - return resolveCommunicator< MeshType, RealType, 1, 0, 2 >( parameters ); + return renderFunction< MeshType, RealType, 1, 0, 2 >( parameters ); if( xDiff == 1 && yDiff == 0 && zDiff == 3 ) - return resolveCommunicator< MeshType, RealType, 1, 0, 3 >( parameters ); + return renderFunction< MeshType, RealType, 1, 0, 3 >( parameters ); if( xDiff == 1 && yDiff == 1 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 1, 1, 0 >( parameters ); + return renderFunction< MeshType, RealType, 1, 1, 0 >( parameters ); if( xDiff == 1 && yDiff == 1 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 1, 1, 1 >( parameters ); + return renderFunction< MeshType, RealType, 1, 1, 1 >( parameters ); if( xDiff == 1 && yDiff == 1 && zDiff == 2 ) - return resolveCommunicator< MeshType, RealType, 1, 1, 2 >( parameters ); + return renderFunction< MeshType, RealType, 1, 1, 2 >( parameters ); if( xDiff == 1 && yDiff == 2 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 1, 2, 0 >( parameters ); + return renderFunction< MeshType, RealType, 1, 2, 0 >( parameters ); if( xDiff == 1 && yDiff == 2 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 1, 2, 1 >( parameters ); + return renderFunction< MeshType, RealType, 1, 2, 1 >( parameters ); if( xDiff == 1 && yDiff == 3 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 1, 3, 0 >( parameters ); + return renderFunction< MeshType, RealType, 1, 3, 0 >( parameters ); if( xDiff == 2 && yDiff == 0 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 2, 0, 0 >( parameters ); + return renderFunction< MeshType, RealType, 2, 0, 0 >( parameters ); if( xDiff == 2 && yDiff == 0 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 2, 0, 1 >( parameters ); + return renderFunction< MeshType, RealType, 2, 0, 1 >( parameters ); if( xDiff == 2 && yDiff == 0 && zDiff == 2 ) - return resolveCommunicator< MeshType, RealType, 2, 0, 2 >( parameters ); + return renderFunction< MeshType, RealType, 2, 0, 2 >( parameters ); if( xDiff == 2 && yDiff == 1 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 2, 1, 0 >( parameters ); + return renderFunction< MeshType, RealType, 2, 1, 0 >( parameters ); if( xDiff == 2 && yDiff == 1 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 2, 1, 1 >( parameters ); + return renderFunction< MeshType, RealType, 2, 1, 1 >( parameters ); if( xDiff == 2 && yDiff == 2 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 2, 2, 0 >( parameters ); + return renderFunction< MeshType, RealType, 2, 2, 0 >( parameters ); if( xDiff == 3 && yDiff == 0 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 3, 0, 0 >( parameters ); + return renderFunction< MeshType, RealType, 3, 0, 0 >( parameters ); if( xDiff == 3 && yDiff == 0 && zDiff == 1 ) - return resolveCommunicator< MeshType, RealType, 3, 0, 1 >( parameters ); + return renderFunction< MeshType, RealType, 3, 0, 1 >( parameters ); if( xDiff == 3 && yDiff == 1 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 3, 1, 0 >( parameters ); + return renderFunction< MeshType, RealType, 3, 1, 0 >( parameters ); if( xDiff == 4 && yDiff == 0 && zDiff == 0 ) - return resolveCommunicator< MeshType, RealType, 4, 0, 0 >( parameters ); + return renderFunction< MeshType, RealType, 4, 0, 0 >( parameters ); return false; } diff --git a/src/Tools/tnl-test-distributed-mesh.h b/src/Tools/tnl-test-distributed-mesh.h index 0be53242b8122f4173a11d27181a73232067b125..6b748d99355375e0f0bb4ac20eaf54129a07ad2f 100644 --- a/src/Tools/tnl-test-distributed-mesh.h +++ b/src/Tools/tnl-test-distributed-mesh.h @@ -18,13 +18,11 @@ #include #include #include -#include -#include +#include +#include using namespace TNL; -using CommunicatorType = Communicators::MpiCommunicator; - struct MyConfigTag {}; namespace TNL { @@ -214,7 +212,7 @@ void testSynchronizerOnDevice( const MeshType& mesh ) if( received != center ) { IndexType cellIndexes[ 2 ] = {0, 0}; const int numCells = getCellsForFace( mesh.getLocalMesh(), i, cellIndexes ); - std::cerr << "rank " << CommunicatorType::GetRank() + std::cerr << "rank " << TNL::MPI::GetRank() << ": wrong result for entity " << i << " (gid " << mesh.template getGlobalIndices< EntityType::getEntityDimension() >()[i] << ")" << " of dimension = " << EntityType::getEntityDimension() << ": received " << received << ", expected = " << center @@ -224,7 +222,7 @@ void testSynchronizerOnDevice( const MeshType& mesh ) } } if( errors > 0 ) { - std::cerr << "rank " << CommunicatorType::GetRank() << ": " << errors << " errors in total." << std::endl; + std::cerr << "rank " << TNL::MPI::GetRank() << ": " << errors << " errors in total." << std::endl; TNL_ASSERT_TRUE( false, "test failed" ); } } @@ -273,7 +271,7 @@ bool testPropagationOverFaces( const Mesh& mesh, int max_iterations ) // create a .pvtu file (only rank 0 actually writes to the file) const std::string mainFilePath = "data_" + std::to_string(iteration) + ".pvtu"; std::ofstream file; - if( CommunicatorType::GetRank() == 0 ) + if( TNL::MPI::GetRank() == 0 ) file.open( mainFilePath ); using PVTU = Meshes::Writers::PVTUWriter< LocalMesh >; PVTU pvtu( file ); @@ -284,7 +282,7 @@ bool testPropagationOverFaces( const Mesh& mesh, int max_iterations ) pvtu.template writePCellData< std::uint8_t >( Meshes::VTK::ghostArrayName() ); pvtu.template writePCellData< Real >( "function values" ); pvtu.template writePCellData< Real >( "test values" ); - const std::string subfilePath = pvtu.template addPiece< CommunicatorType >( mainFilePath, mesh.getCommunicationGroup() ); + const std::string subfilePath = pvtu.addPiece( mainFilePath, mesh.getCommunicationGroup() ); // create a .vtu file for local data using Writer = Meshes::Writers::VTUWriter< LocalMesh >; @@ -315,7 +313,7 @@ bool testPropagationOverFaces( const Mesh& mesh, int max_iterations ) int iteration = 0; do { iteration++; - if( CommunicatorType::GetRank() == 0 ) + if( TNL::MPI::GetRank() == 0 ) std::cout << "Computing iteration " << iteration << "..." << std::endl; const Index prev_sum = sum( f_K.getData() ); @@ -400,14 +398,14 @@ bool testPropagationOverFaces( const Mesh& mesh, int max_iterations ) std::cerr << "ERROR: propatation over faces differs from the propagation over neighbor cells. Differing values are:\n"; for( Index K = 0; K < f_K_view.getSize(); K++ ) if( f_K_view[ K ] != f_K_test_view[ K ] ) - std::cerr << " rank = " << CommunicatorType::GetRank() << ", K = " << K << ": " << f_K_view[ K ] << " instead of " << f_K_test_view[ K ] << "\n"; + std::cerr << " rank = " << TNL::MPI::GetRank() << ", K = " << K << ": " << f_K_view[ K ] << " instead of " << f_K_test_view[ K ] << "\n"; std::cerr.flush(); TNL_ASSERT_TRUE( false, "test failed" ); } // check if finished const bool done = sum( f_K.getData() ) == prev_sum || iteration > max_iterations; - CommunicatorType::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() ); + TNL::MPI::Allreduce( &done, &all_done, 1, MPI_LAND, mesh.getCommunicationGroup() ); } while( all_done == false ); @@ -421,7 +419,7 @@ void configSetup( Config::ConfigDescription& config ) config.addEntry< String >( "input-file-format", "Input mesh file format.", "auto" ); config.addEntry< int >( "max-iterations", "Maximum number of iterations to compute", 100 ); config.addDelimiter( "MPI settings:" ); - CommunicatorType::configSetup( config ); + TNL::MPI::configSetup( config ); } int main( int argc, char* argv[] ) @@ -431,12 +429,12 @@ int main( int argc, char* argv[] ) configSetup( conf_desc ); - Communicators::ScopedInitializer< CommunicatorType > scopedInit(argc, argv); + TNL::MPI::ScopedInitializer mpi(argc, argv); if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) return EXIT_FAILURE; - if( ! CommunicatorType::setup( parameters ) ) + if( ! TNL::MPI::setup( parameters ) ) return EXIT_FAILURE; const String inputFileName = parameters.getParameter< String >( "input-file" ); diff --git a/src/UnitTests/CMakeLists.txt b/src/UnitTests/CMakeLists.txt index 8e4ac724954bcf92e169ae3bc67f11a533d37c04..2c0ba865069600a497932227759c3de5d9a3e9f6 100644 --- a/src/UnitTests/CMakeLists.txt +++ b/src/UnitTests/CMakeLists.txt @@ -1,4 +1,3 @@ -ADD_SUBDIRECTORY( Communicators ) ADD_SUBDIRECTORY( Containers ) ADD_SUBDIRECTORY( Functions ) # Matrices are included from src/CMakeLists.txt diff --git a/src/UnitTests/Communicators/CMakeLists.txt b/src/UnitTests/Communicators/CMakeLists.txt deleted file mode 100644 index 1a3331c3a50990b2f0e0f3fa29e5b72891fe434a..0000000000000000000000000000000000000000 --- a/src/UnitTests/Communicators/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -if( ${BUILD_MPI} ) - ADD_EXECUTABLE( MpiCommunicatorTest MpiCommunicatorTest.cpp ) - TARGET_COMPILE_OPTIONS( MpiCommunicatorTest PRIVATE ${CXX_TESTS_FLAGS} ) - TARGET_LINK_LIBRARIES( MpiCommunicatorTest ${GTEST_BOTH_LIBRARIES} ) - - SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/MpiCommunicatorTest${CMAKE_EXECUTABLE_SUFFIX}" ) - ADD_TEST( NAME MpiCommunicatorTest COMMAND "mpirun" ${mpi_test_parameters}) - -endif() diff --git a/src/UnitTests/Communicators/MpiCommunicatorTest.cpp b/src/UnitTests/Communicators/MpiCommunicatorTest.cpp deleted file mode 100644 index b78011953e638a84796d5b30daa848f168274303..0000000000000000000000000000000000000000 --- a/src/UnitTests/Communicators/MpiCommunicatorTest.cpp +++ /dev/null @@ -1,51 +0,0 @@ -/*************************************************************************** - MpiCommunicatorTest.h - description - ------------------- - begin : Jul 10, 2019 - copyright : (C) 2019 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#ifdef HAVE_GTEST - -#include "gtest/gtest.h" -#include - -using namespace TNL; -using namespace TNL::Communicators; - -// test fixture for typed tests -template< typename Real > -class MpiCommunicatorTest : public ::testing::Test -{ - protected: - using RealType = Real; - using CommunicatorType = MpiCommunicator; -}; - -// types for which MpiCommunicatorTest is instantiated -using MpiCommunicatorTypes = ::testing::Types< - short, - int, - long, - float, - double ->; - -TYPED_TEST_SUITE( MpiCommunicatorTest, MpiCommunicatorTypes ); - -TYPED_TEST( MpiCommunicatorTest, allReduce ) -{ - using RealType = typename TestFixture::RealType; - using CommunicatorType = typename TestFixture::CommunicatorType; - RealType a = CommunicatorType::GetRank(); - RealType b = 0; - CommunicatorType::Allreduce( &a, &b, 1, MPI_MAX, MPI_COMM_WORLD ); - EXPECT_EQ( b, CommunicatorType::GetSize() - 1 ); -} - -#endif // HAVE_GTEST - -#include "../main_mpi.h" \ No newline at end of file diff --git a/src/UnitTests/Containers/CMakeLists.txt b/src/UnitTests/Containers/CMakeLists.txt index fdde0a8b723b971d26304d129c9c3a64c3a3a862..efba5e50de2756a9f9335ecf18094e99689bad08 100644 --- a/src/UnitTests/Containers/CMakeLists.txt +++ b/src/UnitTests/Containers/CMakeLists.txt @@ -92,30 +92,39 @@ if( ${BUILD_MPI} ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedArrayTest${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedArrayTest COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedArrayTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedArrayTest${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTest${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedVectorTest COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedVectorTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTest${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedVectorBinaryOperationsTest COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedVectorBinaryOperationsTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorUnaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedVectorUnaryOperationsTest COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedVectorUnaryOperationsTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorUnaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorVerticalOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedVectorVerticalOperationsTest COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedVectorVerticalOperationsTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorVerticalOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" ) if( BUILD_CUDA ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTestCuda${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedVectorTestCuda COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedVectorTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTestCuda${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedVectorBinaryOperationsTestCuda COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedVectorBinaryOperationsTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorUnaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedVectorUnaryOperationsTestCuda COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedVectorUnaryOperationsTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorUnaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorVerticalOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedVectorVerticalOperationsTestCuda COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedVectorVerticalOperationsTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorVerticalOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" ) endif() endif() diff --git a/src/UnitTests/Containers/DistributedArrayTest.h b/src/UnitTests/Containers/DistributedArrayTest.h index 204bc6fe753c9f75b55bd3523eb1f708faf0b857..e25739afe1b8fb6608e5d319953f4d072f77f875 100644 --- a/src/UnitTests/Containers/DistributedArrayTest.h +++ b/src/UnitTests/Containers/DistributedArrayTest.h @@ -9,13 +9,14 @@ #ifdef HAVE_GTEST #include -#include -#include #include #include +#include "VectorHelperFunctions.h" + using namespace TNL; using namespace TNL::Containers; +using namespace TNL::MPI; /* * Light check of DistributedArray. @@ -31,7 +32,6 @@ class DistributedArrayTest protected: using ValueType = typename DistributedArray::ValueType; using DeviceType = typename DistributedArray::DeviceType; - using CommunicatorType = typename DistributedArray::CommunicatorType; using IndexType = typename DistributedArray::IndexType; using DistributedArrayType = DistributedArray; using ArrayViewType = typename DistributedArrayType::LocalViewType; @@ -39,44 +39,55 @@ protected: const int globalSize = 97; // prime number to force non-uniform distribution - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + const MPI_Comm group = AllGroup(); DistributedArrayType distributedArray; - const int rank = CommunicatorType::GetRank(group); - const int nproc = CommunicatorType::GetSize(group); + const int rank = GetRank(group); + const int nproc = GetSize(group); + + // some arbitrary even value (but must be 0 if not distributed) + const int ghosts = (nproc > 1) ? 4 : 0; DistributedArrayTest() { using LocalRangeType = typename DistributedArray::LocalRangeType; - const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); - distributedArray.setDistribution( localRange, globalSize, group ); + const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group ); + distributedArray.setDistribution( localRange, ghosts, globalSize, group ); + + using Synchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< DeviceType >; + distributedArray.setSynchronizer( std::make_shared( localRange, ghosts / 2, group ) ); EXPECT_EQ( distributedArray.getLocalRange(), localRange ); + EXPECT_EQ( distributedArray.getGhosts(), ghosts ); EXPECT_EQ( distributedArray.getCommunicationGroup(), group ); } }; // types for which DistributedArrayTest is instantiated using DistributedArrayTypes = ::testing::Types< - DistributedArray< double, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedArray< double, Devices::Host, int, Communicators::NoDistrCommunicator > + DistributedArray< double, Devices::Host, int > #ifdef HAVE_CUDA , - DistributedArray< double, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedArray< double, Devices::Cuda, int, Communicators::NoDistrCommunicator > + DistributedArray< double, Devices::Cuda, int > #endif >; TYPED_TEST_SUITE( DistributedArrayTest, DistributedArrayTypes ); -TYPED_TEST( DistributedArrayTest, checkSumOfLocalSizes ) +TYPED_TEST( DistributedArrayTest, checkLocalSizes ) { - using CommunicatorType = typename TestFixture::CommunicatorType; + EXPECT_EQ( this->distributedArray.getLocalView().getSize(), this->distributedArray.getLocalRange().getSize() ); + EXPECT_EQ( this->distributedArray.getConstLocalView().getSize(), this->distributedArray.getLocalRange().getSize() ); + EXPECT_EQ( this->distributedArray.getLocalViewWithGhosts().getSize(), this->distributedArray.getLocalRange().getSize() + this->ghosts ); + EXPECT_EQ( this->distributedArray.getConstLocalViewWithGhosts().getSize(), this->distributedArray.getLocalRange().getSize() + this->ghosts ); +} +TYPED_TEST( DistributedArrayTest, checkSumOfLocalSizes ) +{ const int localSize = this->distributedArray.getLocalView().getSize(); int sumOfLocalSizes = 0; - CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); + Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); EXPECT_EQ( sumOfLocalSizes, this->globalSize ); EXPECT_EQ( this->distributedArray.getSize(), this->globalSize ); } @@ -88,14 +99,26 @@ TYPED_TEST( DistributedArrayTest, copyFromGlobal ) this->distributedArray.setValue( 0.0 ); ArrayType globalArray( this->globalSize ); - globalArray.setValue( 1.0 ); + setLinearSequence( globalArray ); this->distributedArray.copyFromGlobal( globalArray ); + this->distributedArray.waitForSynchronization(); - ArrayViewType localArrayView = this->distributedArray.getLocalView(); - auto globalView = globalArray.getConstView(); const auto localRange = this->distributedArray.getLocalRange(); - globalView.bind( &globalArray.getData()[ localRange.getBegin() ], localRange.getEnd() - localRange.getBegin() ); + ArrayViewType localArrayView; + localArrayView.bind( this->distributedArray.getLocalView().getData(), localRange.getSize() ); + auto globalView = globalArray.getConstView(); + globalView.bind( &globalArray.getData()[ localRange.getBegin() ], localRange.getSize() ); EXPECT_EQ( localArrayView, globalView ); + + // check ghost values + for( int o = 0; o < this->ghosts / 2; o++ ) { + const int left_i = localRange.getSize() + o; + const int left_gi = ((this->rank > 0) ? localRange.getBegin() : this->globalSize) - this->ghosts / 2 + o; + EXPECT_EQ( this->distributedArray.getConstLocalViewWithGhosts().getElement( left_i ), globalArray.getElement( left_gi ) ); + const int right_i = localRange.getSize() + this->ghosts / 2 + o; + const int right_gi = ((this->rank < this->nproc - 1) ? localRange.getEnd() : 0) + o; + EXPECT_EQ( this->distributedArray.getConstLocalViewWithGhosts().getElement( right_i ), globalArray.getElement( right_gi ) ); + } } TYPED_TEST( DistributedArrayTest, setLike ) @@ -126,23 +149,47 @@ TYPED_TEST( DistributedArrayTest, setValue ) using ArrayType = typename TestFixture::ArrayType; this->distributedArray.setValue( 1.0 ); + this->distributedArray.waitForSynchronization(); ArrayViewType localArrayView = this->distributedArray.getLocalView(); ArrayType expected( localArrayView.getSize() ); expected.setValue( 1.0 ); EXPECT_EQ( localArrayView, expected ); } +TYPED_TEST( DistributedArrayTest, setValueGhosts ) +{ + using ArrayViewType = typename TestFixture::ArrayViewType; + using ArrayType = typename TestFixture::ArrayType; + + this->distributedArray.setValue( this->rank ); + this->distributedArray.waitForSynchronization(); + ArrayViewType localArrayView = this->distributedArray.getLocalViewWithGhosts(); + ArrayType expected( localArrayView.getSize() ); + expected.setValue( this->rank ); + + // set expected ghost values + const int left = (this->rank > 0) ? this->rank - 1 : this->nproc - 1; + const int right = (this->rank < this->nproc - 1) ? this->rank + 1 : 0; + for( int o = 0; o < this->ghosts / 2; o++ ) { + expected.setElement( this->distributedArray.getLocalRange().getSize() + o, left ); + expected.setElement( this->distributedArray.getLocalRange().getSize() + this->ghosts / 2 + o, right ); + } + + EXPECT_EQ( localArrayView, expected ); +} + TYPED_TEST( DistributedArrayTest, elementwiseAccess ) { using ArrayViewType = typename TestFixture::ArrayViewType; using IndexType = typename TestFixture::IndexType; this->distributedArray.setValue( 0 ); + this->distributedArray.waitForSynchronization(); ArrayViewType localArrayView = this->distributedArray.getLocalView(); const auto localRange = this->distributedArray.getLocalRange(); // check initial value - for( IndexType i = 0; i < localArrayView.getSize(); i++ ) { + for( IndexType i = 0; i < localRange.getSize(); i++ ) { const IndexType gi = localRange.getGlobalIndex( i ); EXPECT_EQ( localArrayView.getElement( i ), 0 ); EXPECT_EQ( this->distributedArray.getElement( gi ), 0 ); @@ -152,13 +199,13 @@ TYPED_TEST( DistributedArrayTest, elementwiseAccess ) } // use setValue - for( IndexType i = 0; i < localArrayView.getSize(); i++ ) { + for( IndexType i = 0; i < localRange.getSize(); i++ ) { const IndexType gi = localRange.getGlobalIndex( i ); this->distributedArray.setElement( gi, i + 1 ); } // check set value - for( IndexType i = 0; i < localArrayView.getSize(); i++ ) { + for( IndexType i = 0; i < localRange.getSize(); i++ ) { const IndexType gi = localRange.getGlobalIndex( i ); EXPECT_EQ( localArrayView.getElement( i ), i + 1 ); EXPECT_EQ( this->distributedArray.getElement( gi ), i + 1 ); @@ -168,16 +215,17 @@ TYPED_TEST( DistributedArrayTest, elementwiseAccess ) } this->distributedArray.setValue( 0 ); + this->distributedArray.waitForSynchronization(); // use operator[] if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) { - for( IndexType i = 0; i < localArrayView.getSize(); i++ ) { + for( IndexType i = 0; i < localRange.getSize(); i++ ) { const IndexType gi = localRange.getGlobalIndex( i ); this->distributedArray[ gi ] = i + 1; } // check set value - for( IndexType i = 0; i < localArrayView.getSize(); i++ ) { + for( IndexType i = 0; i < localRange.getSize(); i++ ) { const IndexType gi = localRange.getGlobalIndex( i ); EXPECT_EQ( localArrayView.getElement( i ), i + 1 ); EXPECT_EQ( this->distributedArray.getElement( gi ), i + 1 ); @@ -192,8 +240,9 @@ TYPED_TEST( DistributedArrayTest, copyConstructor ) this->distributedArray.setValue( 1 ); DistributedArrayType copy( this->distributedArray ); - // Array has "binding" copy-constructor - //EXPECT_EQ( copy.getLocalView().getData(), this->distributedArray.getLocalView().getData() ); + // no binding, but deep copy + EXPECT_NE( copy.getLocalView().getData(), this->distributedArray.getLocalView().getData() ); + EXPECT_EQ( copy.getLocalView(), this->distributedArray.getLocalView() ); } TYPED_TEST( DistributedArrayTest, copyAssignment ) @@ -219,7 +268,7 @@ TYPED_TEST( DistributedArrayTest, comparisonOperators ) v.setLike( u ); w.setLike( u ); - for( int i = 0; i < u.getLocalView().getSize(); i ++ ) { + for( int i = 0; i < localRange.getSize(); i ++ ) { const IndexType gi = localRange.getGlobalIndex( i ); u.setElement( gi, i ); v.setElement( gi, i ); @@ -248,7 +297,7 @@ TYPED_TEST( DistributedArrayTest, containsValue ) const auto localRange = this->distributedArray.getLocalRange(); - for( int i = 0; i < this->distributedArray.getLocalView().getSize(); i++ ) { + for( int i = 0; i < localRange.getSize(); i++ ) { const IndexType gi = localRange.getGlobalIndex( i ); this->distributedArray.setElement( gi, i % 10 ); } @@ -266,7 +315,7 @@ TYPED_TEST( DistributedArrayTest, containsOnlyValue ) const auto localRange = this->distributedArray.getLocalRange(); - for( int i = 0; i < this->distributedArray.getLocalView().getSize(); i++ ) { + for( int i = 0; i < localRange.getSize(); i++ ) { const IndexType gi = localRange.getGlobalIndex( i ); this->distributedArray.setElement( gi, i % 10 ); } @@ -275,6 +324,7 @@ TYPED_TEST( DistributedArrayTest, containsOnlyValue ) EXPECT_FALSE( this->distributedArray.containsOnlyValue( i ) ); this->distributedArray.setValue( 100 ); + this->distributedArray.waitForSynchronization(); EXPECT_TRUE( this->distributedArray.containsOnlyValue( 100 ) ); } diff --git a/src/UnitTests/Containers/DistributedVectorTest.h b/src/UnitTests/Containers/DistributedVectorTest.h index 2a1834f318fa616d25a77ccccbdb68bb1cc016a4..a90f09506d083db52e4b45ded4c0a49485d9d7e2 100644 --- a/src/UnitTests/Containers/DistributedVectorTest.h +++ b/src/UnitTests/Containers/DistributedVectorTest.h @@ -11,8 +11,6 @@ #include -#include -#include #include #include #include @@ -22,6 +20,7 @@ using namespace TNL; using namespace TNL::Containers; +using namespace TNL::MPI; /* * Light check of DistributedVector. @@ -37,31 +36,40 @@ class DistributedVectorTest protected: using RealType = typename DistributedVector::RealType; using DeviceType = typename DistributedVector::DeviceType; - using CommunicatorType = typename DistributedVector::CommunicatorType; using IndexType = typename DistributedVector::IndexType; using DistributedVectorType = DistributedVector; using VectorViewType = typename DistributedVectorType::LocalViewType; - using DistributedVectorView = Containers::DistributedVectorView< RealType, DeviceType, IndexType, CommunicatorType >; + using DistributedVectorView = Containers::DistributedVectorView< RealType, DeviceType, IndexType >; using HostDistributedVectorType = typename DistributedVectorType::template Self< RealType, Devices::Sequential >; - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + const MPI_Comm group = AllGroup(); DistributedVectorType v; DistributedVectorView v_view; HostDistributedVectorType v_host; - const int rank = CommunicatorType::GetRank(group); - const int nproc = CommunicatorType::GetSize(group); + const int rank = GetRank(group); + const int nproc = GetSize(group); // should be small enough to have fast tests, but large enough to test // scan with multiple CUDA grids const int globalSize = 10000 * nproc; + // some arbitrary value (but must be 0 if not distributed) + const int ghosts = (nproc > 1) ? 4 : 0; + DistributedVectorTest() { using LocalRangeType = typename DistributedVector::LocalRangeType; - const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); - v.setDistribution( localRange, globalSize, group ); + const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group ); + v.setDistribution( localRange, ghosts, globalSize, group ); + + using Synchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< DeviceType >; + using HostSynchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< Devices::Sequential >; + v.setSynchronizer( std::make_shared( localRange, ghosts / 2, group ) ); + v_view.setSynchronizer( v.getSynchronizer() ); + v_host.setSynchronizer( std::make_shared( localRange, ghosts / 2, group ) ); + v_view.bind( v ); setConstantSequence( v, 1 ); } @@ -69,17 +77,17 @@ protected: // types for which DistributedVectorTest is instantiated using DistributedVectorTypes = ::testing::Types< - DistributedVector< double, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVector< double, Devices::Host, int, Communicators::NoDistrCommunicator > + DistributedVector< double, Devices::Host, int > #ifdef HAVE_CUDA , - DistributedVector< double, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVector< double, Devices::Cuda, int, Communicators::NoDistrCommunicator > + DistributedVector< double, Devices::Cuda, int > #endif >; TYPED_TEST_SUITE( DistributedVectorTest, DistributedVectorTypes ); +// TODO: test that horizontal operations are computed for ghost values without synchronization + TYPED_TEST( DistributedVectorTest, scan ) { using RealType = typename TestFixture::DistributedVectorType::RealType; diff --git a/src/UnitTests/Containers/VectorBinaryOperationsTest.h b/src/UnitTests/Containers/VectorBinaryOperationsTest.h index d0979845376f9addb314ef0dc5f4beca86785126..b79b675cf7237950be48d68992f7bcd8c794b01d 100644 --- a/src/UnitTests/Containers/VectorBinaryOperationsTest.h +++ b/src/UnitTests/Containers/VectorBinaryOperationsTest.h @@ -13,11 +13,10 @@ #ifdef HAVE_GTEST #if defined(DISTRIBUTED_VECTOR) - #include - #include #include #include #include + using namespace TNL::MPI; #elif defined(STATIC_VECTOR) #include #else @@ -62,11 +61,16 @@ protected: using RightReal = std::remove_const_t< typename Right::RealType >; #ifndef STATIC_VECTOR #ifdef DISTRIBUTED_VECTOR - using CommunicatorType = typename Left::CommunicatorType; - static_assert( std::is_same< typename Right::CommunicatorType, CommunicatorType >::value, - "CommunicatorType must be the same for both Left and Right vectors." ); - using LeftVector = DistributedVector< LeftReal, typename Left::DeviceType, typename Left::IndexType, CommunicatorType >; - using RightVector = DistributedVector< RightReal, typename Right::DeviceType, typename Right::IndexType, CommunicatorType >; + using LeftVector = DistributedVector< LeftReal, typename Left::DeviceType, typename Left::IndexType >; + using RightVector = DistributedVector< RightReal, typename Right::DeviceType, typename Right::IndexType >; + + const MPI_Comm group = AllGroup(); + + const int rank = GetRank(group); + const int nproc = GetSize(group); + + // some arbitrary value (but must be 0 if not distributed) + const int ghosts = (nproc > 1) ? 4 : 0; #else using LeftVector = Vector< LeftReal, typename Left::DeviceType, typename Left::IndexType >; using RightVector = Vector< RightReal, typename Right::DeviceType, typename Right::IndexType >; @@ -90,14 +94,20 @@ protected: R2 = 2; #else #ifdef DISTRIBUTED_VECTOR - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; using LocalRangeType = typename LeftVector::LocalRangeType; - const LocalRangeType localRange = Partitioner< typename Left::IndexType, CommunicatorType >::splitRange( size, group ); - - _L1.setDistribution( localRange, size, group ); - _L2.setDistribution( localRange, size, group ); - _R1.setDistribution( localRange, size, group ); - _R2.setDistribution( localRange, size, group ); + using Synchronizer = typename Partitioner< typename Left::IndexType >::template ArraySynchronizer< typename Left::DeviceType >; + const LocalRangeType localRange = Partitioner< typename Left::IndexType >::splitRange( size, group ); + + _L1.setDistribution( localRange, ghosts, size, group ); + _L2.setDistribution( localRange, ghosts, size, group ); + _R1.setDistribution( localRange, ghosts, size, group ); + _R2.setDistribution( localRange, ghosts, size, group ); + + auto synchronizer = std::make_shared( localRange, ghosts / 2, group ); + _L1.setSynchronizer( synchronizer ); + _L2.setSynchronizer( synchronizer ); + _R1.setSynchronizer( synchronizer ); + _R2.setSynchronizer( synchronizer ); #else _L1.setSize( size ); _L2.setSize( size ); @@ -147,40 +157,23 @@ protected: #if defined(DISTRIBUTED_VECTOR) using VectorPairs = ::testing::Types< #ifndef HAVE_CUDA - Pair< DistributedVector< int, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVector< short, Devices::Host, int, Communicators::MpiCommunicator > >, - Pair< DistributedVector< int, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVectorView< short, Devices::Host, int, Communicators::MpiCommunicator > >, - Pair< DistributedVectorView< int, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVector< short, Devices::Host, int, Communicators::MpiCommunicator > >, - Pair< DistributedVectorView< int, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVectorView< short, Devices::Host, int, Communicators::MpiCommunicator > >, - - Pair< DistributedVector< int, Devices::Host, int, Communicators::NoDistrCommunicator >, - DistributedVector< short, Devices::Host, int, Communicators::NoDistrCommunicator > >, - Pair< DistributedVector< int, Devices::Host, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< short, Devices::Host, int, Communicators::NoDistrCommunicator > >, - Pair< DistributedVectorView< int, Devices::Host, int, Communicators::NoDistrCommunicator >, - DistributedVector< short, Devices::Host, int, Communicators::NoDistrCommunicator > >, - Pair< DistributedVectorView< int, Devices::Host, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< short, Devices::Host, int, Communicators::NoDistrCommunicator > > + Pair< DistributedVector< int, Devices::Host, int >, + DistributedVector< short, Devices::Host, int > >, + Pair< DistributedVector< int, Devices::Host, int >, + DistributedVectorView< short, Devices::Host, int > >, + Pair< DistributedVectorView< int, Devices::Host, int >, + DistributedVector< short, Devices::Host, int > >, + Pair< DistributedVectorView< int, Devices::Host, int >, + DistributedVectorView< short, Devices::Host, int > > #else - Pair< DistributedVector< int, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVector< short, Devices::Cuda, int, Communicators::MpiCommunicator > >, - Pair< DistributedVector< int, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVectorView< short, Devices::Cuda, int, Communicators::MpiCommunicator > >, - Pair< DistributedVectorView< int, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVector< short, Devices::Cuda, int, Communicators::MpiCommunicator > >, - Pair< DistributedVectorView< int, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVectorView< short, Devices::Cuda, int, Communicators::MpiCommunicator > >, - Pair< DistributedVector< int, Devices::Cuda, int, Communicators::NoDistrCommunicator >, - DistributedVector< short, Devices::Cuda, int, Communicators::NoDistrCommunicator > >, - Pair< DistributedVector< int, Devices::Cuda, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< short, Devices::Cuda, int, Communicators::NoDistrCommunicator > >, - Pair< DistributedVectorView< int, Devices::Cuda, int, Communicators::NoDistrCommunicator >, - DistributedVector< short, Devices::Cuda, int, Communicators::NoDistrCommunicator > >, - Pair< DistributedVectorView< int, Devices::Cuda, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< short, Devices::Cuda, int, Communicators::NoDistrCommunicator > > + Pair< DistributedVector< int, Devices::Cuda, int >, + DistributedVector< short, Devices::Cuda, int > >, + Pair< DistributedVector< int, Devices::Cuda, int >, + DistributedVectorView< short, Devices::Cuda, int > >, + Pair< DistributedVectorView< int, Devices::Cuda, int >, + DistributedVector< short, Devices::Cuda, int > >, + Pair< DistributedVectorView< int, Devices::Cuda, int >, + DistributedVectorView< short, Devices::Cuda, int > > #endif >; #elif defined(STATIC_VECTOR) diff --git a/src/UnitTests/Containers/VectorHelperFunctions.h b/src/UnitTests/Containers/VectorHelperFunctions.h index 649de1cee9c70bc6a0989e6729543c4c23a1744b..32f2d52ba7d8cdba98eb671e57c5a7ae1de64583 100644 --- a/src/UnitTests/Containers/VectorHelperFunctions.h +++ b/src/UnitTests/Containers/VectorHelperFunctions.h @@ -2,6 +2,7 @@ #include #include +#include template< typename Vector > void setLinearSequence( Vector& deviceVector ) @@ -9,15 +10,17 @@ void setLinearSequence( Vector& deviceVector ) #ifdef STATIC_VECTOR Vector a; #else - using HostVector = typename Vector::template Self< typename Vector::RealType, TNL::Devices::Host >; + using HostVector = typename Vector::template Self< typename Vector::ValueType, TNL::Devices::Host >; HostVector a; a.setLike( deviceVector ); #endif #ifdef DISTRIBUTED_VECTOR - for( int i = 0; i < a.getLocalView().getSize(); i++ ) { + for( int i = 0; i < a.getLocalRange().getSize(); i++ ) { const auto gi = a.getLocalRange().getGlobalIndex( i ); a[ gi ] = gi; } + for( int i = a.getLocalRange().getSize(); i < a.getLocalView().getSize(); i++ ) + a.getLocalView()[ i ] = -1; // dummy ghost value #else for( int i = 0; i < a.getSize(); i++ ) a[ i ] = i; @@ -62,10 +65,12 @@ void setNegativeLinearSequence( Vector& deviceVector ) HostVector a; a.setLike( deviceVector ); #ifdef DISTRIBUTED_VECTOR - for( int i = 0; i < a.getLocalView().getSize(); i++ ) { + for( int i = 0; i < a.getLocalRange().getSize(); i++ ) { const auto gi = a.getLocalRange().getGlobalIndex( i ); a[ gi ] = -gi; } + for( int i = a.getLocalRange().getSize(); i < a.getLocalView().getSize(); i++ ) + a.getLocalView()[ i ] = 1; // dummy ghost value #else for( int i = 0; i < a.getSize(); i++ ) a[ i ] = -i; @@ -85,10 +90,12 @@ void setOscilatingSequence( Vector& deviceVector, a.setLike( deviceVector ); #endif #ifdef DISTRIBUTED_VECTOR - for( int i = 0; i < a.getLocalView().getSize(); i++ ) { + for( int i = 0; i < a.getLocalRange().getSize(); i++ ) { const auto gi = a.getLocalRange().getGlobalIndex( i ); a[ gi ] = v * std::pow( -1, gi ); } + for( int i = a.getLocalRange().getSize(); i < a.getLocalView().getSize(); i++ ) + a.getLocalView()[ i ] = 42; // dummy ghost value #else for( int i = 0; i < a.getSize(); i++ ) a[ i ] = v * std::pow( -1, i ); diff --git a/src/UnitTests/Containers/VectorUnaryOperationsTest.h b/src/UnitTests/Containers/VectorUnaryOperationsTest.h index a5beb58d96063d70658e3e317d99275d35036eb9..485265e4e9b0fa43b99b6c84fbf1da7d119b0e82 100644 --- a/src/UnitTests/Containers/VectorUnaryOperationsTest.h +++ b/src/UnitTests/Containers/VectorUnaryOperationsTest.h @@ -13,11 +13,10 @@ #ifdef HAVE_GTEST #if defined(DISTRIBUTED_VECTOR) - #include - #include #include #include #include + using namespace TNL::MPI; #elif defined(STATIC_VECTOR) #include #else @@ -52,10 +51,17 @@ protected: #else using NonConstReal = std::remove_const_t< typename VectorOrView::RealType >; #ifdef DISTRIBUTED_VECTOR - using CommunicatorType = typename VectorOrView::CommunicatorType; - using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >; + using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >; template< typename Real > - using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >; + using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >; + + const MPI_Comm group = AllGroup(); + + const int rank = GetRank(group); + const int nproc = GetSize(group); + + // some arbitrary even value (but must be 0 if not distributed) + const int ghosts = (nproc > 1) ? 4 : 0; #else using VectorType = Containers::Vector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >; template< typename Real > @@ -68,19 +74,13 @@ protected: #if defined(DISTRIBUTED_VECTOR) using VectorTypes = ::testing::Types< #ifndef HAVE_CUDA - DistributedVector< double, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVectorView< double, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVectorView< const double, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVector< double, Devices::Host, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< double, Devices::Host, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< const double, Devices::Host, int, Communicators::NoDistrCommunicator > + DistributedVector< double, Devices::Host, int >, + DistributedVectorView< double, Devices::Host, int >, + DistributedVectorView< const double, Devices::Host, int > #else - DistributedVector< double, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVectorView< double, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVectorView< const double, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVector< double, Devices::Cuda, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< double, Devices::Cuda, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< const double, Devices::Cuda, int, Communicators::NoDistrCommunicator > + DistributedVector< double, Devices::Cuda, int >, + DistributedVectorView< double, Devices::Cuda, int >, + DistributedVectorView< const double, Devices::Cuda, int > #endif >; #elif defined(STATIC_VECTOR) @@ -173,14 +173,17 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes ); #define SETUP_UNARY_VECTOR_TEST( size ) \ using VectorType = typename TestFixture::VectorType; \ using VectorOrView = typename TestFixture::VectorOrView; \ - using CommunicatorType = typename VectorOrView::CommunicatorType; \ - const auto group = CommunicatorType::AllGroup; \ using LocalRangeType = typename VectorOrView::LocalRangeType; \ - const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, group ); \ + const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType >::splitRange( size, this->group ); \ + using Synchronizer = typename Partitioner< typename VectorOrView::IndexType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; \ \ VectorType _V1, _V2; \ - _V1.setDistribution( localRange, size, group ); \ - _V2.setDistribution( localRange, size, group ); \ + _V1.setDistribution( localRange, this->ghosts, size, this->group ); \ + _V2.setDistribution( localRange, this->ghosts, size, this->group ); \ + \ + auto _synchronizer = std::make_shared( localRange, this->ghosts / 2, this->group ); \ + _V1.setSynchronizer( _synchronizer ); \ + _V2.setSynchronizer( _synchronizer ); \ \ _V1 = 1; \ _V2 = 2; \ @@ -194,15 +197,14 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes ); EXPECTED_VECTOR( TestFixture, function ); \ using HostVector = typename VectorType::template Self< RealType, Devices::Host >; \ using HostExpectedVector = typename ExpectedVector::template Self< typename ExpectedVector::RealType, Devices::Host >; \ - using CommunicatorType = typename VectorOrView::CommunicatorType; \ - const auto group = CommunicatorType::AllGroup; \ using LocalRangeType = typename VectorOrView::LocalRangeType; \ - const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, group ); \ + const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType >::splitRange( size, this->group ); \ + using Synchronizer = typename Partitioner< typename VectorOrView::IndexType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; \ \ HostVector _V1h; \ HostExpectedVector expected_h; \ - _V1h.setDistribution( localRange, size, group ); \ - expected_h.setDistribution( localRange, size, group ); \ + _V1h.setDistribution( localRange, this->ghosts, size, this->group ); \ + expected_h.setDistribution( localRange, this->ghosts, size, this->group ); \ \ const double h = (double) (end - begin) / size; \ for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) \ @@ -211,10 +213,17 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes ); _V1h[ i ] = x; \ expected_h[ i ] = function(x); \ } \ + for( int i = localRange.getSize(); i < _V1h.getLocalView().getSize(); i++ ) \ + _V1h.getLocalView()[ i ] = expected_h.getLocalView()[ i ] = 0; \ \ VectorType _V1; _V1 = _V1h; \ VectorOrView V1( _V1 ); \ ExpectedVector expected; expected = expected_h; \ + \ + auto _synchronizer = std::make_shared( localRange, this->ghosts / 2, this->group ); \ + _V1.setSynchronizer( _synchronizer ); \ + expected.setSynchronizer( _synchronizer ); \ + expected.startSynchronization(); \ #else #define SETUP_UNARY_VECTOR_TEST( size ) \ @@ -270,11 +279,8 @@ void expect_vectors_near( const Left& _v1, const Right& _v2 ) using LeftNonConstReal = Expressions::RemoveET< std::remove_const_t< typename Left::RealType > >; using RightNonConstReal = Expressions::RemoveET< std::remove_const_t< typename Right::RealType > >; #ifdef DISTRIBUTED_VECTOR - using CommunicatorType = typename Left::CommunicatorType; - static_assert( std::is_same< typename Right::CommunicatorType, CommunicatorType >::value, - "CommunicatorType must be the same for both Left and Right vectors." ); - using LeftVector = DistributedVector< LeftNonConstReal, typename Left::DeviceType, typename Left::IndexType, CommunicatorType >; - using RightVector = DistributedVector< RightNonConstReal, typename Right::DeviceType, typename Right::IndexType, CommunicatorType >; + using LeftVector = DistributedVector< LeftNonConstReal, typename Left::DeviceType, typename Left::IndexType >; + using RightVector = DistributedVector< RightNonConstReal, typename Right::DeviceType, typename Right::IndexType >; #else using LeftVector = Vector< LeftNonConstReal, typename Left::DeviceType, typename Left::IndexType >; using RightVector = Vector< RightNonConstReal, typename Right::DeviceType, typename Right::IndexType >; diff --git a/src/UnitTests/Containers/VectorVerticalOperationsTest.h b/src/UnitTests/Containers/VectorVerticalOperationsTest.h index 3aa60e6123b06e8b59eccffb967cc157c786c2fa..f73b502ccc9ee12bb812acf963b680991c11a877 100644 --- a/src/UnitTests/Containers/VectorVerticalOperationsTest.h +++ b/src/UnitTests/Containers/VectorVerticalOperationsTest.h @@ -13,11 +13,10 @@ #ifdef HAVE_GTEST #if defined(DISTRIBUTED_VECTOR) - #include - #include #include #include #include + using namespace TNL::MPI; #elif defined(STATIC_VECTOR) #include #else @@ -53,10 +52,17 @@ protected: #else using NonConstReal = std::remove_const_t< typename VectorOrView::RealType >; #ifdef DISTRIBUTED_VECTOR - using CommunicatorType = typename VectorOrView::CommunicatorType; - using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >; + using VectorType = DistributedVector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >; template< typename Real > - using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType, CommunicatorType >; + using Vector = DistributedVector< Real, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >; + + const MPI_Comm group = AllGroup(); + + const int rank = GetRank(group); + const int nproc = GetSize(group); + + // some arbitrary value (but must be 0 if not distributed) + const int ghosts = (nproc > 1) ? 4 : 0; #else using VectorType = Containers::Vector< NonConstReal, typename VectorOrView::DeviceType, typename VectorOrView::IndexType >; template< typename Real > @@ -76,11 +82,11 @@ protected: setLinearSequence( V1 ); #else #ifdef DISTRIBUTED_VECTOR - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; using LocalRangeType = typename VectorOrView::LocalRangeType; - const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType, CommunicatorType >::splitRange( size, group ); - - _V1.setDistribution( localRange, size, group ); + using Synchronizer = typename Partitioner< typename VectorOrView::IndexType >::template ArraySynchronizer< typename VectorOrView::DeviceType >; + const LocalRangeType localRange = Partitioner< typename VectorOrView::IndexType >::splitRange( size, group ); + _V1.setDistribution( localRange, ghosts, size, group ); + _V1.setSynchronizer( std::make_shared( localRange, ghosts / 2, group ) ); #else _V1.setSize( size ); #endif @@ -104,19 +110,13 @@ protected: #if defined(DISTRIBUTED_VECTOR) using VectorTypes = ::testing::Types< #ifndef HAVE_CUDA - DistributedVector< double, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVectorView< double, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVectorView< const double, Devices::Host, int, Communicators::MpiCommunicator >, - DistributedVector< double, Devices::Host, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< double, Devices::Host, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< const double, Devices::Host, int, Communicators::NoDistrCommunicator > + DistributedVector< double, Devices::Host, int >, + DistributedVectorView< double, Devices::Host, int >, + DistributedVectorView< const double, Devices::Host, int > #else - DistributedVector< double, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVectorView< double, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVectorView< const double, Devices::Cuda, int, Communicators::MpiCommunicator >, - DistributedVector< double, Devices::Cuda, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< double, Devices::Cuda, int, Communicators::NoDistrCommunicator >, - DistributedVectorView< const double, Devices::Cuda, int, Communicators::NoDistrCommunicator > + DistributedVector< double, Devices::Cuda, int >, + DistributedVectorView< double, Devices::Cuda, int >, + DistributedVectorView< const double, Devices::Cuda, int > #endif >; #elif defined(STATIC_VECTOR) diff --git a/src/UnitTests/Containers/ndarray/CMakeLists.txt b/src/UnitTests/Containers/ndarray/CMakeLists.txt index 5be285b5e56a9857509dd367e9246a4a99a5dbaf..f5fb11bdfa28777655558d0a36361a107ba8cd3e 100644 --- a/src/UnitTests/Containers/ndarray/CMakeLists.txt +++ b/src/UnitTests/Containers/ndarray/CMakeLists.txt @@ -58,13 +58,17 @@ if( ${BUILD_MPI} ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArray_1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedNDArray_1D_test COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedNDArray_1D_test_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArray_1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArray_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedNDArray_semi1D_test COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedNDArray_semi1D_test_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArray_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArrayOverlaps_1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedNDArrayOverlaps_1D_test COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedNDArrayOverlaps_1D_test_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArrayOverlaps_1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArrayOverlaps_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedNDArrayOverlaps_semi1D_test COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedNDArrayOverlaps_semi1D_test_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedNDArrayOverlaps_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) endif() diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h index 7377cbff2450debc69d05cc3e1fa69b7b20ba8e8..36c4ea5b7039974867f15697dc5c49f54bdcddd6 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h @@ -9,9 +9,6 @@ #ifdef HAVE_GTEST #include -#include -#include -#include #include #include #include @@ -35,7 +32,6 @@ class DistributedNDArrayOverlaps_1D_test protected: using ValueType = typename DistributedNDArray::ValueType; using DeviceType = typename DistributedNDArray::DeviceType; - using CommunicatorType = typename DistributedNDArray::CommunicatorType; using IndexType = typename DistributedNDArray::IndexType; using DistributedNDArrayType = DistributedNDArray; @@ -46,17 +42,17 @@ protected: const int globalSize = 97; // prime number to force non-uniform distribution const int overlaps = __ndarray_impl::get< 0 >( typename DistributedNDArray::OverlapsType{} ); - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + const MPI_Comm group = TNL::MPI::AllGroup(); DistributedNDArrayType distributedNDArray; - const int rank = CommunicatorType::GetRank(group); - const int nproc = CommunicatorType::GetSize(group); + const int rank = TNL::MPI::GetRank(group); + const int nproc = TNL::MPI::GetSize(group); DistributedNDArrayOverlaps_1D_test() { using LocalRangeType = typename DistributedNDArray::LocalRangeType; - const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); + const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group ); distributedNDArray.setSizes( globalSize ); distributedNDArray.template setDistribution< 0 >( localRange.getBegin(), localRange.getEnd(), group ); distributedNDArray.allocate(); @@ -72,30 +68,14 @@ using DistributedNDArrayTypes = ::testing::Types< SizesHolder< int, 0 >, std::index_sequence< 0 >, Devices::Host >, - Communicators::MpiCommunicator, std::index_sequence< 2 > > -// TODO: does it make sense for NoDistrCommunicator? -// DistributedNDArray< NDArray< double, -// SizesHolder< int, 0 >, -// std::index_sequence< 0 >, -// Devices::Host >, -// Communicators::NoDistrCommunicator, -// std::index_sequence< 2 > > #ifdef HAVE_CUDA , DistributedNDArray< NDArray< double, SizesHolder< int, 0 >, std::index_sequence< 0 >, Devices::Cuda >, - Communicators::MpiCommunicator, std::index_sequence< 2 > > -// TODO: does it make sense for NoDistrCommunicator? -// DistributedNDArray< NDArray< double, -// SizesHolder< int, 0 >, -// std::index_sequence< 0 >, -// Devices::Cuda >, -// Communicators::NoDistrCommunicator, -// std::index_sequence< 2 > > #endif >; @@ -103,12 +83,10 @@ TYPED_TEST_SUITE( DistributedNDArrayOverlaps_1D_test, DistributedNDArrayTypes ); TYPED_TEST( DistributedNDArrayOverlaps_1D_test, checkSumOfLocalSizes ) { - using CommunicatorType = typename TestFixture::CommunicatorType; - const auto localRange = this->distributedNDArray.template getLocalRange< 0 >(); const int localSize = localRange.getEnd() - localRange.getBegin(); int sumOfLocalSizes = 0; - CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); + TNL::MPI::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); EXPECT_EQ( sumOfLocalSizes, this->globalSize ); EXPECT_EQ( this->distributedNDArray.template getSize< 0 >(), this->globalSize ); diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h index f1ac970eb2d26f268cc5bd294b05ffd5dcb78551..0b6838639f0c689f132fb4077c275f5edb6e0d92 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h @@ -9,9 +9,6 @@ #ifdef HAVE_GTEST #include -#include -#include -#include #include #include #include @@ -35,7 +32,6 @@ class DistributedNDArrayOverlaps_semi1D_test protected: using ValueType = typename DistributedNDArray::ValueType; using DeviceType = typename DistributedNDArray::DeviceType; - using CommunicatorType = typename DistributedNDArray::CommunicatorType; using IndexType = typename DistributedNDArray::IndexType; using DistributedNDArrayType = DistributedNDArray; @@ -46,17 +42,17 @@ protected: const int globalSize = 97; // prime number to force non-uniform distribution const int overlaps = __ndarray_impl::get< 1 >( typename DistributedNDArray::OverlapsType{} ); - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + const MPI_Comm group = TNL::MPI::AllGroup(); DistributedNDArrayType distributedNDArray; - const int rank = CommunicatorType::GetRank(group); - const int nproc = CommunicatorType::GetSize(group); + const int rank = TNL::MPI::GetRank(group); + const int nproc = TNL::MPI::GetSize(group); DistributedNDArrayOverlaps_semi1D_test() { using LocalRangeType = typename DistributedNDArray::LocalRangeType; - const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); + const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group ); distributedNDArray.setSizes( 0, globalSize, globalSize / 2 ); distributedNDArray.template setDistribution< 1 >( localRange.getBegin(), localRange.getEnd(), group ); distributedNDArray.allocate(); @@ -72,7 +68,6 @@ using DistributedNDArrayTypes = ::testing::Types< SizesHolder< int, 9, 0, 0 >, // Q, X, Y std::index_sequence< 0, 1, 2 >, // permutation - should not matter Devices::Host >, - Communicators::MpiCommunicator, std::index_sequence< 0, 2, 0 > > #ifdef HAVE_CUDA , @@ -80,7 +75,6 @@ using DistributedNDArrayTypes = ::testing::Types< SizesHolder< int, 9, 0, 0 >, // Q, X, Y std::index_sequence< 0, 1, 2 >, // permutation - should not matter Devices::Cuda >, - Communicators::MpiCommunicator, std::index_sequence< 0, 2, 0 > > #endif >; @@ -89,12 +83,10 @@ TYPED_TEST_SUITE( DistributedNDArrayOverlaps_semi1D_test, DistributedNDArrayType TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, checkSumOfLocalSizes ) { - using CommunicatorType = typename TestFixture::CommunicatorType; - const auto localRange = this->distributedNDArray.template getLocalRange< 1 >(); const int localSize = localRange.getEnd() - localRange.getBegin(); int sumOfLocalSizes = 0; - CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); + TNL::MPI::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); EXPECT_EQ( sumOfLocalSizes, this->globalSize ); EXPECT_EQ( this->distributedNDArray.template getSize< 1 >(), this->globalSize ); diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h index a8d3bcdab1cf911cf5386d050e0cc29636340a86..e5519297133d8630e7a6b7c1ed1e68b828e8bd4c 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h @@ -9,9 +9,6 @@ #ifdef HAVE_GTEST #include -#include -#include -#include #include #include #include @@ -34,7 +31,6 @@ class DistributedNDArray_1D_test protected: using ValueType = typename DistributedNDArray::ValueType; using DeviceType = typename DistributedNDArray::DeviceType; - using CommunicatorType = typename DistributedNDArray::CommunicatorType; using IndexType = typename DistributedNDArray::IndexType; using DistributedNDArrayType = DistributedNDArray; @@ -44,17 +40,17 @@ protected: const int globalSize = 97; // prime number to force non-uniform distribution - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + const MPI_Comm group = TNL::MPI::AllGroup(); DistributedNDArrayType distributedNDArray; - const int rank = CommunicatorType::GetRank(group); - const int nproc = CommunicatorType::GetSize(group); + const int rank = TNL::MPI::GetRank(group); + const int nproc = TNL::MPI::GetSize(group); DistributedNDArray_1D_test() { using LocalRangeType = typename DistributedNDArray::LocalRangeType; - const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); + const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group ); distributedNDArray.setSizes( globalSize ); distributedNDArray.template setDistribution< 0 >( localRange.getBegin(), localRange.getEnd(), group ); distributedNDArray.allocate(); @@ -69,25 +65,13 @@ using DistributedNDArrayTypes = ::testing::Types< DistributedNDArray< NDArray< double, SizesHolder< int, 0 >, std::index_sequence< 0 >, - Devices::Host >, - Communicators::MpiCommunicator >, - DistributedNDArray< NDArray< double, - SizesHolder< int, 0 >, - std::index_sequence< 0 >, - Devices::Host >, - Communicators::NoDistrCommunicator > + Devices::Host > > #ifdef HAVE_CUDA , DistributedNDArray< NDArray< double, SizesHolder< int, 0 >, std::index_sequence< 0 >, - Devices::Cuda >, - Communicators::MpiCommunicator >, - DistributedNDArray< NDArray< double, - SizesHolder< int, 0 >, - std::index_sequence< 0 >, - Devices::Cuda >, - Communicators::NoDistrCommunicator > + Devices::Cuda > > #endif >; @@ -95,12 +79,10 @@ TYPED_TEST_SUITE( DistributedNDArray_1D_test, DistributedNDArrayTypes ); TYPED_TEST( DistributedNDArray_1D_test, checkSumOfLocalSizes ) { - using CommunicatorType = typename TestFixture::CommunicatorType; - const auto localRange = this->distributedNDArray.template getLocalRange< 0 >(); const int localSize = localRange.getEnd() - localRange.getBegin(); int sumOfLocalSizes = 0; - CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); + TNL::MPI::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); EXPECT_EQ( sumOfLocalSizes, this->globalSize ); EXPECT_EQ( this->distributedNDArray.template getSize< 0 >(), this->globalSize ); } diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h index 6f777c215aab824f8d7a2c2916bd459845194e55..e3cbb3223c9e411105a019dcefd4ba29c047c179 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h @@ -9,9 +9,6 @@ #ifdef HAVE_GTEST #include -#include -#include -#include #include #include #include @@ -34,7 +31,6 @@ class DistributedNDArray_semi1D_test protected: using ValueType = typename DistributedNDArray::ValueType; using DeviceType = typename DistributedNDArray::DeviceType; - using CommunicatorType = typename DistributedNDArray::CommunicatorType; using IndexType = typename DistributedNDArray::IndexType; using DistributedNDArrayType = DistributedNDArray; @@ -44,17 +40,17 @@ protected: const int globalSize = 97; // prime number to force non-uniform distribution - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + const MPI_Comm group = TNL::MPI::AllGroup(); DistributedNDArrayType distributedNDArray; - const int rank = CommunicatorType::GetRank(group); - const int nproc = CommunicatorType::GetSize(group); + const int rank = TNL::MPI::GetRank(group); + const int nproc = TNL::MPI::GetSize(group); DistributedNDArray_semi1D_test() { using LocalRangeType = typename DistributedNDArray::LocalRangeType; - const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); + const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group ); distributedNDArray.setSizes( 0, globalSize, globalSize / 2 ); distributedNDArray.template setDistribution< 1 >( localRange.getBegin(), localRange.getEnd(), group ); distributedNDArray.allocate(); @@ -69,15 +65,13 @@ using DistributedNDArrayTypes = ::testing::Types< DistributedNDArray< NDArray< double, SizesHolder< int, 9, 0, 0 >, // Q, X, Y, Z std::index_sequence< 0, 1, 2 >, // permutation - should not matter - Devices::Host >, - Communicators::MpiCommunicator > + Devices::Host > > #ifdef HAVE_CUDA , DistributedNDArray< NDArray< double, SizesHolder< int, 9, 0, 0 >, // Q, X, Y, Z std::index_sequence< 0, 1, 2 >, // permutation - should not matter - Devices::Cuda >, - Communicators::NoDistrCommunicator > + Devices::Cuda > > #endif >; @@ -85,12 +79,10 @@ TYPED_TEST_SUITE( DistributedNDArray_semi1D_test, DistributedNDArrayTypes ); TYPED_TEST( DistributedNDArray_semi1D_test, checkSumOfLocalSizes ) { - using CommunicatorType = typename TestFixture::CommunicatorType; - const auto localRange = this->distributedNDArray.template getLocalRange< 1 >(); const int localSize = localRange.getEnd() - localRange.getBegin(); int sumOfLocalSizes = 0; - CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); + TNL::MPI::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); EXPECT_EQ( sumOfLocalSizes, this->globalSize ); EXPECT_EQ( this->distributedNDArray.template getSize< 1 >(), this->globalSize ); } diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt index 65723ac889325a97ecc9db56081ce0d901d9e271..b713c8f0ca76d534b8abb903097b77b8bc8bd22b 100644 --- a/src/UnitTests/Matrices/CMakeLists.txt +++ b/src/UnitTests/Matrices/CMakeLists.txt @@ -58,4 +58,5 @@ if( ${BUILD_MPI} ) SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedMatrixTest${CMAKE_EXECUTABLE_SUFFIX}" ) ADD_TEST( NAME DistributedMatrixTest COMMAND "mpirun" ${mpi_test_parameters}) + ADD_TEST( NAME DistributedMatrixTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedMatrixTest${CMAKE_EXECUTABLE_SUFFIX}" ) endif() diff --git a/src/UnitTests/Matrices/DistributedMatrixTest.h b/src/UnitTests/Matrices/DistributedMatrixTest.h index 30a76f86a60c707926c6f2b7aa88de08964477c0..5e893e111221272912d8c874797b304ab7e68142 100644 --- a/src/UnitTests/Matrices/DistributedMatrixTest.h +++ b/src/UnitTests/Matrices/DistributedMatrixTest.h @@ -9,13 +9,12 @@ #ifdef HAVE_GTEST #include -#include -#include #include #include #include using namespace TNL; +using namespace TNL::MPI; template< typename Vector > void setLinearSequence( Vector& deviceVector, typename Vector::RealType offset = 0 ) @@ -33,7 +32,7 @@ void setLinearSequence( Vector& deviceVector, typename Vector::RealType offset = template< typename Matrix, typename RowCapacities > void setMatrix( Matrix& matrix, const RowCapacities& rowCapacities ) { - using HostMatrix = Matrices::DistributedMatrix< typename Matrix::MatrixType::template Self< typename Matrix::RealType, TNL::Devices::Sequential >, typename Matrix::CommunicatorType >; + using HostMatrix = Matrices::DistributedMatrix< typename Matrix::MatrixType::template Self< typename Matrix::RealType, TNL::Devices::Sequential > >; using HostRowCapacities = typename RowCapacities::template Self< typename RowCapacities::RealType, TNL::Devices::Sequential >; HostMatrix hostMatrix; @@ -66,20 +65,19 @@ class DistributedMatrixTest protected: using RealType = typename DistributedMatrix::RealType; using DeviceType = typename DistributedMatrix::DeviceType; - using CommunicatorType = typename DistributedMatrix::CommunicatorType; using IndexType = typename DistributedMatrix::IndexType; using DistributedMatrixType = DistributedMatrix; using RowCapacitiesVector = typename DistributedMatrixType::CompressedRowLengthsVector; using GlobalVector = Containers::Vector< RealType, DeviceType, IndexType >; - using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType, CommunicatorType >; + using DistributedVector = Containers::DistributedVector< RealType, DeviceType, IndexType >; const int globalSize = 97; // prime number to force non-uniform distribution - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + const MPI_Comm group = AllGroup(); - const int rank = CommunicatorType::GetRank(group); - const int nproc = CommunicatorType::GetSize(group); + const int rank = GetRank(group); + const int nproc = GetSize(group); DistributedMatrixType matrix; @@ -88,9 +86,9 @@ protected: DistributedMatrixTest() { using LocalRangeType = typename DistributedMatrix::LocalRangeType; - const LocalRangeType localRange = Containers::Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); + const LocalRangeType localRange = Containers::Partitioner< IndexType >::splitRange( globalSize, group ); matrix.setDistribution( localRange, globalSize, globalSize, group ); - rowCapacities.setDistribution( localRange, globalSize, group ); + rowCapacities.setDistribution( localRange, 0, globalSize, group ); EXPECT_EQ( matrix.getLocalRowRange(), localRange ); EXPECT_EQ( matrix.getCommunicationGroup(), group ); @@ -101,12 +99,10 @@ protected: // types for which DistributedMatrixTest is instantiated using DistributedMatrixTypes = ::testing::Types< - Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int >, Communicators::MpiCommunicator >, - Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int >, Communicators::NoDistrCommunicator > + Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Host, int > > #ifdef HAVE_CUDA , - Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int >, Communicators::MpiCommunicator >, - Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int >, Communicators::NoDistrCommunicator > + Matrices::DistributedMatrix< Matrices::SparseMatrix< double, Devices::Cuda, int > > #endif >; @@ -114,11 +110,9 @@ TYPED_TEST_SUITE( DistributedMatrixTest, DistributedMatrixTypes ); TYPED_TEST( DistributedMatrixTest, checkSumOfLocalSizes ) { - using CommunicatorType = typename TestFixture::CommunicatorType; - const int localSize = this->matrix.getLocalMatrix().getRows(); int sumOfLocalSizes = 0; - CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); + Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); EXPECT_EQ( sumOfLocalSizes, this->globalSize ); EXPECT_EQ( this->matrix.getRows(), this->globalSize ); } @@ -218,7 +212,7 @@ TYPED_TEST( DistributedMatrixTest, vectorProduct_globalInput ) GlobalVector inVector( this->globalSize ); inVector.setValue( 1 ); - DistributedVector outVector( this->matrix.getLocalRowRange(), this->globalSize, this->matrix.getCommunicationGroup() ); + DistributedVector outVector( this->matrix.getLocalRowRange(), 0, this->globalSize, this->matrix.getCommunicationGroup() ); this->matrix.vectorProduct( inVector, outVector ); EXPECT_EQ( outVector, this->rowCapacities ) @@ -233,9 +227,9 @@ TYPED_TEST( DistributedMatrixTest, vectorProduct_distributedInput ) this->matrix.setRowCapacities( this->rowCapacities ); setMatrix( this->matrix, this->rowCapacities ); - DistributedVector inVector( this->matrix.getLocalRowRange(), this->globalSize, this->matrix.getCommunicationGroup() ); + DistributedVector inVector( this->matrix.getLocalRowRange(), 0, this->globalSize, this->matrix.getCommunicationGroup() ); inVector.setValue( 1 ); - DistributedVector outVector( this->matrix.getLocalRowRange(), this->globalSize, this->matrix.getCommunicationGroup() ); + DistributedVector outVector( this->matrix.getLocalRowRange(), 0, this->globalSize, this->matrix.getCommunicationGroup() ); this->matrix.vectorProduct( inVector, outVector ); EXPECT_EQ( outVector, this->rowCapacities ) diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp index 54071032c5ceba47386286a7fd30c317859a8da3..dccd68f23b4b8cf678d6a1f156e8f359bd15762b 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp @@ -1,9 +1,8 @@ -#ifdef HAVE_GTEST +#ifdef HAVE_GTEST #include -#ifdef HAVE_MPI +#ifdef HAVE_MPI -#include #include #include @@ -12,30 +11,25 @@ using namespace TNL::Containers; using namespace TNL::Meshes; using namespace TNL::Meshes::DistributedMeshes; using namespace TNL::Devices; -using namespace TNL::Communicators; -typedef MpiCommunicator CommunicatorType; - -template< - typename MeshType, - typename CommunicatorType> +template< typename MeshType > void SetUpDistributedGrid(DistributedMesh &distributedGrid, MeshType &globalGrid,int size,typename MeshType::CoordinatesType distribution ) { typename MeshType::PointType globalOrigin; typename MeshType::PointType globalProportions; using DistributedMeshType = DistributedMesh< MeshType >; - + globalOrigin.setValue( -0.5 ); globalProportions.setValue( size ); globalGrid.setDimensions( size ); globalGrid.setDomain( globalOrigin,globalProportions ); - + distributedGrid.setDomainDecomposition( distribution ); - distributedGrid.template setGlobalGrid(globalGrid); + distributedGrid.setGlobalGrid(globalGrid); typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); } @@ -44,47 +38,47 @@ void SetUpDistributedGrid(DistributedMesh &distributedGrid, MeshType & TEST(CutDistributedGirdTest_2D, IsInCut) { typedef Grid<2,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<1,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(3,4)); + SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(3,4)); CutDistributedGridType cutDistributedGrid; - bool result=cutDistributedGrid.SetupByCut( + bool result=cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<1,int>(1), StaticVector<1,int>(0), StaticVector<1,int>(5) ); - if(CommunicatorType::GetRank(CommunicatorType::AllGroup)%3==1) + if(TNL::MPI::GetRank()%3==1) { - ASSERT_TRUE(result); + ASSERT_TRUE(result); } else { ASSERT_FALSE(result); - } + } } TEST(CutDistributedGirdTest_2D, GloblaGridDimesion) { typedef Grid<2,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<1,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid, globalGrid, 10, CoordinatesType(3,4)); + SetUpDistributedGrid(distributedGrid, globalGrid, 10, CoordinatesType(3,4)); CutDistributedGridType cutDistributedGrid; - if(cutDistributedGrid.SetupByCut( + if(cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<1,int>(1), StaticVector<1,int>(0), @@ -92,24 +86,24 @@ TEST(CutDistributedGirdTest_2D, GloblaGridDimesion) )) { EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getMeshDimension(),1) << "Dimenze globálního gridu neodpovídajá řezu"; - EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají"; + EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají"; } } TEST(CutDistributedGirdTest_2D, IsDistributed) { typedef Grid<2,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<1,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(3,4)); + SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(3,4)); CutDistributedGridType cutDistributedGrid; - if(cutDistributedGrid.SetupByCut( + if(cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<1,int>(1), StaticVector<1,int>(0), @@ -123,17 +117,17 @@ TEST(CutDistributedGirdTest_2D, IsDistributed) TEST(CutDistributedGirdTest_2D, IsNotDistributed) { typedef Grid<2,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<1,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(12,1)); + SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(12,1)); CutDistributedGridType cutDistributedGrid; - if(cutDistributedGrid.SetupByCut( + if(cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<1,int>(1), StaticVector<1,int>(0), @@ -149,47 +143,47 @@ TEST(CutDistributedGirdTest_2D, IsNotDistributed) TEST(CutDistributedGirdTest_3D, IsInCut_1D) { typedef Grid<3,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<1,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3)); + SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3)); CutDistributedGridType cutDistributedGrid; - bool result=cutDistributedGrid.SetupByCut( + bool result=cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<1,int>(2), StaticVector<2,int>(0,1), StaticVector<2,int>(2,2) ); - if(CommunicatorType::GetRank(CommunicatorType::AllGroup)%4==0) + if(TNL::MPI::GetRank()%4==0) { - ASSERT_TRUE(result); + ASSERT_TRUE(result); } else { ASSERT_FALSE(result); - } + } } TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_1D) { typedef Grid<3,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<1,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid, globalGrid, 10, CoordinatesType(2,2,3)); + SetUpDistributedGrid(distributedGrid, globalGrid, 10, CoordinatesType(2,2,3)); CutDistributedGridType cutDistributedGrid; - if(cutDistributedGrid.SetupByCut( + if(cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<1,int>(2), StaticVector<2,int>(0,1), @@ -197,24 +191,24 @@ TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_1D) )) { EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getMeshDimension(),1) << "Dimenze globálního gridu neodpovídajá řezu"; - EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají"; + EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají"; } } TEST(CutDistributedGirdTest_3D, IsDistributed_1D) { typedef Grid<3,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<1,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3)); + SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3)); CutDistributedGridType cutDistributedGrid; - if(cutDistributedGrid.SetupByCut( + if(cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<1,int>(2), StaticVector<2,int>(0,1), @@ -228,17 +222,17 @@ TEST(CutDistributedGirdTest_3D, IsDistributed_1D) TEST(CutDistributedGirdTest_3D, IsNotDistributed_1D) { typedef Grid<3,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<1,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid,globalGrid, 30, CoordinatesType(12,1,1)); + SetUpDistributedGrid(distributedGrid,globalGrid, 30, CoordinatesType(12,1,1)); CutDistributedGridType cutDistributedGrid; - if(cutDistributedGrid.SetupByCut( + if(cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<1,int>(2), StaticVector<2,int>(0,1), @@ -254,48 +248,48 @@ TEST(CutDistributedGirdTest_3D, IsNotDistributed_1D) TEST(CutDistributedGirdTest_3D, IsInCut_2D) { typedef Grid<3,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<2,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3)); + SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3)); CutDistributedGridType cutDistributedGrid; - bool result=cutDistributedGrid.SetupByCut( + bool result=cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<2,int>(0,1), StaticVector<1,int>(2), StaticVector<1,int>(5) ); - int rank=CommunicatorType::GetRank(CommunicatorType::AllGroup); + int rank=TNL::MPI::GetRank(); if(rank>3 && rank<8) { - ASSERT_TRUE(result); + ASSERT_TRUE(result); } else { ASSERT_FALSE(result); - } + } } TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_2D) { typedef Grid<3,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<2,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid, globalGrid, 10, CoordinatesType(2,2,3)); + SetUpDistributedGrid(distributedGrid, globalGrid, 10, CoordinatesType(2,2,3)); CutDistributedGridType cutDistributedGrid; - if(cutDistributedGrid.SetupByCut( + if(cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<2,int>(0,1), StaticVector<1,int>(2), @@ -303,7 +297,7 @@ TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_2D) )) { EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getMeshDimension(),2) << "Dimenze globálního gridu neodpovídajá řezu"; - EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají"; + EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().x(),10) << "Rozměry globálního gridu neodpovídají"; EXPECT_EQ(cutDistributedGrid.getGlobalGrid().getDimensions().y(),10) << "Rozměry globálního gridu neodpovídají"; } } @@ -311,17 +305,17 @@ TEST(CutDistributedGirdTest_3D, GloblaGridDimesion_2D) TEST(CutDistributedGirdTest_3D, IsDistributed_2D) { typedef Grid<3,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<2,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3)); + SetUpDistributedGrid(distributedGrid,globalGrid, 10, CoordinatesType(2,2,3)); CutDistributedGridType cutDistributedGrid; - if(cutDistributedGrid.SetupByCut( + if(cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<2,int>(0,1), StaticVector<1,int>(2), @@ -335,17 +329,17 @@ TEST(CutDistributedGirdTest_3D, IsDistributed_2D) TEST(CutDistributedGirdTest_3D, IsNotDistributed_2D) { typedef Grid<3,double,Host,int> MeshType; - typedef typename MeshType::CoordinatesType CoordinatesType; + typedef typename MeshType::CoordinatesType CoordinatesType; typedef DistributedMesh DistributedMeshType; typedef Grid<2,double,Host,int> CutGridType; typedef DistributedMesh CutDistributedGridType; MeshType globalGrid; DistributedMeshType distributedGrid; - SetUpDistributedGrid(distributedGrid,globalGrid, 30, CoordinatesType(1,1,12)); + SetUpDistributedGrid(distributedGrid,globalGrid, 30, CoordinatesType(1,1,12)); CutDistributedGridType cutDistributedGrid; - if(cutDistributedGrid.SetupByCut( + if(cutDistributedGrid.SetupByCut( distributedGrid, StaticVector<2,int>(0,1), StaticVector<1,int>(2), diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp index 4d5bb4baf66abbabcf6f989e4ed8389acf0d7b9a..9ad46b41221a4cb230ab67fdda50b3ef2636199d 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -18,9 +17,6 @@ using namespace TNL::Containers; using namespace TNL::Meshes; using namespace TNL::Meshes::DistributedMeshes; using namespace TNL::Devices; -using namespace TNL::Communicators; - -typedef MpiCommunicator CommunicatorType; static const char* TEST_FILE_NAME = "test_CutDistributedMeshFunctionTest.tnl"; @@ -52,9 +48,9 @@ TEST(CutDistributedMeshFunction, 2D_Data) DistributedMeshType distributedGrid; distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 3, 4 ) ); - distributedGrid.template setGlobalGrid(globalOriginalGrid); + distributedGrid.setGlobalGrid(globalOriginalGrid); typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); @@ -73,14 +69,14 @@ TEST(CutDistributedMeshFunction, 2D_Data) DistributedMeshSynchronizer< DistributedMeshType > synchronizer; synchronizer.setDistributedGrid( &distributedGrid ); - synchronizer.template synchronize( *meshFunctionptr ); + synchronizer.synchronize( *meshFunctionptr ); //Prepare Mesh Function parts for Cut CutDistributedMeshType cutDistributedGrid; Pointers::SharedPointer cutGrid; cutGrid->setDistMesh(&cutDistributedGrid); DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( *meshFunctionptr,*cutGrid, cutDof, StaticVector<1,int>(1), StaticVector<1,int>(0), @@ -134,9 +130,9 @@ TEST(CutDistributedMeshFunction, 3D_1_Data) DistributedMeshType distributedGrid; distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2, 2, 3 ) ); - distributedGrid.template setGlobalGrid(globalOriginalGrid); + distributedGrid.setGlobalGrid(globalOriginalGrid); typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); Pointers::SharedPointer originalGrid; @@ -154,14 +150,14 @@ TEST(CutDistributedMeshFunction, 3D_1_Data) DistributedMeshSynchronizer< DistributedMeshType > synchronizer; synchronizer.setDistributedGrid( &distributedGrid ); - synchronizer.template synchronize( *meshFunctionptr ); + synchronizer.synchronize( *meshFunctionptr ); //Prepare Mesh Function parts for Cut CutDistributedMeshType cutDistributedGrid; Pointers::SharedPointer cutGrid; cutGrid->setDistMesh(&cutDistributedGrid); DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( *meshFunctionptr,*cutGrid, cutDof, StaticVector<1,int>(2), StaticVector<2,int>(1,0), @@ -215,9 +211,9 @@ TEST(CutDistributedMeshFunction, 3D_2_Data) DistributedMeshType distributedGrid; distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2, 2, 3 ) ); - distributedGrid.template setGlobalGrid(globalOriginalGrid); + distributedGrid.setGlobalGrid(globalOriginalGrid); typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); Pointers::SharedPointer originalGrid; @@ -235,14 +231,14 @@ TEST(CutDistributedMeshFunction, 3D_2_Data) DistributedMeshSynchronizer< DistributedMeshType > synchronizer; synchronizer.setDistributedGrid( &distributedGrid ); - synchronizer.template synchronize( *meshFunctionptr ); + synchronizer.synchronize( *meshFunctionptr ); //Prepare Mesh Function parts for Cut CutDistributedMeshType cutDistributedGrid; Pointers::SharedPointer cutGrid; cutGrid->setDistMesh(&cutDistributedGrid); DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( *meshFunctionptr,*cutGrid, cutDof, StaticVector<2,int>(0,2), StaticVector<1,int>(1), @@ -302,9 +298,9 @@ TEST(CutDistributedMeshFunction, 2D_Synchronization) DistributedMeshType distributedGrid; distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 3, 4 ) ); - distributedGrid.template setGlobalGrid(globalOriginalGrid); + distributedGrid.setGlobalGrid(globalOriginalGrid); typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); Pointers::SharedPointer originalGrid; @@ -325,7 +321,7 @@ TEST(CutDistributedMeshFunction, 2D_Synchronization) Pointers::SharedPointer cutGrid; cutGrid->setDistMesh(&cutDistributedGrid); DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( *meshFunctionptr,*cutGrid, cutDof, StaticVector<1,int>(1), StaticVector<1,int>(0), @@ -338,7 +334,7 @@ TEST(CutDistributedMeshFunction, 2D_Synchronization) DistributedMeshSynchronizer< CutDistributedMeshType > synchronizer; synchronizer.setDistributedGrid( &cutDistributedGrid ); - synchronizer.template synchronize( cutMeshFunction ); + synchronizer.synchronize( cutMeshFunction ); typename MeshType::Cell fromEntity(meshFunctionptr->getMesh()); typename CutMeshType::Cell outEntity(*cutGrid); @@ -387,9 +383,9 @@ TEST(CutDistributedMeshFunction, 3D_1_Synchronization) DistributedMeshType distributedGrid; distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2,2,3 ) ); - distributedGrid.template setGlobalGrid( globalOriginalGrid ); + distributedGrid.setGlobalGrid( globalOriginalGrid ); typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); Pointers::SharedPointer originalGrid; @@ -410,7 +406,7 @@ TEST(CutDistributedMeshFunction, 3D_1_Synchronization) Pointers::SharedPointer cutGrid; cutGrid->setDistMesh(&cutDistributedGrid); DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( *meshFunctionptr,*cutGrid, cutDof, StaticVector<1,int>(1), StaticVector<2,int>(0,2), @@ -423,7 +419,7 @@ TEST(CutDistributedMeshFunction, 3D_1_Synchronization) DistributedMeshSynchronizer< CutDistributedMeshType > synchronizer; synchronizer.setDistributedGrid( &cutDistributedGrid ); - synchronizer.template synchronize( cutMeshFunction ); + synchronizer.synchronize( cutMeshFunction ); typename MeshType::Cell fromEntity(meshFunctionptr->getMesh()); typename CutMeshType::Cell outEntity(*cutGrid); @@ -476,9 +472,9 @@ TEST(CutDistributedMeshFunction, 3D_2_Synchronization) overlap.setValue(1); DistributedMeshType distributedGrid; distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2,2,3 ) ); - distributedGrid.template setGlobalGrid(globalOriginalGrid); + distributedGrid.setGlobalGrid(globalOriginalGrid); typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); Pointers::SharedPointer originalGrid; @@ -499,7 +495,7 @@ TEST(CutDistributedMeshFunction, 3D_2_Synchronization) Pointers::SharedPointer cutGrid; cutGrid->setDistMesh(&cutDistributedGrid); DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( *meshFunctionptr,*cutGrid, cutDof, StaticVector<2,int>(0,2), StaticVector<1,int>(1), @@ -512,7 +508,7 @@ TEST(CutDistributedMeshFunction, 3D_2_Synchronization) DistributedMeshSynchronizer< CutDistributedMeshType > synchronizer; synchronizer.setDistributedGrid( &cutDistributedGrid ); - synchronizer.template synchronize( cutMeshFunction ); + synchronizer.synchronize( cutMeshFunction ); typename MeshType::Cell fromEntity(meshFunctionptr->getMesh()); typename CutMeshType::Cell outEntity(*cutGrid); @@ -563,9 +559,9 @@ TEST(CutDistributedMeshFunction, 3D_2_Save) overlap.setValue(1); DistributedMeshType distributedGrid; distributedGrid.setDomainDecomposition( typename DistributedMeshType::CoordinatesType( 2,2,3 ) ); - distributedGrid.template setGlobalGrid( globalOriginalGrid ); + distributedGrid.setGlobalGrid( globalOriginalGrid ); typename DistributedMeshType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); Pointers::SharedPointer originalGrid; @@ -586,7 +582,7 @@ TEST(CutDistributedMeshFunction, 3D_2_Save) Pointers::SharedPointer cutGrid; cutGrid->setDistMesh(&cutDistributedGrid); DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( *meshFunctionptr,*cutGrid, cutDof, StaticVector<2,int>(0,2), StaticVector<1,int>(1), @@ -600,9 +596,8 @@ TEST(CutDistributedMeshFunction, 3D_2_Save) DistributedGridIO,MpiIO> ::save(TEST_FILE_NAME, cutMeshFunction ); //save globalgrid for debug render - typename CommunicatorType::CommunicationGroup *group; - group=(typename CommunicatorType::CommunicationGroup *)(cutDistributedGrid.getCommunicationGroup()); - if(CommunicatorType::GetRank(*group)==0) + MPI_Comm group=cutDistributedGrid.getCommunicationGroup(); + if(TNL::MPI::GetRank(group)==0) { File meshFile; meshFile.open( TEST_FILE_NAME+String("-mesh.tnl"),std::ios_base::out); @@ -612,7 +607,7 @@ TEST(CutDistributedMeshFunction, 3D_2_Save) } - if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0) + if(TNL::MPI::GetRank()==0) { Pointers::SharedPointer globalCutGrid; MeshFunctionView loadMeshFunctionptr; diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp index 5d034087f35a6ba75a96d84e692d6876bfbcb374..6621a01dd971715e923a28b5faa173d1bac044d9 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp @@ -1,11 +1,10 @@ -#ifdef HAVE_GTEST +#ifdef HAVE_GTEST #include #include #include #include #include -#include #include "../../Functions/Functions.h" @@ -14,7 +13,6 @@ using namespace TNL::Containers; using namespace TNL::Functions; using namespace TNL::Meshes; using namespace TNL::Devices; -using namespace TNL::Communicators; TEST(CutMeshFunction, 2D) @@ -28,12 +26,12 @@ TEST(CutMeshFunction, 2D) typedef typename MeshType::Cell Cell; typedef LinearFunction LinearFunctionType; - + //Original MeshFunciton --filed with linear function Pointers::SharedPointer originalGrid; Pointers::SharedPointer> meshFunctionptr; - + PointType origin; origin.setValue(-0.5); PointType proportions; @@ -43,18 +41,18 @@ TEST(CutMeshFunction, 2D) DofType dof(originalGrid->template getEntitiesCount< Cell >()); - dof.setValue(0); + dof.setValue(0); meshFunctionptr->bind(originalGrid,dof); MeshFunctionEvaluator< MeshFunctionView, LinearFunctionType > linearFunctionEvaluator; Pointers::SharedPointer< LinearFunctionType, Host > linearFunctionPtr; linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr); - - //Prepare Mesh Function parts for Cut + + //Prepare Mesh Function parts for Cut Pointers::SharedPointer cutGrid; DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( - *meshFunctionptr,*cutGrid, cutDof, + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + *meshFunctionptr,*cutGrid, cutDof, StaticVector<1,int>(0), StaticVector<1,int>(1), StaticVector<1,typename CutMeshType::IndexType>(5) ); @@ -62,13 +60,13 @@ TEST(CutMeshFunction, 2D) ASSERT_TRUE(inCut)<<"nedistribuovaná meshfunction musí být vždy v řezu"; MeshFunctionView cutMeshFunction; - cutMeshFunction.bind(cutGrid,cutDof); + cutMeshFunction.bind(cutGrid,cutDof); for(int i=0;i<10;i++) { typename MeshType::Cell fromEntity(meshFunctionptr->getMesh()); typename CutMeshType::Cell outEntity(*cutGrid); - + fromEntity.getCoordinates().x()=i; fromEntity.getCoordinates().y()=5; outEntity.getCoordinates().x()=i; @@ -91,12 +89,12 @@ TEST(CutMeshFunction, 3D_1) typedef typename MeshType::Cell Cell; typedef LinearFunction LinearFunctionType; - + //Original MeshFunciton --filed with linear function Pointers::SharedPointer originalGrid; Pointers::SharedPointer> meshFunctionptr; - + PointType origin; origin.setValue(-0.5); PointType proportions; @@ -106,18 +104,18 @@ TEST(CutMeshFunction, 3D_1) DofType dof(originalGrid->template getEntitiesCount< Cell >()); - dof.setValue(0); + dof.setValue(0); meshFunctionptr->bind(originalGrid,dof); MeshFunctionEvaluator< MeshFunctionView, LinearFunctionType > linearFunctionEvaluator; Pointers::SharedPointer< LinearFunctionType, Host > linearFunctionPtr; linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr); - - //Prepare Mesh Function parts for Cut + + //Prepare Mesh Function parts for Cut Pointers::SharedPointer cutGrid; DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( - *meshFunctionptr,*cutGrid, cutDof, + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + *meshFunctionptr,*cutGrid, cutDof, StaticVector<1,int>(1), StaticVector<2,int>(0,2), StaticVector<2,typename CutMeshType::IndexType>(5,5) ); @@ -125,13 +123,13 @@ TEST(CutMeshFunction, 3D_1) ASSERT_TRUE(inCut)<<"nedistribuovaná meshfunction musí být vždy v řezu"; MeshFunctionView cutMeshFunction; - cutMeshFunction.bind(cutGrid,cutDof); + cutMeshFunction.bind(cutGrid,cutDof); for(int i=0;i<10;i++) { typename MeshType::Cell fromEntity(meshFunctionptr->getMesh()); typename CutMeshType::Cell outEntity(*cutGrid); - + fromEntity.getCoordinates().x()=5; fromEntity.getCoordinates().y()=i; fromEntity.getCoordinates().z()=5; @@ -154,12 +152,12 @@ TEST(CutMeshFunction, 3D_2) typedef typename MeshType::Cell Cell; typedef LinearFunction LinearFunctionType; - + //Original MeshFunciton --filed with linear function Pointers::SharedPointer originalGrid; Pointers::SharedPointer> meshFunctionptr; - + PointType origin; origin.setValue(-0.5); PointType proportions; @@ -169,18 +167,18 @@ TEST(CutMeshFunction, 3D_2) DofType dof(originalGrid->template getEntitiesCount< Cell >()); - dof.setValue(0); + dof.setValue(0); meshFunctionptr->bind(originalGrid,dof); MeshFunctionEvaluator< MeshFunctionView, LinearFunctionType > linearFunctionEvaluator; Pointers::SharedPointer< LinearFunctionType, Host > linearFunctionPtr; linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr); - - //Prepare Mesh Function parts for Cut + + //Prepare Mesh Function parts for Cut Pointers::SharedPointer cutGrid; DofType cutDof(0); - bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( - *meshFunctionptr,*cutGrid, cutDof, + bool inCut=CutMeshFunction,CutMeshType,DofType>::Cut( + *meshFunctionptr,*cutGrid, cutDof, StaticVector<2,int>(2,1), StaticVector<1,int>(0), StaticVector<1,typename CutMeshType::IndexType>(5) ); @@ -188,7 +186,7 @@ TEST(CutMeshFunction, 3D_2) ASSERT_TRUE(inCut)<<"nedistribuovaná meshfunction musí být vždy v řezu"; MeshFunctionView cutMeshFunction; - cutMeshFunction.bind(cutGrid,cutDof); + cutMeshFunction.bind(cutGrid,cutDof); for(int i=0;i<10;i++) { @@ -196,7 +194,7 @@ TEST(CutMeshFunction, 3D_2) { typename MeshType::Cell fromEntity(meshFunctionptr->getMesh()); typename CutMeshType::Cell outEntity(*cutGrid); - + fromEntity.getCoordinates().x()=5; fromEntity.getCoordinates().y()=j; fromEntity.getCoordinates().z()=i; diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h index 6b7c489af4acdf0596426409586b49fd22424178..11a85b68ded774ea4cbd8edc7ba49a5a18a64b88 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h +++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTest.h @@ -6,7 +6,6 @@ email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ -#include #include #include #include @@ -18,7 +17,6 @@ using namespace TNL::Containers; using namespace TNL::Meshes; using namespace TNL::Functions; -using namespace TNL::Communicators; using namespace TNL::Meshes::DistributedMeshes; @@ -186,8 +184,6 @@ class ParameterProvider<3,Device> //------------------------------------------------------------------------------ -typedef MpiCommunicator CommunicatorType; - template class TestDistributedGridIO { @@ -227,9 +223,9 @@ class TestDistributedGridIO overlap.setValue(1); DistributedGridType distributedGrid; distributedGrid.setDomainDecomposition( parameters.getDistr() ); - distributedGrid.template setGlobalGrid( globalGrid ); + distributedGrid.setGlobalGrid( globalGrid ); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); //std::cout << distributedGrid.printProcessDistr() < localGridptr; localGridptr->setDimensions(localProportions); @@ -313,14 +309,14 @@ class TestDistributedGridIO overlap.setValue(1); DistributedGridType distributedGrid; distributedGrid.setDomainDecomposition( parameters.getDistr() ); - distributedGrid.template setGlobalGrid( globalGrid ); + distributedGrid.setGlobalGrid( globalGrid ); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); //save files from local mesh - PointType localOrigin=parameters.getOrigin(CommunicatorType::GetRank(CommunicatorType::AllGroup)); - PointType localProportions=parameters.getProportions(CommunicatorType::GetRank(CommunicatorType::AllGroup));; + PointType localOrigin=parameters.getOrigin(TNL::MPI::GetRank()); + PointType localProportions=parameters.getProportions(TNL::MPI::GetRank()); Pointers::SharedPointer localGridptr; localGridptr->setDimensions(localProportions); @@ -355,7 +351,7 @@ class TestDistributedGridIO DistributedMeshSynchronizer< DistributedGridType > synchronizer; synchronizer.setDistributedGrid( &distributedGrid ); - synchronizer.template synchronize( *loadMeshFunctionptr ); //need synchronization for overlaps to be filled corectly in loadDof + synchronizer.synchronize( *loadMeshFunctionptr ); //need synchronization for overlaps to be filled corectly in loadDof //Crete "distributedgrid driven" grid filed by evaluated linear function Pointers::SharedPointer gridptr; @@ -367,7 +363,7 @@ class TestDistributedGridIO meshFunctionptr->bind(gridptr,dof); linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr); - synchronizer.template synchronize( *meshFunctionptr ); + synchronizer.synchronize( *meshFunctionptr ); for(int i=0;i #include #include -#include #include #include @@ -24,13 +23,10 @@ using namespace TNL::Containers; using namespace TNL::Meshes; using namespace TNL::Functions; -using namespace TNL::Communicators; using namespace TNL::Meshes::DistributedMeshes; //------------------------------------------------------------------------------ -typedef MpiCommunicator CommunicatorType; - template class TestDistributedGridMPIIO{ public: @@ -63,9 +59,9 @@ class TestDistributedGridMPIIO{ globalGrid->setDomain(globalOrigin,globalProportions); DistributedGridType distributedGrid; - distributedGrid.template setGlobalGrid( *globalGrid ); + distributedGrid.setGlobalGrid( *globalGrid ); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); ///std::cout << distributedGrid.printProcessDistr() < ::save(FileName, *meshFunctionptr ); //first process compare results - if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0) + if(TNL::MPI::GetRank()==0) { DofType globalEvaluatedDof(globalGrid->template getEntitiesCount< Cell >()); @@ -131,15 +127,15 @@ class TestDistributedGridMPIIO{ CoordinatesType overlap; overlap.setValue(1); DistributedGridType distributedGrid; - distributedGrid.template setGlobalGrid( *globalGrid ); + distributedGrid.setGlobalGrid( *globalGrid ); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); String FileName=String("test-file-mpiio-load.tnl"); //Prepare file - if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0) + if(TNL::MPI::GetRank()==0) { DofType saveDof(globalGrid->template getEntitiesCount< Cell >()); @@ -165,7 +161,7 @@ class TestDistributedGridMPIIO{ DistributedMeshSynchronizer< DistributedGridType > synchronizer; synchronizer.setDistributedGrid( &distributedGrid ); - synchronizer.template synchronize( *loadMeshFunctionptr ); //need synchronization for overlaps to be filled corectly in loadDof + synchronizer.synchronize( *loadMeshFunctionptr ); //need synchronization for overlaps to be filled corectly in loadDof Pointers::SharedPointer evalGridPtr; Pointers::SharedPointer evalMeshFunctionptr; @@ -176,14 +172,14 @@ class TestDistributedGridMPIIO{ evalMeshFunctionptr->bind(evalGridPtr,evalDof); linearFunctionEvaluator.evaluateAllEntities(evalMeshFunctionptr , linearFunctionPtr); - synchronizer.template synchronize( *evalMeshFunctionptr ); + synchronizer.synchronize( *evalMeshFunctionptr ); for(int i=0;i -#ifdef HAVE_MPI +#ifdef HAVE_MPI -#include #include #include #include @@ -26,7 +25,6 @@ using namespace TNL::Meshes; using namespace TNL::Meshes::DistributedMeshes; using namespace TNL::Functions; using namespace TNL::Devices; -using namespace TNL::Communicators; template @@ -44,13 +42,13 @@ void check_Boundary_1D(int rank, int nproc, const DofType& dof, typename DofType EXPECT_EQ( dof[0], expectedValue) << "Left boundary test failed"; return; } - + if(rank==(nproc-1))//Right { EXPECT_EQ( dof[dof.getSize()-1], expectedValue) << "Right boundary test failed"; return; } - + }; template @@ -61,15 +59,15 @@ void check_Overlap_1D(int rank, int nproc, const DofType& dof, typename DofType: EXPECT_EQ( dof[dof.getSize()-1], expectedValue) << "Left boundary node overlap test failed"; return; } - + if( rank == ( nproc - 1 ) ) { EXPECT_EQ( dof[0], expectedValue) << "Right boundary node overlap test failed"; return; } - + EXPECT_EQ( dof[0], expectedValue) << "left overlap test failed"; - EXPECT_EQ( dof[dof.getSize()-1], expectedValue)<< "right overlap test failed"; + EXPECT_EQ( dof[dof.getSize()-1], expectedValue)<< "right overlap test failed"; }; template @@ -80,25 +78,24 @@ void check_Inner_1D(int rank, int nproc, const DofType& dof, typename DofType::R }; /* - * Light check of 1D distributed grid and its synchronization. + * Light check of 1D distributed grid and its synchronization. * Number of process is not limited. * Overlap is limited to 1 * Only double is tested as dof Real type -- it may be changed, extend test * Global size is hardcoded as 10 -- it can be changed, extend test */ -typedef MpiCommunicator CommunicatorType; typedef Grid<1,double,Host,int> GridType; typedef MeshFunctionView< GridType > MeshFunctionType; typedef MeshFunctionView< GridType, GridType::getMeshDimension(), bool > MaskType; typedef Vector< double,Host,int> DofType; typedef Vector< bool, Host, int > MaskDofType; typedef typename GridType::Cell Cell; -typedef typename GridType::IndexType IndexType; -typedef typename GridType::PointType PointType; +typedef typename GridType::IndexType IndexType; +typedef typename GridType::PointType PointType; typedef DistributedMesh DistributedGridType; using Synchronizer = DistributedMeshSynchronizer< DistributedGridType >; - + class DistributedGridTest_1D : public ::testing::Test { protected: @@ -123,14 +120,14 @@ class DistributedGridTest_1D : public ::testing::Test void SetUp() { int size=10; - rank=CommunicatorType::GetRank(CommunicatorType::AllGroup); - nproc=CommunicatorType::GetSize(CommunicatorType::AllGroup); + rank=TNL::MPI::GetRank(); + nproc=TNL::MPI::GetSize(); PointType globalOrigin; PointType globalProportions; GridType globalGrid; - globalOrigin.x()=-0.5; + globalOrigin.x()=-0.5; globalProportions.x()=size; @@ -142,9 +139,9 @@ class DistributedGridTest_1D : public ::testing::Test distributedGrid=new DistributedGridType(); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - distributedGrid->template setGlobalGrid( globalGrid ); - //distributedGrid->setupGrid(*gridptr); - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + distributedGrid->setGlobalGrid( globalGrid ); + //distributedGrid->setupGrid(*gridptr); + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); @@ -155,14 +152,14 @@ class DistributedGridTest_1D : public ::testing::Test constFunctionPtr->Number=rank; } - + void SetUpPeriodicBoundaries() { typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); - distributedGrid->setupGrid(*gridptr); + distributedGrid->setupGrid(*gridptr); } void TearDown() @@ -209,7 +206,7 @@ TEST_F(DistributedGridTest_1D, evaluateInteriorEntities) check_Boundary_1D(rank, nproc, dof, -1); check_Overlap_1D(rank, nproc, dof, -1); check_Inner_1D(rank, nproc, dof, rank); -} +} TEST_F(DistributedGridTest_1D, SynchronizerNeighborsTest ) { @@ -217,7 +214,7 @@ TEST_F(DistributedGridTest_1D, SynchronizerNeighborsTest ) constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr ); Synchronizer synchronizer; synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() ); - synchronizer.template synchronize( *meshFunctionPtr ); + synchronizer.synchronize( *meshFunctionPtr ); if(rank!=0) { EXPECT_EQ((dof)[0],rank-1)<< "Left Overlap was filled by wrong process."; @@ -229,12 +226,12 @@ TEST_F(DistributedGridTest_1D, SynchronizerNeighborsTest ) TEST_F(DistributedGridTest_1D, EvaluateLinearFunction ) { - //fill mesh function with linear function (physical center of cell corresponds with its coordinates in grid) + //fill mesh function with linear function (physical center of cell corresponds with its coordinates in grid) setDof_1D(dof,-1); linearFunctionEvaluator.evaluateAllEntities(meshFunctionPtr, linearFunctionPtr); Synchronizer synchronizer; synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() ); - synchronizer.template synchronize( *meshFunctionPtr ); + synchronizer.synchronize( *meshFunctionPtr ); auto entity = gridptr->template getEntity< Cell >(0); entity.refresh(); @@ -250,7 +247,7 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithoutMask ) // Setup periodic boundaries // TODO: I do not know how to do it better with GTEST typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridptr); @@ -258,13 +255,13 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithoutMask ) maskDofs.setSize( gridptr->template getEntitiesCount< Cell >() ); meshFunctionPtr->bind( gridptr, dof ); maskPointer->bind( gridptr, maskDofs ); - + setDof_1D( dof, -rank-1 ); maskDofs.setValue( true ); //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); Synchronizer synchronizer; synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() ); - synchronizer.template synchronize( *meshFunctionPtr, true ); + synchronizer.synchronize( *meshFunctionPtr, true ); if( rank == 0 ) { EXPECT_EQ( dof[ 0 ], -nproc ) << "Left Overlap was filled by wrong process."; @@ -279,7 +276,7 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithActiveMask ) // Setup periodic boundaries // TODO: I do not know how to do it better with GTEST typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridptr); @@ -287,14 +284,14 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithActiveMask ) maskDofs.setSize( gridptr->template getEntitiesCount< Cell >() ); meshFunctionPtr->bind( gridptr, dof ); maskPointer->bind( gridptr, maskDofs ); - + setDof_1D( dof, -rank-1 ); maskDofs.setValue( true ); //constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr, constFunctionPtr ); //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); Synchronizer synchronizer; synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() ); - synchronizer.template synchronize( *meshFunctionPtr, true, maskPointer ); + synchronizer.synchronize( *meshFunctionPtr, true, maskPointer ); if( rank == 0 ) { EXPECT_EQ( dof[ 0 ], -nproc ) << "Left Overlap was filled by wrong process."; } @@ -310,7 +307,7 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithInactiveMaskOnLef // Setup periodic boundaries // TODO: I do not know how to do it better with GTEST typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridptr); @@ -325,9 +322,9 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithInactiveMaskOnLef //constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr ); //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); TNL_MPI_PRINT( "#### " << dof ); - meshFunctionPtr->template synchronize( true, maskPointer ); + meshFunctionPtr->synchronize( true, maskPointer ); TNL_MPI_PRINT( ">>> " << dof ); - + if( rank == 0 ) EXPECT_EQ( dof[ 0 ], 0 ) << "Left Overlap was filled by wrong process."; if( rank == nproc-1 ) @@ -339,7 +336,7 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithInactiveMask ) // Setup periodic boundaries // TODO: I do not know how to do it better with GTEST typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridptr); @@ -350,27 +347,27 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicNeighborsWithInactiveMask ) setDof_1D( dof, -rank-1 ); maskDofs.setValue( true ); - maskDofs.setElement( 1, false ); + maskDofs.setElement( 1, false ); maskDofs.setElement( dof.getSize() - 2, false ); //constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr ); //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); - meshFunctionPtr->template synchronize( true, maskPointer ); - + meshFunctionPtr->synchronize( true, maskPointer ); + if( rank == 0 ) EXPECT_EQ( dof[ 0 ], 0 ) << "Left Overlap was filled by wrong process."; if( rank == nproc-1 ) - EXPECT_EQ( dof[ dof.getSize() - 1 ], nproc - 1 )<< "Right Overlap was filled by wrong process."; - + EXPECT_EQ( dof[ dof.getSize() - 1 ], nproc - 1 )<< "Right Overlap was filled by wrong process."; + } */ TEST_F(DistributedGridTest_1D, SynchronizePeriodicBoundariesLinearTest ) { // Setup periodic boundaries - // TODO: I do not know how to do it better with GTEST - additional setup + // TODO: I do not know how to do it better with GTEST - additional setup // of the periodic boundaries typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridptr); @@ -382,13 +379,13 @@ TEST_F(DistributedGridTest_1D, SynchronizePeriodicBoundariesLinearTest ) Synchronizer synchronizer; synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() ); - synchronizer.template synchronize( *meshFunctionPtr, true ); + synchronizer.synchronize( *meshFunctionPtr, true ); auto entity = gridptr->template getEntity< Cell >( 0 ); auto entity2= gridptr->template getEntity< Cell >( (dof).getSize() - 1 ); entity.refresh(); entity2.refresh(); - + if( rank == 0 ) { EXPECT_EQ( meshFunctionPtr->getValue(entity), 9 ) << "Linear function Overlap error on left Edge."; } diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp index 71370cae2d0530f871cc4148ac0f06b58696e3c0..1f02dd2364e25a3fa40cf457f9e3f7ad353199ac 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp @@ -7,14 +7,13 @@ ***************************************************************************/ -#ifdef HAVE_GTEST +#ifdef HAVE_GTEST #include -#ifdef HAVE_MPI +#ifdef HAVE_MPI #include #include -#include #include #include @@ -25,10 +24,9 @@ using namespace TNL::Containers; using namespace TNL::Meshes; using namespace TNL::Functions; using namespace TNL::Devices; -using namespace TNL::Communicators; using namespace TNL::Meshes::DistributedMeshes; - + template void setDof_2D( DofType &dof, typename DofType::RealType value ) @@ -46,7 +44,7 @@ void checkLeftEdge( const GridType &grid, const DofType &dof, bool with_first, b int end = maxy; if( !with_first ) begin++; if( !with_last ) end--; - + for( int i=begin;i void check_Boundary_2D(int rank, const GridType &grid, const DofType &dof, typename DofType::RealType expectedValue) -{ +{ if(rank==0)//Up Left { checkUpEdge(grid,dof,true,false,expectedValue);//posledni je overlap checkLeftEdge(grid,dof,true,false, expectedValue);//posledni je overlap } - + if(rank==1)//Up Center { checkUpEdge(grid,dof,false,false, expectedValue);//prvni a posledni je overlap } - + if(rank==2)//Up Right { checkUpEdge(grid,dof,false,true,expectedValue);//prvni je overlap checkRightEdge(grid,dof,true,false,expectedValue);//posledni je overlap } - + if(rank==3)//Center Left { checkLeftEdge(grid,dof,false,false,expectedValue);//prvni a posledni je overlap } - + if(rank==4)//Center Center { //No boundary } - + if(rank==5)//Center Right { checkRightEdge(grid,dof,false,false,expectedValue); } - + if(rank==6)//Down Left { checkDownEdge(grid,dof,true,false,expectedValue); checkLeftEdge(grid,dof,false,true,expectedValue); } - + if(rank==7) //Down Center { checkDownEdge(grid,dof,false,false,expectedValue); } - + if(rank==8) //Down Right { checkDownEdge(grid,dof,false,true,expectedValue); @@ -241,27 +239,27 @@ void check_Overlap_2D(int rank, const GridType &grid, const DofType &dof, typena checkRightEdge(grid,dof,false,true,expectedValue); checkDownEdge(grid,dof,false,true,expectedValue); } - + if(rank==1)//Up Center { checkDownEdge(grid,dof,true,true,expectedValue); checkLeftEdge(grid,dof,false,true,expectedValue); checkRightEdge(grid,dof,false,true,expectedValue); } - + if(rank==2)//Up Right { checkDownEdge(grid,dof,true,false,expectedValue);//prvni je overlap checkLeftEdge(grid,dof,false,true,expectedValue); } - + if(rank==3)//Center Left { checkUpEdge(grid,dof,false,true,expectedValue); checkDownEdge(grid,dof,false,true,expectedValue); checkRightEdge(grid,dof,true,true,expectedValue); } - + if(rank==4)//Center Center { checkUpEdge(grid,dof,true,true,expectedValue); @@ -269,27 +267,27 @@ void check_Overlap_2D(int rank, const GridType &grid, const DofType &dof, typena checkRightEdge(grid,dof,true,true,expectedValue); checkLeftEdge(grid,dof,true,true,expectedValue); } - + if(rank==5)//Center Right { checkUpEdge(grid,dof,true,false,expectedValue); checkDownEdge(grid,dof,true,false,expectedValue); checkLeftEdge(grid,dof,true,true,expectedValue); } - + if(rank==6)//Down Left { checkUpEdge(grid,dof,false,true,expectedValue); checkRightEdge(grid,dof,true,false,expectedValue); } - + if(rank==7) //Down Center { checkUpEdge(grid,dof,true,true,expectedValue); checkLeftEdge(grid,dof,true,false,expectedValue); checkRightEdge(grid,dof,true,false,expectedValue); } - + if(rank==8) //Down Right { checkUpEdge(grid,dof,true,false,expectedValue); @@ -310,26 +308,25 @@ void check_Inner_2D(int rank, const GridType& grid, const DofType& dof, typename } /* - * Light check of 2D distributed grid and its synchronization. + * Light check of 2D distributed grid and its synchronization. * expected 9 processes */ -typedef MpiCommunicator CommunicatorType; typedef Grid<2,double,Host,int> GridType; typedef MeshFunctionView MeshFunctionType; typedef MeshFunctionView< GridType, GridType::getMeshDimension(), bool > MaskType; typedef Vector DofType; typedef Vector< bool, Host, int > MaskDofType; typedef typename GridType::Cell Cell; -typedef typename GridType::IndexType IndexType; -typedef typename GridType::PointType PointType; +typedef typename GridType::IndexType IndexType; +typedef typename GridType::PointType PointType; typedef DistributedMesh DistributedGridType; using Synchronizer = DistributedMeshSynchronizer< DistributedGridType >; class DistributedGridTest_2D : public ::testing::Test { - + public: - + using CoordinatesType = typename GridType::CoordinatesType; DistributedGridType *distributedGrid; @@ -347,20 +344,20 @@ class DistributedGridTest_2D : public ::testing::Test Pointers::SharedPointer< LinearFunction, Host > linearFunctionPtr; int rank; - int nproc; + int nproc; void SetUp() { int size=10; - rank=CommunicatorType::GetRank(CommunicatorType::AllGroup); - nproc=CommunicatorType::GetSize(CommunicatorType::AllGroup); + rank=TNL::MPI::GetRank(); + nproc=TNL::MPI::GetSize(); PointType globalOrigin; PointType globalProportions; GridType globalGrid; globalOrigin.x()=-0.5; - globalOrigin.y()=-0.5; + globalOrigin.y()=-0.5; globalProportions.x()=size; globalProportions.y()=size; @@ -369,9 +366,9 @@ class DistributedGridTest_2D : public ::testing::Test distributedGrid=new DistributedGridType(); distributedGrid->setDomainDecomposition( typename DistributedGridType::CoordinatesType( 3, 3 ) ); - distributedGrid->template setGlobalGrid( globalGrid ); + distributedGrid->setGlobalGrid( globalGrid ); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridPtr); @@ -422,17 +419,17 @@ TEST_F(DistributedGridTest_2D, evaluateInteriorEntities) check_Boundary_2D(rank, *gridPtr, *dof, -1); check_Overlap_2D(rank, *gridPtr, *dof, -1); check_Inner_2D(rank, *gridPtr, *dof, rank); -} +} TEST_F(DistributedGridTest_2D, LinearFunctionTest) { - //fill meshfunction with linear function (physical center of cell corresponds with its coordinates in grid) + //fill meshfunction with linear function (physical center of cell corresponds with its coordinates in grid) setDof_2D(*dof,-1); linearFunctionEvaluator.evaluateAllEntities(meshFunctionPtr, linearFunctionPtr); Synchronizer synchronizer; synchronizer.setDistributedGrid( meshFunctionPtr->getMesh().getDistributedMesh() ); - synchronizer.template synchronize( *meshFunctionPtr ); - + synchronizer.synchronize( *meshFunctionPtr ); + int count =gridPtr->template getEntitiesCount< Cell >(); for(int i=0;igetMesh().getDistributedMesh() ); - synchronizer.template synchronize( *meshFunctionPtr ); - + synchronizer.synchronize( *meshFunctionPtr ); + // checkNeighbor_2D(rank, *gridPtr, *dof); - + if(rank==0)//Up Left { checkRightEdge(*gridPtr, *dof, true, false, 1 ); checkDownEdge( *gridPtr, *dof, true, false, 3 ); checkCorner( *gridPtr, *dof, false, false, 4 ); } - + if(rank==1)//Up Center { checkLeftEdge( *gridPtr, *dof, true, false, 0 ); @@ -468,14 +465,14 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest ) checkDownEdge( *gridPtr, *dof, false, false, 4 ); checkCorner( *gridPtr, *dof, false, false, 5 ); } - + if(rank==2)//Up Right { checkLeftEdge( *gridPtr, *dof, true, false, 1 ); checkCorner( *gridPtr, *dof, false, true, 4 ); checkDownEdge( *gridPtr, *dof, false, true, 5 ); } - + if(rank==3)//Center Left { checkUpEdge( *gridPtr, *dof, true, false, 0 ); @@ -484,7 +481,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest ) checkDownEdge( *gridPtr, *dof, true, false, 6 ); checkCorner( *gridPtr, *dof, false, false, 7 ); } - + if(rank==4)//Center Center { checkCorner( *gridPtr, *dof, true, true, 0 ); @@ -496,7 +493,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest ) checkDownEdge( *gridPtr, *dof, false, false, 7 ); checkCorner( *gridPtr, *dof, false, false, 8 ); } - + if(rank==5)//Center Right { checkCorner( *gridPtr, *dof, true, true, 1 ); @@ -505,14 +502,14 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest ) checkCorner( *gridPtr, *dof, false, true, 7 ); checkDownEdge( *gridPtr, *dof, false, true, 8 ); } - + if(rank==6)//Down Left { checkUpEdge( *gridPtr, *dof, true, false, 3 ); checkCorner( *gridPtr, *dof, true, false, 4 ); checkRightEdge( *gridPtr, *dof, false, true, 7 ); } - + if(rank==7) //Down Center { checkCorner( *gridPtr, *dof, true, true, 3 ); @@ -521,77 +518,77 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborTest ) checkLeftEdge( *gridPtr, *dof, false, true, 6 ); checkRightEdge( *gridPtr, *dof, false, true, 8 ); } - + if(rank==8) //Down Right { checkCorner( *gridPtr, *dof, true, true, 4 ); checkUpEdge( *gridPtr, *dof, false, true, 5 ); checkLeftEdge( *gridPtr, *dof, false, true, 7 ); - } + } } -// TODO: Fix tests for periodic BC - +// TODO: Fix tests for periodic BC - // checkLeftBoundary -> checkLeft Overlap etc. for direction BoundaryToOverlap // Fix the tests with mask to work with the direction OverlapToBoundary /* TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithoutMask ) { // Setup periodic boundaries - // TODO: I do not know how to do it better with GTEST - additional setup + // TODO: I do not know how to do it better with GTEST - additional setup // of the periodic boundaries typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridPtr); dof->setSize( gridPtr->template getEntitiesCount< Cell >() ); meshFunctionPtr->bind( gridPtr, *dof ); - + //Expecting 9 processes setDof_2D(*dof, -rank-1 ); constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr ); //meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); - meshFunctionPtr->template synchronize( true ); - + meshFunctionPtr->synchronize( true ); + if( rank == 0 ) { SCOPED_TRACE( "Up Left" ); checkLeftBoundary( *gridPtr, *dof, false, true, -3 ); checkUpBoundary( *gridPtr, *dof, false, true, -7 ); } - + if( rank == 1 ) { SCOPED_TRACE( "Up Center" ); checkUpBoundary( *gridPtr, *dof, true, true, -8 ); } - + if( rank == 2 ) { SCOPED_TRACE( "Up Right" ); checkRightBoundary( *gridPtr, *dof, false, true, -1 ); checkUpBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 3 ) { SCOPED_TRACE( "Center Left" ); checkLeftBoundary( *gridPtr, *dof, true, true, -6 ); - } - + } + if( rank == 5 ) { SCOPED_TRACE( "Center Right" ); checkRightBoundary( *gridPtr, *dof, true, true, -4 ); } - + if( rank == 6 ) { SCOPED_TRACE( "Down Left" ); checkDownBoundary( *gridPtr, *dof, false, true, -1 ); checkLeftBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 7 ) { SCOPED_TRACE( "Down Center" ); @@ -609,10 +606,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithoutMask TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithActiveMask ) { // Setup periodic boundaries - // TODO: I do not know how to do it better with GTEST - additional setup + // TODO: I do not know how to do it better with GTEST - additional setup // of the periodic boundaries typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridPtr); @@ -620,13 +617,13 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithActiveM maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() ); meshFunctionPtr->bind( gridPtr, *dof ); maskPointer->bind( gridPtr, maskDofs ); - + //Expecting 9 processes setDof_2D(*dof, -rank-1 ); maskDofs.setValue( true ); constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr ); meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); - meshFunctionPtr->template synchronize( true, maskPointer ); + meshFunctionPtr->synchronize( true, maskPointer ); if( rank == 0 ) { @@ -634,39 +631,39 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithActiveM checkLeftBoundary( *gridPtr, *dof, false, true, -3 ); checkUpBoundary( *gridPtr, *dof, false, true, -7 ); } - + if( rank == 1 ) { SCOPED_TRACE( "Up Center" ); checkUpBoundary( *gridPtr, *dof, true, true, -8 ); } - + if( rank == 2 ) { SCOPED_TRACE( "Up Right" ); checkRightBoundary( *gridPtr, *dof, false, true, -1 ); checkUpBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 3 ) { SCOPED_TRACE( "Center Left" ); checkLeftBoundary( *gridPtr, *dof, true, true, -6 ); - } - + } + if( rank == 5 ) { SCOPED_TRACE( "Center Right" ); checkRightBoundary( *gridPtr, *dof, true, true, -4 ); } - + if( rank == 6 ) { SCOPED_TRACE( "Down Left" ); checkDownBoundary( *gridPtr, *dof, false, true, -1 ); checkLeftBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 7 ) { SCOPED_TRACE( "Down Center" ); @@ -684,10 +681,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithActiveM TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInactiveMaskOnLeft ) { // Setup periodic boundaries - // TODO: I do not know how to do it better with GTEST - additional setup + // TODO: I do not know how to do it better with GTEST - additional setup // of the periodic boundaries typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridPtr); @@ -695,7 +692,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInactiv maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() ); meshFunctionPtr->bind( gridPtr, *dof ); maskPointer->bind( gridPtr, maskDofs ); - + //Expecting 9 processes setDof_2D(*dof, -rank-1 ); maskDofs.setValue( true ); @@ -711,47 +708,47 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInactiv } constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr ); meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); - meshFunctionPtr->template synchronize( true, maskPointer ); - + meshFunctionPtr->synchronize( true, maskPointer ); + if( rank == 0 ) { SCOPED_TRACE( "Up Left" ); checkLeftBoundary( *gridPtr, *dof, false, true, 0 ); checkUpBoundary( *gridPtr, *dof, false, true, -7 ); } - + if( rank == 1 ) { SCOPED_TRACE( "Up Center" ); checkUpBoundary( *gridPtr, *dof, true, true, -8 ); } - + if( rank == 2 ) { SCOPED_TRACE( "Up Right" ); checkRightBoundary( *gridPtr, *dof, false, true, -1 ); checkUpBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 3 ) { SCOPED_TRACE( "Center Left" ); checkLeftBoundary( *gridPtr, *dof, true, true, 3 ); - } - + } + if( rank == 5 ) { SCOPED_TRACE( "Center Right" ); checkRightBoundary( *gridPtr, *dof, true, true, -4 ); } - + if( rank == 6 ) { SCOPED_TRACE( "Down Left" ); checkDownBoundary( *gridPtr, *dof, false, true, -1 ); checkLeftBoundary( *gridPtr, *dof, true, false, 6 ); } - + if( rank == 7 ) { SCOPED_TRACE( "Down Center" ); @@ -769,10 +766,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInactiv TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiveMaskOnRight ) { // Setup periodic boundaries - // TODO: I do not know how to do it better with GTEST - additional setup + // TODO: I do not know how to do it better with GTEST - additional setup // of the periodic boundaries typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridPtr); @@ -780,7 +777,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() ); meshFunctionPtr->bind( gridPtr, *dof ); maskPointer->bind( gridPtr, maskDofs ); - + //Expecting 9 processes setDof_2D(*dof, -rank-1 ); maskDofs.setValue( true ); @@ -796,47 +793,47 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv } constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr ); meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); - meshFunctionPtr->template synchronize( true, maskPointer ); - + meshFunctionPtr->synchronize( true, maskPointer ); + if( rank == 0 ) { SCOPED_TRACE( "Up Left" ); checkLeftBoundary( *gridPtr, *dof, false, true, -3 ); checkUpBoundary( *gridPtr, *dof, false, true, -7 ); } - + if( rank == 1 ) { SCOPED_TRACE( "Up Center" ); checkUpBoundary( *gridPtr, *dof, true, true, -8 ); } - + if( rank == 2 ) { SCOPED_TRACE( "Up Right" ); checkRightBoundary( *gridPtr, *dof, false, true, 2 ); checkUpBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 3 ) { SCOPED_TRACE( "Center Left" ); checkLeftBoundary( *gridPtr, *dof, true, true, -6 ); - } - + } + if( rank == 5 ) { SCOPED_TRACE( "Center Right" ); checkRightBoundary( *gridPtr, *dof, true, true, 5 ); } - + if( rank == 6 ) { SCOPED_TRACE( "Down Left" ); checkDownBoundary( *gridPtr, *dof, false, true, -1 ); checkLeftBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 7 ) { SCOPED_TRACE( "Down Center" ); @@ -854,10 +851,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiveMaskUp ) { // Setup periodic boundaries - // TODO: I do not know how to do it better with GTEST - additional setup + // TODO: I do not know how to do it better with GTEST - additional setup // of the periodic boundaries typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridPtr); @@ -865,7 +862,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() ); meshFunctionPtr->bind( gridPtr, *dof ); maskPointer->bind( gridPtr, maskDofs ); - + //Expecting 9 processes setDof_2D(*dof, -rank-1 ); maskDofs.setValue( true ); @@ -881,47 +878,47 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv } constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr ); meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); - meshFunctionPtr->template synchronize( true, maskPointer ); - + meshFunctionPtr->synchronize( true, maskPointer ); + if( rank == 0 ) { SCOPED_TRACE( "Up Left" ); checkLeftBoundary( *gridPtr, *dof, false, true, -3 ); checkUpBoundary( *gridPtr, *dof, false, true, 0 ); } - + if( rank == 1 ) { SCOPED_TRACE( "Up Center" ); checkUpBoundary( *gridPtr, *dof, true, true, 1 ); } - + if( rank == 2 ) { SCOPED_TRACE( "Up Right" ); checkRightBoundary( *gridPtr, *dof, false, true, -1 ); checkUpBoundary( *gridPtr, *dof, true, false, 2 ); } - + if( rank == 3 ) { SCOPED_TRACE( "Center Left" ); checkLeftBoundary( *gridPtr, *dof, true, true, -6 ); - } - + } + if( rank == 5 ) { SCOPED_TRACE( "Center Right" ); checkRightBoundary( *gridPtr, *dof, true, true, -4 ); } - + if( rank == 6 ) { SCOPED_TRACE( "Down Left" ); checkDownBoundary( *gridPtr, *dof, false, true, -1 ); checkLeftBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 7 ) { SCOPED_TRACE( "Down Center" ); @@ -939,10 +936,10 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiveMaskDown ) { // Setup periodic boundaries - // TODO: I do not know how to do it better with GTEST - additional setup + // TODO: I do not know how to do it better with GTEST - additional setup // of the periodic boundaries typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1, 1, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridPtr); @@ -950,7 +947,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv maskDofs.setSize( gridPtr->template getEntitiesCount< Cell >() ); meshFunctionPtr->bind( gridPtr, *dof ); maskPointer->bind( gridPtr, maskDofs ); - + //Expecting 9 processes setDof_2D(*dof, -rank-1 ); maskDofs.setValue( true ); @@ -966,47 +963,47 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv } constFunctionEvaluator.evaluateAllEntities( meshFunctionPtr , constFunctionPtr ); meshFunctionPtr->getSynchronizer().setPeriodicBoundariesCopyDirection( Synchronizer::OverlapToBoundary ); - meshFunctionPtr->template synchronize( true, maskPointer ); - + meshFunctionPtr->synchronize( true, maskPointer ); + if( rank == 0 ) { SCOPED_TRACE( "Up Left" ); checkLeftBoundary( *gridPtr, *dof, false, true, -3 ); checkUpBoundary( *gridPtr, *dof, false, true, -7 ); } - + if( rank == 1 ) { SCOPED_TRACE( "Up Center" ); checkUpBoundary( *gridPtr, *dof, true, true, -8 ); } - + if( rank == 2 ) { SCOPED_TRACE( "Up Right" ); checkRightBoundary( *gridPtr, *dof, false, true, -1 ); checkUpBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 3 ) { SCOPED_TRACE( "Center Left" ); checkLeftBoundary( *gridPtr, *dof, true, true, -6 ); - } - + } + if( rank == 5 ) { SCOPED_TRACE( "Center Right" ); checkRightBoundary( *gridPtr, *dof, true, true, -4 ); } - + if( rank == 6 ) { SCOPED_TRACE( "Down Left" ); checkDownBoundary( *gridPtr, *dof, false, true, 6 ); checkLeftBoundary( *gridPtr, *dof, true, false, -9 ); } - + if( rank == 7 ) { SCOPED_TRACE( "Down Center" ); @@ -1020,7 +1017,7 @@ TEST_F(DistributedGridTest_2D, SynchronizerNeighborPeriodicBoundariesWithInActiv checkRightBoundary( *gridPtr, *dof, true, false, -7 ); } } -*/ +*/ #endif #endif diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp index 765341c1e5c2361c113ee0c2a83a0e435a3ebf3c..4f552dee5455c576111c60d783aee34d45aeb213 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp @@ -1,9 +1,8 @@ -#ifdef HAVE_GTEST +#ifdef HAVE_GTEST #include -#ifdef HAVE_MPI +#ifdef HAVE_MPI -#include #include #include #include @@ -16,8 +15,7 @@ using namespace TNL::Containers; using namespace TNL::Meshes; using namespace TNL::Functions; using namespace TNL::Devices; -using namespace TNL::Communicators; -using namespace TNL::Meshes::DistributedMeshes; +using namespace TNL::Meshes::DistributedMeshes; template void setDof_3D(DofType &dof, typename DofType::RealType value) @@ -49,14 +47,14 @@ void checkConner(const GridType &grid, const DofType &dof,bool bottom, bool nort { int i=getAdd(grid,bottom,north,west); EXPECT_EQ( dof[i], expectedValue) << "Conner test failed"; - + } template void checkXDirectionEdge(const GridType &grid, const DofType &dof, bool bottom, bool north, typename DofType::RealType expectedValue) { - int add=getAdd(grid,bottom,north,true); - for(int i=1;i void checkYDirectionEdge(const GridType &grid, const DofType &dof, bool bottom, bool west, typename DofType::RealType expectedValue) { int add=getAdd(grid,bottom,true,west); - for(int i=1;i void checkZDirectionEdge(const GridType &grid, const DofType &dof, bool north, bool west, typename DofType::RealType expectedValue) { int add=getAdd(grid,true,north,west); - for(int i=1;i @@ -429,7 +427,7 @@ void CheckYFaceNode_Overlap(const GridType &grid, const DofType &dof,bool north, checkXFace(grid, dof, true, expectedValue); checkYFace(grid, dof, !north, expectedValue); checkZFace(grid, dof, false, expectedValue); - checkZFace(grid, dof, true, expectedValue); + checkZFace(grid, dof, true, expectedValue); } template @@ -451,7 +449,7 @@ void CheckZFaceNode_Overlap(const GridType &grid, const DofType &dof,bool bottom checkXFace(grid, dof, true, expectedValue); checkYFace(grid, dof, false, expectedValue); checkYFace(grid, dof, true, expectedValue); - checkZFace(grid, dof, !bottom, expectedValue); + checkZFace(grid, dof, !bottom, expectedValue); } template @@ -484,11 +482,11 @@ void CheckCentralNode_Overlap(const GridType &grid, const DofType &dof,typename checkYFace(grid, dof, false, expectedValue); checkYFace(grid, dof, true, expectedValue); checkZFace(grid, dof, false, expectedValue); - checkZFace(grid, dof, true, expectedValue); + checkZFace(grid, dof, true, expectedValue); } /* -* Expected 27 processes. +* Expected 27 processes. */ template void check_Overlap_3D(int rank, const GridType &grid, const DofType &dof, typename DofType::RealType expectedValue) @@ -499,7 +497,7 @@ void check_Overlap_3D(int rank, const GridType &grid, const DofType &dof, typena if(rank==1) CheckXEdgeNode_Overlap(grid,dof,true,true,expectedValue); - if(rank==2) + if(rank==2) CheckConnerNode_Overlap(grid,dof,true,true,false,expectedValue); if(rank==3) @@ -553,7 +551,7 @@ void check_Overlap_3D(int rank, const GridType &grid, const DofType &dof, typena if(rank==19) CheckXEdgeNode_Overlap(grid,dof,false,true,expectedValue); - if(rank==20) + if(rank==20) CheckConnerNode_Overlap(grid,dof,false,true,false,expectedValue); if(rank==21) @@ -590,19 +588,18 @@ void check_Inner_3D(int rank, const GridType& grid, const DofType& dof, typename /* - * Light check of 3D distributed grid and its synchronization. + * Light check of 3D distributed grid and its synchronization. * expected 27 processes */ -typedef MpiCommunicator CommunicatorType; typedef Grid<3,double,Host,int> GridType; typedef MeshFunctionView MeshFunctionType; typedef Vector DofType; typedef typename GridType::Cell Cell; -typedef typename GridType::IndexType IndexType; -typedef typename GridType::PointType PointType; +typedef typename GridType::IndexType IndexType; +typedef typename GridType::PointType PointType; typedef DistributedMesh DistributedGridType; using Synchronizer = DistributedMeshSynchronizer< DistributedGridType >; - + class DistributedGirdTest_3D : public ::testing::Test { protected: @@ -620,14 +617,14 @@ class DistributedGirdTest_3D : public ::testing::Test Pointers::SharedPointer< LinearFunction, Host > linearFunctionPtr; int rank; - int nproc; + int nproc; void SetUp() { int size=10; - rank=CommunicatorType::GetRank(CommunicatorType::AllGroup); - nproc=CommunicatorType::GetSize(CommunicatorType::AllGroup); + rank=TNL::MPI::GetRank(); + nproc=TNL::MPI::GetSize(); PointType globalOrigin; PointType globalProportions; @@ -635,7 +632,7 @@ class DistributedGirdTest_3D : public ::testing::Test globalOrigin.x()=-0.5; globalOrigin.y()=-0.5; - globalOrigin.z()=-0.5; + globalOrigin.z()=-0.5; globalProportions.x()=size; globalProportions.y()=size; globalProportions.z()=size; @@ -645,17 +642,17 @@ class DistributedGirdTest_3D : public ::testing::Test distributedGrid=new DistributedGridType(); distributedGrid->setDomainDecomposition( typename DistributedGridType::CoordinatesType( 3, 3, 3 ) ); - distributedGrid->template setGlobalGrid( globalGrid ); - distributedGrid->setupGrid(*gridptr); + distributedGrid->setGlobalGrid( globalGrid ); + distributedGrid->setupGrid(*gridptr); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< GridType, CommunicatorType >:: + SubdomainOverlapsGetter< GridType >:: getOverlaps( distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid->setOverlaps( lowerOverlap, upperOverlap ); distributedGrid->setupGrid(*gridptr); dof=new DofType(gridptr->template getEntitiesCount< Cell >()); - meshFunctionptr->bind(gridptr,*dof); + meshFunctionptr->bind(gridptr,*dof); constFunctionPtr->Number=rank; } @@ -697,17 +694,17 @@ TEST_F(DistributedGirdTest_3D, evaluateInteriorEntities) check_Boundary_3D(rank, *gridptr, *dof, -1); check_Overlap_3D(rank, *gridptr, *dof, -1); check_Inner_3D(rank, *gridptr, *dof, rank); -} +} TEST_F(DistributedGirdTest_3D, LinearFunctionTest) { - //fill meshfunction with linear function (physical center of cell corresponds with its coordinates in grid) + //fill meshfunction with linear function (physical center of cell corresponds with its coordinates in grid) setDof_3D(*dof,-1); linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr, linearFunctionPtr); Synchronizer synchronizer; synchronizer.setDistributedGrid( meshFunctionptr->getMesh().getDistributedMesh() ); - synchronizer.template synchronize( *meshFunctionptr ); - + synchronizer.synchronize( *meshFunctionptr ); + int count =gridptr->template getEntitiesCount< Cell >(); for(int i=0;i #include #include -#include -#include #include #include #include @@ -33,9 +31,6 @@ using namespace TNL::Meshes::DistributedMeshes; // cannot be deduced from the grid using LocalIndexType = short int; -// we test only with MPI -using CommunicatorType = Communicators::MpiCommunicator; -using CommunicationGroup = typename CommunicatorType::CommunicationGroup; template< typename Mesh > struct GridDistributor; @@ -55,9 +50,9 @@ struct GridDistributor< TNL::Meshes::Grid< 2, Real, Device, Index > > GridDistributor() = delete; - GridDistributor( CoordinatesType rank_sizes, CommunicationGroup group ) - : rank(CommunicatorType::GetRank(group)), - nproc(CommunicatorType::GetSize(group)), + GridDistributor( CoordinatesType rank_sizes, MPI_Comm group ) + : rank(TNL::MPI::GetRank(group)), + nproc(TNL::MPI::GetSize(group)), rank_sizes(rank_sizes), group(group) {} @@ -329,7 +324,7 @@ struct GridDistributor< TNL::Meshes::Grid< 2, Real, Device, Index > > // input parameters int rank, nproc; CoordinatesType rank_sizes; - CommunicationGroup group; + MPI_Comm group; // output attributes (byproduct of the decomposition, useful for testing) CoordinatesType rank_coordinates, local_size, vert_begin, vert_end, cell_begin, cell_end; Index verticesCount, cellsCount, localVerticesCount, localCellsCount; @@ -342,7 +337,7 @@ void validateMesh( const Mesh& mesh, const Distributor& distributor, int ghostLe using Device = typename Mesh::DeviceType; // check basic interface - EXPECT_EQ( mesh.getCommunicationGroup(), CommunicatorType::AllGroup ); + EXPECT_EQ( mesh.getCommunicationGroup(), TNL::MPI::AllGroup() ); EXPECT_EQ( mesh.getGhostLevels(), ghostLevels ); if( ghostLevels > 0 ) { EXPECT_EQ( mesh.template getGlobalIndices< 0 >().getSize(), mesh.getLocalMesh().template getEntitiesCount< 0 >() ); @@ -399,12 +394,12 @@ void validateMesh( const Mesh& mesh, const Distributor& distributor, int ghostLe Containers::Array< Index, Device > vert_sendbuf( distributor.nproc ), cell_sendbuf( distributor.nproc ); vert_sendbuf.setValue( distributor.localVerticesCount ); cell_sendbuf.setValue( distributor.localCellsCount ); - CommunicatorType::Alltoall( vert_sendbuf.getData(), 1, - vert_offsets.getData(), 1, - distributor.group ); - CommunicatorType::Alltoall( cell_sendbuf.getData(), 1, - cell_offsets.getData(), 1, - distributor.group ); + TNL::MPI::Alltoall( vert_sendbuf.getData(), 1, + vert_offsets.getData(), 1, + distributor.group ); + TNL::MPI::Alltoall( cell_sendbuf.getData(), 1, + cell_offsets.getData(), 1, + distributor.group ); } vert_offsets.setElement( distributor.nproc, 0 ); cell_offsets.setElement( distributor.nproc, 0 ); @@ -662,7 +657,7 @@ void testSynchronizerOnDevice_entity_centers( const MeshType& mesh ) if( received != center ) { IndexType cellIndexes[ 2 ] = {0, 0}; const int numCells = getCellsForFace( mesh.getLocalMesh(), i, cellIndexes ); - std::cerr << "rank " << CommunicatorType::GetRank() + std::cerr << "rank " << TNL::MPI::GetRank() << ": wrong result for entity " << i << " (gid " << mesh.template getGlobalIndices< EntityType::getEntityDimension() >()[i] << ")" << " of dimension = " << EntityType::getEntityDimension() << ": received " << received << ", expected = " << center @@ -672,7 +667,7 @@ void testSynchronizerOnDevice_entity_centers( const MeshType& mesh ) } } if( errors > 0 ) - FAIL() << "rank " << CommunicatorType::GetRank() << ": " << errors << " errors in total." << std::endl; + FAIL() << "rank " << TNL::MPI::GetRank() << ": " << errors << " errors in total." << std::endl; } template< typename Device, typename EntityType, typename MeshType > @@ -704,10 +699,10 @@ TEST( DistributedMeshTest, 2D_ghostLevel0 ) using Mesh = DistributedMesh< LocalMesh >; GridType grid; grid.setDomain( {0, 0}, {1, 1} ); - const int nproc = CommunicatorType::GetSize(); + const int nproc = TNL::MPI::GetSize(); grid.setDimensions( nproc, nproc ); Mesh mesh; - GridDistributor< GridType > distributor( std::sqrt(nproc), CommunicatorType::AllGroup ); + GridDistributor< GridType > distributor( std::sqrt(nproc), TNL::MPI::AllGroup() ); const int ghostLevels = 0; distributor.decompose( grid, mesh, ghostLevels ); validateMesh( mesh, distributor, ghostLevels ); @@ -721,10 +716,10 @@ TEST( DistributedMeshTest, 2D_ghostLevel1 ) using Mesh = DistributedMesh< LocalMesh >; GridType grid; grid.setDomain( {0, 0}, {1, 1} ); - const int nproc = CommunicatorType::GetSize(); + const int nproc = TNL::MPI::GetSize(); grid.setDimensions( nproc, nproc ); Mesh mesh; - GridDistributor< GridType > distributor( std::sqrt(nproc), CommunicatorType::AllGroup ); + GridDistributor< GridType > distributor( std::sqrt(nproc), TNL::MPI::AllGroup() ); const int ghostLevels = 1; distributor.decompose( grid, mesh, ghostLevels ); validateMesh( mesh, distributor, ghostLevels ); @@ -739,10 +734,10 @@ TEST( DistributedMeshTest, 2D_ghostLevel2 ) using Mesh = DistributedMesh< LocalMesh >; GridType grid; grid.setDomain( {0, 0}, {1, 1} ); - const int nproc = CommunicatorType::GetSize(); + const int nproc = TNL::MPI::GetSize(); grid.setDimensions( nproc, nproc ); Mesh mesh; - GridDistributor< GridType > distributor( std::sqrt(nproc), CommunicatorType::AllGroup ); + GridDistributor< GridType > distributor( std::sqrt(nproc), TNL::MPI::AllGroup() ); const int ghostLevels = 2; distributor.decompose( grid, mesh, ghostLevels ); validateMesh( mesh, distributor, ghostLevels ); @@ -757,10 +752,10 @@ TEST( DistributedMeshTest, PVTUWriterReader ) using Mesh = DistributedMesh< LocalMesh >; GridType grid; grid.setDomain( {0, 0}, {1, 1} ); - const int nproc = CommunicatorType::GetSize(); + const int nproc = TNL::MPI::GetSize(); grid.setDimensions( nproc, nproc ); Mesh mesh; - GridDistributor< GridType > distributor( std::sqrt(nproc), CommunicatorType::AllGroup ); + GridDistributor< GridType > distributor( std::sqrt(nproc), TNL::MPI::AllGroup() ); const int ghostLevels = 2; distributor.decompose( grid, mesh, ghostLevels ); @@ -770,7 +765,7 @@ TEST( DistributedMeshTest, PVTUWriterReader ) std::string subfilePath; { std::ofstream file; - if( CommunicatorType::GetRank() == 0 ) + if( TNL::MPI::GetRank() == 0 ) file.open( mainFilePath ); using PVTU = Meshes::Writers::PVTUWriter< LocalMesh >; PVTU pvtu( file ); @@ -781,7 +776,7 @@ TEST( DistributedMeshTest, PVTUWriterReader ) pvtu.template writePCellData< std::uint8_t >( Meshes::VTK::ghostArrayName() ); pvtu.template writePCellData< typename Mesh::GlobalIndexType >( "GlobalIndex" ); } - subfilePath = pvtu.template addPiece< CommunicatorType >( mainFilePath, mesh.getCommunicationGroup() ); + subfilePath = pvtu.addPiece( mainFilePath, mesh.getCommunicationGroup() ); // create a .vtu file for local data using Writer = Meshes::Writers::VTUWriter< LocalMesh >; @@ -799,7 +794,7 @@ TEST( DistributedMeshTest, PVTUWriterReader ) } // load and test - CommunicatorType::Barrier(); + TNL::MPI::Barrier(); Readers::PVTUReader reader( mainFilePath ); reader.detectMesh(); EXPECT_EQ( reader.getMeshType(), "Meshes::DistributedMesh" ); @@ -813,8 +808,8 @@ TEST( DistributedMeshTest, PVTUWriterReader ) // cleanup EXPECT_EQ( fs::remove( subfilePath ), true ); - CommunicatorType::Barrier(); - if( CommunicatorType::GetRank() == 0 ) { + TNL::MPI::Barrier(); + if( TNL::MPI::GetRank() == 0 ) { EXPECT_EQ( fs::remove( mainFilePath ), true ); EXPECT_EQ( fs::remove( baseName ), true ); } diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp index 0a5ab3e37da3c3f0360fad0525425df95d2d6c9e..9bdccbcdb7006c8f45f00865b2c3e6e60456095f 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp @@ -2,13 +2,8 @@ #include #ifdef HAVE_MPI -#include #include "DistributedVectorFieldIO_MPIIOTestBase.h" -using namespace TNL::Communicators; - -typedef MpiCommunicator CommunicatorType; - TEST( DistributedVectorFieldIO_MPIIO, Save_1D ) { TestDistributedVectorFieldMPIIO<1,2,Host>::TestSave(); diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTestBase.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTestBase.h index f35ec8e089621027f065ef764631494b87500982..d6791e1df9d27d9d89ef206c0d3be45288d80c3f 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTestBase.h +++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTestBase.h @@ -1,4 +1,3 @@ -#include #include #include #include @@ -17,13 +16,10 @@ using namespace TNL::Containers; using namespace TNL::Meshes; using namespace TNL::Functions; using namespace TNL::Devices; -using namespace TNL::Communicators; using namespace TNL::Meshes::DistributedMeshes; //------------------------------------------------------------------------------ -typedef MpiCommunicator CommunicatorType; - template class TestDistributedVectorFieldMPIIO{ public: @@ -33,8 +29,8 @@ class TestDistributedVectorFieldMPIIO{ typedef VectorField VectorFieldType; typedef Vector DofType; typedef typename MeshType::Cell Cell; - typedef typename MeshType::IndexType IndexType; - typedef typename MeshType::PointType PointType; + typedef typename MeshType::IndexType IndexType; + typedef typename MeshType::PointType PointType; typedef DistributedMesh DistributedGridType; typedef typename DistributedGridType::CoordinatesType CoordinatesType; @@ -43,8 +39,8 @@ class TestDistributedVectorFieldMPIIO{ static void TestSave() { Pointers::SharedPointer< LinearFunctionType, Device > linearFunctionPtr; - MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator; - + MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator; + //save distributed meshfunction into file PointType globalOrigin; globalOrigin.setValue(-0.5); @@ -55,14 +51,14 @@ class TestDistributedVectorFieldMPIIO{ Pointers::SharedPointer globalGrid; globalGrid->setDimensions(globalProportions); globalGrid->setDomain(globalOrigin,globalProportions); - + DistributedGridType distributedGrid; - distributedGrid.template setGlobalGrid( *globalGrid ); + distributedGrid.setGlobalGrid( *globalGrid ); - Pointers::SharedPointer gridptr; + Pointers::SharedPointer gridptr; distributedGrid.setupGrid(*gridptr); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); distributedGrid.setupGrid(*gridptr); @@ -74,10 +70,10 @@ class TestDistributedVectorFieldMPIIO{ DofType dof(vctdim*(gridptr->template getEntitiesCount< Cell >())); dof.setValue(0); vectorField.bind(gridptr,dof); - + for(int i=0;i ::save(FileName, vectorField ); /*File file; @@ -86,7 +82,7 @@ class TestDistributedVectorFieldMPIIO{ file.close(); */ //first process compare results - if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0) + if(TNL::MPI::GetRank()==0) { DofType globalEvaluatedDof(vctdim*(globalGrid->template getEntitiesCount< Cell >())); @@ -101,7 +97,7 @@ class TestDistributedVectorFieldMPIIO{ loadvct.bind(globalGrid,loadDof); loadDof.setValue(-1); - + File file; file.open( FileName, std::ios_base::in ); loadvct.boundLoad(file); @@ -111,13 +107,13 @@ class TestDistributedVectorFieldMPIIO{ } } }; - + static void TestLoad() { Pointers::SharedPointer< LinearFunctionType, Device > linearFunctionPtr; - MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator; + MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator; - //Crete distributed grid + //Crete distributed grid PointType globalOrigin; globalOrigin.setValue(-0.5); @@ -131,26 +127,26 @@ class TestDistributedVectorFieldMPIIO{ CoordinatesType overlap; overlap.setValue(1); DistributedGridType distributedGrid; - distributedGrid.template setGlobalGrid(*globalGrid); + distributedGrid.setGlobalGrid(*globalGrid); typename DistributedGridType::SubdomainOverlapsType lowerOverlap, upperOverlap; - SubdomainOverlapsGetter< MeshType, CommunicatorType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); + SubdomainOverlapsGetter< MeshType >::getOverlaps( &distributedGrid, lowerOverlap, upperOverlap, 1 ); distributedGrid.setOverlaps( lowerOverlap, upperOverlap ); - String FileName=String("/tmp/test-file.tnl"); + String FileName=String("/tmp/test-file.tnl"); - //Prepare file - if(CommunicatorType::GetRank(CommunicatorType::AllGroup)==0) - { + //Prepare file + if(TNL::MPI::GetRank()==0) + { DofType saveDof(vctdim*(globalGrid->template getEntitiesCount< Cell >())); VectorFieldType saveVectorField; saveVectorField.bind(globalGrid,saveDof); for(int i=0;i loadGridptr; VectorFieldType loadVectorField; distributedGrid.setupGrid(*loadGridptr); - + DofType loadDof(vctdim*(loadGridptr->template getEntitiesCount< Cell >())); loadDof.setValue(0); loadVectorField.bind(loadGridptr,loadDof); @@ -169,26 +165,26 @@ class TestDistributedVectorFieldMPIIO{ synchronizer.setDistributedGrid( &distributedGrid ); for(int i=0;i(*loadVectorField[i]); //need synchronization for overlaps to be filled corectly in loadDof + synchronizer.synchronize(*loadVectorField[i]); //need synchronization for overlaps to be filled corectly in loadDof Pointers::SharedPointer evalGridPtr; VectorFieldType evalVectorField; distributedGrid.setupGrid(*evalGridPtr); - + DofType evalDof(vctdim*(evalGridPtr->template getEntitiesCount< Cell >())); evalDof.setValue(-1); evalVectorField.bind(evalGridPtr,evalDof); - + for(int i=0;i(*evalVectorField[i]); + linearFunctionEvaluator.evaluateAllEntities(evalVectorField[i] , linearFunctionPtr); + synchronizer.synchronize(*evalVectorField[i]); } for(int i=0;i -#include -using CommunicatorType = TNL::Communicators::MpiCommunicator; +#include +#include #include @@ -37,7 +36,7 @@ public: // Called after a test ends. virtual void OnTestEnd(const ::testing::TestInfo& test_info) { - const int rank = CommunicatorType::GetRank(CommunicatorType::AllGroup); + const int rank = TNL::MPI::GetRank(); sout << test_info.test_case_name() << "." << test_info.name() << " End." < mpi(argc, argv); + TNL::MPI::ScopedInitializer mpi(argc, argv); #endif return RUN_ALL_TESTS(); #else