diff --git a/src/TNL/Containers/DistributedNDArray.h b/src/TNL/Containers/DistributedNDArray.h index c49e9e31b0250a333bf430e60612214e4d1585d0..48de78e78dd74551e79c4279c9f73269493e33da 100644 --- a/src/TNL/Containers/DistributedNDArray.h +++ b/src/TNL/Containers/DistributedNDArray.h @@ -18,8 +18,7 @@ namespace TNL { namespace Containers { -template< typename NDArray, - typename Overlaps = __ndarray_impl::make_constant_index_sequence< NDArray::getDimension(), 0 > > +template< typename NDArray > class DistributedNDArray { public: @@ -31,16 +30,13 @@ public: using PermutationType = typename NDArray::PermutationType; using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArray::SizesHolderType >; using LocalRangeType = Subrange< IndexType >; - using OverlapsType = Overlaps; - using LocalIndexerType = NDArrayIndexer< SizesHolderType, PermutationType, typename NDArray::NDBaseType, typename NDArray::StridesHolderType, Overlaps >; + using OverlapsType = typename NDArray::OverlapsType; - using ViewType = DistributedNDArrayView< typename NDArray::ViewType, Overlaps >; - using ConstViewType = DistributedNDArrayView< typename NDArray::ConstViewType, Overlaps >; + using ViewType = DistributedNDArrayView< typename NDArray::ViewType >; + using ConstViewType = DistributedNDArrayView< typename NDArray::ConstViewType >; using LocalViewType = typename NDArray::ViewType; using ConstLocalViewType = typename NDArray::ConstViewType; - static_assert( Overlaps::size() == NDArray::getDimension(), "invalid overlaps" ); - // all methods from NDArrayView DistributedNDArray() = default; @@ -134,11 +130,6 @@ public: return localArray.getStorageSize(); } - LocalIndexerType getLocalIndexer() const - { - return LocalIndexerType( localEnds - localBegins, typename NDArray::StridesHolderType{} ); - } - LocalViewType getLocalView() { return localArray.getView(); @@ -156,12 +147,12 @@ public: getStorageIndex( IndexTypes&&... indices ) const { static_assert( sizeof...( indices ) == SizesHolderType::getDimension(), "got wrong number of indices" ); - __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... ); + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, OverlapsType{}, std::forward< IndexTypes >( indices )... ); auto getStorageIndex = [this]( auto&&... indices ) { return this->localArray.getStorageIndex( std::forward< decltype(indices) >( indices )... ); }; - return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, getStorageIndex, std::forward< IndexTypes >( indices )... ); + return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType >( localBegins, getStorageIndex, std::forward< IndexTypes >( indices )... ); } __cuda_callable__ @@ -183,8 +174,8 @@ public: operator()( IndexTypes&&... indices ) { static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); - __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... ); - return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localArray, std::forward< IndexTypes >( indices )... ); + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, OverlapsType{}, std::forward< IndexTypes >( indices )... ); + return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType >( localBegins, localArray, std::forward< IndexTypes >( indices )... ); } template< typename... IndexTypes > @@ -193,8 +184,8 @@ public: operator()( IndexTypes&&... indices ) const { static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); - __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... ); - return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localArray, std::forward< IndexTypes >( indices )... ); + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, OverlapsType{}, std::forward< IndexTypes >( indices )... ); + return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType >( localBegins, localArray, std::forward< IndexTypes >( indices )... ); } // bracket operator for 1D arrays @@ -203,8 +194,8 @@ public: operator[]( IndexType index ) { static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" ); - __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) ); - return localArray[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ]; + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, OverlapsType{}, std::forward< IndexType >( index ) ); + return localArray[ index - localBegins.template getSize< 0 >() ]; } __cuda_callable__ @@ -212,8 +203,8 @@ public: operator[]( IndexType index ) const { static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" ); - __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) ); - return localArray[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ]; + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, OverlapsType{}, std::forward< IndexType >( index ) ); + return localArray[ index - localBegins.template getSize< 0 >() ]; } __cuda_callable__ @@ -266,14 +257,14 @@ public: using Begins = __ndarray_impl::LocalBeginsHolder< SizesHolderType, 1 >; // add dynamic sizes Begins begins; - __ndarray_impl::SetSizesAddHelper< 1, Begins, SizesHolderType, Overlaps >::add( begins, SizesHolderType{} ); + __ndarray_impl::SetSizesAddHelper< 1, Begins, SizesHolderType, OverlapsType >::add( begins, SizesHolderType{} ); __ndarray_impl::SetSizesMaxHelper< Begins, LocalBeginsType >::max( begins, localBegins ); // subtract static sizes using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type; // subtract dynamic sizes Ends ends; - __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolderType, Overlaps >::subtract( ends, globalSizes ); + __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolderType, OverlapsType >::subtract( ends, globalSizes ); __ndarray_impl::SetSizesMinHelper< Ends, SizesHolderType >::min( ends, localEnds ); __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; @@ -297,14 +288,14 @@ public: using SkipBegins = __ndarray_impl::LocalBeginsHolder< SizesHolderType, 1 >; // add dynamic sizes SkipBegins skipBegins; - __ndarray_impl::SetSizesAddHelper< 1, SkipBegins, SizesHolderType, Overlaps >::add( skipBegins, SizesHolderType{} ); + __ndarray_impl::SetSizesAddHelper< 1, SkipBegins, SizesHolderType, OverlapsType >::add( skipBegins, SizesHolderType{} ); __ndarray_impl::SetSizesMaxHelper< SkipBegins, LocalBeginsType >::max( skipBegins, localBegins ); // subtract static sizes using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type; // subtract dynamic sizes SkipEnds skipEnds; - __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolderType, Overlaps >::subtract( skipEnds, globalSizes ); + __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolderType, OverlapsType >::subtract( skipEnds, globalSizes ); __ndarray_impl::SetSizesMinHelper< SkipEnds, SizesHolderType >::min( skipEnds, localEnds ); __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; @@ -326,11 +317,11 @@ public: { // add overlaps to dynamic sizes LocalBeginsType begins; - __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::add( begins, localBegins ); + __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, OverlapsType >::add( begins, localBegins ); // subtract overlaps from dynamic sizes SizesHolderType ends; - __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::subtract( ends, localEnds ); + __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, OverlapsType >::subtract( ends, localEnds ); __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; dispatch( begins, ends, f ); @@ -342,11 +333,11 @@ public: { // add overlaps to dynamic sizes LocalBeginsType skipBegins; - __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::add( skipBegins, localBegins ); + __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, OverlapsType >::add( skipBegins, localBegins ); // subtract overlaps from dynamic sizes SizesHolderType skipEnds; - __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::subtract( skipEnds, localEnds ); + __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, OverlapsType >::subtract( skipEnds, localEnds ); __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; dispatch( localBegins, skipBegins, skipEnds, localEnds, f ); @@ -358,11 +349,11 @@ public: { // subtract overlaps from dynamic sizes LocalBeginsType begins; - __ndarray_impl::SetSizesSubtractOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::subtract( begins, localBegins ); + __ndarray_impl::SetSizesSubtractOverlapsHelper< LocalBeginsType, SizesHolderType, OverlapsType >::subtract( begins, localBegins ); // add overlaps to dynamic sizes SizesHolderType ends; - __ndarray_impl::SetSizesAddOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::add( ends, localEnds ); + __ndarray_impl::SetSizesAddOverlapsHelper< SizesHolderType, SizesHolderType, OverlapsType >::add( ends, localEnds ); __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; dispatch( begins, localBegins, localEnds, ends, f ); @@ -428,12 +419,12 @@ public: getElement( IndexTypes&&... indices ) const { static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); - __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... ); + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, OverlapsType{}, std::forward< IndexTypes >( indices )... ); auto getElement = [this]( auto&&... indices ) { return this->localArray.getElement( std::forward< decltype(indices) >( indices )... ); }; - return __ndarray_impl::host_call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, getElement, std::forward< IndexTypes >( indices )... ); + return __ndarray_impl::host_call_with_unshifted_indices< LocalBeginsType >( localBegins, getElement, std::forward< IndexTypes >( indices )... ); } void setValue( ValueType value ) @@ -464,8 +455,9 @@ private: if( begin == end ) localSizes.template setSize< level >( globalSizes.template getSize< level >() ); else { - TNL_ASSERT_GE( end - begin, (decltype(end)) __ndarray_impl::get( Overlaps{} ), "local size is less than the size of overlaps" ); - localSizes.template setSize< level >( end - begin + 2 * __ndarray_impl::get( Overlaps{} ) ); + TNL_ASSERT_GE( end - begin, (decltype(end)) __ndarray_impl::get( OverlapsType{} ), "local size is less than the size of overlaps" ); + //localSizes.template setSize< level >( end - begin + 2 * __ndarray_impl::get( OverlapsType{} ) ); + localSizes.template setSize< level >( end - begin ); } } }; diff --git a/src/TNL/Containers/DistributedNDArraySynchronizer.h b/src/TNL/Containers/DistributedNDArraySynchronizer.h index cea40bc21c3ec0c71ad891aafcfe620c9582754b..fcdb728cff5590368f76a7ccefbfaf83dd5628cf 100644 --- a/src/TNL/Containers/DistributedNDArraySynchronizer.h +++ b/src/TNL/Containers/DistributedNDArraySynchronizer.h @@ -13,13 +13,69 @@ #pragma once #include +// 3rd-party async library providing a thread-pool +#include #include #include +#include namespace TNL { namespace Containers { +enum class SyncDirection : std::uint8_t { + // special - sync in all directions + All = 0xff, + // sync directions like in LBM + None = 0, + Right = 1 << 0, + Left = 1 << 1, + + // TODO: for 2D distribution: + // Right = 1 << 0, + // Left = 1 << 1, + // Top = 1 << 2, + // Bottom = 1 << 3, + // TopRight = Top | Right, + // TopLeft = Top | Left, + // BottomRight = Bottom | Right, + // BottomLeft = Bottom | Left + + // TODO: for 3D distribution: + // Right = 1 << 0, + // Left = 1 << 1, + // Top = 1 << 2, + // Bottom = 1 << 3, + // Back = 1 << 4, + // Front = 1 << 5, + // TopRight = Top | Right, + // TopLeft = Top | Left, + // BottomRight = Bottom | Right, + // BottomLeft = Bottom | Left + // BackRight = Back | Right, + // BackLeft = Back | Left, + // FrontRight = Front | Right, + // FrontLeft = Front | Left, + // BackTop = Back | Top, + // BackBottom = Back | Bottom, + // FrontTop = Front | Top, + // FrontBottom = Front | Bottom, + // BackTopRight = Back | Top | Right, + // BackTopLeft = Back | Top | Left, + // BackBottomRight = Back | Bottom | Right, + // BackBottomLeft = Back | Bottom | Left, + // FrontTopRight = Front | Top | Right, + // FrontTopLeft = Front | Top | Left, + // FrontBottomRight = Front | Bottom | Right, + // FrontBottomLeft = Front | Bottom | Left, +}; + +inline bool +operator&( SyncDirection a, SyncDirection b ) +{ + return std::uint8_t(a) & std::uint8_t(b); +} + template< typename DistributedNDArray, // This can be set to false to optimize out buffering when it is not needed // (e.g. for LBM with 1D distribution and specific orientation of the ndarray) @@ -28,26 +84,68 @@ template< typename DistributedNDArray, bool LBM_HACK = false > class DistributedNDArraySynchronizer { +private: + // NOTE: async::threadpool has alignment requirements, which causes problems: + // - it may become misaligned in derived classes, see e.g. + // https://stackoverflow.com/a/46475498 + // solution: specify it as the first member of the base class + // - operator new before C++17 may not support over-aligned types, see + // https://stackoverflow.com/a/53485295 + // solution: relaxed alignment requirements to not exceed the value of + // alignof(std::max_align_t), which is the strongest alignment supported + // by plain new. See https://github.com/d36u9/async/pull/2 + async::threadpool tp; + + int gpu_id = 0; + + int tag_offset = 0; + + static int reserve_tags(int count) + { + static int offset = 0; + // we could use a post-increment, but we don't have to start from 0 either... + return offset += count; + } + public: + using RequestsVector = std::vector< MPI_Request >; + + enum class AsyncPolicy { + synchronous, + deferred, + threadpool, + async, + }; + +// DistributedNDArraySynchronizer(int max_threads = std::thread::hardware_concurrency()) + DistributedNDArraySynchronizer(int max_threads = 1) + : tp(max_threads), + tag_offset(reserve_tags(2)) // reserve 2 communication tags (for left and right) + {} + void synchronize( DistributedNDArray& array ) { - auto future = synchronizeAsync( array, std::launch::deferred ); - future.wait(); + synchronizeAsync( array, AsyncPolicy::synchronous ); } // This method is not thread-safe - only the thread which created and "owns" the // instance of this object can call this method. - // Also note that this method must not be called again until the previous - // asynchronous operation has finished. - std::shared_future synchronizeAsync( DistributedNDArray& array, std::launch policy = std::launch::async ) + // Also note that if (buffered == true), this method must not be called again until + // the previous asynchronous operation has finished. + void synchronizeAsync( DistributedNDArray& array, AsyncPolicy policy = AsyncPolicy::synchronous, SyncDirection mask = SyncDirection::All ) { + // wait for any previous synchronization (multiple objects can share the + // same synchronizer) + wait(); + + async_start_timer.start(); + // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/ #ifdef HAVE_CUDA if( std::is_same< typename DistributedNDArray::DeviceType, Devices::Cuda >::value ) cudaGetDevice(&this->gpu_id); #endif - // NOTE: the allocation cannot be done in the worker, otherwise CUDA would crash // skip allocation on repeated calls - compare only sizes, not the actual data if( array_view.getCommunicationGroup() != array.getCommunicationGroup() || array_view.getSizes() != array.getSizes() || @@ -55,6 +153,7 @@ public: array_view.getLocalEnds() != array.getLocalEnds() ) { array_view.bind( array.getView() ); + this->mask = mask; // allocate buffers Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), AllocateHelper >::execHost( buffers, array_view ); @@ -62,41 +161,98 @@ public: else { // only bind to the actual data array_view.bind( array.getView() ); + this->mask = mask; + } + + if( policy == AsyncPolicy::threadpool || policy == AsyncPolicy::async ) { + // everything offloaded to a separate thread + auto worker = [this] () { + // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/ + #ifdef HAVE_CUDA + if( std::is_same< typename DistributedNDArray::DeviceType, Devices::Cuda >::value ) + cudaSetDevice(this->gpu_id); + #endif + + auto requests = this->worker_init(); + MPI::Waitall( requests.data(), requests.size() ); + this->worker_finish(); + }; + + if( policy == AsyncPolicy::threadpool ) + async_op = tp.post( worker ); + else + async_op = std::async( std::launch::async, worker ); + } + else if( policy == AsyncPolicy::deferred ) { + // immediate start, deferred synchronization (but still in the same thread) + auto requests = worker_init(); + auto worker = [this, requests] () mutable { + MPI::Waitall( requests.data(), requests.size() ); + this->worker_finish(); + }; + this->async_op = std::async( std::launch::deferred, worker ); + } + else { + // synchronous + auto requests = this->worker_init(); + MPI::Waitall( requests.data(), requests.size() ); + this->worker_finish(); } - auto worker = [this](){ this->worker(); }; - return std::async( policy, worker ); + async_ops_count++; + async_start_timer.stop(); + } + + void wait() + { + if( async_op.valid() ) { + async_wait_timer.start(); + async_op.wait(); + async_wait_timer.stop(); + } } + ~DistributedNDArraySynchronizer() + { + if( this->async_op.valid() ) + this->async_op.wait(); + } + + /** + * \brief Can be used for checking if a synchronization started + * asynchronously has been finished. + */ + std::future< void > async_op; + + // attributes for profiling + Timer async_start_timer, async_wait_timer; + std::size_t async_ops_count = 0; + protected: using DistributedNDArrayView = typename DistributedNDArray::ViewType; using Buffers = __ndarray_impl::SynchronizerBuffers< DistributedNDArray >; DistributedNDArrayView array_view; + SyncDirection mask = SyncDirection::All; Buffers buffers; - int gpu_id = 0; - void worker() + RequestsVector worker_init() { - // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/ - #ifdef HAVE_CUDA - if( std::is_same< typename DistributedNDArray::DeviceType, Devices::Cuda >::value ) - cudaSetDevice(gpu_id); - #endif - // fill send buffers - Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, true ); + Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, true, mask ); // issue all send and receive async operations - std::vector< MPI_Request > requests; + RequestsVector requests; const MPI_Comm group = array_view.getCommunicationGroup(); - Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), SendHelper >::execHost( buffers, requests, group ); + Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), SendHelper >::execHost( buffers, requests, group, tag_offset, mask ); - // wait until send is done - MPI::Waitall( requests.data(), requests.size() ); + return requests; + } + void worker_finish() + { // copy data from receive buffers - Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, false ); + Algorithms::TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, false, mask ); } template< std::size_t dim > @@ -106,9 +262,7 @@ protected: { auto& dim_buffers = buffers.template getDimBuffers< dim >(); - constexpr std::size_t overlap = __ndarray_impl::get< dim >( typename DistributedNDArray::OverlapsType{} ); - // TODO -// constexpr std::size_t overlap = array_view.template getOverlap< dim >(); + constexpr std::size_t overlap = DistributedNDArrayView::LocalViewType::IndexerType::template getOverlap< dim >(); if( overlap == 0 ) { dim_buffers.reset(); return; @@ -163,10 +317,10 @@ protected: template< std::size_t dim > struct CopyHelper { - static void exec( Buffers& buffers, DistributedNDArrayView& array_view, bool to_buffer ) + static void exec( Buffers& buffers, DistributedNDArrayView& array_view, bool to_buffer, SyncDirection mask ) { // skip if there are no overlaps - const std::size_t overlap = __ndarray_impl::get< dim >( typename DistributedNDArray::OverlapsType{} ); + constexpr std::size_t overlap = DistributedNDArrayView::LocalViewType::IndexerType::template getOverlap< dim >(); if( overlap == 0 ) return; @@ -179,22 +333,30 @@ protected: copy_kernel.to_buffer = to_buffer; if( to_buffer ) { - copy_kernel.buffer_view.bind( dim_buffers.left_send_view ); - copy_kernel.array_offsets = dim_buffers.left_send_offsets; - dim_buffers.left_send_view.forAll( copy_kernel ); - - copy_kernel.buffer_view.bind( dim_buffers.right_send_view ); - copy_kernel.array_offsets = dim_buffers.right_send_offsets; - dim_buffers.right_send_view.forAll( copy_kernel ); + if( mask & SyncDirection::Left ) { + copy_kernel.buffer_view.bind( dim_buffers.left_send_view ); + copy_kernel.array_offsets = dim_buffers.left_send_offsets; + dim_buffers.left_send_view.forAll( copy_kernel ); + } + + if( mask & SyncDirection::Right ) { + copy_kernel.buffer_view.bind( dim_buffers.right_send_view ); + copy_kernel.array_offsets = dim_buffers.right_send_offsets; + dim_buffers.right_send_view.forAll( copy_kernel ); + } } else { - copy_kernel.buffer_view.bind( dim_buffers.left_recv_view ); - copy_kernel.array_offsets = dim_buffers.left_recv_offsets; - dim_buffers.left_recv_view.forAll( copy_kernel ); - - copy_kernel.buffer_view.bind( dim_buffers.right_recv_view ); - copy_kernel.array_offsets = dim_buffers.right_recv_offsets; - dim_buffers.right_recv_view.forAll( copy_kernel ); + if( mask & SyncDirection::Right ) { + copy_kernel.buffer_view.bind( dim_buffers.left_recv_view ); + copy_kernel.array_offsets = dim_buffers.left_recv_offsets; + dim_buffers.left_recv_view.forAll( copy_kernel ); + } + + if( mask & SyncDirection::Left ) { + copy_kernel.buffer_view.bind( dim_buffers.right_recv_view ); + copy_kernel.array_offsets = dim_buffers.right_recv_offsets; + dim_buffers.right_recv_view.forAll( copy_kernel ); + } } } else { @@ -212,41 +374,45 @@ protected: struct SendHelper { template< typename Requests, typename Group > - static void exec( Buffers& buffers, Requests& requests, Group group ) + static void exec( Buffers& buffers, Requests& requests, Group group, int tag_offset, SyncDirection mask ) { - const std::size_t overlap = __ndarray_impl::get< dim >( typename DistributedNDArray::OverlapsType{} ); + constexpr std::size_t overlap = DistributedNDArrayView::LocalViewType::IndexerType::template getOverlap< dim >(); if( overlap == 0 ) return; auto& dim_buffers = buffers.template getDimBuffers< dim >(); if( LBM_HACK == false ) { - requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData(), - dim_buffers.left_send_view.getStorageSize(), - dim_buffers.left_neighbor, 0, group ) ); - requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData(), - dim_buffers.left_recv_view.getStorageSize(), - dim_buffers.left_neighbor, 1, group ) ); - requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData(), - dim_buffers.right_send_view.getStorageSize(), - dim_buffers.right_neighbor, 1, group ) ); - requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData(), - dim_buffers.right_recv_view.getStorageSize(), - dim_buffers.right_neighbor, 0, group ) ); + if( mask & SyncDirection::Left ) { + requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData(), + dim_buffers.left_send_view.getStorageSize(), + dim_buffers.left_neighbor, tag_offset + 0, group ) ); + requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData(), + dim_buffers.right_recv_view.getStorageSize(), + dim_buffers.right_neighbor, tag_offset + 0, group ) ); + } + if( mask & SyncDirection::Right ) { + requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData(), + dim_buffers.right_send_view.getStorageSize(), + dim_buffers.right_neighbor, tag_offset + 1, group ) ); + requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData(), + dim_buffers.left_recv_view.getStorageSize(), + dim_buffers.left_neighbor, tag_offset + 1, group ) ); + } } else { requests.push_back( MPI::Isend( dim_buffers.left_send_view.getData() + 0, dim_buffers.left_send_view.getStorageSize() / 27 * 9, - dim_buffers.left_neighbor, 0, group ) ); + dim_buffers.left_neighbor, tag_offset + 0, group ) ); requests.push_back( MPI::Irecv( dim_buffers.left_recv_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18, dim_buffers.left_recv_view.getStorageSize() / 27 * 9, - dim_buffers.left_neighbor, 1, group ) ); + dim_buffers.left_neighbor, tag_offset + 1, group ) ); requests.push_back( MPI::Isend( dim_buffers.right_send_view.getData() + dim_buffers.left_recv_view.getStorageSize() / 27 * 18, dim_buffers.right_send_view.getStorageSize() / 27 * 9, - dim_buffers.right_neighbor, 1, group ) ); + dim_buffers.right_neighbor, tag_offset + 1, group ) ); requests.push_back( MPI::Irecv( dim_buffers.right_recv_view.getData() + 0, dim_buffers.right_recv_view.getStorageSize() / 27 * 9, - dim_buffers.right_neighbor, 0, group ) ); + dim_buffers.right_neighbor, tag_offset + 0, group ) ); } } }; diff --git a/src/TNL/Containers/DistributedNDArrayView.h b/src/TNL/Containers/DistributedNDArrayView.h index 4812bf5c006b24dc7ab201901338fcd8ae68337b..f3f672fa89afc5c9d4b856096299e0284eddd7e8 100644 --- a/src/TNL/Containers/DistributedNDArrayView.h +++ b/src/TNL/Containers/DistributedNDArrayView.h @@ -19,8 +19,7 @@ namespace TNL { namespace Containers { -template< typename NDArrayView, - typename Overlaps = __ndarray_impl::make_constant_index_sequence< NDArrayView::getDimension(), 0 > > +template< typename NDArrayView > class DistributedNDArrayView { public: @@ -31,16 +30,13 @@ public: using PermutationType = typename NDArrayView::PermutationType; using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArrayView::SizesHolderType >; using LocalRangeType = Subrange< IndexType >; - using OverlapsType = Overlaps; - using LocalIndexerType = NDArrayIndexer< SizesHolderType, PermutationType, typename NDArrayView::NDBaseType, typename NDArrayView::StridesHolderType, Overlaps >; + using OverlapsType = typename NDArrayView::OverlapsType; - using ViewType = DistributedNDArrayView< NDArrayView, Overlaps >; - using ConstViewType = DistributedNDArrayView< typename NDArrayView::ConstViewType, Overlaps >; + using ViewType = DistributedNDArrayView< NDArrayView >; + using ConstViewType = DistributedNDArrayView< typename NDArrayView::ConstViewType >; using LocalViewType = NDArrayView; using ConstLocalViewType = typename NDArrayView::ConstViewType; - static_assert( Overlaps::size() == NDArrayView::getDimension(), "invalid overlaps" ); - __cuda_callable__ DistributedNDArrayView() = default; @@ -92,10 +88,9 @@ public: // binds to the given raw pointer and changes the indexer __cuda_callable__ - void bind( ValueType* data, LocalIndexerType indexer ) + void bind( ValueType* data, typename LocalViewType::IndexerType indexer ) { localView.bind( data, indexer ); - localView.bind( data ); } // binds to the given raw pointer and preserves the current indexer @@ -167,11 +162,6 @@ public: return localView.getStorageSize(); } - LocalIndexerType getLocalIndexer() const - { - return LocalIndexerType( localEnds - localBegins, typename NDArrayView::StridesHolderType{} ); - } - LocalViewType getLocalView() { return localView; @@ -189,12 +179,12 @@ public: getStorageIndex( IndexTypes&&... indices ) const { static_assert( sizeof...( indices ) == SizesHolderType::getDimension(), "got wrong number of indices" ); - __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... ); + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, OverlapsType{}, std::forward< IndexTypes >( indices )... ); auto getStorageIndex = [this]( auto&&... indices ) { return this->localView.getStorageIndex( std::forward< decltype(indices) >( indices )... ); }; - return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, getStorageIndex, std::forward< IndexTypes >( indices )... ); + return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType >( localBegins, getStorageIndex, std::forward< IndexTypes >( indices )... ); } __cuda_callable__ @@ -216,8 +206,8 @@ public: operator()( IndexTypes&&... indices ) { static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); - __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... ); - return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localView, std::forward< IndexTypes >( indices )... ); + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, OverlapsType{}, std::forward< IndexTypes >( indices )... ); + return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType >( localBegins, localView, std::forward< IndexTypes >( indices )... ); } template< typename... IndexTypes > @@ -226,8 +216,8 @@ public: operator()( IndexTypes&&... indices ) const { static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); - __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... ); - return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localView, std::forward< IndexTypes >( indices )... ); + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, OverlapsType{}, std::forward< IndexTypes >( indices )... ); + return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType >( localBegins, localView, std::forward< IndexTypes >( indices )... ); } // bracket operator for 1D arrays @@ -236,8 +226,8 @@ public: operator[]( IndexType index ) { static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" ); - __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) ); - return localView[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ]; + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, OverlapsType{}, std::forward< IndexType >( index ) ); + return localView[ index - localBegins.template getSize< 0 >() ]; } __cuda_callable__ @@ -245,8 +235,8 @@ public: operator[]( IndexType index ) const { static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" ); - __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) ); - return localView[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ]; + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, OverlapsType{}, std::forward< IndexType >( index ) ); + return localView[ index - localBegins.template getSize< 0 >() ]; } __cuda_callable__ @@ -299,14 +289,14 @@ public: using Begins = __ndarray_impl::LocalBeginsHolder< SizesHolderType, 1 >; // add dynamic sizes Begins begins; - __ndarray_impl::SetSizesAddHelper< 1, Begins, SizesHolderType, Overlaps >::add( begins, SizesHolderType{} ); + __ndarray_impl::SetSizesAddHelper< 1, Begins, SizesHolderType, OverlapsType >::add( begins, SizesHolderType{} ); __ndarray_impl::SetSizesMaxHelper< Begins, LocalBeginsType >::max( begins, localBegins ); // subtract static sizes using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type; // subtract dynamic sizes Ends ends; - __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolderType, Overlaps >::subtract( ends, globalSizes ); + __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolderType, OverlapsType >::subtract( ends, globalSizes ); __ndarray_impl::SetSizesMinHelper< Ends, SizesHolderType >::min( ends, localEnds ); __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; @@ -330,14 +320,14 @@ public: using SkipBegins = __ndarray_impl::LocalBeginsHolder< SizesHolderType, 1 >; // add dynamic sizes SkipBegins skipBegins; - __ndarray_impl::SetSizesAddHelper< 1, SkipBegins, SizesHolderType, Overlaps >::add( skipBegins, SizesHolderType{} ); + __ndarray_impl::SetSizesAddHelper< 1, SkipBegins, SizesHolderType, OverlapsType >::add( skipBegins, SizesHolderType{} ); __ndarray_impl::SetSizesMaxHelper< SkipBegins, LocalBeginsType >::max( skipBegins, localBegins ); // subtract static sizes using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type; // subtract dynamic sizes SkipEnds skipEnds; - __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolderType, Overlaps >::subtract( skipEnds, globalSizes ); + __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolderType, OverlapsType >::subtract( skipEnds, globalSizes ); __ndarray_impl::SetSizesMinHelper< SkipEnds, SizesHolderType >::min( skipEnds, localEnds ); __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; @@ -359,11 +349,11 @@ public: { // add overlaps to dynamic sizes LocalBeginsType begins; - __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::add( begins, localBegins ); + __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, OverlapsType >::add( begins, localBegins ); // subtract overlaps from dynamic sizes SizesHolderType ends; - __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::subtract( ends, localEnds ); + __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, OverlapsType >::subtract( ends, localEnds ); __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; dispatch( begins, ends, f ); @@ -375,11 +365,11 @@ public: { // add overlaps to dynamic sizes LocalBeginsType skipBegins; - __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::add( skipBegins, localBegins ); + __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, OverlapsType >::add( skipBegins, localBegins ); // subtract overlaps from dynamic sizes SizesHolderType skipEnds; - __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::subtract( skipEnds, localEnds ); + __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, OverlapsType >::subtract( skipEnds, localEnds ); __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; dispatch( localBegins, skipBegins, skipEnds, localEnds, f ); @@ -391,11 +381,11 @@ public: { // subtract overlaps from dynamic sizes LocalBeginsType begins; - __ndarray_impl::SetSizesSubtractOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::subtract( begins, localBegins ); + __ndarray_impl::SetSizesSubtractOverlapsHelper< LocalBeginsType, SizesHolderType, OverlapsType >::subtract( begins, localBegins ); // add overlaps to dynamic sizes SizesHolderType ends; - __ndarray_impl::SetSizesAddOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::add( ends, localEnds ); + __ndarray_impl::SetSizesAddOverlapsHelper< SizesHolderType, SizesHolderType, OverlapsType >::add( ends, localEnds ); __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; dispatch( begins, localBegins, localEnds, ends, f ); diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h index f8ba157ba6ce1e8fc85c4b9a28526808e8bb2597..674e1c558cfc57e5f56cf5178512bff976ef4fdb 100644 --- a/src/TNL/Containers/NDArray.h +++ b/src/TNL/Containers/NDArray.h @@ -35,25 +35,22 @@ struct SliceInfo template< typename Array, - typename SizesHolder, - typename Permutation, - typename Base, + typename Indexer, typename Device = typename Array::DeviceType > class NDArrayStorage - : public NDArrayIndexer< SizesHolder, Permutation, Base > +: public Indexer { public: using StorageArray = Array; using ValueType = typename Array::ValueType; using DeviceType = Device; - using IndexType = typename Array::IndexType; - using SizesHolderType = SizesHolder; - using PermutationType = Permutation; - using IndexerType = NDArrayIndexer< SizesHolder, Permutation, Base >; - using ViewType = NDArrayView< ValueType, DeviceType, SizesHolder, Permutation, Base >; - using ConstViewType = NDArrayView< std::add_const_t< ValueType >, DeviceType, SizesHolder, Permutation, Base >; - - static_assert( Permutation::size() == SizesHolder::getDimension(), "invalid permutation" ); + using IndexType = typename Indexer::IndexType; + using SizesHolderType = typename Indexer::SizesHolderType; + using PermutationType = typename Indexer::PermutationType; + using OverlapsType = typename Indexer::OverlapsType; + using IndexerType = Indexer; + using ViewType = NDArrayView< ValueType, DeviceType, IndexerType >; + using ConstViewType = NDArrayView< std::add_const_t< ValueType >, DeviceType, IndexerType >; // all methods from NDArrayView @@ -114,6 +111,7 @@ public: using IndexerType::getSizes; using IndexerType::getSize; using IndexerType::getStride; + using IndexerType::getOverlap; using IndexerType::getStorageSize; using IndexerType::getStorageIndex; @@ -143,13 +141,13 @@ public: static_assert( 0 < sizeof...(Dimensions) && sizeof...(Dimensions) <= getDimension(), "got wrong number of dimensions" ); // FIXME: nvcc chokes on the variadic brace-initialization #ifndef __NVCC__ - static_assert( __ndarray_impl::all_elements_in_range( 0, Permutation::size(), {Dimensions...} ), + static_assert( __ndarray_impl::all_elements_in_range( 0, PermutationType::size(), {Dimensions...} ), "invalid dimensions" ); static_assert( __ndarray_impl::is_increasing_sequence( {Dimensions...} ), "specifying permuted dimensions is not supported" ); #endif - using Getter = __ndarray_impl::SubarrayGetter< Base, Permutation, Dimensions... >; + using Getter = __ndarray_impl::SubarrayGetter< typename Indexer::NDBaseType, PermutationType, Dimensions... >; using Subpermutation = typename Getter::Subpermutation; auto& begin = operator()( std::forward< IndexTypes >( indices )... ); auto subarray_sizes = Getter::filterSizes( getSizes(), std::forward< IndexTypes >( indices )... ); @@ -157,7 +155,9 @@ public: static_assert( Subpermutation::size() == sizeof...(Dimensions), "Bug - wrong subpermutation length." ); static_assert( decltype(subarray_sizes)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the new sizes." ); static_assert( decltype(strides)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the strides." ); - using SubarrayView = NDArrayView< ValueType, Device, decltype(subarray_sizes), Subpermutation, Base, decltype(strides) >; + // TODO: select overlaps for the subarray + using Subindexer = NDArrayIndexer< decltype(subarray_sizes), Subpermutation, typename Indexer::NDBaseType, decltype(strides) >; + using SubarrayView = NDArrayView< ValueType, Device, Subindexer >; return SubarrayView{ &begin, subarray_sizes, strides }; } @@ -167,7 +167,7 @@ public: operator()( IndexTypes&&... indices ) { static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); - __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... ); + __ndarray_impl::assertIndicesInBounds( getSizes(), OverlapsType{}, std::forward< IndexTypes >( indices )... ); TNL_ASSERT_LT( getStorageIndex( std::forward< IndexTypes >( indices )... ), getStorageSize(), "storage index out of bounds - either input error or a bug in the indexer" ); return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ]; @@ -179,7 +179,7 @@ public: operator()( IndexTypes&&... indices ) const { static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); - __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... ); + __ndarray_impl::assertIndicesInBounds( getSizes(), OverlapsType{}, std::forward< IndexTypes >( indices )... ); TNL_ASSERT_LT( getStorageIndex( std::forward< IndexTypes >( indices )... ), getStorageSize(), "storage index out of bounds - either input error or a bug in the indexer" ); return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ]; @@ -191,7 +191,7 @@ public: operator[]( IndexType index ) { static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" ); - __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexType >( index ) ); + __ndarray_impl::assertIndicesInBounds( getSizes(), OverlapsType{}, std::forward< IndexType >( index ) ); return array[ index ]; } @@ -200,7 +200,7 @@ public: operator[]( IndexType index ) const { static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" ); - __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexType >( index ) ); + __ndarray_impl::assertIndicesInBounds( getSizes(), OverlapsType{}, std::forward< IndexType >( index ) ); return array[ index ]; } @@ -218,10 +218,10 @@ public: __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >; // subtract static sizes - using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type; + using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type; // subtract dynamic sizes Ends ends; - __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolder >::subtract( ends, getSizes() ); + __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolderType >::subtract( ends, getSizes() ); dispatch( Begins{}, ends, f ); } @@ -239,10 +239,10 @@ public: using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >; using SkipBegins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >; // subtract static sizes - using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type; + using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type; // subtract dynamic sizes SkipEnds skipEnds; - __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolder >::subtract( skipEnds, getSizes() ); + __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolderType >::subtract( skipEnds, getSizes() ); __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; dispatch( Begins{}, SkipBegins{}, skipEnds, getSizes(), f ); @@ -283,7 +283,7 @@ public: void reset() { - getSizes() = SizesHolder{}; + getSizes() = SizesHolderType{}; TNL_ASSERT_EQ( getStorageSize(), 0, "Failed to reset the sizes." ); array.reset(); } @@ -294,7 +294,7 @@ public: getElement( IndexTypes&&... indices ) const { static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); - __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... ); + __ndarray_impl::assertIndicesInBounds( getSizes(), OverlapsType{}, std::forward< IndexTypes >( indices )... ); TNL_ASSERT_LT( getStorageIndex( std::forward< IndexTypes >( indices )... ), getStorageSize(), "storage index out of bounds - either input error or a bug in the indexer" ); return array.getElement( getStorageIndex( std::forward< IndexTypes >( indices )... ) ); @@ -325,17 +325,22 @@ template< typename Value, typename Permutation = std::make_index_sequence< SizesHolder::getDimension() >, // identity by default typename Device = Devices::Host, typename Index = typename SizesHolder::IndexType, + typename Overlaps = __ndarray_impl::make_constant_index_sequence< SizesHolder::getDimension(), 0 >, typename Allocator = typename Allocators::Default< Device >::template Allocator< Value > > class NDArray : public NDArrayStorage< Array< Value, Device, Index, Allocator >, - SizesHolder, - Permutation, - __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > > + NDArrayIndexer< SizesHolder, + Permutation, + __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >, + __ndarray_impl::DummyStrideBase< typename SizesHolder::IndexType, SizesHolder::getDimension() >, + Overlaps > > { using Base = NDArrayStorage< Array< Value, Device, Index, Allocator >, - SizesHolder, - Permutation, - __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >; + NDArrayIndexer< SizesHolder, + Permutation, + __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >, + __ndarray_impl::DummyStrideBase< typename SizesHolder::IndexType, SizesHolder::getDimension() >, + Overlaps > >; public: // inherit all constructors and assignment operators @@ -375,16 +380,16 @@ template< typename Value, typename Index = typename SizesHolder::IndexType > class StaticNDArray : public NDArrayStorage< StaticArray< __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get(), Value >, - SizesHolder, - Permutation, - __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >, + NDArrayIndexer< SizesHolder, + Permutation, + __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >, Devices::Sequential > { using Base = NDArrayStorage< StaticArray< __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get(), Value >, - SizesHolder, - Permutation, - __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >, - Devices::Sequential >; + NDArrayIndexer< SizesHolder, + Permutation, + __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >, + Devices::Sequential >; static_assert( __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get() > 0, "All dimensions of a static array must to be positive." ); @@ -399,17 +404,22 @@ template< typename Value, typename SliceInfo = SliceInfo<>, // no slicing by default typename Device = Devices::Host, typename Index = typename SizesHolder::IndexType, + typename Overlaps = __ndarray_impl::make_constant_index_sequence< SizesHolder::getDimension(), 0 >, typename Allocator = typename Allocators::Default< Device >::template Allocator< Value > > class SlicedNDArray : public NDArrayStorage< Array< Value, Device, Index, Allocator >, - SizesHolder, - Permutation, - __ndarray_impl::SlicedNDArrayBase< SliceInfo > > + NDArrayIndexer< SizesHolder, + Permutation, + __ndarray_impl::SlicedNDArrayBase< SliceInfo >, + __ndarray_impl::DummyStrideBase< typename SizesHolder::IndexType, SizesHolder::getDimension() >, + Overlaps > > { using Base = NDArrayStorage< Array< Value, Device, Index, Allocator >, - SizesHolder, - Permutation, - __ndarray_impl::SlicedNDArrayBase< SliceInfo > >; + NDArrayIndexer< SizesHolder, + Permutation, + __ndarray_impl::SlicedNDArrayBase< SliceInfo >, + __ndarray_impl::DummyStrideBase< typename SizesHolder::IndexType, SizesHolder::getDimension() >, + Overlaps > >; public: // inherit all constructors and assignment operators diff --git a/src/TNL/Containers/NDArrayView.h b/src/TNL/Containers/NDArrayView.h index 1d3663ecdf25837ac528ce2989749c04575f7d1a..57d166b2ef9e88f97102bdd3a15f740c7890f7ab 100644 --- a/src/TNL/Containers/NDArrayView.h +++ b/src/TNL/Containers/NDArrayView.h @@ -26,31 +26,28 @@ namespace Containers { template< typename Value, typename Device, - typename SizesHolder, - typename Permutation, - typename Base, - typename StridesHolder = __ndarray_impl::DummyStrideBase< typename SizesHolder::IndexType, SizesHolder::getDimension() > > + typename Indexer > class NDArrayView - : public NDArrayIndexer< SizesHolder, Permutation, Base, StridesHolder > +: public Indexer { public: using ValueType = Value; using DeviceType = Device; - using IndexType = typename SizesHolder::IndexType; - using SizesHolderType = SizesHolder; - using PermutationType = Permutation; - using IndexerType = NDArrayIndexer< SizesHolder, Permutation, Base, StridesHolder >; - using ViewType = NDArrayView< Value, Device, SizesHolder, Permutation, Base, StridesHolder >; - using ConstViewType = NDArrayView< std::add_const_t< Value >, Device, SizesHolder, Permutation, Base, StridesHolder >; - - static_assert( Permutation::size() == SizesHolder::getDimension(), "invalid permutation" ); + using IndexType = typename Indexer::IndexType; + using SizesHolderType = typename Indexer::SizesHolderType; + using StridesHolderType = typename Indexer::StridesHolderType; + using PermutationType = typename Indexer::PermutationType; + using OverlapsType = typename Indexer::OverlapsType; + using IndexerType = Indexer; + using ViewType = NDArrayView< ValueType, DeviceType, IndexerType >; + using ConstViewType = NDArrayView< std::add_const_t< ValueType >, DeviceType, IndexerType >; __cuda_callable__ NDArrayView() = default; // explicit initialization by raw data pointer and sizes and strides __cuda_callable__ - NDArrayView( Value* data, SizesHolder sizes, StridesHolder strides = StridesHolder{} ) + NDArrayView( Value* data, SizesHolderType sizes, StridesHolderType strides = StridesHolderType{} ) : IndexerType(sizes, strides), array(data) {} // explicit initialization by raw data pointer and indexer @@ -139,6 +136,7 @@ public: if( getSizes() != other.getSizes() ) return false; // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray + // TODO: overlaps should be skipped, otherwise it works only after synchronization return Algorithms::MemoryOperations< Device >::compare( array, other.array, getStorageSize() ); } @@ -169,6 +167,7 @@ public: using IndexerType::getSizes; using IndexerType::getSize; using IndexerType::getStride; + using IndexerType::getOverlap; using IndexerType::getStorageSize; using IndexerType::getStorageIndex; @@ -187,7 +186,7 @@ public: __cuda_callable__ ConstViewType getConstView() const { - return ConstViewType( array, getSizes(), static_cast< const StridesHolder& >( *this ) ); + return ConstViewType( array, getIndexer() ); } template< std::size_t... Dimensions, typename... IndexTypes > @@ -198,13 +197,13 @@ public: static_assert( 0 < sizeof...(Dimensions) && sizeof...(Dimensions) <= getDimension(), "got wrong number of dimensions" ); // FIXME: nvcc chokes on the variadic brace-initialization #ifndef __NVCC__ - static_assert( __ndarray_impl::all_elements_in_range( 0, Permutation::size(), {Dimensions...} ), + static_assert( __ndarray_impl::all_elements_in_range( 0, PermutationType::size(), {Dimensions...} ), "invalid dimensions" ); static_assert( __ndarray_impl::is_increasing_sequence( {Dimensions...} ), "specifying permuted dimensions is not supported" ); #endif - using Getter = __ndarray_impl::SubarrayGetter< Base, Permutation, Dimensions... >; + using Getter = __ndarray_impl::SubarrayGetter< typename Indexer::NDBaseType, PermutationType, Dimensions... >; using Subpermutation = typename Getter::Subpermutation; auto& begin = operator()( std::forward< IndexTypes >( indices )... ); auto subarray_sizes = Getter::filterSizes( getSizes(), std::forward< IndexTypes >( indices )... ); @@ -212,7 +211,9 @@ public: static_assert( Subpermutation::size() == sizeof...(Dimensions), "Bug - wrong subpermutation length." ); static_assert( decltype(subarray_sizes)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the new sizes." ); static_assert( decltype(strides)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the strides." ); - using SubarrayView = NDArrayView< ValueType, Device, decltype(subarray_sizes), Subpermutation, Base, decltype(strides) >; + // TODO: select overlaps for the subarray + using Subindexer = NDArrayIndexer< decltype(subarray_sizes), Subpermutation, typename Indexer::NDBaseType, decltype(strides) >; + using SubarrayView = NDArrayView< ValueType, Device, Subindexer >; return SubarrayView{ &begin, subarray_sizes, strides }; } @@ -222,7 +223,7 @@ public: operator()( IndexTypes&&... indices ) { static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); - __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... ); + __ndarray_impl::assertIndicesInBounds( getSizes(), OverlapsType{}, std::forward< IndexTypes >( indices )... ); return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ]; } @@ -232,7 +233,7 @@ public: operator()( IndexTypes&&... indices ) const { static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); - __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... ); + __ndarray_impl::assertIndicesInBounds( getSizes(), OverlapsType{}, std::forward< IndexTypes >( indices )... ); return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ]; } @@ -242,7 +243,7 @@ public: operator[]( IndexType&& index ) { static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" ); - __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexType >( index ) ); + __ndarray_impl::assertIndicesInBounds( getSizes(), OverlapsType{}, std::forward< IndexType >( index ) ); return array[ index ]; } @@ -251,7 +252,7 @@ public: operator[]( IndexType index ) const { static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" ); - __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexType >( index ) ); + __ndarray_impl::assertIndicesInBounds( getSizes(), OverlapsType{}, std::forward< IndexType >( index ) ); return array[ index ]; } @@ -269,10 +270,10 @@ public: __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >; // subtract static sizes - using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type; + using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type; // subtract dynamic sizes Ends ends; - __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolder >::subtract( ends, getSizes() ); + __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolderType >::subtract( ends, getSizes() ); dispatch( Begins{}, ends, f ); } @@ -290,10 +291,10 @@ public: using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >; using SkipBegins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >; // subtract static sizes - using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type; + using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type; // subtract dynamic sizes SkipEnds skipEnds; - __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolder >::subtract( skipEnds, getSizes() ); + __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolderType >::subtract( skipEnds, getSizes() ); __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; dispatch( Begins{}, SkipBegins{}, skipEnds, getSizes(), f ); diff --git a/src/TNL/Containers/ndarray/Indexing.h b/src/TNL/Containers/ndarray/Indexing.h index 7e9436cdf8b015cd1358d3b2206997cbe10d3718..2908e2a871efe7b7dac66adab0053b0174f4c96a 100644 --- a/src/TNL/Containers/ndarray/Indexing.h +++ b/src/TNL/Containers/ndarray/Indexing.h @@ -179,8 +179,10 @@ struct SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, level, false static constexpr std::size_t idx = get< level >( Permutation{} ); static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} ); const auto alpha = get_from_pack< idx >( std::forward< Indices >( indices )... ); + const auto size = Alignment::template getAlignedSize< idx >( sizes ) + 2 * overlap; const auto previous = SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, level - 1 >::getIndex( sizes, strides, std::forward< Indices >( indices )... ); - return strides.template getStride< idx >( alpha ) * ( alpha + overlap + Alignment::template getAlignedSize< idx >( sizes ) * previous ); + + return strides.template getStride< idx >( alpha ) * ( alpha + overlap + size * previous ); } }; @@ -205,7 +207,7 @@ struct SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, level, true > static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} ); const auto alpha = get_from_pack< idx >( std::forward< Indices >( indices )... ); static constexpr std::size_t S = SliceInfo::getSliceSize( idx ); - // TODO: check the calculation with strides + // TODO: check the calculation with strides and overlaps return strides.template getStride< idx >( alpha ) * ( S * ((alpha + overlap) / S) * StorageSizeGetter< SizesHolder, Alignment, Overlaps, IndexTag< level - 1 > >::getPermuted( sizes, Permutation{} ) + (alpha + overlap) % S ) + diff --git a/src/TNL/Containers/ndarray/SizesHolderHelpers.h b/src/TNL/Containers/ndarray/SizesHolderHelpers.h index 7c200a2cd14dca5d0e3cc973508d48021d368557..4e5473c701389a25c1386778a8eab95dc5d77fe0 100644 --- a/src/TNL/Containers/ndarray/SizesHolderHelpers.h +++ b/src/TNL/Containers/ndarray/SizesHolderHelpers.h @@ -120,25 +120,28 @@ void setSizesHelper( SizesHolder& holder, // A variadic bounds-checker for indices -template< typename SizesHolder > +template< typename SizesHolder, typename Overlaps > __cuda_callable__ -void assertIndicesInBounds( const SizesHolder& ) +void assertIndicesInBounds( const SizesHolder&, const Overlaps& overlaps ) {} template< typename SizesHolder, + typename Overlaps, typename Index, typename... IndexTypes > __cuda_callable__ -void assertIndicesInBounds( const SizesHolder& sizes, Index&& i, IndexTypes&&... indices ) +void assertIndicesInBounds( const SizesHolder& sizes, const Overlaps& overlaps, Index&& i, IndexTypes&&... indices ) { #ifndef NDEBUG // sizes.template getSize<...>() cannot be inside the assert macro, but the variables // shouldn't be declared when compiling without assertions constexpr std::size_t level = SizesHolder::getDimension() - sizeof...(indices) - 1; const auto size = sizes.template getSize< level >(); - TNL_ASSERT_LT( (decltype(size)) i, size, "Input error - some index is out of bounds." ); + const decltype(size) overlap = get( overlaps ); + TNL_ASSERT_LE( - overlap, (decltype(size)) i, "Input error - some index is below the lower bound." ); + TNL_ASSERT_LT( (decltype(size)) i, size + overlap, "Input error - some index is above the upper bound." ); #endif - assertIndicesInBounds( sizes, std::forward< IndexTypes >( indices )... ); + assertIndicesInBounds( sizes, overlaps, std::forward< IndexTypes >( indices )... ); } diff --git a/src/TNL/MPI/ScopedInitializer.h b/src/TNL/MPI/ScopedInitializer.h index 82ba02bc5743611bfb4af7395142de730672d548..5f3e8f03f6dbf1d9bd4cecfb3be950e5f517c657 100644 --- a/src/TNL/MPI/ScopedInitializer.h +++ b/src/TNL/MPI/ScopedInitializer.h @@ -22,7 +22,7 @@ struct ScopedInitializer { ScopedInitializer( int& argc, char**& argv, int required_thread_level = MPI_THREAD_SINGLE ) { - Init( argc, argv ); + Init( argc, argv, required_thread_level ); } ~ScopedInitializer() diff --git a/src/TNL/MPI/Wrappers.h b/src/TNL/MPI/Wrappers.h index 8a455dcb75d4bba5d38b993cca932a6cb2c4ea2f..b2c97e0764f8597b5855fc38834dded2f7c52271 100644 --- a/src/TNL/MPI/Wrappers.h +++ b/src/TNL/MPI/Wrappers.h @@ -66,10 +66,10 @@ inline void Init( int& argc, char**& argv, int required_thread_level = MPI_THREA { #ifdef HAVE_MPI switch( required_thread_level ) { - case MPI_THREAD_SINGLE: - case MPI_THREAD_FUNNELED: - case MPI_THREAD_SERIALIZED: - case MPI_THREAD_MULTIPLE: + case MPI_THREAD_SINGLE: // application is single-threaded + case MPI_THREAD_FUNNELED: // application is multithreaded, but all MPI calls will be issued from the master thread only + case MPI_THREAD_SERIALIZED: // application is multithreaded and any thread may issue MPI calls, but different threads will never issue MPI calls at the same time + case MPI_THREAD_MULTIPLE: // application is multithreaded and any thread may issue MPI calls at any time break; default: std::cerr << "ERROR: invalid argument for the 'required' thread level support: " << required_thread_level << std::endl; diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h index 36c4ea5b7039974867f15697dc5c49f54bdcddd6..10422b0947e0f7dae6e144ab87ec8189796091d1 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h @@ -67,15 +67,17 @@ using DistributedNDArrayTypes = ::testing::Types< DistributedNDArray< NDArray< double, SizesHolder< int, 0 >, std::index_sequence< 0 >, - Devices::Host >, - std::index_sequence< 2 > > + Devices::Host, + int, + std::index_sequence< 2 > > > // overlaps #ifdef HAVE_CUDA , DistributedNDArray< NDArray< double, SizesHolder< int, 0 >, std::index_sequence< 0 >, - Devices::Cuda >, - std::index_sequence< 2 > > + Devices::Cuda, + int, + std::index_sequence< 2 > > > // overlaps #endif >; diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h index 0b6838639f0c689f132fb4077c275f5edb6e0d92..1801f64bab20137e559631f3aca9d31b7da32363 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h @@ -18,6 +18,8 @@ using namespace TNL; using namespace TNL::Containers; +static constexpr int Q = 9; + /* * Light check of DistributedNDArray. * @@ -65,17 +67,19 @@ protected: // types for which DistributedNDArrayOverlaps_semi1D_test is instantiated using DistributedNDArrayTypes = ::testing::Types< DistributedNDArray< NDArray< double, - SizesHolder< int, 9, 0, 0 >, // Q, X, Y + SizesHolder< int, Q, 0, 0 >, // Q, X, Y std::index_sequence< 0, 1, 2 >, // permutation - should not matter - Devices::Host >, - std::index_sequence< 0, 2, 0 > > + Devices::Host, + int, + std::index_sequence< 0, 2, 0 > > > // overlaps #ifdef HAVE_CUDA , DistributedNDArray< NDArray< double, - SizesHolder< int, 9, 0, 0 >, // Q, X, Y + SizesHolder< int, Q, 0, 0 >, // Q, X, Y std::index_sequence< 0, 1, 2 >, // permutation - should not matter - Devices::Cuda >, - std::index_sequence< 0, 2, 0 > > + Devices::Cuda, + int, + std::index_sequence< 0, 2, 0 > > > // overlaps #endif >; @@ -90,7 +94,7 @@ TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, checkSumOfLocalSizes ) EXPECT_EQ( sumOfLocalSizes, this->globalSize ); EXPECT_EQ( this->distributedNDArray.template getSize< 1 >(), this->globalSize ); - EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 9 * (2 * this->overlaps + localSize) * (this->globalSize / 2) ); + EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), Q * (2 * this->overlaps + localSize) * (this->globalSize / 2) ); } // separate function because nvcc does not allow __cuda_callable__ lambdas inside @@ -112,40 +116,40 @@ void test_helper_forLocalInternal( DistributedArray& a ) a.setValue( 0 ); a.forLocalInternal( setter ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 0 ) - << "gi = " << gi; - for( int q = 0; q < 9; q++ ) + << "q = " << q << ", gi = " << gi << ", j = " << j; + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 1 ) - << "gi = " << gi; - for( int q = 0; q < 9; q++ ) + << "q = " << q << ", gi = " << gi << ", j = " << j; + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 0 ) - << "gi = " << gi; + << "q = " << q << ", gi = " << gi << ", j = " << j; a.setValue( 0 ); a_view.forLocalInternal( setter ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 0 ) - << "gi = " << gi; - for( int q = 0; q < 9; q++ ) + << "q = " << q << ", gi = " << gi << ", j = " << j; + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 1 ) - << "gi = " << gi; - for( int q = 0; q < 9; q++ ) + << "q = " << q << ", gi = " << gi << ", j = " << j; + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 0 ) - << "gi = " << gi; + << "q = " << q << ", gi = " << gi << ", j = " << j; } TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, forLocalInternal ) @@ -172,40 +176,40 @@ void test_helper_forLocalBoundary( DistributedArray& a ) a.setValue( 0 ); a.forLocalBoundary( setter ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 1 ) - << "gi = " << gi; - for( int q = 0; q < 9; q++ ) + << "q = " << q << ", gi = " << gi << ", j = " << j; + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 0 ) - << "gi = " << gi; - for( int q = 0; q < 9; q++ ) + << "q = " << q << ", gi = " << gi << ", j = " << j; + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 1 ) - << "gi = " << gi; + << "q = " << q << ", gi = " << gi << ", j = " << j; a.setValue( 0 ); a_view.forLocalBoundary( setter ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 1 ) - << "gi = " << gi; - for( int q = 0; q < 9; q++ ) + << "q = " << q << ", gi = " << gi << ", j = " << j; + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 0 ) - << "gi = " << gi; - for( int q = 0; q < 9; q++ ) + << "q = " << q << ", gi = " << gi << ", j = " << j; + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 1 ) - << "gi = " << gi; + << "q = " << q << ", gi = " << gi << ", j = " << j; } TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, forLocalBoundary ) @@ -232,40 +236,40 @@ void test_helper_forOverlaps( DistributedArray& a ) a.setValue( 0 ); a.forOverlaps( setter ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 1 ) - << "gi = " << gi; - for( int q = 0; q < 9; q++ ) + << "q = " << q << ", gi = " << gi << ", j = " << j; + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 0 ) - << "gi = " << gi; - for( int q = 0; q < 9; q++ ) + << "q = " << q << ", gi = " << gi << ", j = " << j; + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 1 ) - << "gi = " << gi; + << "q = " << q << ", gi = " << gi << ", j = " << j; a.setValue( 0 ); a_view.forOverlaps( setter ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 1 ) - << "gi = " << gi; - for( int q = 0; q < 9; q++ ) + << "q = " << q << ", gi = " << gi << ", j = " << j; + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 0 ) - << "gi = " << gi; - for( int q = 0; q < 9; q++ ) + << "q = " << q << ", gi = " << gi << ", j = " << j; + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 1 ) - << "gi = " << gi; + << "q = " << q << ", gi = " << gi << ", j = " << j; } TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, forOverlaps ) @@ -286,7 +290,7 @@ void test_helper_synchronize( DistributedArray& a, const int rank, const int npr auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable { - a_view( i ) = i; + a_view( q, i, j ) = i; }; a.setValue( -1 ); @@ -294,48 +298,48 @@ void test_helper_synchronize( DistributedArray& a, const int rank, const int npr DistributedNDArraySynchronizer< DistributedArray > s1; s1.synchronize( a ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), gi + ((rank == 0) ? 97 : 0) ) - << "gi = " << gi; - for( int q = 0; q < 9; q++ ) + << "q = " << q << ", gi = " << gi << ", j = " << j; + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), gi ) - << "gi = " << gi; - for( int q = 0; q < 9; q++ ) + << "q = " << q << ", gi = " << gi << ", j = " << j; + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), gi - ((rank == nproc-1) ? 97 : 0) ) - << "gi = " << gi; + << "q = " << q << ", gi = " << gi << ", j = " << j; a.setValue( -1 ); a_view.forAll( setter ); DistributedNDArraySynchronizer< decltype(a_view) > s2; s2.synchronize( a_view ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), gi + ((rank == 0) ? 97 : 0) ) - << "gi = " << gi; - for( int q = 0; q < 9; q++ ) + << "q = " << q << ", gi = " << gi << ", j = " << j; + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), gi ) - << "gi = " << gi; - for( int q = 0; q < 9; q++ ) + << "q = " << q << ", gi = " << gi << ", j = " << j; + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), gi - ((rank == nproc-1) ? 97 : 0) ) - << "gi = " << gi; + << "q = " << q << ", gi = " << gi << ", j = " << j; } -//TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, synchronize ) -//{ -// test_helper_synchronize( this->distributedNDArray, this->rank, this->nproc ); -//} +TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, synchronize ) +{ + test_helper_synchronize( this->distributedNDArray, this->rank, this->nproc ); +} #endif // HAVE_GTEST diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h index e3cbb3223c9e411105a019dcefd4ba29c047c179..e6ad0df75bdbd939e36dd4a45441c09fdbaa0cef 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h @@ -17,6 +17,8 @@ using namespace TNL; using namespace TNL::Containers; +static constexpr int Q = 9; + /* * Light check of DistributedNDArray. * @@ -63,13 +65,13 @@ protected: // types for which DistributedNDArray_semi1D_test is instantiated using DistributedNDArrayTypes = ::testing::Types< DistributedNDArray< NDArray< double, - SizesHolder< int, 9, 0, 0 >, // Q, X, Y, Z + SizesHolder< int, Q, 0, 0 >, // Q, X, Y, Z std::index_sequence< 0, 1, 2 >, // permutation - should not matter Devices::Host > > #ifdef HAVE_CUDA , DistributedNDArray< NDArray< double, - SizesHolder< int, 9, 0, 0 >, // Q, X, Y, Z + SizesHolder< int, Q, 0, 0 >, // Q, X, Y, Z std::index_sequence< 0, 1, 2 >, // permutation - should not matter Devices::Cuda > > #endif @@ -92,17 +94,17 @@ TYPED_TEST( DistributedNDArray_semi1D_test, setLike ) using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType; const auto localRange = this->distributedNDArray.template getLocalRange< 1 >(); - EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 9 * (localRange.getEnd() - localRange.getBegin()) * (this->globalSize / 2) ); + EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), Q * (localRange.getEnd() - localRange.getBegin()) * (this->globalSize / 2) ); DistributedNDArrayType copy; EXPECT_EQ( copy.getLocalStorageSize(), 0 ); copy.setLike( this->distributedNDArray ); - EXPECT_EQ( copy.getLocalStorageSize(), 9 * (localRange.getEnd() - localRange.getBegin()) * (this->globalSize / 2) ); + EXPECT_EQ( copy.getLocalStorageSize(), Q * (localRange.getEnd() - localRange.getBegin()) * (this->globalSize / 2) ); } TYPED_TEST( DistributedNDArray_semi1D_test, reset ) { const auto localRange = this->distributedNDArray.template getLocalRange< 1 >(); - EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 9 * (localRange.getEnd() - localRange.getBegin()) * (this->globalSize / 2) ); + EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), Q * (localRange.getEnd() - localRange.getBegin()) * (this->globalSize / 2) ); this->distributedNDArray.reset(); EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 0 ); } @@ -117,7 +119,7 @@ TYPED_TEST( DistributedNDArray_semi1D_test, elementwiseAccess ) const auto localRange = this->distributedNDArray.template getLocalRange< 1 >(); // check initial value - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < this->distributedNDArray.template getSize< 2 >(); j++ ) { // EXPECT_EQ( localArrayView.getElement( i ), 0 ); @@ -126,14 +128,14 @@ TYPED_TEST( DistributedNDArray_semi1D_test, elementwiseAccess ) // use operator() if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) { - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < this->distributedNDArray.template getSize< 2 >(); j++ ) { this->distributedNDArray( q, gi, j ) = gi + 1; } // check set value - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < this->distributedNDArray.template getSize< 2 >(); j++ ) { EXPECT_EQ( this->distributedNDArray.getElement( q, gi, j ), gi + 1 ); @@ -174,7 +176,7 @@ void test_helper_comparisonOperators( DistributedArray& u, DistributedArray& v, w_view( q, gi, j ) = 2 * gi; }; Algorithms::ParallelFor3D< DeviceType >::exec( (IndexType) 0, localRange.getBegin(), (IndexType) 0, - 9, localRange.getEnd(), u.template getSize< 2 >(), + Q, localRange.getEnd(), u.template getSize< 2 >(), kernel ); } @@ -223,7 +225,7 @@ void test_helper_forAll( DistributedArray& a ) a.setValue( 0 ); a.forAll( setter ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 1 ); @@ -231,7 +233,7 @@ void test_helper_forAll( DistributedArray& a ) a.setValue( 0 ); a_view.forAll( setter ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 1 ); @@ -260,7 +262,7 @@ void test_helper_forInternal( DistributedArray& a ) a.setValue( 0 ); a.forInternal( setter ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) { @@ -277,7 +279,7 @@ void test_helper_forInternal( DistributedArray& a ) a.setValue( 0 ); a_view.forInternal( setter ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) { @@ -316,7 +318,7 @@ void test_helper_forLocalInternal( DistributedArray& a ) // equivalent to forAll because all overlaps are 0 a.forLocalInternal( setter ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 1 ); @@ -325,7 +327,7 @@ void test_helper_forLocalInternal( DistributedArray& a ) // equivalent to forAll because all overlaps are 0 a_view.forLocalInternal( setter ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 1 ); @@ -354,7 +356,7 @@ void test_helper_forBoundary( DistributedArray& a ) a.setValue( 0 ); a.forBoundary( setter ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) { @@ -371,7 +373,7 @@ void test_helper_forBoundary( DistributedArray& a ) a.setValue( 0 ); a_view.forBoundary( setter ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) { @@ -410,7 +412,7 @@ void test_helper_forLocalBoundary( DistributedArray& a ) // empty set because all overlaps are 0 a.forLocalBoundary( setter ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 0 ); @@ -419,7 +421,7 @@ void test_helper_forLocalBoundary( DistributedArray& a ) // empty set because all overlaps are 0 a_view.forLocalBoundary( setter ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 0 ); @@ -449,7 +451,7 @@ void test_helper_forOverlaps( DistributedArray& a ) // empty set because all overlaps are 0 a.forOverlaps( setter ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 0 ); @@ -458,7 +460,7 @@ void test_helper_forOverlaps( DistributedArray& a ) // empty set because all overlaps are 0 a_view.forOverlaps( setter ); - for( int q = 0; q < 9; q++ ) + for( int q = 0; q < Q; q++ ) for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) for( int j = 0; j < a.template getSize< 2 >(); j++ ) EXPECT_EQ( a.getElement( q, gi, j ), 0 );