From f4d2f8c6ad963fe607dc840993d5a6a1ae53b00d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz> Date: Wed, 6 Feb 2019 15:13:50 +0100 Subject: [PATCH] NDArray: added forInternal method - fixed executors for operations: use inverse permutation when calling the wrapped lambda function - custom internal region can be specified with custom begins/ends multiindices --- src/TNL/Containers/NDArray.h | 24 +- src/TNL/Containers/NDArrayView.h | 24 +- src/TNL/Containers/ndarray/Indexing.h | 30 + src/TNL/Containers/ndarray/Meta.h | 45 +- src/TNL/Containers/ndarray/Operations.h | 232 ++++--- src/TNL/Containers/ndarray/SizesHolder.h | 63 ++ .../Containers/ndarray/NDArrayTest.cpp | 603 +++++++++++++++++- 7 files changed, 924 insertions(+), 97 deletions(-) diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h index ed5111e6dc..e05db95cde 100644 --- a/src/TNL/Containers/NDArray.h +++ b/src/TNL/Containers/NDArray.h @@ -177,7 +177,29 @@ public: void forAll( Func f ) const { __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; - dispatch( sizes, f ); + using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >; + dispatch( Begins{}, sizes, f ); + } + + template< typename Device2 = DeviceType, typename Func > + void forInternal( Func f ) const + { + __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; + using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >; + // subtract static sizes + using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type; + // subtract dynamic sizes + Ends ends; + __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolder >::subtract( ends, sizes ); + dispatch( Begins{}, ends, f ); + } + + template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends > + void forInternal( Func f, const Begins& begins, const Ends& ends ) const + { + // TODO: assert "begins <= sizes", "ends <= sizes" + __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( begins, ends, f ); } diff --git a/src/TNL/Containers/NDArrayView.h b/src/TNL/Containers/NDArrayView.h index 5b5e056ff3..cafaabe9b2 100644 --- a/src/TNL/Containers/NDArrayView.h +++ b/src/TNL/Containers/NDArrayView.h @@ -231,7 +231,29 @@ public: void forAll( Func f ) const { __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; - dispatch( sizes, f ); + using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >; + dispatch( Begins{}, sizes, f ); + } + + template< typename Device2 = DeviceType, typename Func > + void forInternal( Func f ) const + { + __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; + using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >; + // subtract static sizes + using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type; + // subtract dynamic sizes + Ends ends; + __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolder >::subtract( ends, sizes ); + dispatch( Begins{}, ends, f ); + } + + template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends > + void forInternal( Func f, const Begins& begins, const Ends& ends ) const + { + // TODO: assert "begins <= sizes", "ends <= sizes" + __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( begins, ends, f ); } protected: diff --git a/src/TNL/Containers/ndarray/Indexing.h b/src/TNL/Containers/ndarray/Indexing.h index d156547e17..7031d88991 100644 --- a/src/TNL/Containers/ndarray/Indexing.h +++ b/src/TNL/Containers/ndarray/Indexing.h @@ -111,6 +111,36 @@ void setSizesHelper( SizesHolder& holder, } +// helper for the forInternal method +template< std::size_t ConstValue, + typename TargetHolder, + typename SourceHolder, + std::size_t level = TargetHolder::getDimension() - 1 > +struct SetSizesSubtractHelper +{ + static void subtract( TargetHolder& target, + const SourceHolder& source ) + { + if( source.template getStaticSize< level >() == 0 ) + target.template setSize< level >( source.template getSize< level >() - ConstValue ); + SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, level - 1 >::subtract( target, source ); + } +}; + +template< std::size_t ConstValue, + typename TargetHolder, + typename SourceHolder > +struct SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, 0 > +{ + static void subtract( TargetHolder& target, + const SourceHolder& source ) + { + if( source.template getStaticSize< 0 >() == 0 ) + target.template setSize< 0 >( source.template getSize< 0 >() - ConstValue ); + } +}; + + // A variadic bounds-checker for indices template< typename SizesHolder > __cuda_callable__ diff --git a/src/TNL/Containers/ndarray/Meta.h b/src/TNL/Containers/ndarray/Meta.h index 125c9e105a..3ad0372f70 100644 --- a/src/TNL/Containers/ndarray/Meta.h +++ b/src/TNL/Containers/ndarray/Meta.h @@ -93,13 +93,15 @@ is_in_sequence( Index value, std::integer_sequence< Index, vals... > ) // Get index of the first occurrence of value in a variadic pack. template< typename V > -constexpr std::size_t index_in_pack( V&& value ) +constexpr std::size_t +index_in_pack( V&& value ) { return 0; } template< typename V, typename T, typename... Ts > -constexpr std::size_t index_in_pack( V&& value, T&& arg, Ts&&... args ) +constexpr std::size_t +index_in_pack( V&& value, T&& arg, Ts&&... args ) { if( value == arg ) return 0; @@ -196,6 +198,45 @@ auto call_with_permuted_arguments( Func f, Args&&... args ) -> decltype(auto) } +template< typename Permutation, + typename Sequence > +struct CallInversePermutationHelper +{}; + +template< typename Permutation, + std::size_t... N > +struct CallInversePermutationHelper< Permutation, std::index_sequence< N... > > +{ + template< typename Func, + typename... Args > + __cuda_callable__ + static auto apply( Func&& f, Args&&... args ) -> decltype(auto) + { + return std::forward< Func >( f )( get_from_pack< + index_in_sequence( N, Permutation{} ) + >( std::forward< Args >( args )... )... ); + } +}; + +// Call specified function with permuted arguments. +// [used in ndarray_operations.h] +template< typename Permutation, + typename Func, + typename... Args > +__cuda_callable__ +// FIXME: does not compile with nvcc 10.0 +//auto call_with_unpermuted_arguments( Func&& f, Args&&... args ) -> decltype(auto) +//{ +// return CallInversePermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > > +// ::apply( std::forward< Func >( f ), std::forward< Args >( args )... ); +//} +auto call_with_unpermuted_arguments( Func f, Args&&... args ) -> decltype(auto) +{ + return CallInversePermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > > + ::apply( f, std::forward< Args >( args )... ); +} + + // Check that all elements of the initializer list are equal to the specified value. // [used in ndarray_operations.h] constexpr bool diff --git a/src/TNL/Containers/ndarray/Operations.h b/src/TNL/Containers/ndarray/Operations.h index 705687c1d6..b1f793405d 100644 --- a/src/TNL/Containers/ndarray/Operations.h +++ b/src/TNL/Containers/ndarray/Operations.h @@ -15,6 +15,7 @@ #include <TNL/ParallelFor.h> #include <TNL/Containers/ndarray/Meta.h> +#include <TNL/Containers/ndarray/SizesHolder.h> namespace TNL { namespace Containers { @@ -25,34 +26,45 @@ template< typename Permutation, typename LevelTag = IndexTag< 0 > > struct SequentialExecutor { - template< typename SizesHolder, + template< typename Begins, + typename Ends, typename Func, typename... Indices > __cuda_callable__ - void operator()( const SizesHolder& sizes, Func f, Indices&&... indices ) + void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + SequentialExecutor< Permutation, IndexTag< LevelTag::value + 1 > > exec; - const auto size = sizes.template getSize< get< LevelTag::value >( Permutation{} ) >(); - for( typename SizesHolder::IndexType i = 0; i < size; i++ ) - exec( sizes, f, std::forward< Indices >( indices )..., i ); + const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); + const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); + for( auto i = begin; i < end; i++ ) + exec( begins, ends, f, std::forward< Indices >( indices )..., i ); } }; template< typename Permutation > struct SequentialExecutor< Permutation, IndexTag< Permutation::size() - 1 > > { - template< typename SizesHolder, + template< typename Begins, + typename Ends, typename Func, typename... Indices > __cuda_callable__ - void operator()( const SizesHolder& sizes, Func f, Indices&&... indices ) + void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { - static_assert( sizeof...(indices) == SizesHolder::getDimension() - 1, + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + static_assert( sizeof...(indices) == Begins::getDimension() - 1, "invalid number of indices in the final step of the SequentialExecutor" ); - const auto size = sizes.template getSize< get< SizesHolder::getDimension() - 1 >( Permutation{} ) >(); - for( typename SizesHolder::IndexType i = 0; i < size; i++ ) - call_with_permuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i ); + using LevelTag = IndexTag< Permutation::size() - 1 >; + + const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); + const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); + for( auto i = begin; i < end; i++ ) + call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i ); } }; @@ -61,34 +73,43 @@ template< typename Permutation, typename LevelTag = IndexTag< Permutation::size() - 1 > > struct SequentialExecutorRTL { - template< typename SizesHolder, + template< typename Begins, + typename Ends, typename Func, typename... Indices > __cuda_callable__ - void operator()( const SizesHolder& sizes, Func f, Indices&&... indices ) + void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + SequentialExecutorRTL< Permutation, IndexTag< LevelTag::value - 1 > > exec; - const auto size = sizes.template getSize< get< LevelTag::value >( Permutation{} ) >(); - for( typename SizesHolder::IndexType i = 0; i < size; i++ ) - exec( sizes, f, i, std::forward< Indices >( indices )... ); + const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); + const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); + for( auto i = begin; i < end; i++ ) + exec( begins, ends, f, i, std::forward< Indices >( indices )... ); } }; template< typename Permutation > struct SequentialExecutorRTL< Permutation, IndexTag< 0 > > { - template< typename SizesHolder, + template< typename Begins, + typename Ends, typename Func, typename... Indices > __cuda_callable__ - void operator()( const SizesHolder& sizes, Func f, Indices&&... indices ) + void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { - static_assert( sizeof...(indices) == SizesHolder::getDimension() - 1, - "invalid number of indices in the final step of the SequentialExecutor" ); - - const auto size = sizes.template getSize< get< 0 >( Permutation{} ) >(); - for( typename SizesHolder::IndexType i = 0; i < size; i++ ) - call_with_permuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... ); + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + static_assert( sizeof...(indices) == Begins::getDimension() - 1, + "invalid number of indices in the final step of the SequentialExecutorRTL" ); + + const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >(); + const auto end = ends.template getSize< get< 0 >( Permutation{} ) >(); + for( auto i = begin; i < end; i++ ) + call_with_unpermuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... ); } }; @@ -97,42 +118,58 @@ template< typename Permutation, typename Device > struct ParallelExecutorDeviceDispatch { - template< typename SizesHolder, typename Func > - void operator()( const SizesHolder& sizes, Func f ) + template< typename Begins, + typename Ends, + typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) { - using Index = typename SizesHolder::IndexType; + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + using Index = typename Ends::IndexType; auto kernel = [=] ( Index i2, Index i1, Index i0 ) { SequentialExecutor< Permutation, IndexTag< 3 > > exec; - exec( sizes, f, i0, i1, i2 ); + exec( begins, ends, f, i0, i1, i2 ); }; - const Index size0 = sizes.template getSize< get< 0 >( Permutation{} ) >(); - const Index size1 = sizes.template getSize< get< 1 >( Permutation{} ) >(); - const Index size2 = sizes.template getSize< get< 2 >( Permutation{} ) >(); - ParallelFor3D< Device >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel ); + const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); + const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); + const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >(); + const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); + const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); + const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); + ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); } }; template< typename Permutation > struct ParallelExecutorDeviceDispatch< Permutation, Devices::Cuda > { - template< typename SizesHolder, typename Func > - void operator()( const SizesHolder& sizes, Func f ) + template< typename Begins, + typename Ends, + typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) { - using Index = typename SizesHolder::IndexType; + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) { - SequentialExecutorRTL< Permutation, IndexTag< SizesHolder::getDimension() - 4 > > exec; - exec( sizes, f, i0, i1, i2 ); + SequentialExecutorRTL< Permutation, IndexTag< Begins::getDimension() - 4 > > exec; + exec( begins, ends, f, i0, i1, i2 ); }; - const Index size0 = sizes.template getSize< get< SizesHolder::getDimension() - 3 >( Permutation{} ) >(); - const Index size1 = sizes.template getSize< get< SizesHolder::getDimension() - 2 >( Permutation{} ) >(); - const Index size2 = sizes.template getSize< get< SizesHolder::getDimension() - 1 >( Permutation{} ) >(); - ParallelFor3D< Devices::Cuda >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel ); + const Index begin0 = begins.template getSize< get< Begins::getDimension() - 3 >( Permutation{} ) >(); + const Index begin1 = begins.template getSize< get< Begins::getDimension() - 2 >( Permutation{} ) >(); + const Index begin2 = begins.template getSize< get< Begins::getDimension() - 1 >( Permutation{} ) >(); + const Index end0 = ends.template getSize< get< Ends::getDimension() - 3 >( Permutation{} ) >(); + const Index end1 = ends.template getSize< get< Ends::getDimension() - 2 >( Permutation{} ) >(); + const Index end2 = ends.template getSize< get< Ends::getDimension() - 1 >( Permutation{} ) >(); + ParallelFor3D< Devices::Cuda >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); } }; @@ -141,11 +178,13 @@ template< typename Permutation, typename DimTag = IndexTag< Permutation::size() > > struct ParallelExecutor { - template< typename SizesHolder, typename Func > - void operator()( const SizesHolder& sizes, Func f ) + template< typename Begins, + typename Ends, + typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) { ParallelExecutorDeviceDispatch< Permutation, Device > dispatch; - dispatch( sizes, f ); + dispatch( begins, ends, f ); } }; @@ -153,20 +192,28 @@ template< typename Permutation, typename Device > struct ParallelExecutor< Permutation, Device, IndexTag< 3 > > { - template< typename SizesHolder, typename Func > - void operator()( const SizesHolder& sizes, Func f ) + template< typename Begins, + typename Ends, + typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) { - using Index = typename SizesHolder::IndexType; + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) { - call_with_permuted_arguments< Permutation >( f, i0, i1, i2 ); + call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); }; - const Index size0 = sizes.template getSize< get< 0 >( Permutation{} ) >(); - const Index size1 = sizes.template getSize< get< 1 >( Permutation{} ) >(); - const Index size2 = sizes.template getSize< get< 2 >( Permutation{} ) >(); - ParallelFor3D< Device >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel ); + const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); + const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); + const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >(); + const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); + const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); + const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); + ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); } }; @@ -174,19 +221,26 @@ template< typename Permutation, typename Device > struct ParallelExecutor< Permutation, Device, IndexTag< 2 > > { - template< typename SizesHolder, typename Func > - void operator()( const SizesHolder& sizes, Func f ) + template< typename Begins, + typename Ends, + typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) { - using Index = typename SizesHolder::IndexType; + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 ) { - call_with_permuted_arguments< Permutation >( f, i0, i1 ); + call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); }; - const Index size0 = sizes.template getSize< get< 0 >( Permutation{} ) >(); - const Index size1 = sizes.template getSize< get< 1 >( Permutation{} ) >(); - ParallelFor2D< Device >::exec( (Index) 0, (Index) 0, size1, size0, kernel ); + const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); + const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); + const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); + const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); + ParallelFor2D< Device >::exec( begin1, begin0, end1, end0, kernel ); } }; @@ -194,18 +248,25 @@ template< typename Permutation, typename Device > struct ParallelExecutor< Permutation, Device, IndexTag< 1 > > { - template< typename SizesHolder, typename Func > - void operator()( const SizesHolder& sizes, Func f ) + template< typename Begins, + typename Ends, + typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) { - using Index = typename SizesHolder::IndexType; + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); - auto kernel = [=] __cuda_callable__ ( Index i ) - { - call_with_permuted_arguments< Permutation >( f, i ); - }; + using Index = typename Ends::IndexType; + +// auto kernel = [=] __cuda_callable__ ( Index i ) +// { +// call_with_unpermuted_arguments< Permutation >( f, i ); +// }; - const Index size = sizes.template getSize< get< 0 >( Permutation{} ) >(); - ParallelFor< Device >::exec( (Index) 0, size, kernel ); + const Index begin = begins.template getSize< get< 0 >( Permutation{} ) >(); + const Index end = ends.template getSize< get< 0 >( Permutation{} ) >(); +// ParallelFor< Device >::exec( begin, end, kernel ); + ParallelFor< Device >::exec( begin, end, f ); } }; @@ -215,33 +276,33 @@ template< typename Permutation, typename Device > struct ExecutorDispatcher { - template< typename SizesHolder, typename Func > - void operator()( const SizesHolder& sizes, Func f ) + template< typename Begins, typename Ends, typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) { - SequentialExecutor< Permutation >()( sizes, f ); + SequentialExecutor< Permutation >()( begins, ends, f ); } }; template< typename Permutation > struct ExecutorDispatcher< Permutation, Devices::Host > { - template< typename SizesHolder, typename Func > - void operator()( const SizesHolder& sizes, Func f ) + template< typename Begins, typename Ends, typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) { if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) - ParallelExecutor< Permutation, Devices::Host >()( sizes, f ); + ParallelExecutor< Permutation, Devices::Host >()( begins, ends, f ); else - SequentialExecutor< Permutation >()( sizes, f ); + SequentialExecutor< Permutation >()( begins, ends, f ); } }; template< typename Permutation > struct ExecutorDispatcher< Permutation, Devices::Cuda > { - template< typename SizesHolder, typename Func > - void operator()( const SizesHolder& sizes, Func f ) + template< typename Begins, typename Ends, typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) { - ParallelExecutor< Permutation, Devices::Cuda >()( sizes, f ); + ParallelExecutor< Permutation, Devices::Cuda >()( begins, ends, f ); } }; @@ -263,7 +324,8 @@ void nd_map_view( Output output, Func f, const Input... input ) }; ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch; - dispatch( output.getSizes(), wrapper ); + using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >; + dispatch( Begins{}, output.getSizes(), wrapper ); } #else @@ -362,7 +424,8 @@ void nd_map_view( Output output, Func f ) { nvcc_map_helper_0< Output, Func > wrapper( output, f ); ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch; - dispatch( output.getSizes(), wrapper ); + using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >; + dispatch( Begins{}, output.getSizes(), wrapper ); } template< typename Output, @@ -375,7 +438,8 @@ void nd_map_view( Output output, Func f, const Input1 input1 ) nvcc_map_helper_1< Output, Func, Input1 > wrapper( output, f, input1 ); ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch; - dispatch( output.getSizes(), wrapper ); + using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >; + dispatch( Begins{}, output.getSizes(), wrapper ); } template< typename Output, @@ -389,7 +453,8 @@ void nd_map_view( Output output, Func f, const Input1 input1, const Input2 input nvcc_map_helper_2< Output, Func, Input1, Input2 > wrapper( output, f, input1, input2 ); ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch; - dispatch( output.getSizes(), wrapper ); + using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >; + dispatch( Begins{}, output.getSizes(), wrapper ); } template< typename Output, @@ -404,7 +469,8 @@ void nd_map_view( Output output, Func f, const Input1 input1, const Input2 input nvcc_map_helper_3< Output, Func, Input1, Input2, Input3 > wrapper( output, f, input1, input2, input3 ); ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch; - dispatch( output.getSizes(), wrapper ); + using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >; + dispatch( Begins{}, output.getSizes(), wrapper ); } #endif diff --git a/src/TNL/Containers/ndarray/SizesHolder.h b/src/TNL/Containers/ndarray/SizesHolder.h index 5b6e52f5fa..306569de7b 100644 --- a/src/TNL/Containers/ndarray/SizesHolder.h +++ b/src/TNL/Containers/ndarray/SizesHolder.h @@ -201,6 +201,49 @@ public: }; +template< typename Index, + std::size_t dimension, + Index constSize > +class ConstStaticSizesHolder +{ +public: + using IndexType = Index; + + static constexpr std::size_t getDimension() + { + return dimension; + } + + template< std::size_t level > + static constexpr std::size_t getStaticSize() + { + static_assert( level < getDimension(), "Invalid level passed to getStaticSize()." ); + return constSize; + } + + template< std::size_t level > + __cuda_callable__ + Index getSize() const + { + static_assert( level < getDimension(), "Invalid dimension passed to getSize()." ); + return constSize; + } + + // methods for convenience + __cuda_callable__ + bool operator==( const ConstStaticSizesHolder& other ) const + { + return true; + } + + __cuda_callable__ + bool operator!=( const ConstStaticSizesHolder& other ) const + { + return false; + } +}; + + template< typename Index, std::size_t... sizes > std::ostream& operator<<( std::ostream& str, const SizesHolder< Index, sizes... >& holder ) @@ -213,5 +256,25 @@ std::ostream& operator<<( std::ostream& str, const SizesHolder< Index, sizes... return str; } + +// helper for the forInternal method +namespace __ndarray_impl { + +template< typename SizesHolder, + std::size_t ConstValue > +struct SubtractedSizesHolder +{}; + +template< typename Index, + std::size_t ConstValue, + std::size_t... sizes > +struct SubtractedSizesHolder< SizesHolder< Index, sizes... >, ConstValue > +{ +// using type = SizesHolder< Index, std::max( (std::size_t) 0, sizes - ConstValue )... >; + using type = SizesHolder< Index, ( (sizes >= ConstValue) ? sizes - ConstValue : 0 )... >; +}; + +} // namespace __ndarray_impl + } // namespace Containers } // namespace TNL diff --git a/src/UnitTests/Containers/ndarray/NDArrayTest.cpp b/src/UnitTests/Containers/ndarray/NDArrayTest.cpp index 385ff93da2..74221bdcbc 100644 --- a/src/UnitTests/Containers/ndarray/NDArrayTest.cpp +++ b/src/UnitTests/Containers/ndarray/NDArrayTest.cpp @@ -180,9 +180,119 @@ TEST( NDArrayTest, SizesHolderPrinter ) EXPECT_EQ( str.str(), "SizesHolder< 0, 1, 2 >( 3, 1, 2 )" ); } -TEST( NDArrayTest, forAll_dynamic ) +TEST( NDArrayTest, forAll_dynamic_1D ) { - int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2; + int I = 2; + NDArray< int, + SizesHolder< int, 0 >, + index_sequence< 0 > > a; + a.setSizes( I ); + a.setValue( 0 ); + + auto setter = [&] ( int i ) + { + a( i ) += 1; + }; + + a.forAll( setter ); + + for( int i = 0; i < I; i++ ) + EXPECT_EQ( a( i ), 1 ); +} + +TEST( NDArrayTest, forAll_dynamic_2D ) +{ + int I = 2, J = 3; + NDArray< int, + SizesHolder< int, 0, 0 >, + index_sequence< 1, 0 > > a; + a.setSizes( I, J ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j ) + { + a( i, j ) += 1; + }; + + a.forAll( setter ); + + for( int j = 0; j < J; j++ ) + for( int i = 0; i < I; i++ ) + EXPECT_EQ( a( i, j ), 1 ); +} + +TEST( NDArrayTest, forAll_dynamic_3D ) +{ + int I = 2, J = 3, K = 4; + NDArray< int, + SizesHolder< int, 0, 0, 0 >, + index_sequence< 2, 0, 1 > > a; + a.setSizes( I, J, K ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k ) + { + a( i, j, k ) += 1; + }; + + a.forAll( setter ); + + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + EXPECT_EQ( a( i, j, k ), 1 ); +} + +TEST( NDArrayTest, forAll_dynamic_4D ) +{ + int I = 2, J = 3, K = 4, L = 5; + NDArray< int, + SizesHolder< int, 0, 0, 0, 0 >, + index_sequence< 3, 2, 0, 1 > > a; + a.setSizes( I, J, K, L ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l ) + { + a( i, j, k, l ) += 1; + }; + + a.forAll( setter ); + + for( int l = 0; l < L; l++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + EXPECT_EQ( a( i, j, k, l ), 1 ); +} + +TEST( NDArrayTest, forAll_dynamic_5D ) +{ + int I = 2, J = 3, K = 4, L = 5, M = 6; + NDArray< int, + SizesHolder< int, 0, 0, 0, 0, 0 >, + index_sequence< 3, 4, 2, 0, 1 > > a; + a.setSizes( I, J, K, L, M ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l, int m ) + { + a( i, j, k, l, m ) += 1; + }; + + a.forAll( setter ); + + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + EXPECT_EQ( a( i, j, k, l, m ), 1 ); +} + +TEST( NDArrayTest, forAll_dynamic_6D ) +{ + int I = 2, J = 3, K = 4, L = 5, M = 6, N = 7; NDArray< int, SizesHolder< int, 0, 0, 0, 0, 0, 0 >, index_sequence< 5, 3, 4, 2, 0, 1 > > a; @@ -191,7 +301,7 @@ TEST( NDArrayTest, forAll_dynamic ) auto setter = [&] ( int i, int j, int k, int l, int m, int n ) { - a( i, j, k, l, m, n ) = 1; + a( i, j, k, l, m, n ) += 1; }; a.forAll( setter ); @@ -205,19 +315,34 @@ TEST( NDArrayTest, forAll_dynamic ) EXPECT_EQ( a( i, j, k, l, m, n ), 1 ); } -TEST( NDArrayTest, forAll_static ) +TEST( NDArrayTest, forAll_static_1D ) { - constexpr int I = 3, J = 4; - NDArray< int, SizesHolder< int, I, J > > a; - a.setSizes( 0, 0 ); + constexpr int I = 3; + StaticNDArray< int, SizesHolder< int, I > > a; +// a.setSizes( 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i ) + { + a( i ) += 1; + }; + + a.forAll( setter ); for( int i = 0; i < I; i++ ) - for( int j = 0; j < J; j++ ) - a( i, j ) = 0; + EXPECT_EQ( a( i ), 1 ); +} + +TEST( NDArrayTest, forAll_static_2D ) +{ + constexpr int I = 3, J = 4; + StaticNDArray< int, SizesHolder< int, I, J > > a; +// a.setSizes( 0, 0 ); + a.setValue( 0 ); auto setter = [&] ( int i, int j ) { - a( i, j ) = 1; + a( i, j ) += 1; }; a.forAll( setter ); @@ -227,6 +352,464 @@ TEST( NDArrayTest, forAll_static ) EXPECT_EQ( a( i, j ), 1 ); } +TEST( NDArrayTest, forAll_static_3D ) +{ + constexpr int I = 3, J = 4, K = 5; + StaticNDArray< int, SizesHolder< int, I, J, K > > a; +// a.setSizes( 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k ) + { + a( i, j, k ) += 1; + }; + + a.forAll( setter ); + + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + for( int k = 0; k < K; k++ ) + EXPECT_EQ( a( i, j, k ), 1 ); +} + +TEST( NDArrayTest, forAll_static_4D ) +{ + constexpr int I = 3, J = 4, K = 5, L = 6; + StaticNDArray< int, SizesHolder< int, I, J, K, L > > a; +// a.setSizes( 0, 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l ) + { + a( i, j, k, l ) += 1; + }; + + a.forAll( setter ); + + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + for( int k = 0; k < K; k++ ) + for( int l = 0; l < L; l++ ) + EXPECT_EQ( a( i, j, k, l ), 1 ); +} + +TEST( NDArrayTest, forAll_static_5D ) +{ + constexpr int I = 3, J = 4, K = 5, L = 6, M = 7; + StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a; +// a.setSizes( 0, 0, 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l, int m ) + { + a( i, j, k, l, m ) += 1; + }; + + a.forAll( setter ); + + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + for( int k = 0; k < K; k++ ) + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + EXPECT_EQ( a( i, j, k, l, m ), 1 ); +} + +TEST( NDArrayTest, forAll_static_6D ) +{ + constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8; + StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a; +// a.setSizes( 0, 0, 0, 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l, int m, int n ) + { + a( i, j, k, l, m, n ) += 1; + }; + + a.forAll( setter ); + + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + for( int k = 0; k < K; k++ ) + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + for( int n = 0; n < N; n++ ) + EXPECT_EQ( a( i, j, k, l, m, n ), 1 ); +} + +TEST( NDArrayTest, forInternal_dynamic_1D ) +{ + int I = 3; + NDArray< int, + SizesHolder< int, 0 >, + index_sequence< 0 > > a; + a.setSizes( I ); + a.setValue( 0 ); + + auto setter = [&] ( int i ) + { + a( i ) += 1; + }; + + a.forInternal( setter ); + + for( int i = 0; i < I; i++ ) + { + if( i == 0 || i == I - 1 ) + EXPECT_EQ( a( i ), 0 ) + << "i = " << i; + else + EXPECT_EQ( a( i ), 1 ) + << "i = " << i; + } +} + +TEST( NDArrayTest, forInternal_dynamic_2D ) +{ + int I = 3, J = 4; + NDArray< int, + SizesHolder< int, 0, 0 >, + index_sequence< 1, 0 > > a; + a.setSizes( I, J ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j ) + { + a( i, j ) += 1; + }; + + a.forInternal( setter ); + + for( int j = 0; j < J; j++ ) + for( int i = 0; i < I; i++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 ) + EXPECT_EQ( a( i, j ), 0 ) + << "i = " << i << ", j = " << j; + else + EXPECT_EQ( a( i, j ), 1 ) + << "i = " << i << ", j = " << j; + } +} + +TEST( NDArrayTest, forInternal_dynamic_3D ) +{ + int I = 3, J = 4, K = 5; + NDArray< int, + SizesHolder< int, 0, 0, 0 >, + index_sequence< 2, 0, 1 > > a; + a.setSizes( I, J, K ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k ) + { + a( i, j, k ) += 1; + }; + + a.forInternal( setter ); + + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 ) + EXPECT_EQ( a( i, j, k ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k; + else + EXPECT_EQ( a( i, j, k ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k; + } +} + +TEST( NDArrayTest, forInternal_dynamic_4D ) +{ + int I = 3, J = 4, K = 5, L = 6; + NDArray< int, + SizesHolder< int, 0, 0, 0, 0 >, + index_sequence< 3, 2, 0, 1 > > a; + a.setSizes( I, J, K, L ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l ) + { + a( i, j, k, l ) += 1; + }; + + a.forInternal( setter ); + + for( int l = 0; l < L; l++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 || + l == 0 || l == L - 1 ) + EXPECT_EQ( a( i, j, k, l ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l; + else + EXPECT_EQ( a( i, j, k, l ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l; + } +} + +TEST( NDArrayTest, forInternal_dynamic_5D ) +{ + int I = 3, J = 4, K = 5, L = 6, M = 7; + NDArray< int, + SizesHolder< int, 0, 0, 0, 0, 0 >, + index_sequence< 3, 4, 2, 0, 1 > > a; + a.setSizes( I, J, K, L, M ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l, int m ) + { + a( i, j, k, l, m ) += 1; + }; + + a.forInternal( setter ); + + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 || + l == 0 || l == L - 1 || + m == 0 || m == M - 1 ) + EXPECT_EQ( a( i, j, k, l, m ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m; + else + EXPECT_EQ( a( i, j, k, l, m ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m; + } +} + +TEST( NDArrayTest, forInternal_dynamic_6D ) +{ + int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8; + NDArray< int, + SizesHolder< int, 0, 0, 0, 0, 0, 0 >, + index_sequence< 5, 3, 4, 2, 0, 1 > > a; + a.setSizes( I, J, K, L, M, N ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l, int m, int n ) + { + a( i, j, k, l, m, n ) += 1; + }; + + a.forInternal( setter ); + + for( int n = 0; n < N; n++ ) + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 || + l == 0 || l == L - 1 || + m == 0 || m == M - 1 || + n == 0 || n == N - 1 ) + EXPECT_EQ( a( i, j, k, l, m, n ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n; + else + EXPECT_EQ( a( i, j, k, l, m, n ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n; + } +} + +TEST( NDArrayTest, forInternal_static_1D ) +{ + constexpr int I = 3; + StaticNDArray< int, SizesHolder< int, I > > a; +// a.setSizes( 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i ) + { + a( i ) += 1; + }; + + a.forInternal( setter ); + + for( int i = 0; i < I; i++ ) + { + if( i == 0 || i == I - 1 ) + EXPECT_EQ( a( i ), 0 ) + << "i = " << i; + else + EXPECT_EQ( a( i ), 1 ) + << "i = " << i; + } +} + +TEST( NDArrayTest, forInternal_static_2D ) +{ + constexpr int I = 3, J = 4; + StaticNDArray< int, SizesHolder< int, I, J > > a; +// a.setSizes( 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j ) + { + a( i, j ) += 1; + }; + + a.forInternal( setter ); + + for( int j = 0; j < J; j++ ) + for( int i = 0; i < I; i++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 ) + EXPECT_EQ( a( i, j ), 0 ) + << "i = " << i << ", j = " << j; + else + EXPECT_EQ( a( i, j ), 1 ) + << "i = " << i << ", j = " << j; + } +} + +TEST( NDArrayTest, forInternal_static_3D ) +{ + constexpr int I = 3, J = 4, K = 5; + StaticNDArray< int, SizesHolder< int, I, J, K > > a; +// a.setSizes( 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k ) + { + a( i, j, k ) += 1; + }; + + a.forInternal( setter ); + + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 ) + EXPECT_EQ( a( i, j, k ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k; + else + EXPECT_EQ( a( i, j, k ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k; + } +} + +TEST( NDArrayTest, forInternal_static_4D ) +{ + constexpr int I = 3, J = 4, K = 5, L = 6; + StaticNDArray< int, SizesHolder< int, I, J, K, L > > a; +// a.setSizes( 0, 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l ) + { + a( i, j, k, l ) += 1; + }; + + a.forInternal( setter ); + + for( int l = 0; l < L; l++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 || + l == 0 || l == L - 1 ) + EXPECT_EQ( a( i, j, k, l ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l; + else + EXPECT_EQ( a( i, j, k, l ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l; + } +} + +TEST( NDArrayTest, forInternal_static_5D ) +{ + constexpr int I = 3, J = 4, K = 5, L = 6, M = 7; + StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a; +// a.setSizes( 0, 0, 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l, int m ) + { + a( i, j, k, l, m ) += 1; + }; + + a.forInternal( setter ); + + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 || + l == 0 || l == L - 1 || + m == 0 || m == M - 1 ) + EXPECT_EQ( a( i, j, k, l, m ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m; + else + EXPECT_EQ( a( i, j, k, l, m ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m; + } +} + +TEST( NDArrayTest, forInternal_static_6D ) +{ + constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8; + StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a; +// a.setSizes( 0, 0, 0, 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l, int m, int n ) + { + a( i, j, k, l, m, n ) += 1; + }; + + a.forInternal( setter ); + + for( int n = 0; n < N; n++ ) + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 || + l == 0 || l == L - 1 || + m == 0 || m == M - 1 || + n == 0 || n == N - 1 ) + EXPECT_EQ( a( i, j, k, l, m, n ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n; + else + EXPECT_EQ( a( i, j, k, l, m, n ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n; + } +} + //#include "GtestMissingError.h" int main( int argc, char* argv[] ) { -- GitLab