From f4d2f8c6ad963fe607dc840993d5a6a1ae53b00d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Wed, 6 Feb 2019 15:13:50 +0100
Subject: [PATCH] NDArray: added forInternal method

- fixed executors for operations: use inverse permutation when calling
  the wrapped lambda function
- custom internal region can be specified with custom begins/ends
  multiindices
---
 src/TNL/Containers/NDArray.h                  |  24 +-
 src/TNL/Containers/NDArrayView.h              |  24 +-
 src/TNL/Containers/ndarray/Indexing.h         |  30 +
 src/TNL/Containers/ndarray/Meta.h             |  45 +-
 src/TNL/Containers/ndarray/Operations.h       | 232 ++++---
 src/TNL/Containers/ndarray/SizesHolder.h      |  63 ++
 .../Containers/ndarray/NDArrayTest.cpp        | 603 +++++++++++++++++-
 7 files changed, 924 insertions(+), 97 deletions(-)

diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h
index ed5111e6dc..e05db95cde 100644
--- a/src/TNL/Containers/NDArray.h
+++ b/src/TNL/Containers/NDArray.h
@@ -177,7 +177,29 @@ public:
    void forAll( Func f ) const
    {
       __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
-      dispatch( sizes, f );
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
+      dispatch( Begins{}, sizes, f );
+   }
+
+   template< typename Device2 = DeviceType, typename Func >
+   void forInternal( Func f ) const
+   {
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >;
+      // subtract static sizes
+      using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type;
+      // subtract dynamic sizes
+      Ends ends;
+      __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolder >::subtract( ends, sizes );
+      dispatch( Begins{}, ends, f );
+   }
+
+   template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends >
+   void forInternal( Func f, const Begins& begins, const Ends& ends ) const
+   {
+      // TODO: assert "begins <= sizes", "ends <= sizes"
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, ends, f );
    }
 
 
diff --git a/src/TNL/Containers/NDArrayView.h b/src/TNL/Containers/NDArrayView.h
index 5b5e056ff3..cafaabe9b2 100644
--- a/src/TNL/Containers/NDArrayView.h
+++ b/src/TNL/Containers/NDArrayView.h
@@ -231,7 +231,29 @@ public:
    void forAll( Func f ) const
    {
       __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
-      dispatch( sizes, f );
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
+      dispatch( Begins{}, sizes, f );
+   }
+
+   template< typename Device2 = DeviceType, typename Func >
+   void forInternal( Func f ) const
+   {
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >;
+      // subtract static sizes
+      using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type;
+      // subtract dynamic sizes
+      Ends ends;
+      __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolder >::subtract( ends, sizes );
+      dispatch( Begins{}, ends, f );
+   }
+
+   template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends >
+   void forInternal( Func f, const Begins& begins, const Ends& ends ) const
+   {
+      // TODO: assert "begins <= sizes", "ends <= sizes"
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, ends, f );
    }
 
 protected:
diff --git a/src/TNL/Containers/ndarray/Indexing.h b/src/TNL/Containers/ndarray/Indexing.h
index d156547e17..7031d88991 100644
--- a/src/TNL/Containers/ndarray/Indexing.h
+++ b/src/TNL/Containers/ndarray/Indexing.h
@@ -111,6 +111,36 @@ void setSizesHelper( SizesHolder& holder,
 }
 
 
+// helper for the forInternal method
+template< std::size_t ConstValue,
+          typename TargetHolder,
+          typename SourceHolder,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesSubtractHelper
+{
+   static void subtract( TargetHolder& target,
+                         const SourceHolder& source )
+   {
+      if( source.template getStaticSize< level >() == 0 )
+         target.template setSize< level >( source.template getSize< level >() - ConstValue );
+      SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, level - 1 >::subtract( target, source );
+   }
+};
+
+template< std::size_t ConstValue,
+          typename TargetHolder,
+          typename SourceHolder >
+struct SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, 0 >
+{
+   static void subtract( TargetHolder& target,
+                         const SourceHolder& source )
+   {
+      if( source.template getStaticSize< 0 >() == 0 )
+         target.template setSize< 0 >( source.template getSize< 0 >() - ConstValue );
+   }
+};
+
+
 // A variadic bounds-checker for indices
 template< typename SizesHolder >
 __cuda_callable__
diff --git a/src/TNL/Containers/ndarray/Meta.h b/src/TNL/Containers/ndarray/Meta.h
index 125c9e105a..3ad0372f70 100644
--- a/src/TNL/Containers/ndarray/Meta.h
+++ b/src/TNL/Containers/ndarray/Meta.h
@@ -93,13 +93,15 @@ is_in_sequence( Index value, std::integer_sequence< Index, vals... > )
 
 // Get index of the first occurrence of value in a variadic pack.
 template< typename V >
-constexpr std::size_t index_in_pack( V&& value )
+constexpr std::size_t
+index_in_pack( V&& value )
 {
    return 0;
 }
 
 template< typename V, typename T, typename... Ts >
-constexpr std::size_t index_in_pack( V&& value, T&& arg, Ts&&... args )
+constexpr std::size_t
+index_in_pack( V&& value, T&& arg, Ts&&... args )
 {
    if( value == arg )
       return 0;
@@ -196,6 +198,45 @@ auto call_with_permuted_arguments( Func f, Args&&... args ) -> decltype(auto)
 }
 
 
+template< typename Permutation,
+          typename Sequence >
+struct CallInversePermutationHelper
+{};
+
+template< typename Permutation,
+          std::size_t... N >
+struct CallInversePermutationHelper< Permutation, std::index_sequence< N... > >
+{
+   template< typename Func,
+             typename... Args >
+   __cuda_callable__
+   static auto apply( Func&& f, Args&&... args ) -> decltype(auto)
+   {
+      return std::forward< Func >( f )( get_from_pack<
+                  index_in_sequence( N, Permutation{} )
+                >( std::forward< Args >( args )... )... );
+   }
+};
+
+// Call specified function with permuted arguments.
+// [used in ndarray_operations.h]
+template< typename Permutation,
+          typename Func,
+          typename... Args >
+__cuda_callable__
+// FIXME: does not compile with nvcc 10.0
+//auto call_with_unpermuted_arguments( Func&& f, Args&&... args ) -> decltype(auto)
+//{
+//   return CallInversePermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > >
+//          ::apply( std::forward< Func >( f ), std::forward< Args >( args )... );
+//}
+auto call_with_unpermuted_arguments( Func f, Args&&... args ) -> decltype(auto)
+{
+   return CallInversePermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > >
+          ::apply( f, std::forward< Args >( args )... );
+}
+
+
 // Check that all elements of the initializer list are equal to the specified value.
 // [used in ndarray_operations.h]
 constexpr bool
diff --git a/src/TNL/Containers/ndarray/Operations.h b/src/TNL/Containers/ndarray/Operations.h
index 705687c1d6..b1f793405d 100644
--- a/src/TNL/Containers/ndarray/Operations.h
+++ b/src/TNL/Containers/ndarray/Operations.h
@@ -15,6 +15,7 @@
 #include <TNL/ParallelFor.h>
 
 #include <TNL/Containers/ndarray/Meta.h>
+#include <TNL/Containers/ndarray/SizesHolder.h>
 
 namespace TNL {
 namespace Containers {
@@ -25,34 +26,45 @@ template< typename Permutation,
           typename LevelTag = IndexTag< 0 > >
 struct SequentialExecutor
 {
-   template< typename SizesHolder,
+   template< typename Begins,
+             typename Ends,
              typename Func,
              typename... Indices >
    __cuda_callable__
-   void operator()( const SizesHolder& sizes, Func f, Indices&&... indices )
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
    {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
       SequentialExecutor< Permutation, IndexTag< LevelTag::value + 1 > > exec;
-      const auto size = sizes.template getSize< get< LevelTag::value >( Permutation{} ) >();
-      for( typename SizesHolder::IndexType i = 0; i < size; i++ )
-         exec( sizes, f, std::forward< Indices >( indices )..., i );
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         exec( begins, ends, f, std::forward< Indices >( indices )..., i );
    }
 };
 
 template< typename Permutation >
 struct SequentialExecutor< Permutation, IndexTag< Permutation::size() - 1 > >
 {
-   template< typename SizesHolder,
+   template< typename Begins,
+             typename Ends,
              typename Func,
              typename... Indices >
    __cuda_callable__
-   void operator()( const SizesHolder& sizes, Func f, Indices&&... indices )
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
    {
-      static_assert( sizeof...(indices) == SizesHolder::getDimension() - 1,
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      static_assert( sizeof...(indices) == Begins::getDimension() - 1,
                      "invalid number of indices in the final step of the SequentialExecutor" );
 
-      const auto size = sizes.template getSize< get< SizesHolder::getDimension() - 1 >( Permutation{} ) >();
-      for( typename SizesHolder::IndexType i = 0; i < size; i++ )
-         call_with_permuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
+      using LevelTag = IndexTag< Permutation::size() - 1 >;
+
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
    }
 };
 
@@ -61,34 +73,43 @@ template< typename Permutation,
           typename LevelTag = IndexTag< Permutation::size() - 1 > >
 struct SequentialExecutorRTL
 {
-   template< typename SizesHolder,
+   template< typename Begins,
+             typename Ends,
              typename Func,
              typename... Indices >
    __cuda_callable__
-   void operator()( const SizesHolder& sizes, Func f, Indices&&... indices )
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
    {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
       SequentialExecutorRTL< Permutation, IndexTag< LevelTag::value - 1 > > exec;
-      const auto size = sizes.template getSize< get< LevelTag::value >( Permutation{} ) >();
-      for( typename SizesHolder::IndexType i = 0; i < size; i++ )
-         exec( sizes, f, i, std::forward< Indices >( indices )... );
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         exec( begins, ends, f, i, std::forward< Indices >( indices )... );
    }
 };
 
 template< typename Permutation >
 struct SequentialExecutorRTL< Permutation, IndexTag< 0 > >
 {
-   template< typename SizesHolder,
+   template< typename Begins,
+             typename Ends,
              typename Func,
              typename... Indices >
    __cuda_callable__
-   void operator()( const SizesHolder& sizes, Func f, Indices&&... indices )
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
    {
-      static_assert( sizeof...(indices) == SizesHolder::getDimension() - 1,
-                     "invalid number of indices in the final step of the SequentialExecutor" );
-
-      const auto size = sizes.template getSize< get< 0 >( Permutation{} ) >();
-      for( typename SizesHolder::IndexType i = 0; i < size; i++ )
-         call_with_permuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... );
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      static_assert( sizeof...(indices) == Begins::getDimension() - 1,
+                     "invalid number of indices in the final step of the SequentialExecutorRTL" );
+
+      const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< 0 >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         call_with_unpermuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... );
    }
 };
 
@@ -97,42 +118,58 @@ template< typename Permutation,
           typename Device >
 struct ParallelExecutorDeviceDispatch
 {
-   template< typename SizesHolder, typename Func >
-   void operator()( const SizesHolder& sizes, Func f )
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
    {
-      using Index = typename SizesHolder::IndexType;
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
 
       auto kernel = [=] ( Index i2, Index i1, Index i0 )
       {
          SequentialExecutor< Permutation, IndexTag< 3 > > exec;
-         exec( sizes, f, i0, i1, i2 );
+         exec( begins, ends, f, i0, i1, i2 );
       };
 
-      const Index size0 = sizes.template getSize< get< 0 >( Permutation{} ) >();
-      const Index size1 = sizes.template getSize< get< 1 >( Permutation{} ) >();
-      const Index size2 = sizes.template getSize< get< 2 >( Permutation{} ) >();
-      ParallelFor3D< Device >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
+      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+      const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
+      ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
    }
 };
 
 template< typename Permutation >
 struct ParallelExecutorDeviceDispatch< Permutation, Devices::Cuda >
 {
-   template< typename SizesHolder, typename Func >
-   void operator()( const SizesHolder& sizes, Func f )
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
    {
-      using Index = typename SizesHolder::IndexType;
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
 
       auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
       {
-         SequentialExecutorRTL< Permutation, IndexTag< SizesHolder::getDimension() - 4 > > exec;
-         exec( sizes, f, i0, i1, i2 );
+         SequentialExecutorRTL< Permutation, IndexTag< Begins::getDimension() - 4 > > exec;
+         exec( begins, ends, f, i0, i1, i2 );
       };
 
-      const Index size0 = sizes.template getSize< get< SizesHolder::getDimension() - 3 >( Permutation{} ) >();
-      const Index size1 = sizes.template getSize< get< SizesHolder::getDimension() - 2 >( Permutation{} ) >();
-      const Index size2 = sizes.template getSize< get< SizesHolder::getDimension() - 1 >( Permutation{} ) >();
-      ParallelFor3D< Devices::Cuda >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
+      const Index begin0 = begins.template getSize< get< Begins::getDimension() - 3 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< Begins::getDimension() - 2 >( Permutation{} ) >();
+      const Index begin2 = begins.template getSize< get< Begins::getDimension() - 1 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< Ends::getDimension() - 3 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< Ends::getDimension() - 2 >( Permutation{} ) >();
+      const Index end2 = ends.template getSize< get< Ends::getDimension() - 1 >( Permutation{} ) >();
+      ParallelFor3D< Devices::Cuda >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
    }
 };
 
@@ -141,11 +178,13 @@ template< typename Permutation,
           typename DimTag = IndexTag< Permutation::size() > >
 struct ParallelExecutor
 {
-   template< typename SizesHolder, typename Func >
-   void operator()( const SizesHolder& sizes, Func f )
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
    {
       ParallelExecutorDeviceDispatch< Permutation, Device > dispatch;
-      dispatch( sizes, f );
+      dispatch( begins, ends, f );
    }
 };
 
@@ -153,20 +192,28 @@ template< typename Permutation,
           typename Device >
 struct ParallelExecutor< Permutation, Device, IndexTag< 3 > >
 {
-   template< typename SizesHolder, typename Func >
-   void operator()( const SizesHolder& sizes, Func f )
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
    {
-      using Index = typename SizesHolder::IndexType;
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
 
       auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
       {
-         call_with_permuted_arguments< Permutation >( f, i0, i1, i2 );
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
       };
 
-      const Index size0 = sizes.template getSize< get< 0 >( Permutation{} ) >();
-      const Index size1 = sizes.template getSize< get< 1 >( Permutation{} ) >();
-      const Index size2 = sizes.template getSize< get< 2 >( Permutation{} ) >();
-      ParallelFor3D< Device >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
+      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+      const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
+      ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
    }
 };
 
@@ -174,19 +221,26 @@ template< typename Permutation,
           typename Device >
 struct ParallelExecutor< Permutation, Device, IndexTag< 2 > >
 {
-   template< typename SizesHolder, typename Func >
-   void operator()( const SizesHolder& sizes, Func f )
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
    {
-      using Index = typename SizesHolder::IndexType;
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
 
       auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
       {
-         call_with_permuted_arguments< Permutation >( f, i0, i1 );
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
       };
 
-      const Index size0 = sizes.template getSize< get< 0 >( Permutation{} ) >();
-      const Index size1 = sizes.template getSize< get< 1 >( Permutation{} ) >();
-      ParallelFor2D< Device >::exec( (Index) 0, (Index) 0, size1, size0, kernel );
+      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+      ParallelFor2D< Device >::exec( begin1, begin0, end1, end0, kernel );
    }
 };
 
@@ -194,18 +248,25 @@ template< typename Permutation,
           typename Device >
 struct ParallelExecutor< Permutation, Device, IndexTag< 1 > >
 {
-   template< typename SizesHolder, typename Func >
-   void operator()( const SizesHolder& sizes, Func f )
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
    {
-      using Index = typename SizesHolder::IndexType;
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
 
-      auto kernel = [=] __cuda_callable__ ( Index i )
-      {
-         call_with_permuted_arguments< Permutation >( f, i );
-      };
+      using Index = typename Ends::IndexType;
+
+//      auto kernel = [=] __cuda_callable__ ( Index i )
+//      {
+//         call_with_unpermuted_arguments< Permutation >( f, i );
+//      };
 
-      const Index size = sizes.template getSize< get< 0 >( Permutation{} ) >();
-      ParallelFor< Device >::exec( (Index) 0, size, kernel );
+      const Index begin = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end = ends.template getSize< get< 0 >( Permutation{} ) >();
+//      ParallelFor< Device >::exec( begin, end, kernel );
+      ParallelFor< Device >::exec( begin, end, f );
    }
 };
 
@@ -215,33 +276,33 @@ template< typename Permutation,
           typename Device >
 struct ExecutorDispatcher
 {
-   template< typename SizesHolder, typename Func >
-   void operator()( const SizesHolder& sizes, Func f )
+   template< typename Begins, typename Ends, typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
    {
-      SequentialExecutor< Permutation >()( sizes, f );
+      SequentialExecutor< Permutation >()( begins, ends, f );
    }
 };
 
 template< typename Permutation >
 struct ExecutorDispatcher< Permutation, Devices::Host >
 {
-   template< typename SizesHolder, typename Func >
-   void operator()( const SizesHolder& sizes, Func f )
+   template< typename Begins, typename Ends, typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
    {
       if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 )
-         ParallelExecutor< Permutation, Devices::Host >()( sizes, f );
+         ParallelExecutor< Permutation, Devices::Host >()( begins, ends, f );
       else
-         SequentialExecutor< Permutation >()( sizes, f );
+         SequentialExecutor< Permutation >()( begins, ends, f );
    }
 };
 
 template< typename Permutation >
 struct ExecutorDispatcher< Permutation, Devices::Cuda >
 {
-   template< typename SizesHolder, typename Func >
-   void operator()( const SizesHolder& sizes, Func f )
+   template< typename Begins, typename Ends, typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
    {
-      ParallelExecutor< Permutation, Devices::Cuda >()( sizes, f );
+      ParallelExecutor< Permutation, Devices::Cuda >()( begins, ends, f );
    }
 };
 
@@ -263,7 +324,8 @@ void nd_map_view( Output output, Func f, const Input... input )
    };
 
    ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
-   dispatch( output.getSizes(), wrapper );
+   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
+   dispatch( Begins{}, output.getSizes(), wrapper );
 }
 
 #else
@@ -362,7 +424,8 @@ void nd_map_view( Output output, Func f )
 {
    nvcc_map_helper_0< Output, Func > wrapper( output, f );
    ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
-   dispatch( output.getSizes(), wrapper );
+   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
+   dispatch( Begins{}, output.getSizes(), wrapper );
 }
 
 template< typename Output,
@@ -375,7 +438,8 @@ void nd_map_view( Output output, Func f, const Input1 input1 )
 
    nvcc_map_helper_1< Output, Func, Input1 > wrapper( output, f, input1 );
    ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
-   dispatch( output.getSizes(), wrapper );
+   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
+   dispatch( Begins{}, output.getSizes(), wrapper );
 }
 
 template< typename Output,
@@ -389,7 +453,8 @@ void nd_map_view( Output output, Func f, const Input1 input1, const Input2 input
 
    nvcc_map_helper_2< Output, Func, Input1, Input2 > wrapper( output, f, input1, input2 );
    ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
-   dispatch( output.getSizes(), wrapper );
+   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
+   dispatch( Begins{}, output.getSizes(), wrapper );
 }
 
 template< typename Output,
@@ -404,7 +469,8 @@ void nd_map_view( Output output, Func f, const Input1 input1, const Input2 input
 
    nvcc_map_helper_3< Output, Func, Input1, Input2, Input3 > wrapper( output, f, input1, input2, input3 );
    ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
-   dispatch( output.getSizes(), wrapper );
+   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
+   dispatch( Begins{}, output.getSizes(), wrapper );
 }
 
 #endif
diff --git a/src/TNL/Containers/ndarray/SizesHolder.h b/src/TNL/Containers/ndarray/SizesHolder.h
index 5b6e52f5fa..306569de7b 100644
--- a/src/TNL/Containers/ndarray/SizesHolder.h
+++ b/src/TNL/Containers/ndarray/SizesHolder.h
@@ -201,6 +201,49 @@ public:
 };
 
 
+template< typename Index,
+          std::size_t dimension,
+          Index constSize >
+class ConstStaticSizesHolder
+{
+public:
+   using IndexType = Index;
+
+   static constexpr std::size_t getDimension()
+   {
+      return dimension;
+   }
+
+   template< std::size_t level >
+   static constexpr std::size_t getStaticSize()
+   {
+      static_assert( level < getDimension(), "Invalid level passed to getStaticSize()." );
+      return constSize;
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   Index getSize() const
+   {
+      static_assert( level < getDimension(), "Invalid dimension passed to getSize()." );
+      return constSize;
+   }
+
+   // methods for convenience
+   __cuda_callable__
+   bool operator==( const ConstStaticSizesHolder& other ) const
+   {
+      return true;
+   }
+
+   __cuda_callable__
+   bool operator!=( const ConstStaticSizesHolder& other ) const
+   {
+      return false;
+   }
+};
+
+
 template< typename Index,
           std::size_t... sizes >
 std::ostream& operator<<( std::ostream& str, const SizesHolder< Index, sizes... >& holder )
@@ -213,5 +256,25 @@ std::ostream& operator<<( std::ostream& str, const SizesHolder< Index, sizes...
    return str;
 }
 
+
+// helper for the forInternal method
+namespace __ndarray_impl {
+
+template< typename SizesHolder,
+          std::size_t ConstValue >
+struct SubtractedSizesHolder
+{};
+
+template< typename Index,
+          std::size_t ConstValue,
+          std::size_t... sizes >
+struct SubtractedSizesHolder< SizesHolder< Index, sizes... >, ConstValue >
+{
+//   using type = SizesHolder< Index, std::max( (std::size_t) 0, sizes - ConstValue )... >;
+   using type = SizesHolder< Index, ( (sizes >= ConstValue) ? sizes - ConstValue : 0 )... >;
+};
+
+} // namespace __ndarray_impl
+
 } // namespace Containers
 } // namespace TNL
diff --git a/src/UnitTests/Containers/ndarray/NDArrayTest.cpp b/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
index 385ff93da2..74221bdcbc 100644
--- a/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
+++ b/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
@@ -180,9 +180,119 @@ TEST( NDArrayTest, SizesHolderPrinter )
    EXPECT_EQ( str.str(), "SizesHolder< 0, 1, 2 >( 3, 1, 2 )" );
 }
 
-TEST( NDArrayTest, forAll_dynamic )
+TEST( NDArrayTest, forAll_dynamic_1D )
 {
-    int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
+    int I = 2;
+    NDArray< int,
+             SizesHolder< int, 0 >,
+             index_sequence< 0 > > a;
+    a.setSizes( I );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+        EXPECT_EQ( a( i ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_2D )
+{
+    int I = 2, J = 3;
+    NDArray< int,
+             SizesHolder< int, 0, 0 >,
+             index_sequence< 1, 0 > > a;
+    a.setSizes( I, J );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int j = 0; j < J; j++ )
+    for( int i = 0; i < I; i++ )
+        EXPECT_EQ( a( i, j ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_3D )
+{
+    int I = 2, J = 3, K = 4;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0 >,
+             index_sequence< 2, 0, 1 > > a;
+    a.setSizes( I, J, K );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        EXPECT_EQ( a( i, j, k ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_4D )
+{
+    int I = 2, J = 3, K = 4, L = 5;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0 >,
+             index_sequence< 3, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        EXPECT_EQ( a( i, j, k, l ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_5D )
+{
+    int I = 2, J = 3, K = 4, L = 5, M = 6;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0 >,
+             index_sequence< 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        EXPECT_EQ( a( i, j, k, l, m ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_6D )
+{
+    int I = 2, J = 3, K = 4, L = 5, M = 6, N = 7;
     NDArray< int,
              SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
              index_sequence< 5, 3, 4, 2, 0, 1 > > a;
@@ -191,7 +301,7 @@ TEST( NDArrayTest, forAll_dynamic )
 
     auto setter = [&] ( int i, int j, int k, int l, int m, int n )
     {
-       a( i, j, k, l, m, n ) = 1;
+       a( i, j, k, l, m, n ) += 1;
     };
 
     a.forAll( setter );
@@ -205,19 +315,34 @@ TEST( NDArrayTest, forAll_dynamic )
         EXPECT_EQ( a( i, j, k, l, m, n ), 1 );
 }
 
-TEST( NDArrayTest, forAll_static )
+TEST( NDArrayTest, forAll_static_1D )
 {
-    constexpr int I = 3, J = 4;
-    NDArray< int, SizesHolder< int, I, J > > a;
-    a.setSizes( 0, 0 );
+    constexpr int I = 3;
+    StaticNDArray< int, SizesHolder< int, I > > a;
+//    a.setSizes( 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forAll( setter );
 
     for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-        a( i, j ) = 0;
+        EXPECT_EQ( a( i ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_2D )
+{
+    constexpr int I = 3, J = 4;
+    StaticNDArray< int, SizesHolder< int, I, J > > a;
+//    a.setSizes( 0, 0 );
+    a.setValue( 0 );
 
     auto setter = [&] ( int i, int j )
     {
-       a( i, j ) = 1;
+       a( i, j ) += 1;
     };
 
     a.forAll( setter );
@@ -227,6 +352,464 @@ TEST( NDArrayTest, forAll_static )
         EXPECT_EQ( a( i, j ), 1 );
 }
 
+TEST( NDArrayTest, forAll_static_3D )
+{
+    constexpr int I = 3, J = 4, K = 5;
+    StaticNDArray< int, SizesHolder< int, I, J, K > > a;
+//    a.setSizes( 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    for( int k = 0; k < K; k++ )
+        EXPECT_EQ( a( i, j, k ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_4D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L > > a;
+//    a.setSizes( 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    for( int k = 0; k < K; k++ )
+    for( int l = 0; l < L; l++ )
+        EXPECT_EQ( a( i, j, k, l ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_5D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a;
+//    a.setSizes( 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    for( int k = 0; k < K; k++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+        EXPECT_EQ( a( i, j, k, l, m ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_6D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a;
+//    a.setSizes( 0, 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+    {
+       a( i, j, k, l, m, n ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    for( int k = 0; k < K; k++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int n = 0; n < N; n++ )
+        EXPECT_EQ( a( i, j, k, l, m, n ), 1 );
+}
+
+TEST( NDArrayTest, forInternal_dynamic_1D )
+{
+    int I = 3;
+    NDArray< int,
+             SizesHolder< int, 0 >,
+             index_sequence< 0 > > a;
+    a.setSizes( I );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 )
+            EXPECT_EQ( a( i ), 0 )
+               << "i = " << i;
+        else
+            EXPECT_EQ( a( i ), 1 )
+               << "i = " << i;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_2D )
+{
+    int I = 3, J = 4;
+    NDArray< int,
+             SizesHolder< int, 0, 0 >,
+             index_sequence< 1, 0 > > a;
+    a.setSizes( I, J );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int j = 0; j < J; j++ )
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 )
+            EXPECT_EQ( a( i, j ), 0 )
+               << "i = " << i << ", j = " << j;
+        else
+            EXPECT_EQ( a( i, j ), 1 )
+               << "i = " << i << ", j = " << j;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_3D )
+{
+    int I = 3, J = 4, K = 5;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0 >,
+             index_sequence< 2, 0, 1 > > a;
+    a.setSizes( I, J, K );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 )
+            EXPECT_EQ( a( i, j, k ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+        else
+            EXPECT_EQ( a( i, j, k ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_4D )
+{
+    int I = 3, J = 4, K = 5, L = 6;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0 >,
+             index_sequence< 3, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 )
+            EXPECT_EQ( a( i, j, k, l ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+        else
+            EXPECT_EQ( a( i, j, k, l ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_5D )
+{
+    int I = 3, J = 4, K = 5, L = 6, M = 7;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0 >,
+             index_sequence< 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 )
+            EXPECT_EQ( a( i, j, k, l, m ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+        else
+            EXPECT_EQ( a( i, j, k, l, m ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_6D )
+{
+    int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+    {
+       a( i, j, k, l, m, n ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int n = 0; n < N; n++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 ||
+            n == 0 || n == N - 1 )
+            EXPECT_EQ( a( i, j, k, l, m, n ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+        else
+            EXPECT_EQ( a( i, j, k, l, m, n ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_1D )
+{
+    constexpr int I = 3;
+    StaticNDArray< int, SizesHolder< int, I > > a;
+//    a.setSizes( 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 )
+            EXPECT_EQ( a( i ), 0 )
+               << "i = " << i;
+        else
+            EXPECT_EQ( a( i ), 1 )
+               << "i = " << i;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_2D )
+{
+    constexpr int I = 3, J = 4;
+    StaticNDArray< int, SizesHolder< int, I, J > > a;
+//    a.setSizes( 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int j = 0; j < J; j++ )
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 )
+            EXPECT_EQ( a( i, j ), 0 )
+               << "i = " << i << ", j = " << j;
+        else
+            EXPECT_EQ( a( i, j ), 1 )
+               << "i = " << i << ", j = " << j;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_3D )
+{
+    constexpr int I = 3, J = 4, K = 5;
+    StaticNDArray< int, SizesHolder< int, I, J, K > > a;
+//    a.setSizes( 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 )
+            EXPECT_EQ( a( i, j, k ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+        else
+            EXPECT_EQ( a( i, j, k ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_4D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L > > a;
+//    a.setSizes( 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 )
+            EXPECT_EQ( a( i, j, k, l ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+        else
+            EXPECT_EQ( a( i, j, k, l ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_5D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a;
+//    a.setSizes( 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 )
+            EXPECT_EQ( a( i, j, k, l, m ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+        else
+            EXPECT_EQ( a( i, j, k, l, m ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_6D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a;
+//    a.setSizes( 0, 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+    {
+       a( i, j, k, l, m, n ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int n = 0; n < N; n++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 ||
+            n == 0 || n == N - 1 )
+            EXPECT_EQ( a( i, j, k, l, m, n ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+        else
+            EXPECT_EQ( a( i, j, k, l, m, n ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+    }
+}
+
 //#include "GtestMissingError.h"
 int main( int argc, char* argv[] )
 {
-- 
GitLab