From e444116e6d66bf4b18efd4ecfac3bb8cc6597c5a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Tue, 25 Dec 2018 11:37:06 +0100
Subject: [PATCH 01/25] Added NDArray

---
 src/Benchmarks/CMakeLists.txt                 |   1 +
 src/Benchmarks/NDArray/CMakeLists.txt         |   9 +
 .../NDArray/tnl-benchmark-ndarray-cuda.cu     |   1 +
 .../NDArray/tnl-benchmark-ndarray.cpp         |   1 +
 .../NDArray/tnl-benchmark-ndarray.h           | 464 +++++++++++++
 src/TNL/Containers/NDArray.h                  | 314 +++++++++
 src/TNL/Containers/NDArrayView.h              | 291 ++++++++
 src/TNL/Containers/ndarray/Indexing.h         | 308 +++++++++
 src/TNL/Containers/ndarray/Meta.h             | 313 +++++++++
 src/TNL/Containers/ndarray/Operations.h       | 632 ++++++++++++++++++
 src/TNL/Containers/ndarray/SizesHolder.h      | 219 ++++++
 src/TNL/Containers/ndarray/Subarrays.h        | 331 +++++++++
 src/UnitTests/Containers/CMakeLists.txt       |   1 +
 .../Containers/ndarray/CMakeLists.txt         |  26 +
 .../Containers/ndarray/NDArrayTest.cpp        | 192 ++++++
 .../Containers/ndarray/NDSubarrayTest.cpp     | 405 +++++++++++
 .../Containers/ndarray/SlicedNDArrayTest.cpp  | 251 +++++++
 .../ndarray/StaticNDArrayCudaTest.cu          |  90 +++
 .../Containers/ndarray/StaticNDArrayTest.cpp  | 105 +++
 19 files changed, 3954 insertions(+)
 create mode 100644 src/Benchmarks/NDArray/CMakeLists.txt
 create mode 100644 src/Benchmarks/NDArray/tnl-benchmark-ndarray-cuda.cu
 create mode 100644 src/Benchmarks/NDArray/tnl-benchmark-ndarray.cpp
 create mode 100644 src/Benchmarks/NDArray/tnl-benchmark-ndarray.h
 create mode 100644 src/TNL/Containers/NDArray.h
 create mode 100644 src/TNL/Containers/NDArrayView.h
 create mode 100644 src/TNL/Containers/ndarray/Indexing.h
 create mode 100644 src/TNL/Containers/ndarray/Meta.h
 create mode 100644 src/TNL/Containers/ndarray/Operations.h
 create mode 100644 src/TNL/Containers/ndarray/SizesHolder.h
 create mode 100644 src/TNL/Containers/ndarray/Subarrays.h
 create mode 100644 src/UnitTests/Containers/ndarray/CMakeLists.txt
 create mode 100644 src/UnitTests/Containers/ndarray/NDArrayTest.cpp
 create mode 100644 src/UnitTests/Containers/ndarray/NDSubarrayTest.cpp
 create mode 100644 src/UnitTests/Containers/ndarray/SlicedNDArrayTest.cpp
 create mode 100644 src/UnitTests/Containers/ndarray/StaticNDArrayCudaTest.cu
 create mode 100644 src/UnitTests/Containers/ndarray/StaticNDArrayTest.cpp

diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt
index b4639d60a..6f3185329 100644
--- a/src/Benchmarks/CMakeLists.txt
+++ b/src/Benchmarks/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_subdirectory( HeatEquation )
 add_subdirectory( BLAS )
+add_subdirectory( NDArray )
 add_subdirectory( SpMV )
 add_subdirectory( DistSpMV )
 add_subdirectory( LinearSolvers )
diff --git a/src/Benchmarks/NDArray/CMakeLists.txt b/src/Benchmarks/NDArray/CMakeLists.txt
new file mode 100644
index 000000000..3958694e6
--- /dev/null
+++ b/src/Benchmarks/NDArray/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_executable( tnl-benchmark-ndarray tnl-benchmark-ndarray.cpp )
+target_compile_options( tnl-benchmark-ndarray PRIVATE ${CXX_TESTS_FLAGS} )
+install( TARGETS tnl-benchmark-ndarray RUNTIME DESTINATION bin )
+
+if( BUILD_CUDA )
+   cuda_add_executable( tnl-benchmark-ndarray-cuda tnl-benchmark-ndarray-cuda.cu
+                        OPTIONS ${CXX_TESTS_FLAGS} )
+   install( TARGETS tnl-benchmark-ndarray-cuda RUNTIME DESTINATION bin )
+endif()
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-cuda.cu b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-cuda.cu
new file mode 100644
index 000000000..ccbac3b38
--- /dev/null
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-cuda.cu
@@ -0,0 +1 @@
+#include "tnl-benchmark-ndarray.h"
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray.cpp b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.cpp
new file mode 100644
index 000000000..ccbac3b38
--- /dev/null
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.cpp
@@ -0,0 +1 @@
+#include "tnl-benchmark-ndarray.h"
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h
new file mode 100644
index 000000000..ab7c4fa8c
--- /dev/null
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h
@@ -0,0 +1,464 @@
+/***************************************************************************
+                          tnl-benchmark-ndarray.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Math.h>
+#include <TNL/ParallelFor.h>
+
+#include <TNL/Containers/NDArray.h>
+#include <TNL/Containers/ndarray/Operations.h>
+
+#include "../Benchmarks.h"
+
+using namespace TNL;
+using namespace TNL::Benchmarks;
+using namespace TNL::Containers;
+using std::index_sequence;
+
+using value_type = float;
+//using index_type = std::size_t;
+using index_type = unsigned;
+
+template< typename Array >
+void expect_eq_chunked( Array& a, Array& b )
+{
+   // TODO: use something like EXPECT_EQ
+   TNL_ASSERT_EQ( a.getSize(), b.getSize(), "array sizes don't match" );
+   if( a.getSize() != b.getSize() )
+      return;
+
+   using IndexType = typename Array::IndexType;
+
+   const IndexType chunk_size = 4096;
+   for( IndexType c = 0; c < (IndexType) roundUpDivision( a.getSize(), chunk_size ); c++ ) {
+      const typename Array::IndexType this_chunk_size = TNL::min( chunk_size, a.getSize() - c * chunk_size );
+      Array a_chunk( &a[ c * chunk_size ], this_chunk_size );
+      Array b_chunk( &b[ c * chunk_size ], this_chunk_size );
+      // TODO: use something like EXPECT_EQ
+      TNL_ASSERT_EQ( a_chunk, b_chunk, "chunks are not equal" );
+   }
+}
+
+template< typename Array >
+void expect_eq( Array& a, Array& b )
+{
+   if( std::is_same< typename Array::DeviceType, TNL::Devices::Cuda >::value ) {
+      typename Array::HostType a_host, b_host;
+      a_host = a;
+      b_host = b;
+      expect_eq_chunked( a_host, b_host );
+   }
+   else {
+      expect_eq_chunked( a, b );
+   }
+}
+
+template< typename Device >
+const char* performer()
+{
+   if( std::is_same< Device, Devices::Host >::value )
+      return "CPU";
+   else if( std::is_same< Device, Devices::Cuda >::value )
+      return "GPU";
+   else
+      return "unknown";
+}
+
+void reset() {}
+
+// NOTE: having the sizes as function parameters keeps the compiler from treating them
+// as "compile-time constants" and thus e.g. optimizing the 1D iterations with memcpy
+
+template< typename Device >
+void benchmark_array( Benchmark& benchmark, index_type size = 500000000 )
+{
+   Array< value_type, Device > a, b;
+   a.setSize( size );
+   b.setSize( size );
+   a.setValue( -1 );
+   b.setValue( 1 );
+
+   auto kernel = [] __cuda_callable__
+      ( int i,
+        value_type* a,
+        const value_type* b )
+   {
+      a[ i ] = b[ i ];
+   };
+
+   auto f = [&]() {
+      TNL::ParallelFor< Device >::exec( 0, (int) size, kernel, a.getData(), b.getData() );
+   };
+
+   // warm-up for all benchmarks
+   f();
+
+   const double datasetSize = 2 * size * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "array", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a, b );
+}
+
+template< typename Device >
+void benchmark_1D( Benchmark& benchmark, index_type size = 500000000 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0 >,
+            std::make_index_sequence< 1 >,
+            std::make_index_sequence< 1 >,
+            Device > a, b;
+   a.setSizes( size );
+   b.setSizes( size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * size * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "1D", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_2D( Benchmark& benchmark, index_type size = 22333 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0 >,
+            std::make_index_sequence< 2 >,
+            std::make_index_sequence< 2 >,
+            Device > a, b;
+   a.setSizes( size, size );
+   b.setSizes( size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 2 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "2D", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0 >,
+            std::make_index_sequence< 3 >,
+            std::make_index_sequence< 3 >,
+            Device > a, b;
+   a.setSizes( size, size, size );
+   b.setSizes( size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 3 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "3D", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_4D( Benchmark& benchmark, index_type size = 150 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0, 0 >,
+            std::make_index_sequence< 4 >,
+            std::make_index_sequence< 4 >,
+            Device > a, b;
+   a.setSizes( size, size, size, size );
+   b.setSizes( size, size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 4 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "4D", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_5D( Benchmark& benchmark, index_type size = 56 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0, 0, 0 >,
+            std::make_index_sequence< 5 >,
+            std::make_index_sequence< 5 >,
+            Device > a, b;
+   a.setSizes( size, size, size, size, size );
+   b.setSizes( size, size, size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 5 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "5D", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_6D( Benchmark& benchmark, index_type size = 28 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
+            std::make_index_sequence< 6 >,
+            std::make_index_sequence< 6 >,
+            Device > a, b;
+   a.setSizes( size, size, size, size, size, size );
+   b.setSizes( size, size, size, size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 6 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "6D", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+
+template< typename Device >
+void benchmark_2D_perm( Benchmark& benchmark, index_type size = 22333 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0 >,
+            std::index_sequence< 1, 0 >,
+            std::index_sequence< 1, 0 >,
+            Device > a, b;
+   a.setSizes( size, size );
+   b.setSizes( size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 2 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "2D permuted", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0 >,
+            std::index_sequence< 2, 1, 0 >,
+            std::index_sequence< 2, 1, 0 >,
+            Device > a, b;
+   a.setSizes( size, size, size );
+   b.setSizes( size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 3 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "3D permuted", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_4D_perm( Benchmark& benchmark, index_type size = 150 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0, 0 >,
+            std::index_sequence< 3, 2, 1, 0 >,
+            std::index_sequence< 3, 2, 1, 0 >,
+            Device > a, b;
+   a.setSizes( size, size, size, size );
+   b.setSizes( size, size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 4 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "4D permuted", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_5D_perm( Benchmark& benchmark, index_type size = 56 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0, 0, 0 >,
+            std::index_sequence< 4, 3, 2, 1, 0 >,
+            std::index_sequence< 4, 3, 2, 1, 0 >,
+            Device > a, b;
+   a.setSizes( size, size, size, size, size );
+   b.setSizes( size, size, size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 5 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "5D permuted", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_6D_perm( Benchmark& benchmark, index_type size = 28 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
+            std::index_sequence< 5, 4, 3, 2, 1, 0 >,
+            std::index_sequence< 5, 4, 3, 2, 1, 0 >,
+            Device > a, b;
+   a.setSizes( size, size, size, size, size, size );
+   b.setSizes( size, size, size, size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 6 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "6D permuted", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void run_benchmarks( Benchmark& benchmark )
+{
+   benchmark_array< Device >( benchmark );
+   benchmark_1D< Device >( benchmark );
+   benchmark_2D< Device >( benchmark );
+   benchmark_3D< Device >( benchmark );
+   benchmark_4D< Device >( benchmark );
+   benchmark_5D< Device >( benchmark );
+   benchmark_6D< Device >( benchmark );
+   benchmark_2D_perm< Device >( benchmark );
+   benchmark_3D_perm< Device >( benchmark );
+   benchmark_4D_perm< Device >( benchmark );
+   benchmark_5D_perm< Device >( benchmark );
+   benchmark_6D_perm< Device >( benchmark );
+}
+
+void setupConfig( Config::ConfigDescription & config )
+{
+   config.addDelimiter( "Benchmark settings:" );
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-ndarray.log");
+   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+   config.addEntryEnum( "append" );
+   config.addEntryEnum( "overwrite" );
+   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
+   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
+   config.addEntry< String >( "devices", "Run benchmarks on these devices.", "all" );
+   config.addEntryEnum( "all" );
+   config.addEntryEnum( "host" );
+   #ifdef HAVE_CUDA
+   config.addEntryEnum( "cuda" );
+   #endif
+
+   config.addDelimiter( "Device settings:" );
+   Devices::Host::configSetup( config );
+   Devices::Cuda::configSetup( config );
+}
+
+int main( int argc, char* argv[] )
+{
+   Config::ParameterContainer parameters;
+   Config::ConfigDescription conf_desc;
+
+   setupConfig( conf_desc );
+
+   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) {
+      conf_desc.printUsage( argv[ 0 ] );
+      return EXIT_FAILURE;
+   }
+
+   if( ! Devices::Host::setup( parameters ) ||
+       ! Devices::Cuda::setup( parameters ) )
+      return EXIT_FAILURE;
+
+   const String & logFileName = parameters.getParameter< String >( "log-file" );
+   const String & outputMode = parameters.getParameter< String >( "output-mode" );
+   const int loops = parameters.getParameter< int >( "loops" );
+   const int verbose = parameters.getParameter< int >( "verbose" );
+
+   // open log file
+   auto mode = std::ios::out;
+   if( outputMode == "append" )
+       mode |= std::ios::app;
+   std::ofstream logFile( logFileName.getString(), mode );
+
+   // init benchmark and common metadata
+   Benchmark benchmark( loops, verbose );
+
+   // prepare global metadata
+   Benchmark::MetadataMap metadata = getHardwareMetadata();
+
+   const String devices = parameters.getParameter< String >( "devices" );
+   if( devices == "all" || devices == "host" )
+      run_benchmarks< Devices::Host >( benchmark );
+#ifdef HAVE_CUDA
+   if( devices == "all" || devices == "cuda" )
+      run_benchmarks< Devices::Cuda >( benchmark );
+#endif
+
+   if( ! benchmark.save( logFile ) ) {
+      std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   return EXIT_SUCCESS;
+}
diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h
new file mode 100644
index 000000000..ee89e8786
--- /dev/null
+++ b/src/TNL/Containers/NDArray.h
@@ -0,0 +1,314 @@
+/***************************************************************************
+                          NDArray.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Containers/Array.h>
+#include <TNL/Containers/StaticArray.h>
+
+#include <TNL/Containers/NDArrayView.h>
+
+namespace TNL {
+namespace Containers {
+
+template< std::size_t slicedDimension = 0,
+          std::size_t sliceSize = 0 >
+struct SliceInfo
+{
+   // sliceSize == 0 means no slicing
+   static constexpr std::size_t getSliceSize( std::size_t dimension )
+   {
+      return (dimension == slicedDimension) ? sliceSize : 0;
+   }
+};
+
+
+
+
+template< typename Array,
+          typename SizesHolder,
+          typename Permutation,
+          typename Base,
+          typename Device = typename Array::DeviceType >
+class NDArrayStorage
+{
+public:
+   using StorageArray = Array;
+   using ValueType = typename Array::ValueType;
+   using DeviceType = Device;
+   using IndexType = typename Array::IndexType;
+   using SizesHolderType = SizesHolder;
+   using PermutationType = Permutation;
+   using ViewType = NDArrayView< ValueType, DeviceType, SizesHolder, Permutation, Base >;
+   using ConstViewType = NDArrayView< std::add_const_t< ValueType >, DeviceType, SizesHolder, Permutation, Base >;
+
+   static_assert( Permutation::size() == SizesHolder::getDimension(), "invalid permutation" );
+
+   // all methods from NDArrayView
+
+   NDArrayStorage() = default;
+
+   // The copy-constructor of TNL::Containers::Array makes shallow copy so our
+   // copy-constructor cannot be default. Actually, we most likely don't need
+   // it anyway, so let's just delete it.
+   NDArrayStorage( const NDArrayStorage& ) = delete;
+
+   // Standard copy-semantics with deep copy, just like regular 1D array.
+   // Mismatched sizes cause reallocations.
+   NDArrayStorage& operator=( const NDArrayStorage& other ) = default;
+
+   // default move-semantics
+   NDArrayStorage( NDArrayStorage&& ) = default;
+   NDArrayStorage& operator=( NDArrayStorage&& ) = default;
+
+   bool operator==( const NDArrayStorage& other ) const
+   {
+      // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
+      return sizes == other.sizes && array == other.array;
+   }
+
+   bool operator!=( const NDArrayStorage& other ) const
+   {
+      // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
+      return sizes != other.sizes || array != other.array;
+   }
+
+   static constexpr std::size_t getDimension()
+   {
+      return SizesHolder::getDimension();
+   }
+
+   const SizesHolderType& getSizes() const
+   {
+      return sizes;
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   IndexType getSize() const
+   {
+      return sizes.template getSize< level >();
+   }
+
+   // returns the product of the aligned sizes
+   __cuda_callable__
+   IndexType getStorageSize() const
+   {
+      using Alignment = typename Base::template Alignment< Permutation >;
+      return __ndarray_impl::StorageSizeGetter< SizesHolder, Alignment >::get( sizes );
+   }
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   IndexType
+   getStorageIndex( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      return Base::template getStorageIndex< Permutation >( sizes,
+                                                            StrideBase{},
+                                                            std::forward< IndexTypes >( indices )... );
+   }
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   ValueType&
+   operator()( IndexTypes&&... indices )
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInBounds( sizes, std::forward< IndexTypes >( indices )... );
+      TNL_ASSERT_LT( getStorageIndex( std::forward< IndexTypes >( indices )... ), getStorageSize(),
+                     "storage index out of bounds - either input error or a bug in the indexer" );
+      return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ];
+   }
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   const ValueType&
+   operator()( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInBounds( sizes, std::forward< IndexTypes >( indices )... );
+      TNL_ASSERT_LT( getStorageIndex( std::forward< IndexTypes >( indices )... ), getStorageSize(),
+                     "storage index out of bounds - either input error or a bug in the indexer" );
+      return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ];
+   }
+
+   // bracket operator for 1D arrays
+   __cuda_callable__
+   ValueType&
+   operator[]( IndexType index )
+   {
+      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
+      __ndarray_impl::assertIndicesInBounds( sizes, std::forward< IndexType >( index ) );
+      return array[ index ];
+   }
+
+   __cuda_callable__
+   const ValueType&
+   operator[]( IndexType index ) const
+   {
+      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
+      __ndarray_impl::assertIndicesInBounds( sizes, std::forward< IndexType >( index ) );
+      return array[ index ];
+   }
+
+   __cuda_callable__
+   ViewType getView()
+   {
+      return ViewType( array.getData(), sizes );
+   }
+
+   __cuda_callable__
+   ConstViewType getConstView() const
+   {
+      return ConstViewType( array.getData(), sizes );
+   }
+
+   // extra methods
+
+   // TODO: rename to setSizes and make sure that overloading with the following method works
+   void setSize( const SizesHolderType& sizes )
+   {
+      this->sizes = sizes;
+      array.setSize( getStorageSize() );
+   }
+
+   template< typename... IndexTypes >
+   void setSizes( IndexTypes&&... sizes )
+   {
+      static_assert( sizeof...( sizes ) == getDimension(), "got wrong number of sizes" );
+      __ndarray_impl::setSizesHelper( this->sizes, std::forward< IndexTypes >( sizes )... );
+      array.setSize( getStorageSize() );
+   }
+
+   void setLike( const NDArrayStorage& other )
+   {
+      this->sizes = other.getSizes();
+      array.setSize( getStorageSize() );
+   }
+
+   void reset()
+   {
+      this->sizes = SizesHolder{};
+      TNL_ASSERT_EQ( getStorageSize(), 0, "Failed to reset the sizes." );
+      array.reset();
+   }
+
+   // "safe" accessor - will do slow copy from device
+   template< typename... IndexTypes >
+   ValueType
+   getElement( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInBounds( sizes, std::forward< IndexTypes >( indices )... );
+      TNL_ASSERT_LT( getStorageIndex( std::forward< IndexTypes >( indices )... ), getStorageSize(),
+                     "storage index out of bounds - either input error or a bug in the indexer" );
+      return array.getElement( getStorageIndex( std::forward< IndexTypes >( indices )... ) );
+   }
+
+   const StorageArray& getStorageArray() const
+   {
+      return array;
+   }
+
+   StorageArray& getStorageArray()
+   {
+      return array;
+   }
+
+   void setValue( ValueType value )
+   {
+      array.setValue( value );
+   }
+
+protected:
+   StorageArray array;
+   SizesHolder sizes;
+
+   using StrideBase = __ndarray_impl::DummyStrideBase< typename SizesHolder::IndexType, SizesHolder::getDimension() >;
+};
+
+template< typename Value,
+          typename SizesHolder,
+          typename PermutationHost = std::make_index_sequence< SizesHolder::getDimension() >,  // identity by default
+          typename PermutationCuda = std::make_index_sequence< SizesHolder::getDimension() >,  // identity by default
+          typename Device = Devices::Host,
+          typename Index = typename SizesHolder::IndexType >
+class NDArray
+: public NDArrayStorage< Array< Value, Device, Index >,
+                         SizesHolder,
+                         typename std::conditional< std::is_same< Device, Devices::Host >::value,
+                                                    PermutationHost,
+                                                    PermutationCuda >::type,
+                         __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >
+{};
+
+template< typename Value,
+          typename SizesHolder,
+          typename Permutation = std::make_index_sequence< SizesHolder::getDimension() >,  // identity by default
+          typename Index = typename SizesHolder::IndexType >
+class StaticNDArray
+: public NDArrayStorage< StaticArray< __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get(), Value >,
+                         SizesHolder,
+                         Permutation,
+                         __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >,
+                         void >
+{
+   static_assert( __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get() > 0,
+                  "All dimensions of a static array must to be positive." );
+};
+
+template< typename Value,
+          std::size_t Rows,
+          std::size_t Columns,
+          typename Permutation = std::index_sequence< 0, 1 > >  // identity by default
+class StaticMatrix
+: public StaticNDArray< Value,
+                        SizesHolder< std::size_t, Rows, Columns >,
+                        Permutation >
+{
+public:
+   static constexpr std::size_t getRows()
+   {
+      return Rows;
+   }
+
+   __cuda_callable__
+   static constexpr std::size_t getColumns()
+   {
+      return Columns;
+   }
+};
+
+template< typename Value,
+          typename SizesHolder,
+          typename PermutationHost = std::make_index_sequence< SizesHolder::getDimension() >,  // identity by default
+          typename SliceInfoHost = SliceInfo<>,  // no slicing by default
+          typename PermutationCuda = std::make_index_sequence< SizesHolder::getDimension() >,  // identity by default
+          typename SliceInfoCuda = SliceInfo<>,  // no slicing by default
+          typename Device = Devices::Host,
+          typename Index = typename SizesHolder::IndexType >
+class SlicedNDArray
+: public NDArrayStorage< Array< Value, Device, Index >,
+                         SizesHolder,
+                         typename std::conditional< std::is_same< Device, Devices::Host >::value,
+                                                    PermutationHost,
+                                                    PermutationCuda >::type,
+                         __ndarray_impl::SlicedNDArrayBase<
+                            typename std::conditional< std::is_same< Device, Devices::Host >::value,
+                                                       SliceInfoHost,
+                                                       SliceInfoCuda >::type >
+                        >
+{};
+
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/NDArrayView.h b/src/TNL/Containers/NDArrayView.h
new file mode 100644
index 000000000..fe75bdadd
--- /dev/null
+++ b/src/TNL/Containers/NDArrayView.h
@@ -0,0 +1,291 @@
+/***************************************************************************
+                          NDArrayView.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Containers/ndarray/Indexing.h>
+#include <TNL/Containers/ndarray/SizesHolder.h>
+#include <TNL/Containers/ndarray/Subarrays.h>
+
+namespace TNL {
+namespace Containers {
+
+template< typename Value,
+          typename Device,
+          typename SizesHolder,
+          typename Permutation,
+          typename Base,
+          typename StrideBase = __ndarray_impl::DummyStrideBase< typename SizesHolder::IndexType, SizesHolder::getDimension() > >
+class NDArrayView
+    : public StrideBase
+{
+public:
+   using ValueType = Value;
+   using DeviceType = Device;
+   using IndexType = typename SizesHolder::IndexType;
+   using SizesHolderType = SizesHolder;
+   using PermutationType = Permutation;
+   using ViewType = NDArrayView< ValueType, Device, SizesHolder, Permutation, Base, StrideBase >;
+   using ConstViewType = NDArrayView< std::add_const_t< ValueType >, Device, SizesHolder, Permutation, Base, StrideBase >;
+
+   static_assert( Permutation::size() == SizesHolder::getDimension(), "invalid permutation" );
+
+   __cuda_callable__
+   NDArrayView() = default;
+
+   // explicit initialization by raw data pointer and sizes
+   __cuda_callable__
+   NDArrayView( Value* data, SizesHolder sizes ) : array(data), sizes(sizes) {}
+
+   // explicit initialization by raw data pointer and sizes and strides
+   __cuda_callable__
+   NDArrayView( Value* data, SizesHolder sizes, StrideBase strides )
+   : StrideBase(strides), array(data), sizes(sizes) {}
+
+   // Copy-constructor does shallow copy, so views can be passed-by-value into
+   // CUDA kernels and they can be captured-by-value in __cuda_callable__
+   // lambda functions.
+   __cuda_callable__
+   NDArrayView( const NDArrayView& ) = default;
+
+   // default move-constructor
+   __cuda_callable__
+   NDArrayView( NDArrayView&& ) = default;
+
+   // Copy-assignment does deep copy, just like regular array, but the sizes
+   // must match (i.e. copy-assignment cannot resize).
+   __cuda_callable__
+   NDArrayView& operator=( const NDArrayView& other )
+   {
+      TNL_ASSERT_EQ( sizes, other.sizes, "The sizes of the array views must be equal, views are not resizable." );
+      if( getStorageSize() > 0 )
+         ArrayOpsHelper< Device >::copy( array, other.array, getStorageSize() );
+      return *this;
+   }
+
+   // There is no move-assignment operator, so expressions like `a = b.getView()`
+   // are resolved as copy-assignment.
+
+   // method for rebinding (reinitialization)
+   __cuda_callable__
+   void bind( NDArrayView view )
+   {
+      array = view.array;
+      sizes = view.sizes;
+      StrideBase::operator=( view );
+   }
+
+   __cuda_callable__
+   void reset()
+   {
+      array = nullptr;
+      sizes = SizesHolder{};
+      StrideBase::operator=( StrideBase{} );
+   }
+
+   __cuda_callable__
+   bool operator==( const NDArrayView& other ) const
+   {
+      if( sizes != other.sizes )
+         return false;
+      // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
+      return ArrayOpsHelper< Device, Device >::compare( array, other.array, getStorageSize() );
+   }
+
+   __cuda_callable__
+   bool operator!=( const NDArrayView& other ) const
+   {
+      if( sizes != other.sizes )
+         return true;
+      // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
+      return ! ArrayOpsHelper< Device, Device >::compare( array, other.array, getStorageSize() );
+   }
+
+   static constexpr std::size_t getDimension()
+   {
+      return SizesHolder::getDimension();
+   }
+
+   const SizesHolderType& getSizes() const
+   {
+      return sizes;
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   IndexType getSize() const
+   {
+      return sizes.template getSize< level >();
+   }
+
+   // method template from base class
+   using StrideBase::getStride;
+
+   // returns the product of the aligned sizes
+   __cuda_callable__
+   IndexType getStorageSize() const
+   {
+      using Alignment = typename Base::template Alignment< Permutation >;
+      return __ndarray_impl::StorageSizeGetter< SizesHolder, Alignment >::get( sizes );
+   }
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   IndexType
+   getStorageIndex( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      return Base::template getStorageIndex< Permutation >( sizes,
+                                                            static_cast< const StrideBase& >( *this ),
+                                                            std::forward< IndexTypes >( indices )... );
+   }
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   ValueType&
+   operator()( IndexTypes&&... indices )
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInBounds( sizes, std::forward< IndexTypes >( indices )... );
+      return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ];
+   }
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   const ValueType&
+   operator()( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInBounds( sizes, std::forward< IndexTypes >( indices )... );
+      return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ];
+   }
+
+   // bracket operator for 1D arrays
+   __cuda_callable__
+   ValueType&
+   operator[]( IndexType&& index )
+   {
+      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
+      __ndarray_impl::assertIndicesInBounds( sizes, std::forward< IndexType >( index ) );
+      return array[ index ];
+   }
+
+   __cuda_callable__
+   const ValueType&
+   operator[]( IndexType index ) const
+   {
+      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
+      __ndarray_impl::assertIndicesInBounds( sizes, std::forward< IndexType >( index ) );
+      return array[ index ];
+   }
+
+   __cuda_callable__
+   ViewType getView()
+   {
+      return ViewType( *this );
+   }
+
+   __cuda_callable__
+   ConstViewType getConstView() const
+   {
+      return ConstViewType( array, sizes );
+   }
+
+   template< std::size_t... Dimensions, typename... IndexTypes >
+   __cuda_callable__
+   auto getSubarrayView( IndexTypes&&... indices )
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      static_assert( 0 < sizeof...(Dimensions) && sizeof...(Dimensions) <= getDimension(), "got wrong number of dimensions" );
+      static_assert( __ndarray_impl::all_elements_in_range( 0, Permutation::size(), {Dimensions...} ),
+                     "invalid dimensions" );
+// FIXME: nvcc chokes on the variadic brace-initialization
+#ifndef __NVCC__
+      static_assert( __ndarray_impl::is_increasing_sequence( {Dimensions...} ),
+                     "specifying permuted dimensions is not supported" );
+#endif
+
+      using Getter = __ndarray_impl::SubarrayGetter< Base, Permutation, Dimensions... >;
+      using Subpermutation = typename Getter::Subpermutation;
+      auto& begin = operator()( std::forward< IndexTypes >( indices )... );
+      auto subarray_sizes = Getter::filterSizes( sizes, std::forward< IndexTypes >( indices )... );
+      auto strides = Getter::getStrides( sizes, std::forward< IndexTypes >( indices )... );
+      static_assert( Subpermutation::size() == sizeof...(Dimensions), "Bug - wrong subpermutation length." );
+      static_assert( decltype(subarray_sizes)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the new sizes." );
+      static_assert( decltype(strides)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the strides." );
+      using SubarrayView = NDArrayView< ValueType, Device, decltype(subarray_sizes), Subpermutation, Base, decltype(strides) >;
+      return SubarrayView{ &begin, subarray_sizes, strides };
+   }
+
+protected:
+   Value* array = nullptr;
+   SizesHolder sizes;
+
+   // TODO: establish the concept of a "void device" for static computations in the whole TNL
+
+   template< typename DestinationDevice, typename SourceDevice = DestinationDevice, typename _unused = void >
+   struct ArrayOpsHelper
+   {
+      template< typename DestinationValue,
+                typename SourceValue,
+                typename Index >
+      static void copy( DestinationValue* destination,
+                        const SourceValue* source,
+                        const Index size )
+      {
+         Algorithms::ArrayOperations< DestinationDevice, SourceDevice >::copy( destination, source, size );
+      }
+
+      template< typename Value1,
+                typename Value2,
+                typename Index >
+      static bool compare( const Value1* destination,
+                           const Value2* source,
+                           const Index size )
+      {
+         return Algorithms::ArrayOperations< DestinationDevice, SourceDevice >::compare( destination, source, size );
+      }
+   };
+
+   template< typename _unused >
+   struct ArrayOpsHelper< void, void, _unused >
+   {
+      template< typename DestinationValue,
+                typename SourceValue,
+                typename Index >
+      __cuda_callable__
+      static void copy( DestinationValue* destination,
+                        const SourceValue* source,
+                        const Index size )
+      {
+         for( Index i = 0; i < size; i ++ )
+            destination[ i ] = source[ i ];
+      }
+
+      template< typename Value1,
+                typename Value2,
+                typename Index >
+      __cuda_callable__
+      static bool compare( const Value1* destination,
+                           const Value2* source,
+                           const Index size )
+      {
+         for( Index i = 0; i < size; i++ )
+            if( ! ( destination[ i ] == source[ i ] ) )
+               return false;
+         return true;
+      }
+   };
+};
+
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/ndarray/Indexing.h b/src/TNL/Containers/ndarray/Indexing.h
new file mode 100644
index 000000000..d156547e1
--- /dev/null
+++ b/src/TNL/Containers/ndarray/Indexing.h
@@ -0,0 +1,308 @@
+/***************************************************************************
+                          Indexing.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Devices/CudaCallable.h>
+
+#include <TNL/Containers/ndarray/Meta.h>
+
+namespace TNL {
+namespace Containers {
+namespace __ndarray_impl {
+
+// Dynamic storage size with alignment
+template< typename SizesHolder,
+          typename Alignment,
+          typename LevelTag = IndexTag< SizesHolder::getDimension() - 1 > >
+struct StorageSizeGetter
+{
+   static typename SizesHolder::IndexType
+   __cuda_callable__
+   get( const SizesHolder& sizes )
+   {
+      const auto size = Alignment::template getAlignedSize< LevelTag::value >( sizes );
+      return size * StorageSizeGetter< SizesHolder, Alignment, IndexTag< LevelTag::value - 1 > >::get( sizes );
+   }
+
+   template< typename Permutation >
+   __cuda_callable__
+   static typename SizesHolder::IndexType
+   getPermuted( const SizesHolder& sizes, Permutation )
+   {
+      constexpr std::size_t idx = __ndarray_impl::get< LevelTag::value >( Permutation{} );
+      const auto size = Alignment::template getAlignedSize< idx >( sizes );
+      return size * StorageSizeGetter< SizesHolder, Alignment, IndexTag< LevelTag::value - 1 > >::get( sizes );
+   }
+};
+
+template< typename SizesHolder, typename Alignment >
+struct StorageSizeGetter< SizesHolder, Alignment, IndexTag< 0 > >
+{
+   static typename SizesHolder::IndexType
+   __cuda_callable__
+   get( const SizesHolder& sizes )
+   {
+      return Alignment::template getAlignedSize< 0 >( sizes );
+   }
+
+   template< typename Permutation >
+   __cuda_callable__
+   static typename SizesHolder::IndexType
+   getPermuted( const SizesHolder& sizes, Permutation )
+   {
+      constexpr std::size_t idx = __ndarray_impl::get< 0 >( Permutation{} );
+      return Alignment::template getAlignedSize< idx >( sizes );
+   }
+};
+
+
+// Static storage size without alignment, used in StaticNDArray
+template< typename SizesHolder,
+          typename LevelTag = IndexTag< SizesHolder::getDimension() - 1 > >
+struct StaticStorageSizeGetter
+{
+   constexpr static std::size_t get()
+   {
+      return SizesHolder::template getStaticSize< LevelTag::value >() *
+             StaticStorageSizeGetter< SizesHolder, IndexTag< LevelTag::value - 1 > >::get();
+   }
+};
+
+template< typename SizesHolder >
+struct StaticStorageSizeGetter< SizesHolder, IndexTag< 0 > >
+{
+   constexpr static std::size_t get()
+   {
+      return SizesHolder::template getStaticSize< 0 >();
+   }
+};
+
+
+template< std::size_t level = 0,
+          typename SizesHolder,
+          typename Index,
+          typename... IndexTypes >
+void setSizesHelper( SizesHolder& holder,
+                     Index&& size,
+                     IndexTypes&&... otherSizes )
+{
+   holder.template setSize< level >( std::forward< Index >( size ) );
+   setSizesHelper< level + 1 >( holder, std::forward< IndexTypes >( otherSizes )... );
+}
+
+template< std::size_t level = 0,
+          typename SizesHolder,
+          typename Index >
+void setSizesHelper( SizesHolder& holder,
+                     Index&& size )
+{
+   holder.template setSize< level >( std::forward< Index >( size ) );
+}
+
+
+// A variadic bounds-checker for indices
+template< typename SizesHolder >
+__cuda_callable__
+void assertIndicesInBounds( const SizesHolder& )
+{}
+
+template< typename SizesHolder,
+          typename Index,
+          typename... IndexTypes >
+__cuda_callable__
+void assertIndicesInBounds( const SizesHolder& sizes, Index&& i, IndexTypes&&... indices )
+{
+#ifndef NDEBUG
+   // sizes.template getSize<...>() cannot be inside the assert macro, but the variables
+   // shouldn't be declared when compiling without assertions
+   constexpr std::size_t level = SizesHolder::getDimension() - sizeof...(indices) - 1;
+   const auto size = sizes.template getSize< level >();
+   TNL_ASSERT_LT( i, size, "Input error - some index is out of bounds." );
+#endif
+   assertIndicesInBounds( sizes, std::forward< IndexTypes >( indices )... );
+}
+
+
+template< typename Permutation,
+          typename Alignment,
+          typename SliceInfo,
+          std::size_t level = Permutation::size() - 1,
+          bool _sliced_level = ( SliceInfo::getSliceSize( get< level >( Permutation{} ) ) > 0 ) >
+struct SlicedIndexer
+{};
+
+template< typename Permutation,
+          typename Alignment,
+          typename SliceInfo,
+          std::size_t level >
+struct SlicedIndexer< Permutation, Alignment, SliceInfo, level, false >
+{
+   template< typename SizesHolder, typename StridesHolder, typename... Indices >
+   __cuda_callable__
+   static typename SizesHolder::IndexType
+   getIndex( const SizesHolder& sizes,
+             const StridesHolder& strides,
+             Indices&&... indices )
+   {
+      static constexpr std::size_t idx = get< level >( Permutation{} );
+      const auto alpha = get_from_pack< idx >( std::forward< Indices >( indices )... );
+      const auto previous = SlicedIndexer< Permutation, Alignment, SliceInfo, level - 1 >::getIndex( sizes, strides, std::forward< Indices >( indices )... );
+      return strides.template getStride< idx >( alpha ) * ( alpha + Alignment::template getAlignedSize< idx >( sizes ) * previous );
+   }
+};
+
+template< typename Permutation,
+          typename Alignment,
+          typename SliceInfo,
+          std::size_t level >
+struct SlicedIndexer< Permutation, Alignment, SliceInfo, level, true >
+{
+   template< typename SizesHolder, typename StridesHolder, typename... Indices >
+   __cuda_callable__
+   static typename SizesHolder::IndexType
+   getIndex( const SizesHolder& sizes,
+             const StridesHolder& strides,
+             Indices&&... indices )
+   {
+      static_assert( SizesHolder::template getStaticSize< get< level >( Permutation{} ) >() == 0,
+                     "Invalid SliceInfo: static dimension cannot be sliced." );
+
+      static constexpr std::size_t idx = get< level >( Permutation{} );
+      const auto alpha = get_from_pack< idx >( std::forward< Indices >( indices )... );
+      static constexpr std::size_t S = SliceInfo::getSliceSize( idx );
+      // TODO: check the calculation with strides
+      return strides.template getStride< idx >( alpha ) *
+                  ( S * (alpha / S) * StorageSizeGetter< SizesHolder, Alignment, IndexTag< level - 1 > >::getPermuted( sizes, Permutation{} ) +
+                    alpha % S ) +
+             S * SlicedIndexer< Permutation, Alignment, SliceInfo, level - 1 >::getIndex( sizes, strides, std::forward< Indices >( indices )... );
+   }
+};
+
+template< typename Permutation,
+          typename Alignment,
+          typename SliceInfo >
+struct SlicedIndexer< Permutation, Alignment, SliceInfo, 0, false >
+{
+   template< typename SizesHolder, typename StridesHolder, typename... Indices >
+   __cuda_callable__
+   static typename SizesHolder::IndexType
+   getIndex( const SizesHolder& sizes,
+             const StridesHolder& strides,
+             Indices&&... indices )
+   {
+      static constexpr std::size_t idx = get< 0 >( Permutation{} );
+      const auto alpha = get_from_pack< idx >( std::forward< Indices >( indices )... );
+      return strides.template getStride< idx >( alpha ) * alpha;
+   }
+};
+
+template< typename Permutation,
+          typename Alignment,
+          typename SliceInfo >
+struct SlicedIndexer< Permutation, Alignment, SliceInfo, 0, true >
+{
+   template< typename SizesHolder, typename StridesHolder, typename... Indices >
+   __cuda_callable__
+   static typename SizesHolder::IndexType
+   getIndex( const SizesHolder& sizes,
+             const StridesHolder& strides,
+             Indices&&... indices )
+   {
+      static constexpr std::size_t idx = get< 0 >( Permutation{} );
+      const auto alpha = get_from_pack< idx >( std::forward< Indices >( indices )... );
+      return strides.template getStride< idx >( alpha ) * alpha;
+   }
+};
+
+
+// SliceInfo should be always empty (i.e. sliceSize == 0)
+template< typename SliceInfo >
+struct NDArrayBase
+{
+   template< typename Permutation >
+   struct Alignment
+   {
+      template< std::size_t dimension, typename SizesHolder >
+      __cuda_callable__
+      static typename SizesHolder::IndexType
+      getAlignedSize( const SizesHolder& sizes )
+      {
+         const auto size = sizes.template getSize< dimension >();
+         // round up the last dynamic dimension to improve performance
+         // TODO: aligning is good for GPU, but bad for CPU
+//         static constexpr decltype(size) mult = 32;
+//         if( dimension == get< Permutation::size() - 1 >( Permutation{} )
+//                 && SizesHolder::template getStaticSize< dimension >() == 0 )
+//             return mult * ( size / mult + ( size % mult != 0 ) );
+         return size;
+      }
+   };
+
+   template< typename Permutation, typename SizesHolder, typename StridesHolder, typename... Indices >
+   __cuda_callable__
+   typename SizesHolder::IndexType
+   static getStorageIndex( const SizesHolder& sizes, const StridesHolder& strides, Indices&&... indices )
+   {
+      static_assert( check_slice_size( SizesHolder::getDimension(), 0 ), "BUG - invalid SliceInfo type passed to NDArrayBase" );
+      using Alignment = Alignment< Permutation >;
+      return SlicedIndexer< Permutation, Alignment, SliceInfo >::getIndex( sizes, strides, std::forward< Indices >( indices )... );
+   }
+
+private:
+   static constexpr bool check_slice_size( std::size_t dim, std::size_t sliceSize )
+   {
+      for( std::size_t i = 0; i < dim; i++ )
+         if( SliceInfo::getSliceSize( i ) != sliceSize )
+            return false;
+      return true;
+   }
+};
+
+
+template< typename SliceInfo >
+struct SlicedNDArrayBase
+{
+   template< typename Permutation >
+   struct Alignment
+   {
+      template< std::size_t dimension, typename SizesHolder >
+      __cuda_callable__
+      static typename SizesHolder::IndexType
+      getAlignedSize( const SizesHolder& sizes )
+      {
+         const auto size = sizes.template getSize< dimension >();
+         if( SliceInfo::getSliceSize(dimension) > 0 )
+            // round to multiple of SliceSize
+            return SliceInfo::getSliceSize(dimension) * (
+                        size / SliceInfo::getSliceSize(dimension) +
+                        ( size % SliceInfo::getSliceSize(dimension) != 0 )
+                     );
+         // unmodified
+         return size;
+      }
+   };
+
+   template< typename Permutation, typename SizesHolder, typename StridesHolder, typename... Indices >
+   __cuda_callable__
+   static typename SizesHolder::IndexType
+   getStorageIndex( const SizesHolder& sizes, const StridesHolder& strides, Indices&&... indices )
+   {
+      using Alignment = Alignment< Permutation >;
+      return SlicedIndexer< Permutation, Alignment, SliceInfo >::getIndex( sizes, strides, std::forward< Indices >( indices )... );
+   }
+};
+
+} // namespace __ndarray_impl
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/ndarray/Meta.h b/src/TNL/Containers/ndarray/Meta.h
new file mode 100644
index 000000000..6807b8dc7
--- /dev/null
+++ b/src/TNL/Containers/ndarray/Meta.h
@@ -0,0 +1,313 @@
+/***************************************************************************
+                          Meta.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <utility>
+#include <initializer_list>
+
+#include <TNL/Devices/CudaCallable.h>
+
+namespace TNL {
+namespace Containers {
+namespace __ndarray_impl {
+
+/*
+ * Generic function to get the N-th element from a variadic pack.
+ * Reference:
+ * http://stackoverflow.com/questions/20162903/template-parameter-packs-access-nth-type-and-nth-element/37836252#37836252
+ */
+template< std::size_t index, typename T, typename... Ts,
+          typename = typename std::enable_if< index == 0 >::type >
+constexpr T
+get_from_pack( T&& arg, Ts&&... args )
+{
+   return arg;
+}
+
+template< std::size_t index, typename T, typename... Ts,
+          typename = typename std::enable_if< (index > 0) && index <= sizeof...( Ts ) >::type >
+constexpr auto
+get_from_pack( T&& arg, Ts&&... args )
+{
+   return get_from_pack< index-1 >( std::forward< Ts >( args )... );
+}
+
+// complementary specialization for getting a more readable compilation error
+// in case calling get with a bad index
+template< long long index, typename T, typename... Ts,
+          typename = typename std::enable_if< (index < 0) || (index > sizeof...( Ts )) >::type >
+constexpr T
+get_from_pack( T&& arg, Ts&&... args )
+{
+   static_assert( index >= 0 && index <= sizeof...( Ts ),
+                  "invalid index passed to the get function" );
+   return arg;
+}
+
+
+// Get N-th element from std::integer_sequence.
+template< std::size_t N, typename Index, Index... vals >
+constexpr Index
+get( std::integer_sequence< Index, vals... > )
+{
+   return get_from_pack< N >( vals... );
+}
+
+
+// Test if a variadic pack contains a value.
+template< typename Index, typename T >
+constexpr bool
+is_in_pack( Index value, T&& pack_value )
+{
+   return value == pack_value;
+}
+
+template< typename Index, typename T, typename... Ts >
+constexpr bool
+is_in_pack( Index value, T&& pack_value, Ts&&... vals )
+{
+   if( value == pack_value )
+      return true;
+   return is_in_pack( value, std::forward< Ts >( vals )... );
+}
+
+
+// Test if an std::integer_sequence contains an element.
+template< typename Index, Index... vals >
+constexpr bool
+is_in_sequence( Index value, std::integer_sequence< Index, vals... > )
+{
+   return is_in_pack( value, vals... );
+}
+
+
+// Get index of the first occurrence of value in a variadic pack.
+template< typename V >
+constexpr std::size_t index_in_pack( V&& value )
+{
+   return 0;
+}
+
+template< typename V, typename T, typename... Ts >
+constexpr std::size_t index_in_pack( V&& value, T&& arg, Ts&&... args )
+{
+   if( value == arg )
+      return 0;
+   return 1 + index_in_pack( value, std::forward< Ts >( args )... );
+}
+
+
+// Get index of the first occurrence of value in a std::integer_sequence
+template< typename V, typename Index, Index... vals >
+constexpr std::size_t
+index_in_sequence( V&& value, std::integer_sequence< Index, vals... > )
+{
+   return index_in_pack( std::forward< V >( value ), vals... );
+}
+
+
+/*
+ * Generic function to concatenate an arbitrary number of std::integer_sequence instances.
+ * Useful mainly for getting the type of the resulting sequence with `decltype`.
+ */
+// concatenate a single, potentially empty sequence
+template< typename Index, Index... s >
+constexpr auto
+concat_sequences( std::integer_sequence< Index, s... > )
+{
+   return std::integer_sequence< Index, s... >{};
+}
+
+// concatenate two sequences, each potentially empty
+template< typename Index, Index... s, Index... t>
+constexpr auto
+concat_sequences( std::integer_sequence< Index, s... >, std::integer_sequence< Index, t... > )
+{
+   return std::integer_sequence< Index, s... , t... >{};
+}
+
+// concatenate more than 2 sequences
+template< typename Index, Index... s, Index... t, typename... R >
+constexpr auto
+concat_sequences( std::integer_sequence< Index, s... >, std::integer_sequence< Index, t...>, R... )
+{
+   return concat_sequences( std::integer_sequence< Index, s..., t... >{}, R{}... );
+}
+
+
+// Integer wrapper necessary for C++ templates specializations.
+// As the C++ standard says:
+//    A partially specialized non-type argument expression shall not involve
+//    a template parameter of the partial specialization except when the argument
+//    expression is a simple identifier.
+template< std::size_t v >
+struct IndexTag
+{
+   static constexpr std::size_t value = v;
+};
+
+
+template< typename Permutation,
+          typename Sequence >
+struct CallPermutationHelper
+{};
+
+template< typename Permutation,
+          std::size_t... N >
+struct CallPermutationHelper< Permutation, std::index_sequence< N... > >
+{
+   template< typename Func,
+             typename... Args >
+   __cuda_callable__
+   static auto apply( Func f, Args&&... args )
+   {
+      return f( get_from_pack<
+                  get< N >( Permutation{} )
+                >( args... )... );
+   }
+};
+
+// Call specified function with permuted arguments.
+// [used in ndarray_operations.h]
+template< typename Permutation,
+          typename Func,
+          typename... Args >
+__cuda_callable__
+auto call_with_permuted_arguments( Func f, Args&&... args )
+{
+   return CallPermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > >
+          ::apply( f, std::forward< Args >( args )... );
+}
+
+
+// Check that all elements of the initializer list are equal to the specified value.
+// [used in ndarray_operations.h]
+constexpr bool
+all_elements_equal_to_value( std::size_t value, std::initializer_list< std::size_t > list )
+{
+   for( auto elem : list )
+      if( elem != value )
+         return false;
+   return true;
+}
+
+
+// Check that all elements of the initializer list are in the specified range [begin, end).
+// [used in ndarray.h -- static assertions on permutations]
+constexpr bool
+all_elements_in_range( std::size_t begin, std::size_t end, std::initializer_list< std::size_t > list )
+{
+   for( auto elem : list )
+      if( elem < begin || elem >= end )
+         return false;
+   return true;
+}
+
+
+// Check that the elements of the initializer list form an increasing sequence.
+// [used in ndarray.h -- static assertion in getSubarrayView()]
+constexpr bool
+is_increasing_sequence( std::initializer_list< std::size_t > list )
+{
+   std::size_t prev = *list.begin();
+   for( auto& elem : list ) {
+      if( &elem == list.begin() )
+         continue;
+      if( elem <= prev )
+         return false;
+      prev = elem;
+   }
+   return true;
+}
+
+
+// Count elements of a variadic pack smaller than a specified value
+// [used in ndarray_subarray.h to generate a subpermutation]
+template< typename T, typename V >
+constexpr std::size_t
+count_smaller( T threshold, V&& value )
+{
+   return value < threshold ? 1 : 0;
+}
+
+template< typename T, typename V, typename... Values >
+constexpr std::size_t
+count_smaller( T threshold, V&& value, Values&&... vals )
+{
+   if( value < threshold )
+      return 1 + count_smaller( threshold, vals... );
+   return count_smaller( threshold, vals... );
+}
+
+
+// C++17 version using "if constexpr" and a general predicate (lambda function)
+// Reference: https://stackoverflow.com/a/41723705
+//template< typename Index, Index a, typename Predicate >
+//constexpr auto
+//FilterSingle( std::integer_sequence< Index, a >, Predicate pred )
+//{
+//   if constexpr (pred(a))
+//      return std::integer_sequence< Index, a >{};
+//   else
+//      return std::integer_sequence< Index >{};
+//}
+//
+//// empty sequence case
+//template< typename Index, typename Predicate >
+//constexpr auto
+//filter_sequence( std::integer_sequence< Index >, [[maybe_unused]] Predicate pred )
+//{
+//   return std::integer_sequence< Index >{};
+//}
+//
+//// non empty sequence case
+//template< typename Index, Index... vals, typename Predicate >
+//constexpr auto
+//filter_sequence( std::integer_sequence< Index, vals... >, [[maybe_unused]] Predicate pred )
+//{
+//   return concat_sequences( FilterSingle( std::integer_sequence< Index, vals >{}, pred )... );
+//}
+
+// C++14 version, with hard-coded predicate
+template< typename Mask, typename Index, Index val >
+constexpr typename std::conditional_t< is_in_sequence( val, Mask{} ),
+                                       std::integer_sequence< Index, val >,
+                                       std::integer_sequence< Index > >
+FilterSingle( std::integer_sequence< Index, val > )
+{
+   return {};
+}
+
+/*
+ * Generic function returning a subsequence of a sequence obtained by omitting
+ * the elements not contained in the specified mask.
+ */
+// empty sequence case
+template< typename Mask, typename Index >
+constexpr auto
+filter_sequence( std::integer_sequence< Index > )
+{
+   return std::integer_sequence< Index >{};
+}
+
+// non empty sequence case
+template< typename Mask, typename Index, Index... vals >
+constexpr auto
+filter_sequence( std::integer_sequence< Index, vals... > )
+{
+   return concat_sequences( FilterSingle< Mask >( std::integer_sequence< Index, vals >{} )... );
+}
+
+} // namespace __ndarray_impl
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/ndarray/Operations.h b/src/TNL/Containers/ndarray/Operations.h
new file mode 100644
index 000000000..2462771d3
--- /dev/null
+++ b/src/TNL/Containers/ndarray/Operations.h
@@ -0,0 +1,632 @@
+/***************************************************************************
+                          Operations.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+
+#include <TNL/Containers/ndarray/Meta.h>
+
+namespace TNL {
+namespace Containers {
+
+namespace __ndarray_impl {
+
+template< typename Array,
+          typename LevelTag = IndexTag< 0 > >
+struct SequentialExecutor
+{
+   template< typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Array& array, Func f, Indices&&... indices )
+   {
+      SequentialExecutor< Array, IndexTag< LevelTag::value + 1 > > exec;
+      const auto size = array.template getSize< get< LevelTag::value >( typename Array::PermutationType{} ) >();
+      for( typename Array::IndexType i = 0; i < size; i++ )
+         exec( array, f, std::forward< Indices >( indices )..., i );
+   }
+};
+
+template< typename Array >
+struct SequentialExecutor< Array, IndexTag< Array::getDimension() - 1 > >
+{
+   template< typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Array& array, Func f, Indices&&... indices )
+   {
+      static_assert( sizeof...(indices) == Array::getDimension() - 1,
+                     "invalid number of indices in the final step of the SequentialExecutor" );
+
+      const auto size = array.template getSize< get< Array::getDimension() - 1 >( typename Array::PermutationType{} ) >();
+      for( typename Array::IndexType i = 0; i < size; i++ )
+         call_with_permuted_arguments< typename Array::PermutationType >( f, std::forward< Indices >( indices )..., i );
+   }
+};
+
+
+template< typename Array,
+          typename LevelTag = IndexTag< Array::getDimension() - 1 > >
+struct SequentialExecutorRTL
+{
+   template< typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Array& array, Func f, Indices&&... indices )
+   {
+      SequentialExecutorRTL< Array, IndexTag< LevelTag::value - 1 > > exec;
+      const auto size = array.template getSize< get< LevelTag::value >( typename Array::PermutationType{} ) >();
+      for( typename Array::IndexType i = 0; i < size; i++ )
+         exec( array, f, i, std::forward< Indices >( indices )... );
+   }
+};
+
+template< typename Array >
+struct SequentialExecutorRTL< Array, IndexTag< 0 > >
+{
+   template< typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Array& array, Func f, Indices&&... indices )
+   {
+      static_assert( sizeof...(indices) == Array::getDimension() - 1,
+                     "invalid number of indices in the final step of the SequentialExecutor" );
+
+      const auto size = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
+      for( typename Array::IndexType i = 0; i < size; i++ )
+         call_with_permuted_arguments< typename Array::PermutationType >( f, i, std::forward< Indices >( indices )... );
+   }
+};
+
+
+template< typename Array,
+          typename DimTag = IndexTag< Array::getDimension() > >
+struct OpenMPExecutor
+{
+   template< typename Func >
+   void operator()( const Array& array, Func f )
+   {
+      SequentialExecutor< Array, IndexTag< 3 > > exec;
+
+      const auto size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
+      const auto size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
+      const auto size2 = array.template getSize< get< 2 >( typename Array::PermutationType{} ) >();
+
+      #ifdef HAVE_OPENMP
+      #pragma omp parallel for collapse(3)
+      #endif
+      for( typename Array::IndexType i0 = 0; i0 < size0; i0++ )
+      for( typename Array::IndexType i1 = 0; i1 < size1; i1++ )
+      for( typename Array::IndexType i2 = 0; i2 < size2; i2++ )
+         exec( array, f, i0, i1, i2 );
+   }
+};
+
+template< typename Array >
+struct OpenMPExecutor< Array, IndexTag< 3 > >
+{
+   template< typename Func >
+   void operator()( const Array& array, Func f )
+   {
+      const auto size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
+      const auto size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
+      const auto size2 = array.template getSize< get< 2 >( typename Array::PermutationType{} ) >();
+
+      #ifdef HAVE_OPENMP
+      #pragma omp parallel for collapse(2)
+      #endif
+      for( typename Array::IndexType i0 = 0; i0 < size0; i0++ )
+      for( typename Array::IndexType i1 = 0; i1 < size1; i1++ )
+      for( typename Array::IndexType i2 = 0; i2 < size2; i2++ )
+         call_with_permuted_arguments< typename Array::PermutationType >( f, i0, i1, i2 );
+   }
+};
+
+template< typename Array >
+struct OpenMPExecutor< Array, IndexTag< 2 > >
+{
+   template< typename Func >
+   void operator()( const Array& array, Func f )
+   {
+      const auto size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
+      const auto size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
+
+      #ifdef HAVE_OPENMP
+      #pragma omp parallel for
+      #endif
+      for( typename Array::IndexType i0 = 0; i0 < size0; i0++ )
+      for( typename Array::IndexType i1 = 0; i1 < size1; i1++ )
+         call_with_permuted_arguments< typename Array::PermutationType >( f, i0, i1 );
+   }
+};
+
+template< typename Array >
+struct OpenMPExecutor< Array, IndexTag< 1 > >
+{
+   template< typename Func >
+   void operator()( const Array& array, Func f )
+   {
+      const auto size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
+
+      #ifdef HAVE_OPENMP
+      #pragma omp parallel for
+      #endif
+      for( typename Array::IndexType i0 = 0; i0 < size0; i0++ )
+         call_with_permuted_arguments< typename Array::PermutationType >( f, i0 );
+   }
+};
+
+
+template< typename Array,
+          typename DimTag = IndexTag< Array::getDimension() > >
+struct CudaExecutor
+{
+   template< typename Func >
+   void operator()( const Array& array, Func f )
+   {
+      using Index = typename Array::IndexType;
+
+      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
+      {
+         SequentialExecutorRTL< Array, IndexTag< Array::getDimension() - 4 > > exec;
+         exec( array, f, i0, i1, i2 );
+      };
+
+      const Index size0 = array.template getSize< get< Array::getDimension() - 3 >( typename Array::PermutationType{} ) >();
+      const Index size1 = array.template getSize< get< Array::getDimension() - 2 >( typename Array::PermutationType{} ) >();
+      const Index size2 = array.template getSize< get< Array::getDimension() - 1 >( typename Array::PermutationType{} ) >();
+      ParallelFor3D< Devices::Cuda >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
+   }
+};
+
+template< typename Array >
+struct CudaExecutor< Array, IndexTag< 3 > >
+{
+   template< typename Func >
+   void operator()( const Array& array, Func f )
+   {
+      using Index = typename Array::IndexType;
+
+      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
+      {
+         call_with_permuted_arguments< typename Array::PermutationType >( f, i0, i1, i2 );
+      };
+
+      const Index size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
+      const Index size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
+      const Index size2 = array.template getSize< get< 2 >( typename Array::PermutationType{} ) >();
+      ParallelFor3D< Devices::Cuda >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
+   }
+};
+
+template< typename Array >
+struct CudaExecutor< Array, IndexTag< 2 > >
+{
+   template< typename Func >
+   void operator()( const Array& array, Func f )
+   {
+      using Index = typename Array::IndexType;
+
+      auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
+      {
+         call_with_permuted_arguments< typename Array::PermutationType >( f, i0, i1 );
+      };
+
+      const Index size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
+      const Index size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
+      ParallelFor2D< Devices::Cuda >::exec( (Index) 0, (Index) 0, size1, size0, kernel );
+   }
+};
+
+template< typename Array >
+struct CudaExecutor< Array, IndexTag< 1 > >
+{
+   template< typename Func >
+   void operator()( const Array& array, Func f )
+   {
+      using Index = typename Array::IndexType;
+
+      auto kernel = [=] __cuda_callable__ ( Index i )
+      {
+         call_with_permuted_arguments< typename Array::PermutationType >( f, i );
+      };
+
+      const Index size = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
+      ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel );
+   }
+};
+
+
+// Device may be void which stands for StaticNDArray
+template< typename Array, typename Device = typename Array::DeviceType >
+struct ExecutorDispatcher
+{
+   template< typename Func >
+   void operator()( const Array& array, Func f )
+   {
+      SequentialExecutor< Array >()( array, f );
+   }
+};
+
+template< typename Array >
+struct ExecutorDispatcher< Array, Devices::Host >
+{
+   template< typename Func >
+   void operator()( const Array& array, Func f )
+   {
+      if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 )
+         OpenMPExecutor< Array >()( array, f );
+      else
+         SequentialExecutor< Array >()( array, f );
+   }
+};
+
+template< typename Array >
+struct ExecutorDispatcher< Array, Devices::Cuda >
+{
+   template< typename Func >
+   void operator()( const Array& array, Func f )
+   {
+      CudaExecutor< Array >()( array, f );
+   }
+};
+
+#ifndef __NVCC__
+template< typename Output,
+          typename Func,
+          typename... Input >
+void nd_map_view( Output output, Func f, const Input... input )
+{
+   static_assert( all_elements_equal_to_value( Output::getDimension(), {Input::getDimension()...} ),
+                  "all arrays must be of the same dimension" );
+
+   // without mutable, the operator() would be const so output would be const as well
+   // https://stackoverflow.com/a/2835645/4180822
+   auto wrapper = [=] __cuda_callable__ ( auto... indices ) mutable {
+      static_assert( sizeof...( indices ) == Output::getDimension(),
+                     "wrong number of indices passed to the wrapper lambda function" );
+      output( indices... ) = f( input( indices... )... );
+   };
+
+   // From here on, the output array is used only for getting the sizes,
+   // the writing of the result is done inside the wrapper.
+   ExecutorDispatcher< Output >()( output, wrapper );
+}
+
+#else
+
+   template< typename Output,
+             typename Func >
+   struct nvcc_map_helper_0
+   {
+      Output output;
+      Func f;
+
+      nvcc_map_helper_0( Output o, Func f ) : output(o), f(f) {}
+
+      template< typename... Ts >
+      __cuda_callable__
+      void operator()( Ts... indices )
+      {
+         static_assert( sizeof...( indices ) == Output::getDimension(),
+                        "wrong number of indices passed to the wrapper operator() function" );
+         output( indices... ) = f();
+      }
+   };
+
+   template< typename Output,
+             typename Func,
+             typename Input1 >
+   struct nvcc_map_helper_1
+   {
+      Output output;
+      Func f;
+      Input1 input1;
+
+      nvcc_map_helper_1( Output o, Func f, Input1 i1 ) : output(o), f(f), input1(i1) {}
+
+      template< typename... Ts >
+      __cuda_callable__
+      void operator()( Ts... indices )
+      {
+         static_assert( sizeof...( indices ) == Output::getDimension(),
+                        "wrong number of indices passed to the wrapper operator() function" );
+         output( indices... ) = f( input1( indices... ) );
+      }
+   };
+
+   template< typename Output,
+             typename Func,
+             typename Input1,
+             typename Input2 >
+   struct nvcc_map_helper_2
+   {
+      Output output;
+      Func f;
+      Input1 input1;
+      Input2 input2;
+
+      nvcc_map_helper_2( Output o, Func f, Input1 i1, Input2 i2 ) : output(o), f(f), input1(i1), input2(i2) {}
+
+      template< typename... Ts >
+      __cuda_callable__
+      void operator()( Ts... indices )
+      {
+         static_assert( sizeof...( indices ) == Output::getDimension(),
+                        "wrong number of indices passed to the wrapper operator() function" );
+         output( indices... ) = f( input1( indices... ), input2( indices... ) );
+      }
+   };
+
+   template< typename Output,
+             typename Func,
+             typename Input1,
+             typename Input2,
+             typename Input3 >
+   struct nvcc_map_helper_3
+   {
+      Output output;
+      Func f;
+      Input1 input1;
+      Input2 input2;
+      Input3 input3;
+
+      nvcc_map_helper_3( Output o, Func f, Input1 i1, Input2 i2, Input3 i3 ) : output(o), f(f), input1(i1), input2(i2), input3(i3) {}
+
+      template< typename... Ts >
+      __cuda_callable__
+      void operator()( Ts... indices )
+      {
+         static_assert( sizeof...( indices ) == Output::getDimension(),
+                        "wrong number of indices passed to the wrapper operator() function" );
+         output( indices... ) = f( input1( indices... ), input2( indices... ), input3( indices... ) );
+      }
+   };
+
+template< typename Output,
+          typename Func >
+void nd_map_view( Output output, Func f )
+{
+   nvcc_map_helper_0< Output, Func > wrapper( output, f );
+
+   // From here on, the output array is used only for getting the sizes,
+   // the writing of the result is done inside the wrapper.
+   ExecutorDispatcher< Output >()( output, wrapper );
+}
+
+template< typename Output,
+          typename Func,
+          typename Input1 >
+void nd_map_view( Output output, Func f, const Input1 input1 )
+{
+   static_assert( all_elements_equal_to_value( Output::getDimension(), {Input1::getDimension()} ),
+                  "all arrays must be of the same dimension" );
+
+   nvcc_map_helper_1< Output, Func, Input1 > wrapper( output, f, input1 );
+
+   // From here on, the output array is used only for getting the sizes,
+   // the writing of the result is done inside the wrapper.
+   ExecutorDispatcher< Output >()( output, wrapper );
+}
+
+template< typename Output,
+          typename Func,
+          typename Input1,
+          typename Input2 >
+void nd_map_view( Output output, Func f, const Input1 input1, const Input2 input2 )
+{
+   static_assert( all_elements_equal_to_value( Output::getDimension(), {Input1::getDimension(), Input2::getDimension()} ),
+                  "all arrays must be of the same dimension" );
+
+   nvcc_map_helper_2< Output, Func, Input1, Input2 > wrapper( output, f, input1, input2 );
+
+   // From here on, the output array is used only for getting the sizes,
+   // the writing of the result is done inside the wrapper.
+   ExecutorDispatcher< Output >()( output, wrapper );
+}
+
+template< typename Output,
+          typename Func,
+          typename Input1,
+          typename Input2,
+          typename Input3 >
+void nd_map_view( Output output, Func f, const Input1 input1, const Input2 input2, const Input3 input3 )
+{
+   static_assert( all_elements_equal_to_value( Output::getDimension(), {Input1::getDimension(), Input2::getDimension(), Input3::getDimension()} ),
+                  "all arrays must be of the same dimension" );
+
+   nvcc_map_helper_3< Output, Func, Input1, Input2, Input3 > wrapper( output, f, input1, input2, input3 );
+
+   // From here on, the output array is used only for getting the sizes,
+   // the writing of the result is done inside the wrapper.
+   ExecutorDispatcher< Output >()( output, wrapper );
+}
+
+#endif
+
+} // namespace __ndarray_impl
+
+
+// f must be an N-ary function, where N is the dimension of the output and input arrays:
+//      output( i1, ..., iN ) = f( input1( i1, ..., iN ), ... inputM( i1, ..., iN ) )
+template< typename Output,
+          typename Func,
+          typename... Input >
+void nd_map( Output& output, Func f, const Input&... input )
+{
+   __ndarray_impl::nd_map_view( output.getView(), f, input.getConstView()... );
+}
+
+template< typename Output,
+          typename Input >
+void nd_assign( Output& output, const Input& input )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v ){ return v; }, input );
+#else
+   using value_type = typename Input::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type v ){ return v; }, input );
+#endif
+}
+
+// Some mathematical functions, inspired by NumPy:
+// https://docs.scipy.org/doc/numpy/reference/ufuncs.html#math-operations
+
+template< typename Output,
+          typename Input1,
+          typename Input2 >
+void nd_add( Output& output, const Input1& input1, const Input2& input2 )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return v1 + v2; }, input1, input2 );
+#else
+   using value_type_1 = typename Input1::ValueType;
+   using value_type_2 = typename Input2::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return v1 + v2; }, input1, input2 );
+#endif
+}
+
+template< typename Output,
+          typename Input1,
+          typename Input2 >
+void nd_subtract( Output& output, const Input1& input1, const Input2& input2 )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return v1 - v2; }, input1, input2 );
+#else
+   using value_type_1 = typename Input1::ValueType;
+   using value_type_2 = typename Input2::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return v1 - v2; }, input1, input2 );
+#endif
+}
+
+template< typename Output,
+          typename Input1,
+          typename Input2 >
+void nd_multiply( Output& output, const Input1& input1, const Input2& input2 )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return v1 * v2; }, input1, input2 );
+#else
+   using value_type_1 = typename Input1::ValueType;
+   using value_type_2 = typename Input2::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return v1 * v2; }, input1, input2 );
+#endif
+}
+
+template< typename Output,
+          typename Input1,
+          typename Input2 >
+void nd_divide( Output& output, const Input1& input1, const Input2& input2 )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return v1 / v2; }, input1, input2 );
+#else
+   using value_type_1 = typename Input1::ValueType;
+   using value_type_2 = typename Input2::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return v1 / v2; }, input1, input2 );
+#endif
+}
+
+template< typename Output,
+          typename Input1,
+          typename Input2 >
+void nd_maximum( Output& output, const Input1& input1, const Input2& input2 )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return TNL::max( v1, v2 ); }, input1, input2 );
+#else
+   using value_type_1 = typename Input1::ValueType;
+   using value_type_2 = typename Input2::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return TNL::max( v1, v2 ); }, input1, input2 );
+#endif
+}
+
+template< typename Output,
+          typename Input1,
+          typename Input2 >
+void nd_minimum( Output& output, const Input1& input1, const Input2& input2 )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return TNL::min( v1, v2 ); }, input1, input2 );
+#else
+   using value_type_1 = typename Input1::ValueType;
+   using value_type_2 = typename Input2::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return TNL::min( v1, v2 ); }, input1, input2 );
+#endif
+}
+
+template< typename Output,
+          typename Input >
+void nd_absolute( Output& output, const Input& input )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v ){ return TNL::abs( v ); }, input );
+#else
+   using value_type = typename Input::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type v ){ return TNL::abs( v ); }, input );
+#endif
+}
+
+template< typename Output,
+          typename Input >
+void nd_sign( Output& output, const Input& input )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v ){ return TNL::sign( v ); }, input );
+#else
+   using value_type = typename Input::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type v ){ return TNL::sign( v ); }, input );
+#endif
+}
+
+template< typename Output,
+          typename Input1,
+          typename Input2 >
+void nd_pow( Output& output, const Input1& input1, const Input2& input2 )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return TNL::pow( v1, v2 ); }, input1, input2 );
+#else
+   using value_type_1 = typename Input1::ValueType;
+   using value_type_2 = typename Input2::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return TNL::pow( v1, v2 ); }, input1, input2 );
+#endif
+}
+
+template< typename Output,
+          typename Input >
+void nd_sqrt( Output& output, const Input& input )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v ){ return TNL::sqrt( v ); }, input );
+#else
+   using value_type = typename Input::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type v ){ return TNL::sqrt( v ); }, input );
+#endif
+}
+
+template< typename Output,
+          typename Input >
+void nd_square( Output& output, const Input& input )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v ){ return v*v; }, input );
+#else
+   using value_type = typename Input::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type v ){ return v*v; }, input );
+#endif
+}
+
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/ndarray/SizesHolder.h b/src/TNL/Containers/ndarray/SizesHolder.h
new file mode 100644
index 000000000..69a032c7f
--- /dev/null
+++ b/src/TNL/Containers/ndarray/SizesHolder.h
@@ -0,0 +1,219 @@
+/***************************************************************************
+                          SizesHolder.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Devices/CudaCallable.h>
+#include <TNL/TemplateStaticFor.h>
+
+#include <TNL/Containers/ndarray/Meta.h>
+
+namespace TNL {
+namespace Containers {
+
+namespace __ndarray_impl {
+
+template< typename Index,
+          typename LevelTag,
+          std::size_t size >
+class SizeHolder
+{
+public:
+   __cuda_callable__
+   constexpr Index getSize( LevelTag ) const
+   {
+      return size;
+   }
+
+   void setSize( LevelTag, Index newSize )
+   {
+      TNL_ASSERT( newSize == 0, );
+   }
+
+   __cuda_callable__
+   bool operator==( const SizeHolder& ) const
+   {
+      return true;
+   }
+};
+
+template< typename Index,
+          typename LevelTag >
+class SizeHolder< Index, LevelTag, 0 >
+{
+public:
+   __cuda_callable__
+   Index getSize( LevelTag ) const
+   {
+      return size;
+   }
+
+   void setSize( LevelTag, Index size )
+   {
+      TNL_ASSERT( size >= 0, );
+      this->size = size;
+   }
+
+   __cuda_callable__
+   bool operator==( const SizeHolder& other ) const
+   {
+      return size == other.size;
+   }
+
+private:
+   Index size = 0;
+};
+
+template< typename Index,
+          std::size_t currentSize,
+          std::size_t... otherSizes >
+class SizesHolderLayer
+: public SizesHolderLayer< Index, otherSizes... >,
+  public SizeHolder< Index,
+                     IndexTag< sizeof...( otherSizes ) >,  // LevelTag
+                     currentSize >
+{
+   using BaseType = SizesHolderLayer< Index, otherSizes... >;
+   using Layer = SizeHolder< Index,
+                             IndexTag< sizeof...( otherSizes ) >,  // LevelTag
+                             currentSize >;
+protected:
+   using BaseType::getSize;
+   using BaseType::setSize;
+   using Layer::getSize;
+   using Layer::setSize;
+
+   __cuda_callable__
+   bool operator==( const SizesHolderLayer& other ) const
+   {
+      return BaseType::operator==( other ) &&
+             Layer::operator==( other );
+   }
+};
+
+// specializations to terminate the recursive inheritance
+template< typename Index,
+          std::size_t currentSize >
+class SizesHolderLayer< Index, currentSize >
+: public SizeHolder< Index,
+                     IndexTag< 0 >,  // LevelTag
+                     currentSize >
+{
+    using Layer = SizeHolder< Index,
+                              IndexTag< 0 >,  // LevelTag
+                              currentSize >;
+protected:
+    using Layer::getSize;
+    using Layer::setSize;
+
+    __cuda_callable__
+    bool operator==( const SizesHolderLayer& other ) const
+    {
+        return Layer::operator==( other );
+    }
+};
+
+} // namespace __ndarray_impl
+
+
+// dimensions and static sizes are specified as std::size_t,
+// the type of dynamic sizes is configurable with Index
+
+template< typename Index,
+          std::size_t... sizes >
+class SizesHolder
+: public __ndarray_impl::SizesHolderLayer< Index, sizes... >
+{
+   using BaseType = __ndarray_impl::SizesHolderLayer< Index, sizes... >;
+
+public:
+   using IndexType = Index;
+
+   static constexpr std::size_t getDimension()
+   {
+      return sizeof...( sizes );
+   }
+
+   template< std::size_t dimension >
+   static constexpr std::size_t getStaticSize()
+   {
+      static_assert( dimension < sizeof...(sizes), "Invalid dimension passed to getStaticSize()." );
+      return __ndarray_impl::get_from_pack< dimension >( sizes... );
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   Index getSize() const
+   {
+      static_assert( level < sizeof...(sizes), "Invalid level passed to getSize()." );
+      return BaseType::getSize( __ndarray_impl::IndexTag< getDimension() - level - 1 >() );
+   }
+
+   template< std::size_t level >
+   void setSize( Index size )
+   {
+      static_assert( level < sizeof...(sizes), "Invalid level passed to setSize()." );
+      BaseType::setSize( __ndarray_impl::IndexTag< getDimension() - level - 1 >(), size );
+   }
+
+   // methods for convenience
+   __cuda_callable__
+   bool operator==( const SizesHolder& other ) const
+   {
+      return BaseType::operator==( other );
+   }
+
+   __cuda_callable__
+   bool operator!=( const SizesHolder& other ) const
+   {
+      return ! operator==( other );
+   }
+};
+
+
+template< std::size_t dimension >
+struct SizesHolderStaticSizePrinter
+{
+   template< typename Index,
+             std::size_t... sizes >
+   static void exec( std::ostream& str, const SizesHolder< Index, sizes... >& holder )
+   {
+      str << holder.template getStaticSize< dimension >() << ", ";
+   }
+};
+
+template< std::size_t dimension >
+struct SizesHolderSizePrinter
+{
+   template< typename Index,
+             std::size_t... sizes >
+   static void exec( std::ostream& str, const SizesHolder< Index, sizes... >& holder )
+   {
+      str << holder.template getSize< dimension >() << ", ";
+   }
+};
+
+template< typename Index,
+          std::size_t... sizes >
+std::ostream& operator<<( std::ostream& str, const SizesHolder< Index, sizes... >& holder )
+{
+   str << "SizesHolder< ";
+   TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, SizesHolderStaticSizePrinter >::execHost( str, holder );
+   str << holder.template getStaticSize< sizeof...(sizes) - 1 >() << " >( ";
+   TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, SizesHolderSizePrinter >::execHost( str, holder );
+   str << holder.template getSize< sizeof...(sizes) - 1 >() << " )";
+   return str;
+}
+
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/ndarray/Subarrays.h b/src/TNL/Containers/ndarray/Subarrays.h
new file mode 100644
index 000000000..5fc9554e4
--- /dev/null
+++ b/src/TNL/Containers/ndarray/Subarrays.h
@@ -0,0 +1,331 @@
+/***************************************************************************
+                          Subarrays.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Containers/ndarray/Meta.h>
+#include <TNL/Containers/ndarray/SizesHolder.h>
+#include <TNL/Containers/ndarray/Indexing.h>
+
+namespace TNL {
+namespace Containers {
+namespace __ndarray_impl {
+
+template< typename Dimensions, typename Permutation >
+class SubpermutationGetter;
+
+template< std::size_t... dims, std::size_t... vals >
+class SubpermutationGetter< std::index_sequence< dims... >, std::index_sequence< vals... > >
+{
+private:
+   using Dimensions = std::index_sequence< dims... >;
+   using Permutation = std::index_sequence< vals... >;
+   using Subsequence = decltype(
+            filter_sequence< Dimensions >( Permutation{} )
+         );
+
+   template< std::size_t... v >
+   static constexpr auto
+   get_subpermutation( std::index_sequence< v... > )
+   {
+      using Subpermutation = std::index_sequence< count_smaller( v, v... )... >;
+      return Subpermutation{};
+   }
+
+public:
+   using Subpermutation = decltype(
+            get_subpermutation( Subsequence{} )
+         );
+};
+
+
+template< typename Dimensions, typename SihesHolder >
+class SizesFilter;
+
+template< std::size_t... dims, typename Index, std::size_t... sizes >
+class SizesFilter< std::index_sequence< dims... >, SizesHolder< Index, sizes... > >
+{
+private:
+   using Dimensions = std::index_sequence< dims... >;
+   using SizesSequence = std::index_sequence< sizes... >;
+   using Subsequence = decltype(
+            concat_sequences( std::index_sequence< get_from_pack< dims >( sizes... ) >{} ... )
+         );
+
+   template< std::size_t... v >
+   static constexpr auto
+   get_sizesholder( std::index_sequence< v... > )
+   {
+      using Sizes = SizesHolder< Index, v... >;
+      return Sizes{};
+   }
+
+   template< std::size_t level = 0, typename = void >
+   struct SizeSetterHelper
+   {
+      template< typename NewSizes,
+                typename OldSizes >
+      static void setSizes( NewSizes& newSizes,
+                            const OldSizes& oldSizes )
+      {
+         if( oldSizes.template getStaticSize< level >() == 0 )
+            newSizes.template setSize< level >( oldSizes.template getSize< get< level >( Dimensions{} ) >() );
+         SizeSetterHelper< level + 1 >::setSizes( newSizes, oldSizes );
+      }
+   };
+
+   template< typename _unused >
+   struct SizeSetterHelper< Dimensions::size() - 1, _unused >
+   {
+      template< typename NewSizes,
+                typename OldSizes >
+      static void setSizes( NewSizes& newSizes,
+                            const OldSizes& oldSizes )
+      {
+         static constexpr std::size_t level = Dimensions::size() - 1;
+         if( oldSizes.template getStaticSize< level >() == 0 )
+            newSizes.template setSize< level >( oldSizes.template getSize< get< level >( Dimensions{} ) >() );
+      }
+   };
+
+   template< std::size_t level = 0, typename = void >
+   struct IndexChecker
+   {
+      template< typename... IndexTypes >
+      static bool check( IndexTypes&&... indices )
+      {
+         static constexpr std::size_t d = get< level >( Dimensions{} );
+         if( get_from_pack< d >( std::forward< IndexTypes >( indices )... ) != 0 )
+            return false;
+         return IndexChecker< level + 1 >::check( std::forward< IndexTypes >( indices )... );
+      }
+   };
+
+   template< typename _unused >
+   struct IndexChecker< Dimensions::size() - 1, _unused >
+   {
+      template< typename... IndexTypes >
+      static bool check( IndexTypes&&... indices )
+      {
+         static constexpr std::size_t d = get< Dimensions::size() - 1 >( Dimensions{} );
+         if( get_from_pack< d >( std::forward< IndexTypes >( indices )... ) != 0 )
+            return false;
+         return true;
+      }
+   };
+
+public:
+   using Sizes = decltype(
+            get_sizesholder( Subsequence{} )
+         );
+
+   template< typename... IndexTypes >
+   static Sizes filterSizes( const SizesHolder< Index, sizes... >& oldSizes, IndexTypes&&... indices )
+   {
+      Sizes newSizes;
+
+      // assert that indices are 0 for the dimensions in the subarray
+      // (contraction of dimensions is not supported yet, and it does not
+      // make sense for static dimensions anyway)
+      TNL_ASSERT_TRUE( IndexChecker<>::check( std::forward< IndexTypes >( indices )... ),
+                       "Static dimensions of the subarray must start at index 0 of the array." );
+
+      // set dynamic sizes
+      // pseudo-python-code:
+      //      for d, D in enumerate(dims...):
+      //          newSizes.setSize< d >( oldSizes.getSize< D >() )
+      SizeSetterHelper<>::setSizes( newSizes, oldSizes );
+
+      return newSizes;
+   }
+};
+
+
+template< typename Index, std::size_t Dimension >
+struct DummyStrideBase
+{
+   template< std::size_t level >
+   constexpr Index getStride( Index i = 0 ) const
+   {
+      return 1;
+   }
+};
+
+template< typename Index,
+          std::size_t... sizes >
+class StridesHolder
+: private SizesHolder< Index, sizes... >
+{
+   using BaseType = SizesHolder< Index, sizes... >;
+
+public:
+   using BaseType::getDimension;
+
+   template< std::size_t level >
+   static constexpr std::size_t getStaticStride( Index i = 0 )
+   {
+      return BaseType::template getStaticSize< level >();
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   Index getStride( Index i = 0 ) const
+   {
+      return BaseType::template getSize< level >();
+   }
+
+   template< std::size_t level >
+   void setStride( Index size )
+   {
+      BaseType::template setSize< level >( size );
+   }
+};
+
+template< typename Base, typename Permutation, std::size_t... Dimensions >
+class SubarrayGetter;
+
+template< typename SliceInfo, typename Permutation, std::size_t... Dimensions >
+class SubarrayGetter< NDArrayBase< SliceInfo >, Permutation, Dimensions... >
+{
+   // returns the number of factors in the stride product
+   template< std::size_t dim, std::size_t... vals >
+   static constexpr std::size_t get_end( std::index_sequence< vals... > _perm )
+   {
+      if( dim == get< Permutation::size() - 1 >( Permutation{} ) )
+         return 0;
+      std::size_t i = 0;
+      std::size_t count = 0;
+// FIXME: nvcc chokes on the variadic brace-initialization
+#ifndef __NVCC__
+      for( auto v : std::initializer_list< std::size_t >{ vals... } )
+#else
+      for( auto v : (std::size_t [sizeof...(vals)]){ vals... } )
+#endif
+      {
+         if( i++ <= index_in_pack( dim, vals... ) )
+            continue;
+         if( is_in_sequence( v, std::index_sequence< Dimensions... >{} ) )
+            break;
+         count++;
+      }
+      return count;
+   }
+
+   // static calculation of the stride product
+   template< typename SizesHolder,
+             std::size_t start_dim,
+             std::size_t end = get_end< start_dim >( Permutation{} ),
+             std::size_t level = 0,
+             typename = void >
+   struct StaticStrideGetter
+   {
+      static constexpr std::size_t get()
+      {
+         constexpr std::size_t start_offset = index_in_sequence( start_dim, Permutation{} );
+         constexpr std::size_t dim = __ndarray_impl::get< start_offset + level + 1 >( Permutation{} );
+         return SizesHolder::template getStaticSize< dim >() * StaticStrideGetter< SizesHolder, start_dim, end, level + 1 >::get();
+      }
+   };
+
+   template< typename SizesHolder, std::size_t start_dim, std::size_t end, typename _unused >
+   struct StaticStrideGetter< SizesHolder, start_dim, end, end, _unused >
+   {
+      static constexpr std::size_t get()
+      {
+         return 1;
+      }
+   };
+
+   // dynamic calculation of the stride product
+   template< std::size_t start_dim,
+             std::size_t end = get_end< start_dim >( Permutation{} ),
+             std::size_t level = 0,
+             typename = void >
+   struct DynamicStrideGetter
+   {
+      template< typename SizesHolder >
+      static constexpr std::size_t get( const SizesHolder& sizes )
+      {
+         constexpr std::size_t start_offset = index_in_sequence( start_dim, Permutation{} );
+         constexpr std::size_t dim = __ndarray_impl::get< start_offset + level + 1 >( Permutation{} );
+         return sizes.template getSize< dim >() * DynamicStrideGetter< start_dim, end, level + 1 >::get( sizes );
+      }
+   };
+
+   template< std::size_t start_dim, std::size_t end, typename _unused >
+   struct DynamicStrideGetter< start_dim, end, end, _unused >
+   {
+      template< typename SizesHolder >
+      static constexpr std::size_t get( const SizesHolder& sizes )
+      {
+         return 1;
+      }
+   };
+
+   // helper class for setting dynamic strides
+   template< std::size_t level = 0, typename = void >
+   struct StrideSetterHelper
+   {
+      template< typename StridesHolder, typename SizesHolder >
+      static void setStrides( StridesHolder& strides, const SizesHolder& sizes )
+      {
+         static constexpr std::size_t dim = get_from_pack< level >( Dimensions... );
+         if( StridesHolder::template getStaticStride< level >() == 0 )
+            strides.template setStride< level >( DynamicStrideGetter< dim >::get( sizes ) );
+         StrideSetterHelper< level + 1 >::setStrides( strides, sizes );
+      }
+   };
+
+   template< typename _unused >
+   struct StrideSetterHelper< sizeof...(Dimensions) - 1, _unused >
+   {
+      template< typename StridesHolder, typename SizesHolder >
+      static void setStrides( StridesHolder& strides, const SizesHolder& sizes )
+      {
+         static constexpr std::size_t level = sizeof...(Dimensions) - 1;
+         static constexpr std::size_t dim = get_from_pack< level >( Dimensions... );
+         if( StridesHolder::template getStaticStride< level >() == 0 )
+            strides.template setStride< level >( DynamicStrideGetter< dim >::get( sizes ) );
+      }
+   };
+
+public:
+   using Subpermutation = typename SubpermutationGetter< std::index_sequence< Dimensions... >, Permutation >::Subpermutation;
+
+   template< typename SizesHolder, typename... IndexTypes >
+   static auto filterSizes( const SizesHolder& sizes, IndexTypes&&... indices )
+   {
+      using Filter = SizesFilter< std::index_sequence< Dimensions... >, SizesHolder >;
+      return Filter::filterSizes( sizes, std::forward< IndexTypes >( indices )... );
+   }
+
+   template< typename SizesHolder, typename... IndexTypes >
+   static auto getStrides( const SizesHolder& sizes, IndexTypes&&... indices )
+   {
+      using Strides = StridesHolder< typename SizesHolder::IndexType,
+                                     StaticStrideGetter< SizesHolder, Dimensions >::get()... >;
+      Strides strides;
+
+      // set dynamic strides
+      // pseudo-python-code:
+      //      for i, d in enumerate(Dimensions):
+      //          if is_dynamic_dimension(d):
+      //              strides.setStride< i >( dynamic_stride(d, sizes) )
+      StrideSetterHelper<>::setStrides( strides, sizes );
+
+      return strides;
+   }
+};
+
+} // namespace __ndarray_impl
+} // namespace Containers
+} // namespace TNL
diff --git a/src/UnitTests/Containers/CMakeLists.txt b/src/UnitTests/Containers/CMakeLists.txt
index d33f5d263..c8cd88af9 100644
--- a/src/UnitTests/Containers/CMakeLists.txt
+++ b/src/UnitTests/Containers/CMakeLists.txt
@@ -120,6 +120,7 @@ ADD_TEST( StaticVectorOperationsTest ${EXECUTABLE_OUTPUT_PATH}/StaticVectorOpera
 
 
 ADD_SUBDIRECTORY( Multimaps )
+ADD_SUBDIRECTORY( ndarray )
 
 
 if( ${BUILD_MPI} )
diff --git a/src/UnitTests/Containers/ndarray/CMakeLists.txt b/src/UnitTests/Containers/ndarray/CMakeLists.txt
new file mode 100644
index 000000000..cbdbe328a
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/CMakeLists.txt
@@ -0,0 +1,26 @@
+add_executable( NDArrayTest NDArrayTest.cpp )
+target_compile_options( NDArrayTest PRIVATE ${CXX_TESTS_FLAGS} )
+target_link_libraries( NDArrayTest ${GTEST_BOTH_LIBRARIES} )
+add_test( NDArrayTest ${EXECUTABLE_OUTPUT_PATH}/NDArrayTest${CMAKE_EXECUTABLE_SUFFIX} )
+
+add_executable( NDSubarrayTest NDSubarrayTest.cpp )
+target_compile_options( NDSubarrayTest PRIVATE ${CXX_TESTS_FLAGS} )
+target_link_libraries( NDSubarrayTest ${GTEST_BOTH_LIBRARIES} )
+add_test( NDSubarrayTest ${EXECUTABLE_OUTPUT_PATH}/NDSubarrayTest${CMAKE_EXECUTABLE_SUFFIX} )
+
+add_executable( SlicedNDArrayTest SlicedNDArrayTest.cpp )
+target_compile_options( SlicedNDArrayTest PRIVATE ${CXX_TESTS_FLAGS} )
+target_link_libraries( SlicedNDArrayTest ${GTEST_BOTH_LIBRARIES} )
+add_test( SlicedNDArrayTest ${EXECUTABLE_OUTPUT_PATH}/SlicedNDArrayTest${CMAKE_EXECUTABLE_SUFFIX} )
+
+add_executable( StaticNDArrayTest StaticNDArrayTest.cpp )
+target_compile_options( StaticNDArrayTest PRIVATE ${CXX_TESTS_FLAGS} )
+target_link_libraries( StaticNDArrayTest ${GTEST_BOTH_LIBRARIES} )
+add_test( StaticNDArrayTest ${EXECUTABLE_OUTPUT_PATH}/StaticNDArrayTest${CMAKE_EXECUTABLE_SUFFIX} )
+
+if( BUILD_CUDA )
+   cuda_add_executable( StaticNDArrayCudaTest StaticNDArrayCudaTest.cu
+                        OPTIONS ${CXX_TESTS_FLAGS} )
+   target_link_libraries( StaticNDArrayCudaTest ${GTEST_BOTH_LIBRARIES} )
+   add_test( StaticNDArrayCudaTest ${EXECUTABLE_OUTPUT_PATH}/StaticNDArrayCudaTest${CMAKE_EXECUTABLE_SUFFIX} )
+endif()
diff --git a/src/UnitTests/Containers/ndarray/NDArrayTest.cpp b/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
new file mode 100644
index 000000000..2a98e71f1
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
@@ -0,0 +1,192 @@
+#include "gtest/gtest.h"
+
+#include <TNL/Containers/NDArray.h>
+
+using namespace TNL::Containers;
+using std::index_sequence;
+
+template< typename Array >
+void expect_identity( const Array& a )
+{
+    Array identity;
+    identity.setLike( a );
+    int last = 0;
+    for( int i = 0; i < identity.getSize(); i++ ) {
+        // skip negative/invalid entries due to alignment
+        if( a[ i ] < 0 )
+            identity[ i ] = a[ i ];
+        else
+            identity[ i ] = last++;
+    }
+    EXPECT_EQ( a, identity );
+}
+
+TEST( NDArrayTest, setLike )
+{
+    int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+
+    decltype(a) b;
+    EXPECT_EQ( b.template getSize< 0 >(), 0 );
+    EXPECT_EQ( b.template getSize< 1 >(), 0 );
+    EXPECT_EQ( b.template getSize< 2 >(), 0 );
+    EXPECT_EQ( b.template getSize< 3 >(), 0 );
+    EXPECT_EQ( b.template getSize< 4 >(), 0 );
+    EXPECT_EQ( b.template getSize< 5 >(), 0 );
+    b.setLike( a );
+    EXPECT_EQ( b.template getSize< 0 >(), I );
+    EXPECT_EQ( b.template getSize< 1 >(), J );
+    EXPECT_EQ( b.template getSize< 2 >(), K );
+    EXPECT_EQ( b.template getSize< 3 >(), L );
+    EXPECT_EQ( b.template getSize< 4 >(), M );
+    EXPECT_EQ( b.template getSize< 5 >(), N );
+}
+
+TEST( NDArrayTest, reset )
+{
+    int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+    EXPECT_EQ( a.template getSize< 0 >(), I );
+    EXPECT_EQ( a.template getSize< 1 >(), J );
+    EXPECT_EQ( a.template getSize< 2 >(), K );
+    EXPECT_EQ( a.template getSize< 3 >(), L );
+    EXPECT_EQ( a.template getSize< 4 >(), M );
+    EXPECT_EQ( a.template getSize< 5 >(), N );
+
+    a.reset();
+    EXPECT_EQ( a.template getSize< 0 >(), 0 );
+    EXPECT_EQ( a.template getSize< 1 >(), 0 );
+    EXPECT_EQ( a.template getSize< 2 >(), 0 );
+    EXPECT_EQ( a.template getSize< 3 >(), 0 );
+    EXPECT_EQ( a.template getSize< 4 >(), 0 );
+    EXPECT_EQ( a.template getSize< 5 >(), 0 );
+}
+
+TEST( NDArrayTest, Static_1D )
+{
+    constexpr int I = 3;
+    NDArray< int, SizesHolder< int, I > > a;
+    a.setSizes( 0 );
+
+    int v = 0;
+    for( int i = 0; i < I; i++ ) {
+        a( i ) = v++;
+        EXPECT_EQ( a[ i ], a( i ) );
+    }
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( NDArrayTest, Static_2D_Identity )
+{
+    constexpr int I = 3, J = 5;
+    NDArray< int, SizesHolder< int, I, J > > a;
+    a.setSizes( 0, 0 );
+
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+        for( int j = 0; j < J; j++ )
+            a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( NDArrayTest, Static_2D_Permuted )
+{
+    constexpr int I = 3, J = 5;
+    NDArray< int,
+             SizesHolder< int, I, J >,
+             index_sequence< 1, 0 > > a;
+    a.setSizes( 0, 0 );
+
+    int v = 0;
+    for( int j = 0; j < J; j++ )
+        for( int i = 0; i < I; i++ )
+            a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( NDArrayTest, Dynamic_6D )
+{
+    int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+
+    // initialize entries invalid due to alignment to -1
+    a.getStorageArray().setValue( -1 );
+
+    int v = 0;
+    for( int n = 0; n < N; n++ )
+        for( int l = 0; l < L; l++ )
+            for( int m = 0; m < M; m++ )
+                for( int k = 0; k < K; k++ )
+                    for( int i = 0; i < I; i++ )
+                        for( int j = 0; j < J; j++ )
+                            a( i, j, k, l, m, n ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( NDArrayTest, CopySemantics )
+{
+    constexpr int I = 3, J = 4;
+    NDArray< int, SizesHolder< int, I, J > > a, b, c;
+    a.setSizes( 0, 0 );
+
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+
+    b = a;
+    EXPECT_EQ( a, b );
+
+    auto a_view = a.getView();
+    auto b_view = b.getView();
+    EXPECT_EQ( a_view, b_view );
+    EXPECT_EQ( a_view.getView(), b_view );
+    EXPECT_EQ( a_view.getConstView(), b_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), b.getConstView() );
+    EXPECT_EQ( a.getConstView(), b_view.getConstView() );
+
+    c.setSizes( 0, 0 );
+    auto c_view = c.getView();
+    c_view = b_view;
+    EXPECT_EQ( a_view, c_view );
+    EXPECT_EQ( a_view.getView(), c_view );
+    EXPECT_EQ( a_view.getConstView(), c_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), c.getConstView() );
+    EXPECT_EQ( a.getConstView(), c_view.getConstView() );
+}
+
+TEST( NDArrayTest, SizesHolderPrinter )
+{
+   SizesHolder< int, 0, 1, 2 > holder;
+   holder.setSize< 0 >( 3 );
+
+   std::stringstream str;
+   str << holder;
+   EXPECT_EQ( str.str(), "SizesHolder< 0, 1, 2 >( 3, 1, 2 )" );
+}
+
+//#include "GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+//#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+   return RUN_ALL_TESTS();
+//#else
+//   throw GtestMissingError();
+//#endif
+}
diff --git a/src/UnitTests/Containers/ndarray/NDSubarrayTest.cpp b/src/UnitTests/Containers/ndarray/NDSubarrayTest.cpp
new file mode 100644
index 000000000..1b57eed28
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/NDSubarrayTest.cpp
@@ -0,0 +1,405 @@
+#include "gtest/gtest.h"
+
+#include <TNL/Containers/NDArray.h>
+
+using namespace TNL::Containers;
+using std::index_sequence;
+
+// wrapper around static_assert to get the type names in the error message
+template< typename Permutation, typename ExpectedPermutation >
+void check_permutation()
+{
+    static_assert( std::is_same< Permutation, ExpectedPermutation >::value,
+                   "The permutation is not the same as the expected permutation." );
+}
+
+TEST( NDArraySubarrayTest, StaticAsserts )
+{
+    using namespace TNL::Containers::__ndarray_impl;
+
+//    auto is_even = [](int _in) {return _in % 2 == 0;};
+    using expected_type = std::integer_sequence<int, 0, 2, 4, 6, 8>;
+    using test_type = std::integer_sequence<int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9>;
+//    constexpr auto result = filter_sequence(test_type{}, is_even);
+    constexpr auto result = filter_sequence< expected_type >(test_type{});
+    using result_type = std::decay_t<decltype(result)>;
+    static_assert(std::is_same<expected_type, result_type>::value, "Integer sequences should be equal");
+
+
+
+    using Permutation = std::integer_sequence< std::size_t, 5, 3, 1, 4, 2, 6, 0 >;
+    {
+        using Dimensions = std::integer_sequence< std::size_t, 3, 4, 6 >;
+        using Subpermutation = typename SubpermutationGetter< Dimensions, Permutation >::Subpermutation;
+        check_permutation< Subpermutation,
+                           std::integer_sequence< std::size_t, 0, 1, 2 > >();
+    }
+    {
+        using Dimensions = std::integer_sequence< std::size_t, 1, 4, 2 >;
+        using Subpermutation = typename SubpermutationGetter< Dimensions, Permutation >::Subpermutation;
+        check_permutation< Subpermutation,
+                           std::integer_sequence< std::size_t, 0, 2, 1 > >();
+    }
+    {
+        using Dimensions = std::integer_sequence< std::size_t, 5, 1, 6 >;
+        using Subpermutation = typename SubpermutationGetter< Dimensions, Permutation >::Subpermutation;
+        check_permutation< Subpermutation,
+                           std::integer_sequence< std::size_t, 1, 0, 2 > >();
+    }
+    {
+        using Dimensions = std::integer_sequence< std::size_t, 5, 1, 2 >;
+        using Subpermutation = typename SubpermutationGetter< Dimensions, Permutation >::Subpermutation;
+        check_permutation< Subpermutation,
+                           std::integer_sequence< std::size_t, 2, 0, 1 > >();
+    }
+    {
+        using Dimensions = std::integer_sequence< std::size_t, 2, 3, 4 >;
+        using Subpermutation = typename SubpermutationGetter< Dimensions, Permutation >::Subpermutation;
+        check_permutation< Subpermutation,
+                           std::integer_sequence< std::size_t, 1, 2, 0 > >();
+    }
+    {
+        using Dimensions = std::integer_sequence< std::size_t, 0, 1, 5 >;
+        using Subpermutation = typename SubpermutationGetter< Dimensions, Permutation >::Subpermutation;
+        check_permutation< Subpermutation,
+                           std::integer_sequence< std::size_t, 2, 1, 0 > >();
+    }
+
+    static_assert( is_increasing_sequence( {0, 1, 2, 3, 4} ), "bug" );
+    static_assert( ! is_increasing_sequence( {0, 1, 2, 0, 4} ), "bug" );
+    static_assert( ! is_increasing_sequence( {1, 0, 2, 3, 4} ), "bug" );
+}
+
+TEST( NDArraySubarrayTest, Dynamic_6D )
+{
+    int I = 2, J = 3, K = 4, L = 5, M = 6, N = 7;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+    a.setValue( 0 );
+
+    auto v = a.getView();
+
+    auto s1 = v.template getSubarrayView< 0 >( 0, 0, 0, 0, 0, 0 );
+    const int size1 = s1.template getSize< 0 >();
+    const int stride1 = s1.template getStride< 0 >();
+    EXPECT_EQ( size1, I );
+    EXPECT_EQ( stride1, J );
+    for( int i = 0; i < I; i++ ) {
+        s1( i ) = 1 + i;
+        EXPECT_EQ( v( i, 0, 0, 0, 0, 0 ), 1 + i );
+    }
+    a.setValue( 0 );
+
+    auto s2 = v.template getSubarrayView< 1 >( 0, 0, 0, 0, 0, 0 );
+    const int size2 = s2.template getSize< 0 >();
+    const int stride2 = s2.template getStride< 0 >();
+    EXPECT_EQ( size2, J );
+    EXPECT_EQ( stride2, 1 );
+    for( int j = 0; j < J; j++ ) {
+        s2( j ) = 1 + j;
+        EXPECT_EQ( v( 0, j, 0, 0, 0, 0 ), 1 + j );
+    }
+    a.setValue( 0 );
+
+    auto s3 = v.template getSubarrayView< 2 >( 0, 0, 0, 0, 0, 0 );
+    const int size3 = s3.template getSize< 0 >();
+    const int stride3 = s3.template getStride< 0 >();
+    EXPECT_EQ( size3, K );
+    EXPECT_EQ( stride3, I*J );
+    for( int k = 0; k < K; k++ ) {
+        s3( k ) = 1 + k;
+        EXPECT_EQ( v( 0, 0, k, 0, 0, 0 ), 1 + k );
+    }
+    a.setValue( 0 );
+
+    auto s4 = v.template getSubarrayView< 3 >( 0, 0, 0, 0, 0, 0 );
+    const int size4 = s4.template getSize< 0 >();
+    const int stride4 = s4.template getStride< 0 >();
+    EXPECT_EQ( size4, L );
+    EXPECT_EQ( stride4, I*J*K*M );
+    for( int l = 0; l < L; l++ ) {
+        s4( l ) = 1 + l;
+        EXPECT_EQ( v( 0, 0, 0, l, 0, 0 ), 1 + l );
+    }
+    a.setValue( 0 );
+
+    auto s5 = v.template getSubarrayView< 4 >( 0, 0, 0, 0, 0, 0 );
+    const int size5 = s5.template getSize< 0 >();
+    const int stride5 = s5.template getStride< 0 >();
+    EXPECT_EQ( size5, M );
+    EXPECT_EQ( stride5, I*J*K );
+    for( int m = 0; m < M; m++ ) {
+        s5( m ) = 1 + m;
+        EXPECT_EQ( v( 0, 0, 0, 0, m, 0 ), 1 + m );
+    }
+    a.setValue( 0 );
+
+    auto s6 = v.template getSubarrayView< 5 >( 0, 0, 0, 0, 0, 0 );
+    const int size6 = s6.template getSize< 0 >();
+    const int stride6 = s6.template getStride< 0 >();
+    EXPECT_EQ( size6, N );
+    EXPECT_EQ( stride6, I*J*K*L*M );
+    for( int n = 0; n < N; n++ ) {
+        s6( n ) = 1 + n;
+        EXPECT_EQ( v( 0, 0, 0, 0, 0, n ), 1 + n );
+    }
+    a.setValue( 0 );
+
+
+    auto s_ij = v.template getSubarrayView< 0, 1 >( 0, 0, 0, 0, 0, 0 );
+    const int size_ij_0 = s_ij.template getSize< 0 >();
+    const int size_ij_1 = s_ij.template getSize< 1 >();
+    const int stride_ij_0 = s_ij.template getStride< 0 >();
+    const int stride_ij_1 = s_ij.template getStride< 1 >();
+    EXPECT_EQ( size_ij_0, I );
+    EXPECT_EQ( size_ij_1, J );
+    EXPECT_EQ( stride_ij_0, 1 );
+    EXPECT_EQ( stride_ij_1, 1 );
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ ) {
+        s_ij( i, j ) = 1;
+        EXPECT_EQ( v( i, j, 0, 0, 0, 0 ), 1 );
+    }
+    a.setValue( 0 );
+
+    auto s_ik = v.template getSubarrayView< 0, 2 >( 0, 0, 0, 0, 0, 0 );
+    const int size_ik_0 = s_ik.template getSize< 0 >();
+    const int size_ik_1 = s_ik.template getSize< 1 >();
+    const int stride_ik_0 = s_ik.template getStride< 0 >();
+    const int stride_ik_1 = s_ik.template getStride< 1 >();
+    EXPECT_EQ( size_ik_0, I );
+    EXPECT_EQ( size_ik_1, K );
+    EXPECT_EQ( stride_ik_0, J );
+    EXPECT_EQ( stride_ik_1, 1 );
+    for( int i = 0; i < I; i++ )
+    for( int k = 0; k < K; k++ ) {
+        s_ik( i, k ) = 1 + k;
+        EXPECT_EQ( v( i, 0, k, 0, 0, 0 ), 1 + k );
+    }
+    a.setValue( 0 );
+
+    auto s_il = v.template getSubarrayView< 0, 3 >( 0, 0, 0, 0, 0, 0 );
+    const int size_il_0 = s_il.template getSize< 0 >();
+    const int size_il_1 = s_il.template getSize< 1 >();
+    const int stride_il_0 = s_il.template getStride< 0 >();
+    const int stride_il_1 = s_il.template getStride< 1 >();
+    EXPECT_EQ( size_il_0, I );
+    EXPECT_EQ( size_il_1, L );
+    EXPECT_EQ( stride_il_0, J );
+    EXPECT_EQ( stride_il_1, K*M );
+    for( int i = 0; i < I; i++ )
+    for( int l = 0; l < L; l++ ) {
+        s_il( i, l ) = 1 + l;
+        EXPECT_EQ( v( i, 0, 0, l, 0, 0 ), 1 + l );
+    }
+    a.setValue( 0 );
+
+    auto s_im = v.template getSubarrayView< 0, 4 >( 0, 0, 0, 0, 0, 0 );
+    const int size_im_0 = s_im.template getSize< 0 >();
+    const int size_im_1 = s_im.template getSize< 1 >();
+    const int stride_im_0 = s_im.template getStride< 0 >();
+    const int stride_im_1 = s_im.template getStride< 1 >();
+    EXPECT_EQ( size_im_0, I );
+    EXPECT_EQ( size_im_1, M );
+    EXPECT_EQ( stride_im_0, J );
+    EXPECT_EQ( stride_im_1, K );
+    for( int i = 0; i < I; i++ )
+    for( int m = 0; m < M; m++ ) {
+        s_im( i, m ) = 1 + m;
+        EXPECT_EQ( v( i, 0, 0, 0, m, 0 ), 1 + m );
+    }
+    a.setValue( 0 );
+
+    auto s_in = v.template getSubarrayView< 0, 5 >( 0, 0, 0, 0, 0, 0 );
+    const int size_in_0 = s_in.template getSize< 0 >();
+    const int size_in_1 = s_in.template getSize< 1 >();
+    const int stride_in_0 = s_in.template getStride< 0 >();
+    const int stride_in_1 = s_in.template getStride< 1 >();
+    EXPECT_EQ( size_in_0, I );
+    EXPECT_EQ( size_in_1, N );
+    EXPECT_EQ( stride_in_0, J );
+    EXPECT_EQ( stride_in_1, K*L*M );
+    for( int i = 0; i < I; i++ )
+    for( int n = 0; n < N; n++ ) {
+        s_in( i, n ) = 1 + n;
+        EXPECT_EQ( v( i, 0, 0, 0, 0, n ), 1 + n );
+    }
+    a.setValue( 0 );
+
+
+    auto s_jk = v.template getSubarrayView< 1, 2 >( 0, 0, 0, 0, 0, 0 );
+    const int size_jk_0 = s_jk.template getSize< 0 >();
+    const int size_jk_1 = s_jk.template getSize< 1 >();
+    const int stride_jk_0 = s_jk.template getStride< 0 >();
+    const int stride_jk_1 = s_jk.template getStride< 1 >();
+    EXPECT_EQ( size_jk_0, J );
+    EXPECT_EQ( size_jk_1, K );
+    EXPECT_EQ( stride_jk_0, 1 );
+    EXPECT_EQ( stride_jk_1, I );
+    for( int j = 0; j < J; j++ )
+    for( int k = 0; k < K; k++ ) {
+        s_jk( j, k ) = 1 + k;
+        EXPECT_EQ( v( 0, j, k, 0, 0, 0 ), 1 + k );
+    }
+    a.setValue( 0 );
+
+    auto s_jl = v.template getSubarrayView< 1, 3 >( 0, 0, 0, 0, 0, 0 );
+    const int size_jl_0 = s_jl.template getSize< 0 >();
+    const int size_jl_1 = s_jl.template getSize< 1 >();
+    const int stride_jl_0 = s_jl.template getStride< 0 >();
+    const int stride_jl_1 = s_jl.template getStride< 1 >();
+    EXPECT_EQ( size_jl_0, J );
+    EXPECT_EQ( size_jl_1, L );
+    EXPECT_EQ( stride_jl_0, 1 );
+    EXPECT_EQ( stride_jl_1, I*K*M );
+    for( int j = 0; j < J; j++ )
+    for( int l = 0; l < L; l++ ) {
+        s_jl( j, l ) = 1 + l;
+        EXPECT_EQ( v( 0, j, 0, l, 0, 0 ), 1 + l );
+    }
+    a.setValue( 0 );
+
+    auto s_jm = v.template getSubarrayView< 1, 4 >( 0, 0, 0, 0, 0, 0 );
+    const int size_jm_0 = s_jm.template getSize< 0 >();
+    const int size_jm_1 = s_jm.template getSize< 1 >();
+    const int stride_jm_0 = s_jm.template getStride< 0 >();
+    const int stride_jm_1 = s_jm.template getStride< 1 >();
+    EXPECT_EQ( size_jm_0, J );
+    EXPECT_EQ( size_jm_1, M );
+    EXPECT_EQ( stride_jm_0, 1 );
+    EXPECT_EQ( stride_jm_1, I*K );
+    for( int j = 0; j < J; j++ )
+    for( int m = 0; m < M; m++ ) {
+        s_jm( j, m ) = 1 + m;
+        EXPECT_EQ( v( 0, j, 0, 0, m, 0 ), 1 + m );
+    }
+    a.setValue( 0 );
+
+    auto s_jn = v.template getSubarrayView< 1, 5 >( 0, 0, 0, 0, 0, 0 );
+    const int size_jn_0 = s_jn.template getSize< 0 >();
+    const int size_jn_1 = s_jn.template getSize< 1 >();
+    const int stride_jn_0 = s_jn.template getStride< 0 >();
+    const int stride_jn_1 = s_jn.template getStride< 1 >();
+    EXPECT_EQ( size_jn_0, J );
+    EXPECT_EQ( size_jn_1, N );
+    EXPECT_EQ( stride_jn_0, 1 );
+    EXPECT_EQ( stride_jn_1, I*K*L*M );
+    for( int j = 0; j < J; j++ )
+    for( int n = 0; n < N; n++ ) {
+        s_jn( j, n ) = 1 + n;
+        EXPECT_EQ( v( 0, j, 0, 0, 0, n ), 1 + n );
+    }
+    a.setValue( 0 );
+
+
+    auto s_kl = v.template getSubarrayView< 2, 3 >( 0, 0, 0, 0, 0, 0 );
+    const int size_kl_0 = s_kl.template getSize< 0 >();
+    const int size_kl_1 = s_kl.template getSize< 1 >();
+    const int stride_kl_0 = s_kl.template getStride< 0 >();
+    const int stride_kl_1 = s_kl.template getStride< 1 >();
+    EXPECT_EQ( size_kl_0, K );
+    EXPECT_EQ( size_kl_1, L );
+    EXPECT_EQ( stride_kl_0, I*J );
+    EXPECT_EQ( stride_kl_1, M );
+    for( int k = 0; k < K; k++ )
+    for( int l = 0; l < L; l++ ) {
+        s_kl( k, l ) = 1 + l;
+        EXPECT_EQ( v( 0, 0, k, l, 0, 0 ), 1 + l );
+    }
+    a.setValue( 0 );
+
+    auto s_km = v.template getSubarrayView< 2, 4 >( 0, 0, 0, 0, 0, 0 );
+    const int size_km_0 = s_km.template getSize< 0 >();
+    const int size_km_1 = s_km.template getSize< 1 >();
+    const int stride_km_0 = s_km.template getStride< 0 >();
+    const int stride_km_1 = s_km.template getStride< 1 >();
+    EXPECT_EQ( size_km_0, K );
+    EXPECT_EQ( size_km_1, M );
+    EXPECT_EQ( stride_km_0, I*J );
+    EXPECT_EQ( stride_km_1, 1 );
+    for( int k = 0; k < K; k++ )
+    for( int m = 0; m < M; m++ ) {
+        s_km( k, m ) = 1 + m;
+        EXPECT_EQ( v( 0, 0, k, 0, m, 0 ), 1 + m );
+    }
+    a.setValue( 0 );
+
+    auto s_kn = v.template getSubarrayView< 2, 5 >( 0, 0, 0, 0, 0, 0 );
+    const int size_kn_0 = s_kn.template getSize< 0 >();
+    const int size_kn_1 = s_kn.template getSize< 1 >();
+    const int stride_kn_0 = s_kn.template getStride< 0 >();
+    const int stride_kn_1 = s_kn.template getStride< 1 >();
+    EXPECT_EQ( size_kn_0, K );
+    EXPECT_EQ( size_kn_1, N );
+    EXPECT_EQ( stride_kn_0, I*J );
+    EXPECT_EQ( stride_kn_1, L*M );
+    for( int k = 0; k < K; k++ )
+    for( int n = 0; n < N; n++ ) {
+        s_kn( k, n ) = 1 + n;
+        EXPECT_EQ( v( 0, 0, k, 0, 0, n ), 1 + n );
+    }
+    a.setValue( 0 );
+
+
+    auto s_lm = v.template getSubarrayView< 3, 4 >( 0, 0, 0, 0, 0, 0 );
+    const int size_lm_0 = s_lm.template getSize< 0 >();
+    const int size_lm_1 = s_lm.template getSize< 1 >();
+    const int stride_lm_0 = s_lm.template getStride< 0 >();
+    const int stride_lm_1 = s_lm.template getStride< 1 >();
+    EXPECT_EQ( size_lm_0, L );
+    EXPECT_EQ( size_lm_1, M );
+    EXPECT_EQ( stride_lm_0, 1 );
+    EXPECT_EQ( stride_lm_1, I*J*K );
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ ) {
+        s_lm( l, m ) = 1 + m;
+        EXPECT_EQ( v( 0, 0, 0, l, m, 0 ), 1 + m );
+    }
+    a.setValue( 0 );
+
+    auto s_ln = v.template getSubarrayView< 3, 5 >( 0, 0, 0, 0, 0, 0 );
+    const int size_ln_0 = s_ln.template getSize< 0 >();
+    const int size_ln_1 = s_ln.template getSize< 1 >();
+    const int stride_ln_0 = s_ln.template getStride< 0 >();
+    const int stride_ln_1 = s_ln.template getStride< 1 >();
+    EXPECT_EQ( size_ln_0, L );
+    EXPECT_EQ( size_ln_1, N );
+    EXPECT_EQ( stride_ln_0, I*J*K*M );
+    EXPECT_EQ( stride_ln_1, 1 );
+    for( int l = 0; l < L; l++ )
+    for( int n = 0; n < N; n++ ) {
+        s_ln( l, n ) = 1 + n;
+        EXPECT_EQ( v( 0, 0, 0, l, 0, n ), 1 + n );
+    }
+    a.setValue( 0 );
+
+
+    auto s_mn = v.template getSubarrayView< 4, 5 >( 0, 0, 0, 0, 0, 0 );
+    const int size_mn_0 = s_mn.template getSize< 0 >();
+    const int size_mn_1 = s_mn.template getSize< 1 >();
+    const int stride_mn_0 = s_mn.template getStride< 0 >();
+    const int stride_mn_1 = s_mn.template getStride< 1 >();
+    EXPECT_EQ( size_mn_0, M );
+    EXPECT_EQ( size_mn_1, N );
+    EXPECT_EQ( stride_mn_0, I*J*K );
+    EXPECT_EQ( stride_mn_1, L );
+    for( int m = 0; m < M; m++ )
+    for( int n = 0; n < N; n++ ) {
+        s_mn( m, n ) = 1 + n;
+        EXPECT_EQ( v( 0, 0, 0, 0, m, n ), 1 + n );
+    }
+    a.setValue( 0 );
+}
+
+//#include "GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+//#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+   return RUN_ALL_TESTS();
+//#else
+//   throw GtestMissingError();
+//#endif
+}
diff --git a/src/UnitTests/Containers/ndarray/SlicedNDArrayTest.cpp b/src/UnitTests/Containers/ndarray/SlicedNDArrayTest.cpp
new file mode 100644
index 000000000..8574a5602
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/SlicedNDArrayTest.cpp
@@ -0,0 +1,251 @@
+#include "gtest/gtest.h"
+
+#include <TNL/Containers/NDArray.h>
+
+using namespace TNL::Containers;
+using std::index_sequence;
+
+template< typename Array >
+void expect_identity( const Array& a )
+{
+    Array identity;
+    identity.setLike( a );
+    for( int i = 0; i < identity.getSize(); i++ )
+        identity[ i ] = i;
+    EXPECT_EQ( a, identity );
+}
+
+template< typename Array, typename Seq >
+void expect_seq( const Array& a, const Seq& seq )
+{
+    for( int i = 0; i < a.getSize(); i++ )
+        EXPECT_EQ( a[ i ], seq[ i ] );
+}
+
+TEST( SlicedNDArrayTest, 2D_Static_Identity )
+{
+    constexpr int I = 3, J = 5;
+    SlicedNDArray< int, SizesHolder< int, I, J > > a;
+    a.setSizes( 0, 0 );
+
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+        for( int j = 0; j < J; j++ )
+            a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( SlicedNDArrayTest, 2D_Static_Permuted )
+{
+    constexpr int I = 3, J = 5;
+    SlicedNDArray< int,
+                   SizesHolder< int, I, J >,
+                   index_sequence< 1, 0 > > a;
+    a.setSizes( 0, 0 );
+
+    int v = 0;
+    for( int j = 0; j < J; j++ )
+        for( int i = 0; i < I; i++ )
+            a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( SlicedNDArrayTest, 6D_Dynamic )
+{
+    int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
+    SlicedNDArray< int,
+                   SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+                   index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+
+    int v = 0;
+    for( int n = 0; n < N; n++ )
+        for( int l = 0; l < L; l++ )
+            for( int m = 0; m < M; m++ )
+                for( int k = 0; k < K; k++ )
+                    for( int i = 0; i < I; i++ )
+                        for( int j = 0; j < J; j++ )
+                            a( i, j, k, l, m, n ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+
+TEST( SlicedNDArrayTest, Sliced2D_Dynamic_Identity )
+{
+    const int I = 3, J = 5;
+    SlicedNDArray< int,
+                   SizesHolder< int, 0, 0 >,
+                   index_sequence< 0, 1 >,
+                   SliceInfo< 1, 2 > > a;  // J is sliced
+    a.setSizes( I, J );
+
+    a.getStorageArray().setValue(-1);
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+        for( int j = 0; j < J; j++ )
+            a( i, j ) = v++;
+
+    const std::vector< int > seq({
+            // first slice
+            0, 1,
+            5, 6,
+            10, 11,
+            // second slice
+            2, 3,
+            7, 8,
+            12, 13,
+            // third slice
+            4, -1,
+            9, -1,
+            14, -1,
+        });
+    expect_seq( a.getStorageArray(), seq );
+}
+
+TEST( SlicedNDArrayTest, Sliced2D_HalfStatic_Identity )
+{
+    constexpr int I = 3;
+    const int J = 5;
+    SlicedNDArray< int,
+                   SizesHolder< int, I, 0 >,
+                   index_sequence< 0, 1 >,
+                   SliceInfo< 1, 2 > > a;  // J is sliced
+    a.setSizes( 0, J );
+
+    a.getStorageArray().setValue(-1);
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+        for( int j = 0; j < J; j++ )
+            a( i, j ) = v++;
+
+    const std::vector< int > seq({
+            // first slice
+            0, 1,
+            5, 6,
+            10, 11,
+            // second slice
+            2, 3,
+            7, 8,
+            12, 13,
+            // third slice
+            4, -1,
+            9, -1,
+            14, -1,
+        });
+    expect_seq( a.getStorageArray(), seq );
+}
+
+TEST( SlicedNDArrayTest, Sliced2D_Dynamic_Permuted )
+{
+    const int I = 3, J = 5;
+    SlicedNDArray< int,
+                   SizesHolder< int, 0, 0 >,
+                   index_sequence< 1, 0 >,
+                   SliceInfo< 0, 2 > > a;  // I is sliced
+    a.setSizes( I, J );
+
+    a.getStorageArray().setValue(-1);
+    int v = 0;
+    for( int j = 0; j < J; j++ )
+        for( int i = 0; i < I; i++ )
+            a( i, j ) = v++;
+
+    const std::vector< int > seq({
+            // first slice (transposed)
+            0, 1,
+            3, 4,
+            6, 7,
+            9, 10,
+            12, 13,
+            // second slice (transposed)
+            2, -1,
+            5, -1,
+            8, -1,
+            11, -1,
+            14, -1,
+        });
+    expect_seq( a.getStorageArray(), seq );
+}
+
+TEST( SlicedNDArrayTest, Sliced2D_HalfStatic_Permuted )
+{
+    const int I = 3;
+    constexpr int J = 5;
+    SlicedNDArray< int,
+                   SizesHolder< int, 0, J >,
+                   index_sequence< 1, 0 >,
+                   SliceInfo< 0, 2 > > a;  // I is sliced
+    a.setSizes( I, 0 );
+
+    a.getStorageArray().setValue(-1);
+    int v = 0;
+    for( int j = 0; j < J; j++ )
+        for( int i = 0; i < I; i++ )
+            a( i, j ) = v++;
+
+    const std::vector< int > seq({
+            // first slice (transposed)
+            0, 1,
+            3, 4,
+            6, 7,
+            9, 10,
+            12, 13,
+            // second slice (transposed)
+            2, -1,
+            5, -1,
+            8, -1,
+            11, -1,
+            14, -1,
+        });
+    expect_seq( a.getStorageArray(), seq );
+}
+
+
+TEST( SlicedNDArrayTest, CopySemantics )
+{
+    const int I = 3, J = 4;
+    SlicedNDArray< int,
+                   SizesHolder< int, 0, 0 >,
+                   index_sequence< 0, 1 >,
+                   SliceInfo< 1, 2 > > a, b, c;  // J is sliced
+    a.setSizes( I, J );
+
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        a( i, j ) = v++;
+
+    b = a;
+    EXPECT_EQ( a, b );
+
+    auto a_view = a.getView();
+    auto b_view = b.getView();
+    EXPECT_EQ( a_view, b_view );
+    EXPECT_EQ( a_view.getView(), b_view );
+    EXPECT_EQ( a_view.getConstView(), b_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), b.getConstView() );
+    EXPECT_EQ( a.getConstView(), b_view.getConstView() );
+
+    c.setSizes( I, J );
+    auto c_view = c.getView();
+    c_view = b_view;
+    EXPECT_EQ( a_view, c_view );
+    EXPECT_EQ( a_view.getView(), c_view );
+    EXPECT_EQ( a_view.getConstView(), c_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), c.getConstView() );
+    EXPECT_EQ( a.getConstView(), c_view.getConstView() );
+}
+
+//#include "GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+//#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+   return RUN_ALL_TESTS();
+//#else
+//   throw GtestMissingError();
+//#endif
+}
diff --git a/src/UnitTests/Containers/ndarray/StaticNDArrayCudaTest.cu b/src/UnitTests/Containers/ndarray/StaticNDArrayCudaTest.cu
new file mode 100644
index 000000000..71ff572be
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/StaticNDArrayCudaTest.cu
@@ -0,0 +1,90 @@
+#include "gtest/gtest.h"
+
+#include <TNL/Containers/NDArray.h>
+
+#include <TNL/ParallelFor.h>
+
+using namespace TNL::Containers;
+using std::index_sequence;
+
+template< typename Array >
+void expect_identity( const Array& a )
+{
+    Array identity;
+    identity.setSize( a.getSize() );
+    for( int i = 0; i < identity.getSize(); i++ )
+        identity.setElement( i, i );
+    EXPECT_EQ( a, identity );
+}
+
+// nvcc fuck-up: __host__ __device__ lambdas cannot be inside protected/private class methods
+void __test_SetThroughView()
+{
+    constexpr int I = 3, J = 5;
+    using ViewType = typename StaticNDArray< int, SizesHolder< int, I, J > >::ViewType;
+    NDArray< int,
+             SizesHolder< int, I, J >,
+             std::make_index_sequence< 2 >,
+             std::make_index_sequence< 2 >,
+             TNL::Devices::Cuda > a;
+    a.setSizes( 0, 0 );
+    ViewType a_view( a.getStorageArray().getData(), SizesHolder< int, I, J >{} );
+
+    auto kernel = [] __cuda_callable__ ( int, ViewType a ) {
+        int v = 0;
+        for( int i = 0; i < I; i++ )
+            for( int j = 0; j < J; j++ )
+                a( i, j ) = v++;
+    };
+
+    a.setValue(0);
+    TNL::ParallelFor< TNL::Devices::Cuda >::exec( 0, 1, kernel, a_view );
+    expect_identity( a.getStorageArray() );
+}
+TEST( StaticNDArrayCudaTest, SetThroughView )
+{
+    __test_SetThroughView();
+}
+
+// nvcc fuck-up: __host__ __device__ lambdas cannot be inside protected/private class methods
+void __test_CopyFromArray()
+{
+    constexpr int I = 3, J = 5;
+    using ViewType = typename StaticNDArray< int, SizesHolder< int, I, J > >::ViewType;
+    NDArray< int,
+             SizesHolder< int, I, J >,
+             std::make_index_sequence< 2 >,
+             std::make_index_sequence< 2 >,
+             TNL::Devices::Cuda > a;
+    a.setSizes( 0, 0 );
+    ViewType a_view( a.getStorageArray().getData(), SizesHolder< int, I, J >{} );
+
+    auto kernel = [] __cuda_callable__ ( int, ViewType a ) {
+        StaticNDArray< int, SizesHolder< int, I, J > > b;
+        int v = 0;
+        for( int i = 0; i < I; i++ )
+            for( int j = 0; j < J; j++ )
+                b( i, j ) = v++;
+        a = b.getView();
+        a( 0, 0 ) = a != b.getView();
+    };
+
+    a.setValue(0);
+    TNL::ParallelFor< TNL::Devices::Cuda >::exec( 0, 1, kernel, a_view );
+    expect_identity( a.getStorageArray() );
+}
+TEST( StaticNDArrayCudaTest, CopyFromArray )
+{
+    __test_CopyFromArray();
+}
+
+//#include "GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+//#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+   return RUN_ALL_TESTS();
+//#else
+//   throw GtestMissingError();
+//#endif
+}
diff --git a/src/UnitTests/Containers/ndarray/StaticNDArrayTest.cpp b/src/UnitTests/Containers/ndarray/StaticNDArrayTest.cpp
new file mode 100644
index 000000000..e3ea290f2
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/StaticNDArrayTest.cpp
@@ -0,0 +1,105 @@
+#include "gtest/gtest.h"
+
+#include <TNL/Containers/NDArray.h>
+
+using namespace TNL::Containers;
+using std::index_sequence;
+
+template< typename Array >
+void expect_identity( const Array& a )
+{
+    Array identity;
+    for( int i = 0; i < identity.getSize(); i++ )
+        identity[ i ] = i;
+    EXPECT_EQ( a, identity );
+}
+
+TEST( StaticNDArrayTest, Static_2D_Identity )
+{
+    constexpr int I = 3, J = 5;
+    StaticNDArray< int, SizesHolder< int, I, J > > a;
+
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+        for( int j = 0; j < J; j++ )
+            a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( StaticNDArrayTest, Static_2D_Permuted )
+{
+    constexpr int I = 3, J = 5;
+    StaticNDArray< int,
+                   SizesHolder< int, I, J >,
+                   index_sequence< 1, 0 > > a;
+
+    int v = 0;
+    for( int j = 0; j < J; j++ )
+        for( int i = 0; i < I; i++ )
+            a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( StaticNDArrayTest, Static_6D_Permuted )
+{
+    constexpr int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
+    StaticNDArray< int,
+                   SizesHolder< int, I, J, K, L, M, N >,
+                   index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+
+    int v = 0;
+    for( int n = 0; n < N; n++ )
+        for( int l = 0; l < L; l++ )
+            for( int m = 0; m < M; m++ )
+                for( int k = 0; k < K; k++ )
+                    for( int i = 0; i < I; i++ )
+                        for( int j = 0; j < J; j++ )
+                            a( i, j, k, l, m, n ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( StaticNDArrayTest, CopySemantics )
+{
+    constexpr int I = 3, J = 5;
+    StaticNDArray< int, SizesHolder< int, I, J > > a, b, c;
+
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+        for( int j = 0; j < J; j++ )
+            a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+
+    b = a;
+    EXPECT_EQ( a, b );
+
+    auto a_view = a.getView();
+    auto b_view = b.getView();
+    EXPECT_EQ( a_view, b_view );
+    EXPECT_EQ( a_view.getView(), b_view );
+    EXPECT_EQ( a_view.getConstView(), b_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), b.getConstView() );
+    EXPECT_EQ( a.getConstView(), b_view.getConstView() );
+
+    auto c_view = c.getView();
+    c_view = b_view;
+    EXPECT_EQ( a_view, c_view );
+    EXPECT_EQ( a_view.getView(), c_view );
+    EXPECT_EQ( a_view.getConstView(), c_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), c.getConstView() );
+    EXPECT_EQ( a.getConstView(), c_view.getConstView() );
+}
+
+//#include "GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+//#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+   return RUN_ALL_TESTS();
+//#else
+//   throw GtestMissingError();
+//#endif
+}
-- 
GitLab


From f6d08f4b71b23a45e278e56c14bdb532b4865b09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sat, 2 Feb 2019 23:45:46 +0100
Subject: [PATCH 02/25] NDArray: generalized call_with_permuted_arguments

---
 src/TNL/Containers/ndarray/Meta.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/TNL/Containers/ndarray/Meta.h b/src/TNL/Containers/ndarray/Meta.h
index 6807b8dc7..125c9e105 100644
--- a/src/TNL/Containers/ndarray/Meta.h
+++ b/src/TNL/Containers/ndarray/Meta.h
@@ -169,11 +169,11 @@ struct CallPermutationHelper< Permutation, std::index_sequence< N... > >
    template< typename Func,
              typename... Args >
    __cuda_callable__
-   static auto apply( Func f, Args&&... args )
+   static auto apply( Func&& f, Args&&... args ) -> decltype(auto)
    {
-      return f( get_from_pack<
+      return std::forward< Func >( f )( get_from_pack<
                   get< N >( Permutation{} )
-                >( args... )... );
+                >( std::forward< Args >( args )... )... );
    }
 };
 
@@ -183,7 +183,13 @@ template< typename Permutation,
           typename Func,
           typename... Args >
 __cuda_callable__
-auto call_with_permuted_arguments( Func f, Args&&... args )
+// FIXME: does not compile with nvcc 10.0
+//auto call_with_permuted_arguments( Func&& f, Args&&... args ) -> decltype(auto)
+//{
+//   return CallPermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > >
+//          ::apply( std::forward< Func >( f ), std::forward< Args >( args )... );
+//}
+auto call_with_permuted_arguments( Func f, Args&&... args ) -> decltype(auto)
 {
    return CallPermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > >
           ::apply( f, std::forward< Args >( args )... );
-- 
GitLab


From ffc002609b5d8e91446113161982473e477773ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sun, 3 Feb 2019 16:33:48 +0100
Subject: [PATCH 03/25] NDArray: simplified executors for operations

---
 src/TNL/Containers/ndarray/Operations.h | 114 ++++++++----------------
 1 file changed, 36 insertions(+), 78 deletions(-)

diff --git a/src/TNL/Containers/ndarray/Operations.h b/src/TNL/Containers/ndarray/Operations.h
index 2462771d3..04c66a7f4 100644
--- a/src/TNL/Containers/ndarray/Operations.h
+++ b/src/TNL/Containers/ndarray/Operations.h
@@ -90,86 +90,29 @@ struct SequentialExecutorRTL< Array, IndexTag< 0 > >
 
 
 template< typename Array,
-          typename DimTag = IndexTag< Array::getDimension() > >
-struct OpenMPExecutor
+          typename Device = typename Array::DeviceType >
+struct ParallelExecutorDeviceDispatch
 {
    template< typename Func >
    void operator()( const Array& array, Func f )
    {
-      SequentialExecutor< Array, IndexTag< 3 > > exec;
-
-      const auto size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
-      const auto size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
-      const auto size2 = array.template getSize< get< 2 >( typename Array::PermutationType{} ) >();
-
-      #ifdef HAVE_OPENMP
-      #pragma omp parallel for collapse(3)
-      #endif
-      for( typename Array::IndexType i0 = 0; i0 < size0; i0++ )
-      for( typename Array::IndexType i1 = 0; i1 < size1; i1++ )
-      for( typename Array::IndexType i2 = 0; i2 < size2; i2++ )
-         exec( array, f, i0, i1, i2 );
-   }
-};
+      using Index = typename Array::IndexType;
 
-template< typename Array >
-struct OpenMPExecutor< Array, IndexTag< 3 > >
-{
-   template< typename Func >
-   void operator()( const Array& array, Func f )
-   {
-      const auto size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
-      const auto size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
-      const auto size2 = array.template getSize< get< 2 >( typename Array::PermutationType{} ) >();
-
-      #ifdef HAVE_OPENMP
-      #pragma omp parallel for collapse(2)
-      #endif
-      for( typename Array::IndexType i0 = 0; i0 < size0; i0++ )
-      for( typename Array::IndexType i1 = 0; i1 < size1; i1++ )
-      for( typename Array::IndexType i2 = 0; i2 < size2; i2++ )
-         call_with_permuted_arguments< typename Array::PermutationType >( f, i0, i1, i2 );
-   }
-};
+      auto kernel = [=] ( Index i2, Index i1, Index i0 )
+      {
+         SequentialExecutor< Array, IndexTag< 3 > > exec;
+         exec( array, f, i0, i1, i2 );
+      };
 
-template< typename Array >
-struct OpenMPExecutor< Array, IndexTag< 2 > >
-{
-   template< typename Func >
-   void operator()( const Array& array, Func f )
-   {
-      const auto size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
-      const auto size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
-
-      #ifdef HAVE_OPENMP
-      #pragma omp parallel for
-      #endif
-      for( typename Array::IndexType i0 = 0; i0 < size0; i0++ )
-      for( typename Array::IndexType i1 = 0; i1 < size1; i1++ )
-         call_with_permuted_arguments< typename Array::PermutationType >( f, i0, i1 );
+      const Index size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
+      const Index size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
+      const Index size2 = array.template getSize< get< 2 >( typename Array::PermutationType{} ) >();
+      ParallelFor3D< Device >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
    }
 };
 
 template< typename Array >
-struct OpenMPExecutor< Array, IndexTag< 1 > >
-{
-   template< typename Func >
-   void operator()( const Array& array, Func f )
-   {
-      const auto size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
-
-      #ifdef HAVE_OPENMP
-      #pragma omp parallel for
-      #endif
-      for( typename Array::IndexType i0 = 0; i0 < size0; i0++ )
-         call_with_permuted_arguments< typename Array::PermutationType >( f, i0 );
-   }
-};
-
-
-template< typename Array,
-          typename DimTag = IndexTag< Array::getDimension() > >
-struct CudaExecutor
+struct ParallelExecutorDeviceDispatch< Array, Devices::Cuda >
 {
    template< typename Func >
    void operator()( const Array& array, Func f )
@@ -189,12 +132,25 @@ struct CudaExecutor
    }
 };
 
+template< typename Array,
+          typename DimTag = IndexTag< Array::getDimension() > >
+struct ParallelExecutor
+{
+   template< typename Func >
+   void operator()( const Array& array, Func f )
+   {
+      ParallelExecutorDeviceDispatch< Array > dispatch;
+      dispatch( array, f );
+   }
+};
+
 template< typename Array >
-struct CudaExecutor< Array, IndexTag< 3 > >
+struct ParallelExecutor< Array, IndexTag< 3 > >
 {
    template< typename Func >
    void operator()( const Array& array, Func f )
    {
+      using Device = typename Array::DeviceType;
       using Index = typename Array::IndexType;
 
       auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
@@ -205,16 +161,17 @@ struct CudaExecutor< Array, IndexTag< 3 > >
       const Index size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
       const Index size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
       const Index size2 = array.template getSize< get< 2 >( typename Array::PermutationType{} ) >();
-      ParallelFor3D< Devices::Cuda >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
+      ParallelFor3D< Device >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
    }
 };
 
 template< typename Array >
-struct CudaExecutor< Array, IndexTag< 2 > >
+struct ParallelExecutor< Array, IndexTag< 2 > >
 {
    template< typename Func >
    void operator()( const Array& array, Func f )
    {
+      using Device = typename Array::DeviceType;
       using Index = typename Array::IndexType;
 
       auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
@@ -224,16 +181,17 @@ struct CudaExecutor< Array, IndexTag< 2 > >
 
       const Index size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
       const Index size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
-      ParallelFor2D< Devices::Cuda >::exec( (Index) 0, (Index) 0, size1, size0, kernel );
+      ParallelFor2D< Device >::exec( (Index) 0, (Index) 0, size1, size0, kernel );
    }
 };
 
 template< typename Array >
-struct CudaExecutor< Array, IndexTag< 1 > >
+struct ParallelExecutor< Array, IndexTag< 1 > >
 {
    template< typename Func >
    void operator()( const Array& array, Func f )
    {
+      using Device = typename Array::DeviceType;
       using Index = typename Array::IndexType;
 
       auto kernel = [=] __cuda_callable__ ( Index i )
@@ -242,7 +200,7 @@ struct CudaExecutor< Array, IndexTag< 1 > >
       };
 
       const Index size = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
-      ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel );
+      ParallelFor< Device >::exec( (Index) 0, size, kernel );
    }
 };
 
@@ -265,7 +223,7 @@ struct ExecutorDispatcher< Array, Devices::Host >
    void operator()( const Array& array, Func f )
    {
       if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 )
-         OpenMPExecutor< Array >()( array, f );
+         ParallelExecutor< Array >()( array, f );
       else
          SequentialExecutor< Array >()( array, f );
    }
@@ -277,7 +235,7 @@ struct ExecutorDispatcher< Array, Devices::Cuda >
    template< typename Func >
    void operator()( const Array& array, Func f )
    {
-      CudaExecutor< Array >()( array, f );
+      ParallelExecutor< Array >()( array, f );
    }
 };
 
-- 
GitLab


From 024829623aee58feb70b660b2a7d722179642019 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Tue, 5 Feb 2019 20:45:31 +0100
Subject: [PATCH 04/25] NDArray: added forAll method

---
 src/TNL/Containers/NDArray.h                  |  8 ++++
 src/TNL/Containers/NDArrayView.h              |  8 ++++
 .../Containers/ndarray/NDArrayTest.cpp        | 47 +++++++++++++++++++
 3 files changed, 63 insertions(+)

diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h
index ee89e8786..d9e4cb09b 100644
--- a/src/TNL/Containers/NDArray.h
+++ b/src/TNL/Containers/NDArray.h
@@ -173,6 +173,14 @@ public:
       return ConstViewType( array.getData(), sizes );
    }
 
+   template< typename Device2 = DeviceType, typename Func >
+   void forAll( Func f ) const
+   {
+      __ndarray_impl::ExecutorDispatcher< ConstViewType, Device2 > dispatch;
+      dispatch( getConstView(), f );
+   }
+
+
    // extra methods
 
    // TODO: rename to setSizes and make sure that overloading with the following method works
diff --git a/src/TNL/Containers/NDArrayView.h b/src/TNL/Containers/NDArrayView.h
index fe75bdadd..73af6713f 100644
--- a/src/TNL/Containers/NDArrayView.h
+++ b/src/TNL/Containers/NDArrayView.h
@@ -15,6 +15,7 @@
 #include <TNL/Containers/ndarray/Indexing.h>
 #include <TNL/Containers/ndarray/SizesHolder.h>
 #include <TNL/Containers/ndarray/Subarrays.h>
+#include <TNL/Containers/ndarray/Operations.h>
 
 namespace TNL {
 namespace Containers {
@@ -226,6 +227,13 @@ public:
       return SubarrayView{ &begin, subarray_sizes, strides };
    }
 
+   template< typename Device2 = DeviceType, typename Func >
+   void forAll( Func f ) const
+   {
+      __ndarray_impl::ExecutorDispatcher< NDArrayView, Device2 > dispatch;
+      dispatch( *this, f );
+   }
+
 protected:
    Value* array = nullptr;
    SizesHolder sizes;
diff --git a/src/UnitTests/Containers/ndarray/NDArrayTest.cpp b/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
index 2a98e71f1..385ff93da 100644
--- a/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
+++ b/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
@@ -180,6 +180,53 @@ TEST( NDArrayTest, SizesHolderPrinter )
    EXPECT_EQ( str.str(), "SizesHolder< 0, 1, 2 >( 3, 1, 2 )" );
 }
 
+TEST( NDArrayTest, forAll_dynamic )
+{
+    int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+    {
+       a( i, j, k, l, m, n ) = 1;
+    };
+
+    a.forAll( setter );
+
+    for( int n = 0; n < N; n++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        EXPECT_EQ( a( i, j, k, l, m, n ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static )
+{
+    constexpr int I = 3, J = 4;
+    NDArray< int, SizesHolder< int, I, J > > a;
+    a.setSizes( 0, 0 );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        a( i, j ) = 0;
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) = 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        EXPECT_EQ( a( i, j ), 1 );
+}
+
 //#include "GtestMissingError.h"
 int main( int argc, char* argv[] )
 {
-- 
GitLab


From 3144060def3839ff66e4d16cdb1cdd6676220774 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Tue, 5 Feb 2019 21:54:35 +0100
Subject: [PATCH 05/25] NDArray: simplified executors for operations using
 sizes instead of the whole array view

---
 src/TNL/Containers/NDArray.h            |   4 +-
 src/TNL/Containers/NDArrayView.h        |   4 +-
 src/TNL/Containers/ndarray/Operations.h | 233 ++++++++++++------------
 3 files changed, 119 insertions(+), 122 deletions(-)

diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h
index d9e4cb09b..ed5111e6d 100644
--- a/src/TNL/Containers/NDArray.h
+++ b/src/TNL/Containers/NDArray.h
@@ -176,8 +176,8 @@ public:
    template< typename Device2 = DeviceType, typename Func >
    void forAll( Func f ) const
    {
-      __ndarray_impl::ExecutorDispatcher< ConstViewType, Device2 > dispatch;
-      dispatch( getConstView(), f );
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( sizes, f );
    }
 
 
diff --git a/src/TNL/Containers/NDArrayView.h b/src/TNL/Containers/NDArrayView.h
index 73af6713f..5b5e056ff 100644
--- a/src/TNL/Containers/NDArrayView.h
+++ b/src/TNL/Containers/NDArrayView.h
@@ -230,8 +230,8 @@ public:
    template< typename Device2 = DeviceType, typename Func >
    void forAll( Func f ) const
    {
-      __ndarray_impl::ExecutorDispatcher< NDArrayView, Device2 > dispatch;
-      dispatch( *this, f );
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( sizes, f );
    }
 
 protected:
diff --git a/src/TNL/Containers/ndarray/Operations.h b/src/TNL/Containers/ndarray/Operations.h
index 04c66a7f4..705687c1d 100644
--- a/src/TNL/Containers/ndarray/Operations.h
+++ b/src/TNL/Containers/ndarray/Operations.h
@@ -21,221 +21,227 @@ namespace Containers {
 
 namespace __ndarray_impl {
 
-template< typename Array,
+template< typename Permutation,
           typename LevelTag = IndexTag< 0 > >
 struct SequentialExecutor
 {
-   template< typename Func,
+   template< typename SizesHolder,
+             typename Func,
              typename... Indices >
    __cuda_callable__
-   void operator()( const Array& array, Func f, Indices&&... indices )
+   void operator()( const SizesHolder& sizes, Func f, Indices&&... indices )
    {
-      SequentialExecutor< Array, IndexTag< LevelTag::value + 1 > > exec;
-      const auto size = array.template getSize< get< LevelTag::value >( typename Array::PermutationType{} ) >();
-      for( typename Array::IndexType i = 0; i < size; i++ )
-         exec( array, f, std::forward< Indices >( indices )..., i );
+      SequentialExecutor< Permutation, IndexTag< LevelTag::value + 1 > > exec;
+      const auto size = sizes.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      for( typename SizesHolder::IndexType i = 0; i < size; i++ )
+         exec( sizes, f, std::forward< Indices >( indices )..., i );
    }
 };
 
-template< typename Array >
-struct SequentialExecutor< Array, IndexTag< Array::getDimension() - 1 > >
+template< typename Permutation >
+struct SequentialExecutor< Permutation, IndexTag< Permutation::size() - 1 > >
 {
-   template< typename Func,
+   template< typename SizesHolder,
+             typename Func,
              typename... Indices >
    __cuda_callable__
-   void operator()( const Array& array, Func f, Indices&&... indices )
+   void operator()( const SizesHolder& sizes, Func f, Indices&&... indices )
    {
-      static_assert( sizeof...(indices) == Array::getDimension() - 1,
+      static_assert( sizeof...(indices) == SizesHolder::getDimension() - 1,
                      "invalid number of indices in the final step of the SequentialExecutor" );
 
-      const auto size = array.template getSize< get< Array::getDimension() - 1 >( typename Array::PermutationType{} ) >();
-      for( typename Array::IndexType i = 0; i < size; i++ )
-         call_with_permuted_arguments< typename Array::PermutationType >( f, std::forward< Indices >( indices )..., i );
+      const auto size = sizes.template getSize< get< SizesHolder::getDimension() - 1 >( Permutation{} ) >();
+      for( typename SizesHolder::IndexType i = 0; i < size; i++ )
+         call_with_permuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
    }
 };
 
 
-template< typename Array,
-          typename LevelTag = IndexTag< Array::getDimension() - 1 > >
+template< typename Permutation,
+          typename LevelTag = IndexTag< Permutation::size() - 1 > >
 struct SequentialExecutorRTL
 {
-   template< typename Func,
+   template< typename SizesHolder,
+             typename Func,
              typename... Indices >
    __cuda_callable__
-   void operator()( const Array& array, Func f, Indices&&... indices )
+   void operator()( const SizesHolder& sizes, Func f, Indices&&... indices )
    {
-      SequentialExecutorRTL< Array, IndexTag< LevelTag::value - 1 > > exec;
-      const auto size = array.template getSize< get< LevelTag::value >( typename Array::PermutationType{} ) >();
-      for( typename Array::IndexType i = 0; i < size; i++ )
-         exec( array, f, i, std::forward< Indices >( indices )... );
+      SequentialExecutorRTL< Permutation, IndexTag< LevelTag::value - 1 > > exec;
+      const auto size = sizes.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      for( typename SizesHolder::IndexType i = 0; i < size; i++ )
+         exec( sizes, f, i, std::forward< Indices >( indices )... );
    }
 };
 
-template< typename Array >
-struct SequentialExecutorRTL< Array, IndexTag< 0 > >
+template< typename Permutation >
+struct SequentialExecutorRTL< Permutation, IndexTag< 0 > >
 {
-   template< typename Func,
+   template< typename SizesHolder,
+             typename Func,
              typename... Indices >
    __cuda_callable__
-   void operator()( const Array& array, Func f, Indices&&... indices )
+   void operator()( const SizesHolder& sizes, Func f, Indices&&... indices )
    {
-      static_assert( sizeof...(indices) == Array::getDimension() - 1,
+      static_assert( sizeof...(indices) == SizesHolder::getDimension() - 1,
                      "invalid number of indices in the final step of the SequentialExecutor" );
 
-      const auto size = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
-      for( typename Array::IndexType i = 0; i < size; i++ )
-         call_with_permuted_arguments< typename Array::PermutationType >( f, i, std::forward< Indices >( indices )... );
+      const auto size = sizes.template getSize< get< 0 >( Permutation{} ) >();
+      for( typename SizesHolder::IndexType i = 0; i < size; i++ )
+         call_with_permuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... );
    }
 };
 
 
-template< typename Array,
-          typename Device = typename Array::DeviceType >
+template< typename Permutation,
+          typename Device >
 struct ParallelExecutorDeviceDispatch
 {
-   template< typename Func >
-   void operator()( const Array& array, Func f )
+   template< typename SizesHolder, typename Func >
+   void operator()( const SizesHolder& sizes, Func f )
    {
-      using Index = typename Array::IndexType;
+      using Index = typename SizesHolder::IndexType;
 
       auto kernel = [=] ( Index i2, Index i1, Index i0 )
       {
-         SequentialExecutor< Array, IndexTag< 3 > > exec;
-         exec( array, f, i0, i1, i2 );
+         SequentialExecutor< Permutation, IndexTag< 3 > > exec;
+         exec( sizes, f, i0, i1, i2 );
       };
 
-      const Index size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
-      const Index size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
-      const Index size2 = array.template getSize< get< 2 >( typename Array::PermutationType{} ) >();
+      const Index size0 = sizes.template getSize< get< 0 >( Permutation{} ) >();
+      const Index size1 = sizes.template getSize< get< 1 >( Permutation{} ) >();
+      const Index size2 = sizes.template getSize< get< 2 >( Permutation{} ) >();
       ParallelFor3D< Device >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
    }
 };
 
-template< typename Array >
-struct ParallelExecutorDeviceDispatch< Array, Devices::Cuda >
+template< typename Permutation >
+struct ParallelExecutorDeviceDispatch< Permutation, Devices::Cuda >
 {
-   template< typename Func >
-   void operator()( const Array& array, Func f )
+   template< typename SizesHolder, typename Func >
+   void operator()( const SizesHolder& sizes, Func f )
    {
-      using Index = typename Array::IndexType;
+      using Index = typename SizesHolder::IndexType;
 
       auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
       {
-         SequentialExecutorRTL< Array, IndexTag< Array::getDimension() - 4 > > exec;
-         exec( array, f, i0, i1, i2 );
+         SequentialExecutorRTL< Permutation, IndexTag< SizesHolder::getDimension() - 4 > > exec;
+         exec( sizes, f, i0, i1, i2 );
       };
 
-      const Index size0 = array.template getSize< get< Array::getDimension() - 3 >( typename Array::PermutationType{} ) >();
-      const Index size1 = array.template getSize< get< Array::getDimension() - 2 >( typename Array::PermutationType{} ) >();
-      const Index size2 = array.template getSize< get< Array::getDimension() - 1 >( typename Array::PermutationType{} ) >();
+      const Index size0 = sizes.template getSize< get< SizesHolder::getDimension() - 3 >( Permutation{} ) >();
+      const Index size1 = sizes.template getSize< get< SizesHolder::getDimension() - 2 >( Permutation{} ) >();
+      const Index size2 = sizes.template getSize< get< SizesHolder::getDimension() - 1 >( Permutation{} ) >();
       ParallelFor3D< Devices::Cuda >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
    }
 };
 
-template< typename Array,
-          typename DimTag = IndexTag< Array::getDimension() > >
+template< typename Permutation,
+          typename Device,
+          typename DimTag = IndexTag< Permutation::size() > >
 struct ParallelExecutor
 {
-   template< typename Func >
-   void operator()( const Array& array, Func f )
+   template< typename SizesHolder, typename Func >
+   void operator()( const SizesHolder& sizes, Func f )
    {
-      ParallelExecutorDeviceDispatch< Array > dispatch;
-      dispatch( array, f );
+      ParallelExecutorDeviceDispatch< Permutation, Device > dispatch;
+      dispatch( sizes, f );
    }
 };
 
-template< typename Array >
-struct ParallelExecutor< Array, IndexTag< 3 > >
+template< typename Permutation,
+          typename Device >
+struct ParallelExecutor< Permutation, Device, IndexTag< 3 > >
 {
-   template< typename Func >
-   void operator()( const Array& array, Func f )
+   template< typename SizesHolder, typename Func >
+   void operator()( const SizesHolder& sizes, Func f )
    {
-      using Device = typename Array::DeviceType;
-      using Index = typename Array::IndexType;
+      using Index = typename SizesHolder::IndexType;
 
       auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
       {
-         call_with_permuted_arguments< typename Array::PermutationType >( f, i0, i1, i2 );
+         call_with_permuted_arguments< Permutation >( f, i0, i1, i2 );
       };
 
-      const Index size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
-      const Index size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
-      const Index size2 = array.template getSize< get< 2 >( typename Array::PermutationType{} ) >();
+      const Index size0 = sizes.template getSize< get< 0 >( Permutation{} ) >();
+      const Index size1 = sizes.template getSize< get< 1 >( Permutation{} ) >();
+      const Index size2 = sizes.template getSize< get< 2 >( Permutation{} ) >();
       ParallelFor3D< Device >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
    }
 };
 
-template< typename Array >
-struct ParallelExecutor< Array, IndexTag< 2 > >
+template< typename Permutation,
+          typename Device >
+struct ParallelExecutor< Permutation, Device, IndexTag< 2 > >
 {
-   template< typename Func >
-   void operator()( const Array& array, Func f )
+   template< typename SizesHolder, typename Func >
+   void operator()( const SizesHolder& sizes, Func f )
    {
-      using Device = typename Array::DeviceType;
-      using Index = typename Array::IndexType;
+      using Index = typename SizesHolder::IndexType;
 
       auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
       {
-         call_with_permuted_arguments< typename Array::PermutationType >( f, i0, i1 );
+         call_with_permuted_arguments< Permutation >( f, i0, i1 );
       };
 
-      const Index size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
-      const Index size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
+      const Index size0 = sizes.template getSize< get< 0 >( Permutation{} ) >();
+      const Index size1 = sizes.template getSize< get< 1 >( Permutation{} ) >();
       ParallelFor2D< Device >::exec( (Index) 0, (Index) 0, size1, size0, kernel );
    }
 };
 
-template< typename Array >
-struct ParallelExecutor< Array, IndexTag< 1 > >
+template< typename Permutation,
+          typename Device >
+struct ParallelExecutor< Permutation, Device, IndexTag< 1 > >
 {
-   template< typename Func >
-   void operator()( const Array& array, Func f )
+   template< typename SizesHolder, typename Func >
+   void operator()( const SizesHolder& sizes, Func f )
    {
-      using Device = typename Array::DeviceType;
-      using Index = typename Array::IndexType;
+      using Index = typename SizesHolder::IndexType;
 
       auto kernel = [=] __cuda_callable__ ( Index i )
       {
-         call_with_permuted_arguments< typename Array::PermutationType >( f, i );
+         call_with_permuted_arguments< Permutation >( f, i );
       };
 
-      const Index size = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
+      const Index size = sizes.template getSize< get< 0 >( Permutation{} ) >();
       ParallelFor< Device >::exec( (Index) 0, size, kernel );
    }
 };
 
 
 // Device may be void which stands for StaticNDArray
-template< typename Array, typename Device = typename Array::DeviceType >
+template< typename Permutation,
+          typename Device >
 struct ExecutorDispatcher
 {
-   template< typename Func >
-   void operator()( const Array& array, Func f )
+   template< typename SizesHolder, typename Func >
+   void operator()( const SizesHolder& sizes, Func f )
    {
-      SequentialExecutor< Array >()( array, f );
+      SequentialExecutor< Permutation >()( sizes, f );
    }
 };
 
-template< typename Array >
-struct ExecutorDispatcher< Array, Devices::Host >
+template< typename Permutation >
+struct ExecutorDispatcher< Permutation, Devices::Host >
 {
-   template< typename Func >
-   void operator()( const Array& array, Func f )
+   template< typename SizesHolder, typename Func >
+   void operator()( const SizesHolder& sizes, Func f )
    {
       if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 )
-         ParallelExecutor< Array >()( array, f );
+         ParallelExecutor< Permutation, Devices::Host >()( sizes, f );
       else
-         SequentialExecutor< Array >()( array, f );
+         SequentialExecutor< Permutation >()( sizes, f );
    }
 };
 
-template< typename Array >
-struct ExecutorDispatcher< Array, Devices::Cuda >
+template< typename Permutation >
+struct ExecutorDispatcher< Permutation, Devices::Cuda >
 {
-   template< typename Func >
-   void operator()( const Array& array, Func f )
+   template< typename SizesHolder, typename Func >
+   void operator()( const SizesHolder& sizes, Func f )
    {
-      ParallelExecutor< Array >()( array, f );
+      ParallelExecutor< Permutation, Devices::Cuda >()( sizes, f );
    }
 };
 
@@ -256,9 +262,8 @@ void nd_map_view( Output output, Func f, const Input... input )
       output( indices... ) = f( input( indices... )... );
    };
 
-   // From here on, the output array is used only for getting the sizes,
-   // the writing of the result is done inside the wrapper.
-   ExecutorDispatcher< Output >()( output, wrapper );
+   ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
+   dispatch( output.getSizes(), wrapper );
 }
 
 #else
@@ -356,10 +361,8 @@ template< typename Output,
 void nd_map_view( Output output, Func f )
 {
    nvcc_map_helper_0< Output, Func > wrapper( output, f );
-
-   // From here on, the output array is used only for getting the sizes,
-   // the writing of the result is done inside the wrapper.
-   ExecutorDispatcher< Output >()( output, wrapper );
+   ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
+   dispatch( output.getSizes(), wrapper );
 }
 
 template< typename Output,
@@ -371,10 +374,8 @@ void nd_map_view( Output output, Func f, const Input1 input1 )
                   "all arrays must be of the same dimension" );
 
    nvcc_map_helper_1< Output, Func, Input1 > wrapper( output, f, input1 );
-
-   // From here on, the output array is used only for getting the sizes,
-   // the writing of the result is done inside the wrapper.
-   ExecutorDispatcher< Output >()( output, wrapper );
+   ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
+   dispatch( output.getSizes(), wrapper );
 }
 
 template< typename Output,
@@ -387,10 +388,8 @@ void nd_map_view( Output output, Func f, const Input1 input1, const Input2 input
                   "all arrays must be of the same dimension" );
 
    nvcc_map_helper_2< Output, Func, Input1, Input2 > wrapper( output, f, input1, input2 );
-
-   // From here on, the output array is used only for getting the sizes,
-   // the writing of the result is done inside the wrapper.
-   ExecutorDispatcher< Output >()( output, wrapper );
+   ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
+   dispatch( output.getSizes(), wrapper );
 }
 
 template< typename Output,
@@ -404,10 +403,8 @@ void nd_map_view( Output output, Func f, const Input1 input1, const Input2 input
                   "all arrays must be of the same dimension" );
 
    nvcc_map_helper_3< Output, Func, Input1, Input2, Input3 > wrapper( output, f, input1, input2, input3 );
-
-   // From here on, the output array is used only for getting the sizes,
-   // the writing of the result is done inside the wrapper.
-   ExecutorDispatcher< Output >()( output, wrapper );
+   ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
+   dispatch( output.getSizes(), wrapper );
 }
 
 #endif
-- 
GitLab


From 0ef45b13550922fd1b793bef8a4ddcaf6f046069 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Tue, 5 Feb 2019 22:05:11 +0100
Subject: [PATCH 06/25] NDArray: moved SizesHolderStaticSizePrinter and
 SizesHolderSizePrinter into the implementation namespace

---
 src/TNL/Containers/ndarray/SizesHolder.h | 46 ++++++++++++------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/src/TNL/Containers/ndarray/SizesHolder.h b/src/TNL/Containers/ndarray/SizesHolder.h
index 69a032c7f..5b6e52f5f 100644
--- a/src/TNL/Containers/ndarray/SizesHolder.h
+++ b/src/TNL/Containers/ndarray/SizesHolder.h
@@ -123,6 +123,26 @@ protected:
     }
 };
 
+template< std::size_t dimension >
+struct SizesHolderStaticSizePrinter
+{
+   template< typename SizesHolder >
+   static void exec( std::ostream& str, const SizesHolder& holder )
+   {
+      str << holder.template getStaticSize< dimension >() << ", ";
+   }
+};
+
+template< std::size_t dimension >
+struct SizesHolderSizePrinter
+{
+   template< typename SizesHolder >
+   static void exec( std::ostream& str, const SizesHolder& holder )
+   {
+      str << holder.template getSize< dimension >() << ", ";
+   }
+};
+
 } // namespace __ndarray_impl
 
 
@@ -181,36 +201,14 @@ public:
 };
 
 
-template< std::size_t dimension >
-struct SizesHolderStaticSizePrinter
-{
-   template< typename Index,
-             std::size_t... sizes >
-   static void exec( std::ostream& str, const SizesHolder< Index, sizes... >& holder )
-   {
-      str << holder.template getStaticSize< dimension >() << ", ";
-   }
-};
-
-template< std::size_t dimension >
-struct SizesHolderSizePrinter
-{
-   template< typename Index,
-             std::size_t... sizes >
-   static void exec( std::ostream& str, const SizesHolder< Index, sizes... >& holder )
-   {
-      str << holder.template getSize< dimension >() << ", ";
-   }
-};
-
 template< typename Index,
           std::size_t... sizes >
 std::ostream& operator<<( std::ostream& str, const SizesHolder< Index, sizes... >& holder )
 {
    str << "SizesHolder< ";
-   TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, SizesHolderStaticSizePrinter >::execHost( str, holder );
+   TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderStaticSizePrinter >::execHost( str, holder );
    str << holder.template getStaticSize< sizeof...(sizes) - 1 >() << " >( ";
-   TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, SizesHolderSizePrinter >::execHost( str, holder );
+   TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderSizePrinter >::execHost( str, holder );
    str << holder.template getSize< sizeof...(sizes) - 1 >() << " )";
    return str;
 }
-- 
GitLab


From f4d2f8c6ad963fe607dc840993d5a6a1ae53b00d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Wed, 6 Feb 2019 15:13:50 +0100
Subject: [PATCH 07/25] NDArray: added forInternal method

- fixed executors for operations: use inverse permutation when calling
  the wrapped lambda function
- custom internal region can be specified with custom begins/ends
  multiindices
---
 src/TNL/Containers/NDArray.h                  |  24 +-
 src/TNL/Containers/NDArrayView.h              |  24 +-
 src/TNL/Containers/ndarray/Indexing.h         |  30 +
 src/TNL/Containers/ndarray/Meta.h             |  45 +-
 src/TNL/Containers/ndarray/Operations.h       | 232 ++++---
 src/TNL/Containers/ndarray/SizesHolder.h      |  63 ++
 .../Containers/ndarray/NDArrayTest.cpp        | 603 +++++++++++++++++-
 7 files changed, 924 insertions(+), 97 deletions(-)

diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h
index ed5111e6d..e05db95cd 100644
--- a/src/TNL/Containers/NDArray.h
+++ b/src/TNL/Containers/NDArray.h
@@ -177,7 +177,29 @@ public:
    void forAll( Func f ) const
    {
       __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
-      dispatch( sizes, f );
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
+      dispatch( Begins{}, sizes, f );
+   }
+
+   template< typename Device2 = DeviceType, typename Func >
+   void forInternal( Func f ) const
+   {
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >;
+      // subtract static sizes
+      using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type;
+      // subtract dynamic sizes
+      Ends ends;
+      __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolder >::subtract( ends, sizes );
+      dispatch( Begins{}, ends, f );
+   }
+
+   template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends >
+   void forInternal( Func f, const Begins& begins, const Ends& ends ) const
+   {
+      // TODO: assert "begins <= sizes", "ends <= sizes"
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, ends, f );
    }
 
 
diff --git a/src/TNL/Containers/NDArrayView.h b/src/TNL/Containers/NDArrayView.h
index 5b5e056ff..cafaabe9b 100644
--- a/src/TNL/Containers/NDArrayView.h
+++ b/src/TNL/Containers/NDArrayView.h
@@ -231,7 +231,29 @@ public:
    void forAll( Func f ) const
    {
       __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
-      dispatch( sizes, f );
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
+      dispatch( Begins{}, sizes, f );
+   }
+
+   template< typename Device2 = DeviceType, typename Func >
+   void forInternal( Func f ) const
+   {
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >;
+      // subtract static sizes
+      using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type;
+      // subtract dynamic sizes
+      Ends ends;
+      __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolder >::subtract( ends, sizes );
+      dispatch( Begins{}, ends, f );
+   }
+
+   template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends >
+   void forInternal( Func f, const Begins& begins, const Ends& ends ) const
+   {
+      // TODO: assert "begins <= sizes", "ends <= sizes"
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, ends, f );
    }
 
 protected:
diff --git a/src/TNL/Containers/ndarray/Indexing.h b/src/TNL/Containers/ndarray/Indexing.h
index d156547e1..7031d8899 100644
--- a/src/TNL/Containers/ndarray/Indexing.h
+++ b/src/TNL/Containers/ndarray/Indexing.h
@@ -111,6 +111,36 @@ void setSizesHelper( SizesHolder& holder,
 }
 
 
+// helper for the forInternal method
+template< std::size_t ConstValue,
+          typename TargetHolder,
+          typename SourceHolder,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesSubtractHelper
+{
+   static void subtract( TargetHolder& target,
+                         const SourceHolder& source )
+   {
+      if( source.template getStaticSize< level >() == 0 )
+         target.template setSize< level >( source.template getSize< level >() - ConstValue );
+      SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, level - 1 >::subtract( target, source );
+   }
+};
+
+template< std::size_t ConstValue,
+          typename TargetHolder,
+          typename SourceHolder >
+struct SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, 0 >
+{
+   static void subtract( TargetHolder& target,
+                         const SourceHolder& source )
+   {
+      if( source.template getStaticSize< 0 >() == 0 )
+         target.template setSize< 0 >( source.template getSize< 0 >() - ConstValue );
+   }
+};
+
+
 // A variadic bounds-checker for indices
 template< typename SizesHolder >
 __cuda_callable__
diff --git a/src/TNL/Containers/ndarray/Meta.h b/src/TNL/Containers/ndarray/Meta.h
index 125c9e105..3ad0372f7 100644
--- a/src/TNL/Containers/ndarray/Meta.h
+++ b/src/TNL/Containers/ndarray/Meta.h
@@ -93,13 +93,15 @@ is_in_sequence( Index value, std::integer_sequence< Index, vals... > )
 
 // Get index of the first occurrence of value in a variadic pack.
 template< typename V >
-constexpr std::size_t index_in_pack( V&& value )
+constexpr std::size_t
+index_in_pack( V&& value )
 {
    return 0;
 }
 
 template< typename V, typename T, typename... Ts >
-constexpr std::size_t index_in_pack( V&& value, T&& arg, Ts&&... args )
+constexpr std::size_t
+index_in_pack( V&& value, T&& arg, Ts&&... args )
 {
    if( value == arg )
       return 0;
@@ -196,6 +198,45 @@ auto call_with_permuted_arguments( Func f, Args&&... args ) -> decltype(auto)
 }
 
 
+template< typename Permutation,
+          typename Sequence >
+struct CallInversePermutationHelper
+{};
+
+template< typename Permutation,
+          std::size_t... N >
+struct CallInversePermutationHelper< Permutation, std::index_sequence< N... > >
+{
+   template< typename Func,
+             typename... Args >
+   __cuda_callable__
+   static auto apply( Func&& f, Args&&... args ) -> decltype(auto)
+   {
+      return std::forward< Func >( f )( get_from_pack<
+                  index_in_sequence( N, Permutation{} )
+                >( std::forward< Args >( args )... )... );
+   }
+};
+
+// Call specified function with permuted arguments.
+// [used in ndarray_operations.h]
+template< typename Permutation,
+          typename Func,
+          typename... Args >
+__cuda_callable__
+// FIXME: does not compile with nvcc 10.0
+//auto call_with_unpermuted_arguments( Func&& f, Args&&... args ) -> decltype(auto)
+//{
+//   return CallInversePermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > >
+//          ::apply( std::forward< Func >( f ), std::forward< Args >( args )... );
+//}
+auto call_with_unpermuted_arguments( Func f, Args&&... args ) -> decltype(auto)
+{
+   return CallInversePermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > >
+          ::apply( f, std::forward< Args >( args )... );
+}
+
+
 // Check that all elements of the initializer list are equal to the specified value.
 // [used in ndarray_operations.h]
 constexpr bool
diff --git a/src/TNL/Containers/ndarray/Operations.h b/src/TNL/Containers/ndarray/Operations.h
index 705687c1d..b1f793405 100644
--- a/src/TNL/Containers/ndarray/Operations.h
+++ b/src/TNL/Containers/ndarray/Operations.h
@@ -15,6 +15,7 @@
 #include <TNL/ParallelFor.h>
 
 #include <TNL/Containers/ndarray/Meta.h>
+#include <TNL/Containers/ndarray/SizesHolder.h>
 
 namespace TNL {
 namespace Containers {
@@ -25,34 +26,45 @@ template< typename Permutation,
           typename LevelTag = IndexTag< 0 > >
 struct SequentialExecutor
 {
-   template< typename SizesHolder,
+   template< typename Begins,
+             typename Ends,
              typename Func,
              typename... Indices >
    __cuda_callable__
-   void operator()( const SizesHolder& sizes, Func f, Indices&&... indices )
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
    {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
       SequentialExecutor< Permutation, IndexTag< LevelTag::value + 1 > > exec;
-      const auto size = sizes.template getSize< get< LevelTag::value >( Permutation{} ) >();
-      for( typename SizesHolder::IndexType i = 0; i < size; i++ )
-         exec( sizes, f, std::forward< Indices >( indices )..., i );
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         exec( begins, ends, f, std::forward< Indices >( indices )..., i );
    }
 };
 
 template< typename Permutation >
 struct SequentialExecutor< Permutation, IndexTag< Permutation::size() - 1 > >
 {
-   template< typename SizesHolder,
+   template< typename Begins,
+             typename Ends,
              typename Func,
              typename... Indices >
    __cuda_callable__
-   void operator()( const SizesHolder& sizes, Func f, Indices&&... indices )
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
    {
-      static_assert( sizeof...(indices) == SizesHolder::getDimension() - 1,
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      static_assert( sizeof...(indices) == Begins::getDimension() - 1,
                      "invalid number of indices in the final step of the SequentialExecutor" );
 
-      const auto size = sizes.template getSize< get< SizesHolder::getDimension() - 1 >( Permutation{} ) >();
-      for( typename SizesHolder::IndexType i = 0; i < size; i++ )
-         call_with_permuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
+      using LevelTag = IndexTag< Permutation::size() - 1 >;
+
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
    }
 };
 
@@ -61,34 +73,43 @@ template< typename Permutation,
           typename LevelTag = IndexTag< Permutation::size() - 1 > >
 struct SequentialExecutorRTL
 {
-   template< typename SizesHolder,
+   template< typename Begins,
+             typename Ends,
              typename Func,
              typename... Indices >
    __cuda_callable__
-   void operator()( const SizesHolder& sizes, Func f, Indices&&... indices )
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
    {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
       SequentialExecutorRTL< Permutation, IndexTag< LevelTag::value - 1 > > exec;
-      const auto size = sizes.template getSize< get< LevelTag::value >( Permutation{} ) >();
-      for( typename SizesHolder::IndexType i = 0; i < size; i++ )
-         exec( sizes, f, i, std::forward< Indices >( indices )... );
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         exec( begins, ends, f, i, std::forward< Indices >( indices )... );
    }
 };
 
 template< typename Permutation >
 struct SequentialExecutorRTL< Permutation, IndexTag< 0 > >
 {
-   template< typename SizesHolder,
+   template< typename Begins,
+             typename Ends,
              typename Func,
              typename... Indices >
    __cuda_callable__
-   void operator()( const SizesHolder& sizes, Func f, Indices&&... indices )
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
    {
-      static_assert( sizeof...(indices) == SizesHolder::getDimension() - 1,
-                     "invalid number of indices in the final step of the SequentialExecutor" );
-
-      const auto size = sizes.template getSize< get< 0 >( Permutation{} ) >();
-      for( typename SizesHolder::IndexType i = 0; i < size; i++ )
-         call_with_permuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... );
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      static_assert( sizeof...(indices) == Begins::getDimension() - 1,
+                     "invalid number of indices in the final step of the SequentialExecutorRTL" );
+
+      const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< 0 >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         call_with_unpermuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... );
    }
 };
 
@@ -97,42 +118,58 @@ template< typename Permutation,
           typename Device >
 struct ParallelExecutorDeviceDispatch
 {
-   template< typename SizesHolder, typename Func >
-   void operator()( const SizesHolder& sizes, Func f )
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
    {
-      using Index = typename SizesHolder::IndexType;
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
 
       auto kernel = [=] ( Index i2, Index i1, Index i0 )
       {
          SequentialExecutor< Permutation, IndexTag< 3 > > exec;
-         exec( sizes, f, i0, i1, i2 );
+         exec( begins, ends, f, i0, i1, i2 );
       };
 
-      const Index size0 = sizes.template getSize< get< 0 >( Permutation{} ) >();
-      const Index size1 = sizes.template getSize< get< 1 >( Permutation{} ) >();
-      const Index size2 = sizes.template getSize< get< 2 >( Permutation{} ) >();
-      ParallelFor3D< Device >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
+      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+      const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
+      ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
    }
 };
 
 template< typename Permutation >
 struct ParallelExecutorDeviceDispatch< Permutation, Devices::Cuda >
 {
-   template< typename SizesHolder, typename Func >
-   void operator()( const SizesHolder& sizes, Func f )
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
    {
-      using Index = typename SizesHolder::IndexType;
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
 
       auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
       {
-         SequentialExecutorRTL< Permutation, IndexTag< SizesHolder::getDimension() - 4 > > exec;
-         exec( sizes, f, i0, i1, i2 );
+         SequentialExecutorRTL< Permutation, IndexTag< Begins::getDimension() - 4 > > exec;
+         exec( begins, ends, f, i0, i1, i2 );
       };
 
-      const Index size0 = sizes.template getSize< get< SizesHolder::getDimension() - 3 >( Permutation{} ) >();
-      const Index size1 = sizes.template getSize< get< SizesHolder::getDimension() - 2 >( Permutation{} ) >();
-      const Index size2 = sizes.template getSize< get< SizesHolder::getDimension() - 1 >( Permutation{} ) >();
-      ParallelFor3D< Devices::Cuda >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
+      const Index begin0 = begins.template getSize< get< Begins::getDimension() - 3 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< Begins::getDimension() - 2 >( Permutation{} ) >();
+      const Index begin2 = begins.template getSize< get< Begins::getDimension() - 1 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< Ends::getDimension() - 3 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< Ends::getDimension() - 2 >( Permutation{} ) >();
+      const Index end2 = ends.template getSize< get< Ends::getDimension() - 1 >( Permutation{} ) >();
+      ParallelFor3D< Devices::Cuda >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
    }
 };
 
@@ -141,11 +178,13 @@ template< typename Permutation,
           typename DimTag = IndexTag< Permutation::size() > >
 struct ParallelExecutor
 {
-   template< typename SizesHolder, typename Func >
-   void operator()( const SizesHolder& sizes, Func f )
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
    {
       ParallelExecutorDeviceDispatch< Permutation, Device > dispatch;
-      dispatch( sizes, f );
+      dispatch( begins, ends, f );
    }
 };
 
@@ -153,20 +192,28 @@ template< typename Permutation,
           typename Device >
 struct ParallelExecutor< Permutation, Device, IndexTag< 3 > >
 {
-   template< typename SizesHolder, typename Func >
-   void operator()( const SizesHolder& sizes, Func f )
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
    {
-      using Index = typename SizesHolder::IndexType;
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
 
       auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
       {
-         call_with_permuted_arguments< Permutation >( f, i0, i1, i2 );
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
       };
 
-      const Index size0 = sizes.template getSize< get< 0 >( Permutation{} ) >();
-      const Index size1 = sizes.template getSize< get< 1 >( Permutation{} ) >();
-      const Index size2 = sizes.template getSize< get< 2 >( Permutation{} ) >();
-      ParallelFor3D< Device >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
+      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+      const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
+      ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
    }
 };
 
@@ -174,19 +221,26 @@ template< typename Permutation,
           typename Device >
 struct ParallelExecutor< Permutation, Device, IndexTag< 2 > >
 {
-   template< typename SizesHolder, typename Func >
-   void operator()( const SizesHolder& sizes, Func f )
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
    {
-      using Index = typename SizesHolder::IndexType;
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
 
       auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
       {
-         call_with_permuted_arguments< Permutation >( f, i0, i1 );
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
       };
 
-      const Index size0 = sizes.template getSize< get< 0 >( Permutation{} ) >();
-      const Index size1 = sizes.template getSize< get< 1 >( Permutation{} ) >();
-      ParallelFor2D< Device >::exec( (Index) 0, (Index) 0, size1, size0, kernel );
+      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+      ParallelFor2D< Device >::exec( begin1, begin0, end1, end0, kernel );
    }
 };
 
@@ -194,18 +248,25 @@ template< typename Permutation,
           typename Device >
 struct ParallelExecutor< Permutation, Device, IndexTag< 1 > >
 {
-   template< typename SizesHolder, typename Func >
-   void operator()( const SizesHolder& sizes, Func f )
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
    {
-      using Index = typename SizesHolder::IndexType;
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
 
-      auto kernel = [=] __cuda_callable__ ( Index i )
-      {
-         call_with_permuted_arguments< Permutation >( f, i );
-      };
+      using Index = typename Ends::IndexType;
+
+//      auto kernel = [=] __cuda_callable__ ( Index i )
+//      {
+//         call_with_unpermuted_arguments< Permutation >( f, i );
+//      };
 
-      const Index size = sizes.template getSize< get< 0 >( Permutation{} ) >();
-      ParallelFor< Device >::exec( (Index) 0, size, kernel );
+      const Index begin = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end = ends.template getSize< get< 0 >( Permutation{} ) >();
+//      ParallelFor< Device >::exec( begin, end, kernel );
+      ParallelFor< Device >::exec( begin, end, f );
    }
 };
 
@@ -215,33 +276,33 @@ template< typename Permutation,
           typename Device >
 struct ExecutorDispatcher
 {
-   template< typename SizesHolder, typename Func >
-   void operator()( const SizesHolder& sizes, Func f )
+   template< typename Begins, typename Ends, typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
    {
-      SequentialExecutor< Permutation >()( sizes, f );
+      SequentialExecutor< Permutation >()( begins, ends, f );
    }
 };
 
 template< typename Permutation >
 struct ExecutorDispatcher< Permutation, Devices::Host >
 {
-   template< typename SizesHolder, typename Func >
-   void operator()( const SizesHolder& sizes, Func f )
+   template< typename Begins, typename Ends, typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
    {
       if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 )
-         ParallelExecutor< Permutation, Devices::Host >()( sizes, f );
+         ParallelExecutor< Permutation, Devices::Host >()( begins, ends, f );
       else
-         SequentialExecutor< Permutation >()( sizes, f );
+         SequentialExecutor< Permutation >()( begins, ends, f );
    }
 };
 
 template< typename Permutation >
 struct ExecutorDispatcher< Permutation, Devices::Cuda >
 {
-   template< typename SizesHolder, typename Func >
-   void operator()( const SizesHolder& sizes, Func f )
+   template< typename Begins, typename Ends, typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
    {
-      ParallelExecutor< Permutation, Devices::Cuda >()( sizes, f );
+      ParallelExecutor< Permutation, Devices::Cuda >()( begins, ends, f );
    }
 };
 
@@ -263,7 +324,8 @@ void nd_map_view( Output output, Func f, const Input... input )
    };
 
    ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
-   dispatch( output.getSizes(), wrapper );
+   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
+   dispatch( Begins{}, output.getSizes(), wrapper );
 }
 
 #else
@@ -362,7 +424,8 @@ void nd_map_view( Output output, Func f )
 {
    nvcc_map_helper_0< Output, Func > wrapper( output, f );
    ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
-   dispatch( output.getSizes(), wrapper );
+   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
+   dispatch( Begins{}, output.getSizes(), wrapper );
 }
 
 template< typename Output,
@@ -375,7 +438,8 @@ void nd_map_view( Output output, Func f, const Input1 input1 )
 
    nvcc_map_helper_1< Output, Func, Input1 > wrapper( output, f, input1 );
    ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
-   dispatch( output.getSizes(), wrapper );
+   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
+   dispatch( Begins{}, output.getSizes(), wrapper );
 }
 
 template< typename Output,
@@ -389,7 +453,8 @@ void nd_map_view( Output output, Func f, const Input1 input1, const Input2 input
 
    nvcc_map_helper_2< Output, Func, Input1, Input2 > wrapper( output, f, input1, input2 );
    ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
-   dispatch( output.getSizes(), wrapper );
+   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
+   dispatch( Begins{}, output.getSizes(), wrapper );
 }
 
 template< typename Output,
@@ -404,7 +469,8 @@ void nd_map_view( Output output, Func f, const Input1 input1, const Input2 input
 
    nvcc_map_helper_3< Output, Func, Input1, Input2, Input3 > wrapper( output, f, input1, input2, input3 );
    ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
-   dispatch( output.getSizes(), wrapper );
+   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
+   dispatch( Begins{}, output.getSizes(), wrapper );
 }
 
 #endif
diff --git a/src/TNL/Containers/ndarray/SizesHolder.h b/src/TNL/Containers/ndarray/SizesHolder.h
index 5b6e52f5f..306569de7 100644
--- a/src/TNL/Containers/ndarray/SizesHolder.h
+++ b/src/TNL/Containers/ndarray/SizesHolder.h
@@ -201,6 +201,49 @@ public:
 };
 
 
+template< typename Index,
+          std::size_t dimension,
+          Index constSize >
+class ConstStaticSizesHolder
+{
+public:
+   using IndexType = Index;
+
+   static constexpr std::size_t getDimension()
+   {
+      return dimension;
+   }
+
+   template< std::size_t level >
+   static constexpr std::size_t getStaticSize()
+   {
+      static_assert( level < getDimension(), "Invalid level passed to getStaticSize()." );
+      return constSize;
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   Index getSize() const
+   {
+      static_assert( level < getDimension(), "Invalid dimension passed to getSize()." );
+      return constSize;
+   }
+
+   // methods for convenience
+   __cuda_callable__
+   bool operator==( const ConstStaticSizesHolder& other ) const
+   {
+      return true;
+   }
+
+   __cuda_callable__
+   bool operator!=( const ConstStaticSizesHolder& other ) const
+   {
+      return false;
+   }
+};
+
+
 template< typename Index,
           std::size_t... sizes >
 std::ostream& operator<<( std::ostream& str, const SizesHolder< Index, sizes... >& holder )
@@ -213,5 +256,25 @@ std::ostream& operator<<( std::ostream& str, const SizesHolder< Index, sizes...
    return str;
 }
 
+
+// helper for the forInternal method
+namespace __ndarray_impl {
+
+template< typename SizesHolder,
+          std::size_t ConstValue >
+struct SubtractedSizesHolder
+{};
+
+template< typename Index,
+          std::size_t ConstValue,
+          std::size_t... sizes >
+struct SubtractedSizesHolder< SizesHolder< Index, sizes... >, ConstValue >
+{
+//   using type = SizesHolder< Index, std::max( (std::size_t) 0, sizes - ConstValue )... >;
+   using type = SizesHolder< Index, ( (sizes >= ConstValue) ? sizes - ConstValue : 0 )... >;
+};
+
+} // namespace __ndarray_impl
+
 } // namespace Containers
 } // namespace TNL
diff --git a/src/UnitTests/Containers/ndarray/NDArrayTest.cpp b/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
index 385ff93da..74221bdcb 100644
--- a/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
+++ b/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
@@ -180,9 +180,119 @@ TEST( NDArrayTest, SizesHolderPrinter )
    EXPECT_EQ( str.str(), "SizesHolder< 0, 1, 2 >( 3, 1, 2 )" );
 }
 
-TEST( NDArrayTest, forAll_dynamic )
+TEST( NDArrayTest, forAll_dynamic_1D )
 {
-    int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
+    int I = 2;
+    NDArray< int,
+             SizesHolder< int, 0 >,
+             index_sequence< 0 > > a;
+    a.setSizes( I );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+        EXPECT_EQ( a( i ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_2D )
+{
+    int I = 2, J = 3;
+    NDArray< int,
+             SizesHolder< int, 0, 0 >,
+             index_sequence< 1, 0 > > a;
+    a.setSizes( I, J );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int j = 0; j < J; j++ )
+    for( int i = 0; i < I; i++ )
+        EXPECT_EQ( a( i, j ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_3D )
+{
+    int I = 2, J = 3, K = 4;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0 >,
+             index_sequence< 2, 0, 1 > > a;
+    a.setSizes( I, J, K );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        EXPECT_EQ( a( i, j, k ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_4D )
+{
+    int I = 2, J = 3, K = 4, L = 5;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0 >,
+             index_sequence< 3, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        EXPECT_EQ( a( i, j, k, l ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_5D )
+{
+    int I = 2, J = 3, K = 4, L = 5, M = 6;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0 >,
+             index_sequence< 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        EXPECT_EQ( a( i, j, k, l, m ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_6D )
+{
+    int I = 2, J = 3, K = 4, L = 5, M = 6, N = 7;
     NDArray< int,
              SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
              index_sequence< 5, 3, 4, 2, 0, 1 > > a;
@@ -191,7 +301,7 @@ TEST( NDArrayTest, forAll_dynamic )
 
     auto setter = [&] ( int i, int j, int k, int l, int m, int n )
     {
-       a( i, j, k, l, m, n ) = 1;
+       a( i, j, k, l, m, n ) += 1;
     };
 
     a.forAll( setter );
@@ -205,19 +315,34 @@ TEST( NDArrayTest, forAll_dynamic )
         EXPECT_EQ( a( i, j, k, l, m, n ), 1 );
 }
 
-TEST( NDArrayTest, forAll_static )
+TEST( NDArrayTest, forAll_static_1D )
 {
-    constexpr int I = 3, J = 4;
-    NDArray< int, SizesHolder< int, I, J > > a;
-    a.setSizes( 0, 0 );
+    constexpr int I = 3;
+    StaticNDArray< int, SizesHolder< int, I > > a;
+//    a.setSizes( 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forAll( setter );
 
     for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-        a( i, j ) = 0;
+        EXPECT_EQ( a( i ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_2D )
+{
+    constexpr int I = 3, J = 4;
+    StaticNDArray< int, SizesHolder< int, I, J > > a;
+//    a.setSizes( 0, 0 );
+    a.setValue( 0 );
 
     auto setter = [&] ( int i, int j )
     {
-       a( i, j ) = 1;
+       a( i, j ) += 1;
     };
 
     a.forAll( setter );
@@ -227,6 +352,464 @@ TEST( NDArrayTest, forAll_static )
         EXPECT_EQ( a( i, j ), 1 );
 }
 
+TEST( NDArrayTest, forAll_static_3D )
+{
+    constexpr int I = 3, J = 4, K = 5;
+    StaticNDArray< int, SizesHolder< int, I, J, K > > a;
+//    a.setSizes( 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    for( int k = 0; k < K; k++ )
+        EXPECT_EQ( a( i, j, k ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_4D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L > > a;
+//    a.setSizes( 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    for( int k = 0; k < K; k++ )
+    for( int l = 0; l < L; l++ )
+        EXPECT_EQ( a( i, j, k, l ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_5D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a;
+//    a.setSizes( 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    for( int k = 0; k < K; k++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+        EXPECT_EQ( a( i, j, k, l, m ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_6D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a;
+//    a.setSizes( 0, 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+    {
+       a( i, j, k, l, m, n ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    for( int k = 0; k < K; k++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int n = 0; n < N; n++ )
+        EXPECT_EQ( a( i, j, k, l, m, n ), 1 );
+}
+
+TEST( NDArrayTest, forInternal_dynamic_1D )
+{
+    int I = 3;
+    NDArray< int,
+             SizesHolder< int, 0 >,
+             index_sequence< 0 > > a;
+    a.setSizes( I );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 )
+            EXPECT_EQ( a( i ), 0 )
+               << "i = " << i;
+        else
+            EXPECT_EQ( a( i ), 1 )
+               << "i = " << i;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_2D )
+{
+    int I = 3, J = 4;
+    NDArray< int,
+             SizesHolder< int, 0, 0 >,
+             index_sequence< 1, 0 > > a;
+    a.setSizes( I, J );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int j = 0; j < J; j++ )
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 )
+            EXPECT_EQ( a( i, j ), 0 )
+               << "i = " << i << ", j = " << j;
+        else
+            EXPECT_EQ( a( i, j ), 1 )
+               << "i = " << i << ", j = " << j;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_3D )
+{
+    int I = 3, J = 4, K = 5;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0 >,
+             index_sequence< 2, 0, 1 > > a;
+    a.setSizes( I, J, K );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 )
+            EXPECT_EQ( a( i, j, k ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+        else
+            EXPECT_EQ( a( i, j, k ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_4D )
+{
+    int I = 3, J = 4, K = 5, L = 6;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0 >,
+             index_sequence< 3, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 )
+            EXPECT_EQ( a( i, j, k, l ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+        else
+            EXPECT_EQ( a( i, j, k, l ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_5D )
+{
+    int I = 3, J = 4, K = 5, L = 6, M = 7;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0 >,
+             index_sequence< 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 )
+            EXPECT_EQ( a( i, j, k, l, m ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+        else
+            EXPECT_EQ( a( i, j, k, l, m ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_6D )
+{
+    int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+    {
+       a( i, j, k, l, m, n ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int n = 0; n < N; n++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 ||
+            n == 0 || n == N - 1 )
+            EXPECT_EQ( a( i, j, k, l, m, n ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+        else
+            EXPECT_EQ( a( i, j, k, l, m, n ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_1D )
+{
+    constexpr int I = 3;
+    StaticNDArray< int, SizesHolder< int, I > > a;
+//    a.setSizes( 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 )
+            EXPECT_EQ( a( i ), 0 )
+               << "i = " << i;
+        else
+            EXPECT_EQ( a( i ), 1 )
+               << "i = " << i;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_2D )
+{
+    constexpr int I = 3, J = 4;
+    StaticNDArray< int, SizesHolder< int, I, J > > a;
+//    a.setSizes( 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int j = 0; j < J; j++ )
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 )
+            EXPECT_EQ( a( i, j ), 0 )
+               << "i = " << i << ", j = " << j;
+        else
+            EXPECT_EQ( a( i, j ), 1 )
+               << "i = " << i << ", j = " << j;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_3D )
+{
+    constexpr int I = 3, J = 4, K = 5;
+    StaticNDArray< int, SizesHolder< int, I, J, K > > a;
+//    a.setSizes( 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 )
+            EXPECT_EQ( a( i, j, k ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+        else
+            EXPECT_EQ( a( i, j, k ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_4D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L > > a;
+//    a.setSizes( 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 )
+            EXPECT_EQ( a( i, j, k, l ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+        else
+            EXPECT_EQ( a( i, j, k, l ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_5D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a;
+//    a.setSizes( 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 )
+            EXPECT_EQ( a( i, j, k, l, m ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+        else
+            EXPECT_EQ( a( i, j, k, l, m ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_6D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a;
+//    a.setSizes( 0, 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+    {
+       a( i, j, k, l, m, n ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int n = 0; n < N; n++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 ||
+            n == 0 || n == N - 1 )
+            EXPECT_EQ( a( i, j, k, l, m, n ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+        else
+            EXPECT_EQ( a( i, j, k, l, m, n ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+    }
+}
+
 //#include "GtestMissingError.h"
 int main( int argc, char* argv[] )
 {
-- 
GitLab


From 061263a6412a51314282418be75e3ac9b61b9307 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sat, 9 Feb 2019 15:17:39 +0100
Subject: [PATCH 08/25] NDArray: split implementation of executors into a
 separate header file

---
 src/TNL/Containers/NDArrayView.h        |   1 +
 src/TNL/Containers/ndarray/Executors.h  | 310 ++++++++++++++++++++++++
 src/TNL/Containers/ndarray/Operations.h | 289 +---------------------
 3 files changed, 312 insertions(+), 288 deletions(-)
 create mode 100644 src/TNL/Containers/ndarray/Executors.h

diff --git a/src/TNL/Containers/NDArrayView.h b/src/TNL/Containers/NDArrayView.h
index cafaabe9b..50119eda4 100644
--- a/src/TNL/Containers/NDArrayView.h
+++ b/src/TNL/Containers/NDArrayView.h
@@ -15,6 +15,7 @@
 #include <TNL/Containers/ndarray/Indexing.h>
 #include <TNL/Containers/ndarray/SizesHolder.h>
 #include <TNL/Containers/ndarray/Subarrays.h>
+#include <TNL/Containers/ndarray/Executors.h>
 #include <TNL/Containers/ndarray/Operations.h>
 
 namespace TNL {
diff --git a/src/TNL/Containers/ndarray/Executors.h b/src/TNL/Containers/ndarray/Executors.h
new file mode 100644
index 000000000..ba37fe345
--- /dev/null
+++ b/src/TNL/Containers/ndarray/Executors.h
@@ -0,0 +1,310 @@
+/***************************************************************************
+                          Executors.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+
+#include <TNL/Containers/ndarray/Meta.h>
+#include <TNL/Containers/ndarray/SizesHolder.h>
+
+namespace TNL {
+namespace Containers {
+namespace __ndarray_impl {
+
+template< typename Permutation,
+          typename LevelTag = IndexTag< 0 > >
+struct SequentialExecutor
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      SequentialExecutor< Permutation, IndexTag< LevelTag::value + 1 > > exec;
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         exec( begins, ends, f, std::forward< Indices >( indices )..., i );
+   }
+};
+
+template< typename Permutation >
+struct SequentialExecutor< Permutation, IndexTag< Permutation::size() - 1 > >
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      static_assert( sizeof...(indices) == Begins::getDimension() - 1,
+                     "invalid number of indices in the final step of the SequentialExecutor" );
+
+      using LevelTag = IndexTag< Permutation::size() - 1 >;
+
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
+   }
+};
+
+
+template< typename Permutation,
+          typename LevelTag = IndexTag< Permutation::size() - 1 > >
+struct SequentialExecutorRTL
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      SequentialExecutorRTL< Permutation, IndexTag< LevelTag::value - 1 > > exec;
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         exec( begins, ends, f, i, std::forward< Indices >( indices )... );
+   }
+};
+
+template< typename Permutation >
+struct SequentialExecutorRTL< Permutation, IndexTag< 0 > >
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      static_assert( sizeof...(indices) == Begins::getDimension() - 1,
+                     "invalid number of indices in the final step of the SequentialExecutorRTL" );
+
+      const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< 0 >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         call_with_unpermuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... );
+   }
+};
+
+
+template< typename Permutation,
+          typename Device >
+struct ParallelExecutorDeviceDispatch
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
+
+      auto kernel = [=] ( Index i2, Index i1, Index i0 )
+      {
+         SequentialExecutor< Permutation, IndexTag< 3 > > exec;
+         exec( begins, ends, f, i0, i1, i2 );
+      };
+
+      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+      const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
+      ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
+   }
+};
+
+template< typename Permutation >
+struct ParallelExecutorDeviceDispatch< Permutation, Devices::Cuda >
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
+
+      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
+      {
+         SequentialExecutorRTL< Permutation, IndexTag< Begins::getDimension() - 4 > > exec;
+         exec( begins, ends, f, i0, i1, i2 );
+      };
+
+      const Index begin0 = begins.template getSize< get< Begins::getDimension() - 3 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< Begins::getDimension() - 2 >( Permutation{} ) >();
+      const Index begin2 = begins.template getSize< get< Begins::getDimension() - 1 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< Ends::getDimension() - 3 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< Ends::getDimension() - 2 >( Permutation{} ) >();
+      const Index end2 = ends.template getSize< get< Ends::getDimension() - 1 >( Permutation{} ) >();
+      ParallelFor3D< Devices::Cuda >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
+   }
+};
+
+template< typename Permutation,
+          typename Device,
+          typename DimTag = IndexTag< Permutation::size() > >
+struct ParallelExecutor
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      ParallelExecutorDeviceDispatch< Permutation, Device > dispatch;
+      dispatch( begins, ends, f );
+   }
+};
+
+template< typename Permutation,
+          typename Device >
+struct ParallelExecutor< Permutation, Device, IndexTag< 3 > >
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
+
+      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
+      };
+
+      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+      const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
+      ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
+   }
+};
+
+template< typename Permutation,
+          typename Device >
+struct ParallelExecutor< Permutation, Device, IndexTag< 2 > >
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
+
+      auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
+      };
+
+      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+      ParallelFor2D< Device >::exec( begin1, begin0, end1, end0, kernel );
+   }
+};
+
+template< typename Permutation,
+          typename Device >
+struct ParallelExecutor< Permutation, Device, IndexTag< 1 > >
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
+
+//      auto kernel = [=] __cuda_callable__ ( Index i )
+//      {
+//         call_with_unpermuted_arguments< Permutation >( f, i );
+//      };
+
+      const Index begin = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end = ends.template getSize< get< 0 >( Permutation{} ) >();
+//      ParallelFor< Device >::exec( begin, end, kernel );
+      ParallelFor< Device >::exec( begin, end, f );
+   }
+};
+
+
+// Device may be void which stands for StaticNDArray
+template< typename Permutation,
+          typename Device >
+struct ExecutorDispatcher
+{
+   template< typename Begins, typename Ends, typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      SequentialExecutor< Permutation >()( begins, ends, f );
+   }
+};
+
+template< typename Permutation >
+struct ExecutorDispatcher< Permutation, Devices::Host >
+{
+   template< typename Begins, typename Ends, typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 )
+         ParallelExecutor< Permutation, Devices::Host >()( begins, ends, f );
+      else
+         SequentialExecutor< Permutation >()( begins, ends, f );
+   }
+};
+
+template< typename Permutation >
+struct ExecutorDispatcher< Permutation, Devices::Cuda >
+{
+   template< typename Begins, typename Ends, typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      ParallelExecutor< Permutation, Devices::Cuda >()( begins, ends, f );
+   }
+};
+
+} // namespace __ndarray_impl
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/ndarray/Operations.h b/src/TNL/Containers/ndarray/Operations.h
index b1f793405..eb219b6e0 100644
--- a/src/TNL/Containers/ndarray/Operations.h
+++ b/src/TNL/Containers/ndarray/Operations.h
@@ -12,300 +12,13 @@
 
 #pragma once
 
-#include <TNL/ParallelFor.h>
-
-#include <TNL/Containers/ndarray/Meta.h>
-#include <TNL/Containers/ndarray/SizesHolder.h>
+#include <TNL/Containers/ndarray/Executors.h>
 
 namespace TNL {
 namespace Containers {
 
 namespace __ndarray_impl {
 
-template< typename Permutation,
-          typename LevelTag = IndexTag< 0 > >
-struct SequentialExecutor
-{
-   template< typename Begins,
-             typename Ends,
-             typename Func,
-             typename... Indices >
-   __cuda_callable__
-   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
-   {
-      static_assert( Begins::getDimension() == Ends::getDimension(),
-                     "wrong begins or ends" );
-
-      SequentialExecutor< Permutation, IndexTag< LevelTag::value + 1 > > exec;
-      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
-      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
-      for( auto i = begin; i < end; i++ )
-         exec( begins, ends, f, std::forward< Indices >( indices )..., i );
-   }
-};
-
-template< typename Permutation >
-struct SequentialExecutor< Permutation, IndexTag< Permutation::size() - 1 > >
-{
-   template< typename Begins,
-             typename Ends,
-             typename Func,
-             typename... Indices >
-   __cuda_callable__
-   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
-   {
-      static_assert( Begins::getDimension() == Ends::getDimension(),
-                     "wrong begins or ends" );
-      static_assert( sizeof...(indices) == Begins::getDimension() - 1,
-                     "invalid number of indices in the final step of the SequentialExecutor" );
-
-      using LevelTag = IndexTag< Permutation::size() - 1 >;
-
-      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
-      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
-      for( auto i = begin; i < end; i++ )
-         call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
-   }
-};
-
-
-template< typename Permutation,
-          typename LevelTag = IndexTag< Permutation::size() - 1 > >
-struct SequentialExecutorRTL
-{
-   template< typename Begins,
-             typename Ends,
-             typename Func,
-             typename... Indices >
-   __cuda_callable__
-   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
-   {
-      static_assert( Begins::getDimension() == Ends::getDimension(),
-                     "wrong begins or ends" );
-
-      SequentialExecutorRTL< Permutation, IndexTag< LevelTag::value - 1 > > exec;
-      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
-      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
-      for( auto i = begin; i < end; i++ )
-         exec( begins, ends, f, i, std::forward< Indices >( indices )... );
-   }
-};
-
-template< typename Permutation >
-struct SequentialExecutorRTL< Permutation, IndexTag< 0 > >
-{
-   template< typename Begins,
-             typename Ends,
-             typename Func,
-             typename... Indices >
-   __cuda_callable__
-   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
-   {
-      static_assert( Begins::getDimension() == Ends::getDimension(),
-                     "wrong begins or ends" );
-      static_assert( sizeof...(indices) == Begins::getDimension() - 1,
-                     "invalid number of indices in the final step of the SequentialExecutorRTL" );
-
-      const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >();
-      const auto end = ends.template getSize< get< 0 >( Permutation{} ) >();
-      for( auto i = begin; i < end; i++ )
-         call_with_unpermuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... );
-   }
-};
-
-
-template< typename Permutation,
-          typename Device >
-struct ParallelExecutorDeviceDispatch
-{
-   template< typename Begins,
-             typename Ends,
-             typename Func >
-   void operator()( const Begins& begins, const Ends& ends, Func f )
-   {
-      static_assert( Begins::getDimension() == Ends::getDimension(),
-                     "wrong begins or ends" );
-
-      using Index = typename Ends::IndexType;
-
-      auto kernel = [=] ( Index i2, Index i1, Index i0 )
-      {
-         SequentialExecutor< Permutation, IndexTag< 3 > > exec;
-         exec( begins, ends, f, i0, i1, i2 );
-      };
-
-      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
-      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
-      const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >();
-      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
-      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
-      const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
-      ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
-   }
-};
-
-template< typename Permutation >
-struct ParallelExecutorDeviceDispatch< Permutation, Devices::Cuda >
-{
-   template< typename Begins,
-             typename Ends,
-             typename Func >
-   void operator()( const Begins& begins, const Ends& ends, Func f )
-   {
-      static_assert( Begins::getDimension() == Ends::getDimension(),
-                     "wrong begins or ends" );
-
-      using Index = typename Ends::IndexType;
-
-      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
-      {
-         SequentialExecutorRTL< Permutation, IndexTag< Begins::getDimension() - 4 > > exec;
-         exec( begins, ends, f, i0, i1, i2 );
-      };
-
-      const Index begin0 = begins.template getSize< get< Begins::getDimension() - 3 >( Permutation{} ) >();
-      const Index begin1 = begins.template getSize< get< Begins::getDimension() - 2 >( Permutation{} ) >();
-      const Index begin2 = begins.template getSize< get< Begins::getDimension() - 1 >( Permutation{} ) >();
-      const Index end0 = ends.template getSize< get< Ends::getDimension() - 3 >( Permutation{} ) >();
-      const Index end1 = ends.template getSize< get< Ends::getDimension() - 2 >( Permutation{} ) >();
-      const Index end2 = ends.template getSize< get< Ends::getDimension() - 1 >( Permutation{} ) >();
-      ParallelFor3D< Devices::Cuda >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
-   }
-};
-
-template< typename Permutation,
-          typename Device,
-          typename DimTag = IndexTag< Permutation::size() > >
-struct ParallelExecutor
-{
-   template< typename Begins,
-             typename Ends,
-             typename Func >
-   void operator()( const Begins& begins, const Ends& ends, Func f )
-   {
-      ParallelExecutorDeviceDispatch< Permutation, Device > dispatch;
-      dispatch( begins, ends, f );
-   }
-};
-
-template< typename Permutation,
-          typename Device >
-struct ParallelExecutor< Permutation, Device, IndexTag< 3 > >
-{
-   template< typename Begins,
-             typename Ends,
-             typename Func >
-   void operator()( const Begins& begins, const Ends& ends, Func f )
-   {
-      static_assert( Begins::getDimension() == Ends::getDimension(),
-                     "wrong begins or ends" );
-
-      using Index = typename Ends::IndexType;
-
-      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
-      {
-         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
-      };
-
-      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
-      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
-      const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >();
-      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
-      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
-      const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
-      ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
-   }
-};
-
-template< typename Permutation,
-          typename Device >
-struct ParallelExecutor< Permutation, Device, IndexTag< 2 > >
-{
-   template< typename Begins,
-             typename Ends,
-             typename Func >
-   void operator()( const Begins& begins, const Ends& ends, Func f )
-   {
-      static_assert( Begins::getDimension() == Ends::getDimension(),
-                     "wrong begins or ends" );
-
-      using Index = typename Ends::IndexType;
-
-      auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
-      {
-         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
-      };
-
-      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
-      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
-      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
-      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
-      ParallelFor2D< Device >::exec( begin1, begin0, end1, end0, kernel );
-   }
-};
-
-template< typename Permutation,
-          typename Device >
-struct ParallelExecutor< Permutation, Device, IndexTag< 1 > >
-{
-   template< typename Begins,
-             typename Ends,
-             typename Func >
-   void operator()( const Begins& begins, const Ends& ends, Func f )
-   {
-      static_assert( Begins::getDimension() == Ends::getDimension(),
-                     "wrong begins or ends" );
-
-      using Index = typename Ends::IndexType;
-
-//      auto kernel = [=] __cuda_callable__ ( Index i )
-//      {
-//         call_with_unpermuted_arguments< Permutation >( f, i );
-//      };
-
-      const Index begin = begins.template getSize< get< 0 >( Permutation{} ) >();
-      const Index end = ends.template getSize< get< 0 >( Permutation{} ) >();
-//      ParallelFor< Device >::exec( begin, end, kernel );
-      ParallelFor< Device >::exec( begin, end, f );
-   }
-};
-
-
-// Device may be void which stands for StaticNDArray
-template< typename Permutation,
-          typename Device >
-struct ExecutorDispatcher
-{
-   template< typename Begins, typename Ends, typename Func >
-   void operator()( const Begins& begins, const Ends& ends, Func f )
-   {
-      SequentialExecutor< Permutation >()( begins, ends, f );
-   }
-};
-
-template< typename Permutation >
-struct ExecutorDispatcher< Permutation, Devices::Host >
-{
-   template< typename Begins, typename Ends, typename Func >
-   void operator()( const Begins& begins, const Ends& ends, Func f )
-   {
-      if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 )
-         ParallelExecutor< Permutation, Devices::Host >()( begins, ends, f );
-      else
-         SequentialExecutor< Permutation >()( begins, ends, f );
-   }
-};
-
-template< typename Permutation >
-struct ExecutorDispatcher< Permutation, Devices::Cuda >
-{
-   template< typename Begins, typename Ends, typename Func >
-   void operator()( const Begins& begins, const Ends& ends, Func f )
-   {
-      ParallelExecutor< Permutation, Devices::Cuda >()( begins, ends, f );
-   }
-};
-
 #ifndef __NVCC__
 template< typename Output,
           typename Func,
-- 
GitLab


From 6c8c608e5b9a9e6cc74e907a38c16221d390e5f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sat, 9 Feb 2019 21:47:04 +0100
Subject: [PATCH 09/25] NDArray: added forBoundary method

---
 src/Benchmarks/NDArray/CMakeLists.txt         |   8 +
 .../tnl-benchmark-ndarray-boundary-cuda.cu    |   1 +
 .../tnl-benchmark-ndarray-boundary.cpp        |   1 +
 .../NDArray/tnl-benchmark-ndarray-boundary.h  | 477 ++++++++++++++++++
 src/TNL/Containers/NDArray.h                  |  24 +
 src/TNL/Containers/NDArrayView.h              |  25 +
 .../Containers/ndarray/BoundaryExecutors.h    | 367 ++++++++++++++
 .../Containers/ndarray/NDArrayTest.cpp        | 373 ++++++++++++++
 8 files changed, 1276 insertions(+)
 create mode 100644 src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary-cuda.cu
 create mode 100644 src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.cpp
 create mode 100644 src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h
 create mode 100644 src/TNL/Containers/ndarray/BoundaryExecutors.h

diff --git a/src/Benchmarks/NDArray/CMakeLists.txt b/src/Benchmarks/NDArray/CMakeLists.txt
index 3958694e6..e6291c5f3 100644
--- a/src/Benchmarks/NDArray/CMakeLists.txt
+++ b/src/Benchmarks/NDArray/CMakeLists.txt
@@ -2,8 +2,16 @@ add_executable( tnl-benchmark-ndarray tnl-benchmark-ndarray.cpp )
 target_compile_options( tnl-benchmark-ndarray PRIVATE ${CXX_TESTS_FLAGS} )
 install( TARGETS tnl-benchmark-ndarray RUNTIME DESTINATION bin )
 
+add_executable( tnl-benchmark-ndarray-boundary tnl-benchmark-ndarray-boundary.cpp )
+target_compile_options( tnl-benchmark-ndarray-boundary PRIVATE ${CXX_TESTS_FLAGS} )
+install( TARGETS tnl-benchmark-ndarray-boundary RUNTIME DESTINATION bin )
+
 if( BUILD_CUDA )
    cuda_add_executable( tnl-benchmark-ndarray-cuda tnl-benchmark-ndarray-cuda.cu
                         OPTIONS ${CXX_TESTS_FLAGS} )
    install( TARGETS tnl-benchmark-ndarray-cuda RUNTIME DESTINATION bin )
+
+   cuda_add_executable( tnl-benchmark-ndarray-boundary-cuda tnl-benchmark-ndarray-boundary-cuda.cu
+                        OPTIONS ${CXX_TESTS_FLAGS} )
+   install( TARGETS tnl-benchmark-ndarray-boundary-cuda RUNTIME DESTINATION bin )
 endif()
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary-cuda.cu b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary-cuda.cu
new file mode 100644
index 000000000..b5a2622a4
--- /dev/null
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary-cuda.cu
@@ -0,0 +1 @@
+#include "tnl-benchmark-ndarray-boundary.h"
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.cpp b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.cpp
new file mode 100644
index 000000000..b5a2622a4
--- /dev/null
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.cpp
@@ -0,0 +1 @@
+#include "tnl-benchmark-ndarray-boundary.h"
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h
new file mode 100644
index 000000000..e47149d84
--- /dev/null
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h
@@ -0,0 +1,477 @@
+/***************************************************************************
+                          tnl-benchmark-ndarray-boundary.h  -  description
+                             -------------------
+    begin                : Feb 9, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Math.h>
+#include <TNL/ParallelFor.h>
+
+#include <TNL/Containers/NDArray.h>
+
+#include "../Benchmarks.h"
+
+using namespace TNL;
+using namespace TNL::Benchmarks;
+using namespace TNL::Containers;
+using std::index_sequence;
+
+using value_type = float;
+//using index_type = std::size_t;
+using index_type = unsigned;
+
+template< typename Array >
+void expect_eq_chunked( Array& a, Array& b )
+{
+   // TODO: use something like EXPECT_EQ
+   TNL_ASSERT_EQ( a.getSize(), b.getSize(), "array sizes don't match" );
+   if( a.getSize() != b.getSize() )
+      return;
+
+   using IndexType = typename Array::IndexType;
+
+   const IndexType chunk_size = 4096;
+   for( IndexType c = 0; c < (IndexType) roundUpDivision( a.getSize(), chunk_size ); c++ ) {
+      const typename Array::IndexType this_chunk_size = TNL::min( chunk_size, a.getSize() - c * chunk_size );
+      Array a_chunk( &a[ c * chunk_size ], this_chunk_size );
+      Array b_chunk( &b[ c * chunk_size ], this_chunk_size );
+      // TODO: use something like EXPECT_EQ
+      TNL_ASSERT_EQ( a_chunk, b_chunk, "chunks are not equal" );
+   }
+}
+
+template< typename Array >
+void expect_eq( Array& a, Array& b )
+{
+   if( std::is_same< typename Array::DeviceType, TNL::Devices::Cuda >::value ) {
+      typename Array::HostType a_host, b_host;
+      a_host = a;
+      b_host = b;
+      expect_eq_chunked( a_host, b_host );
+   }
+   else {
+      expect_eq_chunked( a, b );
+   }
+}
+
+template< typename Device >
+const char* performer()
+{
+   if( std::is_same< Device, Devices::Host >::value )
+      return "CPU";
+   else if( std::is_same< Device, Devices::Cuda >::value )
+      return "GPU";
+   else
+      return "unknown";
+}
+
+void reset() {}
+
+// NOTE: having the sizes as function parameters keeps the compiler from treating them
+// as "compile-time constants" and thus e.g. optimizing the 1D iterations with memcpy
+
+template< typename Device >
+void benchmark_1D( Benchmark& benchmark, index_type size = 500000000 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0 >,
+            std::make_index_sequence< 1 >,
+            std::make_index_sequence< 1 >,
+            Device > a, b;
+   a.setSizes( size );
+   b.setSizes( size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto a_view = a.getView();
+   auto b_view = b.getView();
+
+   auto f = [&]() {
+      a.forBoundary( [=] __cuda_callable__ ( index_type i ) mutable { a_view( i ) = b_view( i ); } );
+      a.forInternal( [=] __cuda_callable__ ( index_type i ) mutable { a_view( i ) = b_view( i ); } );
+   };
+
+   const double datasetSize = 2 * size * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "1D", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_2D( Benchmark& benchmark, index_type size = 22333 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0 >,
+            std::make_index_sequence< 2 >,
+            std::make_index_sequence< 2 >,
+            Device > a, b;
+   a.setSizes( size, size );
+   b.setSizes( size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto a_view = a.getView();
+   auto b_view = b.getView();
+
+   auto f = [&]() {
+      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j ) mutable { a_view( i, j ) = b_view( i, j ); } );
+      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j ) mutable { a_view( i, j ) = b_view( i, j ); } );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 2 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "2D", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0 >,
+            std::make_index_sequence< 3 >,
+            std::make_index_sequence< 3 >,
+            Device > a, b;
+   a.setSizes( size, size, size );
+   b.setSizes( size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto a_view = a.getView();
+   auto b_view = b.getView();
+
+   auto f = [&]() {
+      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k ) mutable { a_view( i, j, k ) = b_view( i, j, k ); } );
+      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k ) mutable { a_view( i, j, k ) = b_view( i, j, k ); } );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 3 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "3D", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+// TODO: implement general ParallelBoundaryExecutor
+//template< typename Device >
+//void benchmark_4D( Benchmark& benchmark, index_type size = 150 )
+//{
+//   NDArray< value_type,
+//            SizesHolder< index_type, 0, 0, 0, 0 >,
+//            std::make_index_sequence< 4 >,
+//            std::make_index_sequence< 4 >,
+//            Device > a, b;
+//   a.setSizes( size, size, size, size );
+//   b.setSizes( size, size, size, size );
+//   a.getStorageArray().setValue( -1 );
+//   b.getStorageArray().setValue( 1 );
+//
+//   auto a_view = a.getView();
+//   auto b_view = b.getView();
+//
+//   auto f = [&]() {
+//      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l ) mutable { a_view( i, j, k, l ) = b_view( i, j, k, l ); } );
+//      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l ) mutable { a_view( i, j, k, l ) = b_view( i, j, k, l ); } );
+//   };
+//
+//   const double datasetSize = 2 * std::pow( size, 4 ) * sizeof(value_type) / oneGB;
+//   benchmark.setOperation( "4D", datasetSize );
+//   benchmark.time< Device >( reset, performer< Device >(), f );
+//
+//   expect_eq( a.getStorageArray(), b.getStorageArray() );
+//}
+//
+//template< typename Device >
+//void benchmark_5D( Benchmark& benchmark, index_type size = 56 )
+//{
+//   NDArray< value_type,
+//            SizesHolder< index_type, 0, 0, 0, 0, 0 >,
+//            std::make_index_sequence< 5 >,
+//            std::make_index_sequence< 5 >,
+//            Device > a, b;
+//   a.setSizes( size, size, size, size, size );
+//   b.setSizes( size, size, size, size, size );
+//   a.getStorageArray().setValue( -1 );
+//   b.getStorageArray().setValue( 1 );
+//
+//   auto a_view = a.getView();
+//   auto b_view = b.getView();
+//
+//   auto f = [&]() {
+//      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m ) mutable { a_view( i, j, k, l, m ) = b_view( i, j, k, l, m ); } );
+//      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m ) mutable { a_view( i, j, k, l, m ) = b_view( i, j, k, l, m ); } );
+//   };
+//
+//   const double datasetSize = 2 * std::pow( size, 5 ) * sizeof(value_type) / oneGB;
+//   benchmark.setOperation( "5D", datasetSize );
+//   benchmark.time< Device >( reset, performer< Device >(), f );
+//
+//   expect_eq( a.getStorageArray(), b.getStorageArray() );
+//}
+//
+//template< typename Device >
+//void benchmark_6D( Benchmark& benchmark, index_type size = 28 )
+//{
+//   NDArray< value_type,
+//            SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
+//            std::make_index_sequence< 6 >,
+//            std::make_index_sequence< 6 >,
+//            Device > a, b;
+//   a.setSizes( size, size, size, size, size, size );
+//   b.setSizes( size, size, size, size, size, size );
+//   a.getStorageArray().setValue( -1 );
+//   b.getStorageArray().setValue( 1 );
+//
+//   auto a_view = a.getView();
+//   auto b_view = b.getView();
+//
+//   auto f = [&]() {
+//      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m, index_type n ) mutable { a_view( i, j, k, l, m, n ) = b_view( i, j, k, l, m, n ); } );
+//      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m, index_type n ) mutable { a_view( i, j, k, l, m, n ) = b_view( i, j, k, l, m, n ); } );
+//   };
+//
+//   const double datasetSize = 2 * std::pow( size, 6 ) * sizeof(value_type) / oneGB;
+//   benchmark.setOperation( "6D", datasetSize );
+//   benchmark.time< Device >( reset, performer< Device >(), f );
+//
+//   expect_eq( a.getStorageArray(), b.getStorageArray() );
+//}
+
+
+template< typename Device >
+void benchmark_2D_perm( Benchmark& benchmark, index_type size = 22333 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0 >,
+            std::index_sequence< 1, 0 >,
+            std::index_sequence< 1, 0 >,
+            Device > a, b;
+   a.setSizes( size, size );
+   b.setSizes( size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto a_view = a.getView();
+   auto b_view = b.getView();
+
+   auto f = [&]() {
+      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j ) mutable { a_view( i, j ) = b_view( i, j ); } );
+      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j ) mutable { a_view( i, j ) = b_view( i, j ); } );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 2 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "2D permuted", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0 >,
+            std::index_sequence< 2, 1, 0 >,
+            std::index_sequence< 2, 1, 0 >,
+            Device > a, b;
+   a.setSizes( size, size, size );
+   b.setSizes( size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto a_view = a.getView();
+   auto b_view = b.getView();
+
+   auto f = [&]() {
+      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k ) mutable { a_view( i, j, k ) = b_view( i, j, k ); } );
+      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k ) mutable { a_view( i, j, k ) = b_view( i, j, k ); } );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 3 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "3D permuted", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+// TODO: implement general ParallelBoundaryExecutor
+//template< typename Device >
+//void benchmark_4D_perm( Benchmark& benchmark, index_type size = 150 )
+//{
+//   NDArray< value_type,
+//            SizesHolder< index_type, 0, 0, 0, 0 >,
+//            std::index_sequence< 3, 2, 1, 0 >,
+//            std::index_sequence< 3, 2, 1, 0 >,
+//            Device > a, b;
+//   a.setSizes( size, size, size, size );
+//   b.setSizes( size, size, size, size );
+//   a.getStorageArray().setValue( -1 );
+//   b.getStorageArray().setValue( 1 );
+//
+//   auto a_view = a.getView();
+//   auto b_view = b.getView();
+//
+//   auto f = [&]() {
+//      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l ) mutable { a_view( i, j, k, l ) = b_view( i, j, k, l ); } );
+//      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l ) mutable { a_view( i, j, k, l ) = b_view( i, j, k, l ); } );
+//   };
+//
+//   const double datasetSize = 2 * std::pow( size, 4 ) * sizeof(value_type) / oneGB;
+//   benchmark.setOperation( "4D permuted", datasetSize );
+//   benchmark.time< Device >( reset, performer< Device >(), f );
+//
+//   expect_eq( a.getStorageArray(), b.getStorageArray() );
+//}
+//
+//template< typename Device >
+//void benchmark_5D_perm( Benchmark& benchmark, index_type size = 56 )
+//{
+//   NDArray< value_type,
+//            SizesHolder< index_type, 0, 0, 0, 0, 0 >,
+//            std::index_sequence< 4, 3, 2, 1, 0 >,
+//            std::index_sequence< 4, 3, 2, 1, 0 >,
+//            Device > a, b;
+//   a.setSizes( size, size, size, size, size );
+//   b.setSizes( size, size, size, size, size );
+//   a.getStorageArray().setValue( -1 );
+//   b.getStorageArray().setValue( 1 );
+//
+//   auto a_view = a.getView();
+//   auto b_view = b.getView();
+//
+//   auto f = [&]() {
+//      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m ) mutable { a_view( i, j, k, l, m ) = b_view( i, j, k, l, m ); } );
+//      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m ) mutable { a_view( i, j, k, l, m ) = b_view( i, j, k, l, m ); } );
+//   };
+//
+//   const double datasetSize = 2 * std::pow( size, 5 ) * sizeof(value_type) / oneGB;
+//   benchmark.setOperation( "5D permuted", datasetSize );
+//   benchmark.time< Device >( reset, performer< Device >(), f );
+//
+//   expect_eq( a.getStorageArray(), b.getStorageArray() );
+//}
+//
+//template< typename Device >
+//void benchmark_6D_perm( Benchmark& benchmark, index_type size = 28 )
+//{
+//   NDArray< value_type,
+//            SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
+//            std::index_sequence< 5, 4, 3, 2, 1, 0 >,
+//            std::index_sequence< 5, 4, 3, 2, 1, 0 >,
+//            Device > a, b;
+//   a.setSizes( size, size, size, size, size, size );
+//   b.setSizes( size, size, size, size, size, size );
+//   a.getStorageArray().setValue( -1 );
+//   b.getStorageArray().setValue( 1 );
+//
+//   auto a_view = a.getView();
+//   auto b_view = b.getView();
+//
+//   auto f = [&]() {
+//      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m, index_type n ) mutable { a_view( i, j, k, l, m, n ) = b_view( i, j, k, l, m, n ); } );
+//      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m, index_type n ) mutable { a_view( i, j, k, l, m, n ) = b_view( i, j, k, l, m, n ); } );
+//   };
+//
+//   const double datasetSize = 2 * std::pow( size, 6 ) * sizeof(value_type) / oneGB;
+//   benchmark.setOperation( "6D permuted", datasetSize );
+//   benchmark.time< Device >( reset, performer< Device >(), f );
+//
+//   expect_eq( a.getStorageArray(), b.getStorageArray() );
+//}
+
+template< typename Device >
+void run_benchmarks( Benchmark& benchmark )
+{
+   benchmark_1D< Device >( benchmark );
+   benchmark_2D< Device >( benchmark );
+   benchmark_3D< Device >( benchmark );
+//   benchmark_4D< Device >( benchmark );
+//   benchmark_5D< Device >( benchmark );
+//   benchmark_6D< Device >( benchmark );
+   benchmark_2D_perm< Device >( benchmark );
+   benchmark_3D_perm< Device >( benchmark );
+//   benchmark_4D_perm< Device >( benchmark );
+//   benchmark_5D_perm< Device >( benchmark );
+//   benchmark_6D_perm< Device >( benchmark );
+}
+
+void setupConfig( Config::ConfigDescription & config )
+{
+   config.addDelimiter( "Benchmark settings:" );
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-ndarray-boundary.log");
+   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+   config.addEntryEnum( "append" );
+   config.addEntryEnum( "overwrite" );
+   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
+   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
+   config.addEntry< String >( "devices", "Run benchmarks on these devices.", "all" );
+   config.addEntryEnum( "all" );
+   config.addEntryEnum( "host" );
+   #ifdef HAVE_CUDA
+   config.addEntryEnum( "cuda" );
+   #endif
+
+   config.addDelimiter( "Device settings:" );
+   Devices::Host::configSetup( config );
+   Devices::Cuda::configSetup( config );
+}
+
+int main( int argc, char* argv[] )
+{
+   Config::ParameterContainer parameters;
+   Config::ConfigDescription conf_desc;
+
+   setupConfig( conf_desc );
+
+   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) {
+      conf_desc.printUsage( argv[ 0 ] );
+      return EXIT_FAILURE;
+   }
+
+   if( ! Devices::Host::setup( parameters ) ||
+       ! Devices::Cuda::setup( parameters ) )
+      return EXIT_FAILURE;
+
+   const String & logFileName = parameters.getParameter< String >( "log-file" );
+   const String & outputMode = parameters.getParameter< String >( "output-mode" );
+   const int loops = parameters.getParameter< int >( "loops" );
+   const int verbose = parameters.getParameter< int >( "verbose" );
+
+   // open log file
+   auto mode = std::ios::out;
+   if( outputMode == "append" )
+       mode |= std::ios::app;
+   std::ofstream logFile( logFileName.getString(), mode );
+
+   // init benchmark and common metadata
+   Benchmark benchmark( loops, verbose );
+
+   // prepare global metadata
+   Benchmark::MetadataMap metadata = getHardwareMetadata();
+
+   const String devices = parameters.getParameter< String >( "devices" );
+   if( devices == "all" || devices == "host" )
+      run_benchmarks< Devices::Host >( benchmark );
+#ifdef HAVE_CUDA
+   if( devices == "all" || devices == "cuda" )
+      run_benchmarks< Devices::Cuda >( benchmark );
+#endif
+
+   if( ! benchmark.save( logFile ) ) {
+      std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   return EXIT_SUCCESS;
+}
diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h
index e05db95cd..89315fc6f 100644
--- a/src/TNL/Containers/NDArray.h
+++ b/src/TNL/Containers/NDArray.h
@@ -202,6 +202,30 @@ public:
       dispatch( begins, ends, f );
    }
 
+   template< typename Device2 = DeviceType, typename Func >
+   void forBoundary( Func f ) const
+   {
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
+      using SkipBegins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >;
+      // subtract static sizes
+      using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type;
+      // subtract dynamic sizes
+      SkipEnds skipEnds;
+      __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolder >::subtract( skipEnds, sizes );
+
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( Begins{}, SkipBegins{}, skipEnds, sizes, f );
+   }
+
+   template< typename Device2 = DeviceType, typename Func, typename SkipBegins, typename SkipEnds >
+   void forBoundary( Func f, const SkipBegins& skipBegins, const SkipEnds& skipEnds ) const
+   {
+      // TODO: assert "skipBegins <= sizes", "skipEnds <= sizes"
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( Begins{}, skipBegins, skipEnds, sizes, f );
+   }
+
 
    // extra methods
 
diff --git a/src/TNL/Containers/NDArrayView.h b/src/TNL/Containers/NDArrayView.h
index 50119eda4..e367f0a06 100644
--- a/src/TNL/Containers/NDArrayView.h
+++ b/src/TNL/Containers/NDArrayView.h
@@ -16,6 +16,7 @@
 #include <TNL/Containers/ndarray/SizesHolder.h>
 #include <TNL/Containers/ndarray/Subarrays.h>
 #include <TNL/Containers/ndarray/Executors.h>
+#include <TNL/Containers/ndarray/BoundaryExecutors.h>
 #include <TNL/Containers/ndarray/Operations.h>
 
 namespace TNL {
@@ -257,6 +258,30 @@ public:
       dispatch( begins, ends, f );
    }
 
+   template< typename Device2 = DeviceType, typename Func >
+   void forBoundary( Func f ) const
+   {
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
+      using SkipBegins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >;
+      // subtract static sizes
+      using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type;
+      // subtract dynamic sizes
+      SkipEnds skipEnds;
+      __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolder >::subtract( skipEnds, sizes );
+
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( Begins{}, SkipBegins{}, skipEnds, sizes, f );
+   }
+
+   template< typename Device2 = DeviceType, typename Func, typename SkipBegins, typename SkipEnds >
+   void forBoundary( Func f, const SkipBegins& skipBegins, const SkipEnds& skipEnds ) const
+   {
+      // TODO: assert "skipBegins <= sizes", "skipEnds <= sizes"
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( Begins{}, skipBegins, skipEnds, sizes, f );
+   }
+
 protected:
    Value* array = nullptr;
    SizesHolder sizes;
diff --git a/src/TNL/Containers/ndarray/BoundaryExecutors.h b/src/TNL/Containers/ndarray/BoundaryExecutors.h
new file mode 100644
index 000000000..08970b46a
--- /dev/null
+++ b/src/TNL/Containers/ndarray/BoundaryExecutors.h
@@ -0,0 +1,367 @@
+/***************************************************************************
+                          BoundaryExecutors.h  -  description
+                             -------------------
+    begin                : Feb 09, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+
+#include <TNL/Containers/ndarray/Meta.h>
+#include <TNL/Containers/ndarray/SizesHolder.h>
+
+namespace TNL {
+namespace Containers {
+namespace __ndarray_impl {
+
+template< typename Permutation,
+          typename LevelTag = IndexTag< 0 > >
+struct SequentialBoundaryExecutor_inner
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    std::size_t level,
+                    Func f,
+                    Indices&&... indices )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      SequentialBoundaryExecutor_inner< Permutation, IndexTag< LevelTag::value + 1 > > exec;
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto skipBegin = skipBegins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto skipEnd = skipEnds.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      if( level == LevelTag::value ) {
+         for( auto i = begin; i < skipBegin; i++ )
+            exec( begins, skipBegins, skipEnds, ends, level, f, std::forward< Indices >( indices )..., i );
+         for( auto i = skipEnd; i < end; i++ )
+            exec( begins, skipBegins, skipEnds, ends, level, f, std::forward< Indices >( indices )..., i );
+      }
+      else if( level > LevelTag::value ) {
+         for( auto i = skipBegin; i < skipEnd; i++ )
+            exec( begins, skipBegins, skipEnds, ends, level, f, std::forward< Indices >( indices )..., i );
+      }
+      else {
+         for( auto i = begin; i < end; i++ )
+            exec( begins, skipBegins, skipEnds, ends, level, f, std::forward< Indices >( indices )..., i );
+      }
+   }
+};
+
+template< typename Permutation >
+struct SequentialBoundaryExecutor_inner< Permutation, IndexTag< Permutation::size() - 1 > >
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    std::size_t level,
+                    Func f,
+                    Indices&&... indices )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      static_assert( sizeof...(indices) == Begins::getDimension() - 1,
+                     "invalid number of indices in the final step of the SequentialBoundaryExecutor" );
+
+      using LevelTag = IndexTag< Permutation::size() - 1 >;
+
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto skipBegin = skipBegins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto skipEnd = skipEnds.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      if( level == LevelTag::value ) {
+         for( auto i = begin; i < skipBegin; i++ )
+            call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
+         for( auto i = skipEnd; i < end; i++ )
+            call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
+      }
+      else if( level > LevelTag::value ) {
+         for( auto i = skipBegin; i < skipEnd; i++ )
+            call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
+      }
+      else {
+         for( auto i = begin; i < end; i++ )
+            call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
+      }
+   }
+};
+
+template< typename Permutation,
+          std::size_t dim = Permutation::size() >
+struct SequentialBoundaryExecutor
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func >
+   __cuda_callable__
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      SequentialBoundaryExecutor_inner< Permutation > exec;
+      for( std::size_t level = 0; level < Permutation::size(); level++ )
+         exec( begins, skipBegins, skipEnds, ends, level, f );
+   }
+};
+
+template< typename Permutation >
+struct SequentialBoundaryExecutor< Permutation, 0 >
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func >
+   __cuda_callable__
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto skipBegin = skipBegins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto skipEnd = skipEnds.template getSize< get< 0 >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< 0 >( Permutation{} ) >();
+      for( auto i = begin; i < skipBegin; i++ )
+         f( i );
+      for( auto i = skipEnd; i < end; i++ )
+         f( i );
+   }
+};
+
+
+template< typename Permutation,
+          typename Device,
+          typename DimTag = IndexTag< Permutation::size() > >
+struct ParallelBoundaryExecutor
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    Func f )
+   {
+      static_assert( Permutation::size() <= 3, "ParallelBoundaryExecutor is implemented only for 1D, 2D, and 3D." );
+   }
+};
+
+template< typename Permutation,
+          typename Device >
+struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 3 > >
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
+
+      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
+      };
+
+      const auto begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const auto begin2 = begins.template getSize< get< 2 >( Permutation{} ) >();
+      const auto skipBegin0 = skipBegins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto skipBegin1 = skipBegins.template getSize< get< 1 >( Permutation{} ) >();
+      const auto skipBegin2 = skipBegins.template getSize< get< 2 >( Permutation{} ) >();
+      const auto skipEnd0 = skipEnds.template getSize< get< 0 >( Permutation{} ) >();
+      const auto skipEnd1 = skipEnds.template getSize< get< 1 >( Permutation{} ) >();
+      const auto skipEnd2 = skipEnds.template getSize< get< 2 >( Permutation{} ) >();
+      const auto end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const auto end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+      const auto end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
+
+      ParallelFor3D< Device >::exec( begin2,     begin1,     begin0,   skipBegin2, end1,       end0,       kernel );
+      ParallelFor3D< Device >::exec( skipEnd2,   begin1,     begin0,   end2,       end1,       end0,       kernel );
+      ParallelFor3D< Device >::exec( skipBegin2, begin1,     begin0,   skipEnd2,   skipBegin1, end0,       kernel );
+      ParallelFor3D< Device >::exec( skipBegin2, skipEnd1,   begin0,   skipEnd2,   end1,       end0,       kernel );
+      ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, begin0,   skipEnd2,   skipEnd1,   skipBegin0, kernel );
+      ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, skipEnd0, skipEnd2,   skipEnd1,   end0,       kernel );
+   }
+};
+
+template< typename Permutation,
+          typename Device >
+struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 2 > >
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
+
+      auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
+      };
+
+      const auto begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const auto skipBegin0 = skipBegins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto skipBegin1 = skipBegins.template getSize< get< 1 >( Permutation{} ) >();
+      const auto skipEnd0 = skipEnds.template getSize< get< 0 >( Permutation{} ) >();
+      const auto skipEnd1 = skipEnds.template getSize< get< 1 >( Permutation{} ) >();
+      const auto end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const auto end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+
+      ParallelFor2D< Device >::exec( begin1,     begin0,   skipBegin1, end0,       kernel );
+      ParallelFor2D< Device >::exec( skipEnd1,   begin0,   end1,       end0,       kernel );
+      ParallelFor2D< Device >::exec( skipBegin1, begin0,   skipEnd1,   skipBegin0, kernel );
+      ParallelFor2D< Device >::exec( skipBegin1, skipEnd0, skipEnd1,   end0,       kernel );
+   }
+};
+
+template< typename Permutation,
+          typename Device >
+struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 1 > >
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto skipBegin = skipBegins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto skipEnd = skipEnds.template getSize< get< 0 >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< 0 >( Permutation{} ) >();
+
+      ParallelFor< Device >::exec( begin, skipBegin, f );
+      ParallelFor< Device >::exec( skipEnd, end, f );
+   }
+};
+
+
+// Device may be void which stands for StaticNDArray
+template< typename Permutation,
+          typename Device >
+struct BoundaryExecutorDispatcher
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    Func f )
+   {
+      SequentialBoundaryExecutor< Permutation >()( begins, skipBegins, skipEnds, ends, f );
+   }
+};
+
+template< typename Permutation >
+struct BoundaryExecutorDispatcher< Permutation, Devices::Host >
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    Func f )
+   {
+      if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 )
+         ParallelBoundaryExecutor< Permutation, Devices::Host >()( begins, skipBegins, skipEnds, ends, f );
+      else
+         SequentialBoundaryExecutor< Permutation >()( begins, skipBegins, skipEnds, ends, f );
+   }
+};
+
+template< typename Permutation >
+struct BoundaryExecutorDispatcher< Permutation, Devices::Cuda >
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    Func f )
+   {
+      ParallelBoundaryExecutor< Permutation, Devices::Cuda >()( begins, skipBegins, skipEnds, ends, f );
+   }
+};
+
+} // namespace __ndarray_impl
+} // namespace Containers
+} // namespace TNL
diff --git a/src/UnitTests/Containers/ndarray/NDArrayTest.cpp b/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
index 74221bdcb..5790a80cf 100644
--- a/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
+++ b/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
@@ -810,6 +810,379 @@ TEST( NDArrayTest, forInternal_static_6D )
     }
 }
 
+TEST( NDArrayTest, forBoundary_dynamic_1D )
+{
+    int I = 3;
+    NDArray< int,
+             SizesHolder< int, 0 >,
+             index_sequence< 0 > > a;
+    a.setSizes( I );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 )
+            EXPECT_EQ( a( i ), 1 )
+               << "i = " << i;
+        else
+            EXPECT_EQ( a( i ), 0 )
+               << "i = " << i;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_dynamic_2D )
+{
+    int I = 3, J = 4;
+    NDArray< int,
+             SizesHolder< int, 0, 0 >,
+             index_sequence< 1, 0 > > a;
+    a.setSizes( I, J );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int j = 0; j < J; j++ )
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 )
+            EXPECT_EQ( a( i, j ), 1 )
+               << "i = " << i << ", j = " << j;
+        else
+            EXPECT_EQ( a( i, j ), 0 )
+               << "i = " << i << ", j = " << j;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_dynamic_3D )
+{
+    int I = 3, J = 4, K = 5;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0 >,
+             index_sequence< 2, 0, 1 > > a;
+    a.setSizes( I, J, K );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 )
+            EXPECT_EQ( a( i, j, k ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+        else
+            EXPECT_EQ( a( i, j, k ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+    }
+}
+
+// TODO: implement general ParallelBoundaryExecutor
+//TEST( NDArrayTest, forBoundary_dynamic_4D )
+//{
+//    int I = 3, J = 4, K = 5, L = 6;
+//    NDArray< int,
+//             SizesHolder< int, 0, 0, 0, 0 >,
+//             index_sequence< 3, 2, 0, 1 > > a;
+//    a.setSizes( I, J, K, L );
+//    a.setValue( 0 );
+//
+//    auto setter = [&] ( int i, int j, int k, int l )
+//    {
+//       a( i, j, k, l ) += 1;
+//    };
+//
+//    a.forBoundary( setter );
+//
+//    for( int l = 0; l < L; l++ )
+//    for( int k = 0; k < K; k++ )
+//    for( int i = 0; i < I; i++ )
+//    for( int j = 0; j < J; j++ )
+//    {
+//        if( i == 0 || i == I - 1 ||
+//            j == 0 || j == J - 1 ||
+//            k == 0 || k == K - 1 ||
+//            l == 0 || l == L - 1 )
+//            EXPECT_EQ( a( i, j, k, l ), 1 )
+//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+//        else
+//            EXPECT_EQ( a( i, j, k, l ), 0 )
+//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+//    }
+//}
+//
+//TEST( NDArrayTest, forBoundary_dynamic_5D )
+//{
+//    int I = 3, J = 4, K = 5, L = 6, M = 7;
+//    NDArray< int,
+//             SizesHolder< int, 0, 0, 0, 0, 0 >,
+//             index_sequence< 3, 4, 2, 0, 1 > > a;
+//    a.setSizes( I, J, K, L, M );
+//    a.setValue( 0 );
+//
+//    auto setter = [&] ( int i, int j, int k, int l, int m )
+//    {
+//       a( i, j, k, l, m ) += 1;
+//    };
+//
+//    a.forBoundary( setter );
+//
+//    for( int l = 0; l < L; l++ )
+//    for( int m = 0; m < M; m++ )
+//    for( int k = 0; k < K; k++ )
+//    for( int i = 0; i < I; i++ )
+//    for( int j = 0; j < J; j++ )
+//    {
+//        if( i == 0 || i == I - 1 ||
+//            j == 0 || j == J - 1 ||
+//            k == 0 || k == K - 1 ||
+//            l == 0 || l == L - 1 ||
+//            m == 0 || m == M - 1 )
+//            EXPECT_EQ( a( i, j, k, l, m ), 1 )
+//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+//        else
+//            EXPECT_EQ( a( i, j, k, l, m ), 0 )
+//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+//    }
+//}
+//
+//TEST( NDArrayTest, forBoundary_dynamic_6D )
+//{
+//    int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
+//    NDArray< int,
+//             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+//             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+//    a.setSizes( I, J, K, L, M, N );
+//    a.setValue( 0 );
+//
+//    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+//    {
+//       a( i, j, k, l, m, n ) += 1;
+//    };
+//
+//    a.forBoundary( setter );
+//
+//    for( int n = 0; n < N; n++ )
+//    for( int l = 0; l < L; l++ )
+//    for( int m = 0; m < M; m++ )
+//    for( int k = 0; k < K; k++ )
+//    for( int i = 0; i < I; i++ )
+//    for( int j = 0; j < J; j++ )
+//    {
+//        if( i == 0 || i == I - 1 ||
+//            j == 0 || j == J - 1 ||
+//            k == 0 || k == K - 1 ||
+//            l == 0 || l == L - 1 ||
+//            m == 0 || m == M - 1 ||
+//            n == 0 || n == N - 1 )
+//            EXPECT_EQ( a( i, j, k, l, m, n ), 1 )
+//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+//        else
+//            EXPECT_EQ( a( i, j, k, l, m, n ), 0 )
+//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+//    }
+//}
+
+TEST( NDArrayTest, forBoundary_static_1D )
+{
+    constexpr int I = 3;
+    StaticNDArray< int, SizesHolder< int, I > > a;
+//    a.setSizes( 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 )
+            EXPECT_EQ( a( i ), 1 )
+               << "i = " << i;
+        else
+            EXPECT_EQ( a( i ), 0 )
+               << "i = " << i;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_static_2D )
+{
+    constexpr int I = 3, J = 4;
+    StaticNDArray< int, SizesHolder< int, I, J > > a;
+//    a.setSizes( 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int j = 0; j < J; j++ )
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 )
+            EXPECT_EQ( a( i, j ), 1 )
+               << "i = " << i << ", j = " << j;
+        else
+            EXPECT_EQ( a( i, j ), 0 )
+               << "i = " << i << ", j = " << j;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_static_3D )
+{
+    constexpr int I = 3, J = 4, K = 5;
+    StaticNDArray< int, SizesHolder< int, I, J, K > > a;
+//    a.setSizes( 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 )
+            EXPECT_EQ( a( i, j, k ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+        else
+            EXPECT_EQ( a( i, j, k ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_static_4D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L > > a;
+//    a.setSizes( 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 )
+            EXPECT_EQ( a( i, j, k, l ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+        else
+            EXPECT_EQ( a( i, j, k, l ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_static_5D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a;
+//    a.setSizes( 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 )
+            EXPECT_EQ( a( i, j, k, l, m ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+        else
+            EXPECT_EQ( a( i, j, k, l, m ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_static_6D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a;
+//    a.setSizes( 0, 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+    {
+       a( i, j, k, l, m, n ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int n = 0; n < N; n++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 ||
+            n == 0 || n == N - 1 )
+            EXPECT_EQ( a( i, j, k, l, m, n ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+        else
+            EXPECT_EQ( a( i, j, k, l, m, n ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+    }
+}
+
 //#include "GtestMissingError.h"
 int main( int argc, char* argv[] )
 {
-- 
GitLab


From 07d933dc9b642199765266763a583de166ac3aee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Wed, 2 Jan 2019 11:44:15 +0100
Subject: [PATCH 10/25] Basic implementation of the distributed NDArray

---
 src/TNL/Containers/DistributedNDArray.h       | 291 ++++++++++++++++
 src/TNL/Containers/DistributedNDArrayView.h   | 226 ++++++++++++
 src/TNL/Containers/ndarray/Indexing.h         |  81 +++++
 src/TNL/Containers/ndarray/Meta.h             |  36 ++
 src/TNL/Containers/ndarray/SizesHolder.h      |  36 +-
 .../Containers/ndarray/CMakeLists.txt         |  15 +
 .../ndarray/DistributedNDArrayTest.cpp        |   1 +
 .../ndarray/DistributedNDArrayTest.cu         |   1 +
 .../ndarray/DistributedNDArrayTest.h          | 323 ++++++++++++++++++
 9 files changed, 1009 insertions(+), 1 deletion(-)
 create mode 100644 src/TNL/Containers/DistributedNDArray.h
 create mode 100644 src/TNL/Containers/DistributedNDArrayView.h
 create mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArrayTest.cpp
 create mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArrayTest.cu
 create mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h

diff --git a/src/TNL/Containers/DistributedNDArray.h b/src/TNL/Containers/DistributedNDArray.h
new file mode 100644
index 000000000..fcf9b5c8c
--- /dev/null
+++ b/src/TNL/Containers/DistributedNDArray.h
@@ -0,0 +1,291 @@
+/***************************************************************************
+                          DistributedNDArray.h  -  description
+                             -------------------
+    begin                : Dec 27, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Containers/NDArray.h>
+#include <TNL/Containers/Subrange.h>
+#include <TNL/Containers/DistributedNDArrayView.h>
+
+namespace TNL {
+namespace Containers {
+
+template< typename NDArray,
+          typename Communicator = Communicators::MpiCommunicator,
+          typename Overlaps = __ndarray_impl::make_constant_index_sequence< NDArray::getDimension(), 0 > >
+class DistributedNDArray
+{
+   using CommunicationGroup = typename Communicator::CommunicationGroup;
+public:
+   using ValueType = typename NDArray::ValueType;
+   using DeviceType = typename NDArray::DeviceType;
+   using IndexType = typename NDArray::IndexType;
+   using SizesHolderType = typename NDArray::SizesHolderType;
+   using PermutationType = typename NDArray::PermutationType;
+   using CommunicatorType = Communicator;
+   using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArray::SizesHolderType >;
+   using LocalRangeType = Subrange< IndexType >;
+   using OverlapsType = Overlaps;
+
+   using ViewType = DistributedNDArrayView< typename NDArray::ViewType, Communicator, Overlaps >;
+   using ConstViewType = DistributedNDArrayView< typename NDArray::ConstViewType, Communicator, Overlaps >;
+
+   static_assert( Overlaps::size() == NDArray::getDimension(), "invalid overlaps" );
+
+   // all methods from NDArrayView
+
+   DistributedNDArray() = default;
+
+   // The copy-constructor of TNL::Containers::Array makes shallow copy so our
+   // copy-constructor cannot be default. Actually, we most likely don't need
+   // it anyway, so let's just delete it.
+   DistributedNDArray( const DistributedNDArray& ) = delete;
+
+   // Standard copy-semantics with deep copy, just like regular 1D array.
+   // Mismatched sizes cause reallocations.
+   DistributedNDArray& operator=( const DistributedNDArray& other ) = default;
+
+   // default move-semantics
+   DistributedNDArray( DistributedNDArray&& ) = default;
+   DistributedNDArray& operator=( DistributedNDArray&& ) = default;
+
+   static constexpr std::size_t getDimension()
+   {
+      return NDArray::getDimension();
+   }
+
+   __cuda_callable__
+   CommunicationGroup getCommunicationGroup() const
+   {
+      return group;
+   }
+
+   // Returns the *global* sizes
+   __cuda_callable__
+   const SizesHolderType& getSizes() const
+   {
+      return globalSizes;
+   }
+
+   // Returns the *global* size
+   template< std::size_t level >
+   __cuda_callable__
+   IndexType getSize() const
+   {
+      return globalSizes.template getSize< level >();
+   }
+
+   __cuda_callable__
+   LocalBeginsType getLocalBegins() const
+   {
+      return localBegins;
+   }
+
+   __cuda_callable__
+   SizesHolderType getLocalEnds() const
+   {
+      return localEnds;
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   LocalRangeType getLocalRange() const
+   {
+      return LocalRangeType( localBegins.template getSize< level >(), localEnds.template getSize< level >() );
+   }
+
+   // returns the local storage size
+   __cuda_callable__
+   IndexType getLocalStorageSize() const
+   {
+      return localArray.getStorageSize();
+   }
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   ValueType&
+   operator()( IndexTypes&&... indices )
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
+      return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localArray, std::forward< IndexTypes >( indices )... );
+   }
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   const ValueType&
+   operator()( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
+      return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localArray, std::forward< IndexTypes >( indices )... );
+   }
+
+   // bracket operator for 1D arrays
+   __cuda_callable__
+   ValueType&
+   operator[]( IndexType index )
+   {
+      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) );
+      return localArray[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ];
+   }
+
+   __cuda_callable__
+   const ValueType&
+   operator[]( IndexType index ) const
+   {
+      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) );
+      return localArray[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ];
+   }
+
+   __cuda_callable__
+   ViewType getView()
+   {
+      return ViewType( localArray.getView(), globalSizes, localBegins, localEnds, group );
+   }
+
+   __cuda_callable__
+   ConstViewType getConstView() const
+   {
+      return ConstViewType( localArray.getConstView(), globalSizes, localBegins, localEnds, group );
+   }
+
+   // TODO: overlaps should be skipped, otherwise it works only after synchronization
+   bool operator==( const DistributedNDArray& other ) const
+   {
+      // we can't run allreduce if the communication groups are different
+      if( group != other.getCommunicationGroup() )
+         return false;
+      const bool localResult =
+            globalSizes == other.globalSizes &&
+            localBegins == other.localBegins &&
+            localEnds == other.localEnds &&
+            localArray == other.localArray;
+      bool result = true;
+      if( group != CommunicatorType::NullGroup )
+         CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
+      return result;
+   }
+
+   bool operator!=( const DistributedNDArray& other ) const
+   {
+      return ! (*this == other);
+   }
+
+
+   // extra methods
+
+   // Sets the *global* size, but does not allocate storage
+   template< typename... IndexTypes >
+   void setSizes( IndexTypes&&... sizes )
+   {
+      static_assert( sizeof...( sizes ) == getDimension(), "got wrong number of sizes" );
+      __ndarray_impl::setSizesHelper( globalSizes, std::forward< IndexTypes >( sizes )... );
+      // initialize localBegins and localEnds
+      localBegins = LocalBeginsType{};
+      localEnds = globalSizes;
+   }
+
+   template< std::size_t level >
+   void setDistribution( IndexType begin, IndexType end, CommunicationGroup group = Communicator::AllGroup )
+   {
+      static_assert( SizesHolderType::template getStaticSize< level >() == 0, "NDArray cannot be distributed in static dimensions." );
+      TNL_ASSERT_GE( begin, 0, "begin must be non-negative" );
+      TNL_ASSERT_LE( end, globalSizes.template getSize< level >(), "end must not be greater than global size" );
+      TNL_ASSERT_LT( begin, end, "begin must be lesser than end" );
+      localBegins.template setSize< level >( begin );
+      localEnds.template setSize< level >( end );
+      TNL_ASSERT( this->group == Communicator::NullGroup || this->group == group,
+                  std::cerr << "different groups cannot be combined for different dimensions" );
+      this->group = group;
+   }
+
+   // Computes the distributed storage size and allocates the local array
+   void allocate()
+   {
+      SizesHolderType localSizes;
+      TemplateStaticFor< std::size_t, 0, SizesHolderType::getDimension(), LocalSizesSetter >::execHost( localSizes, globalSizes, localBegins, localEnds );
+      localArray.setSize( localSizes );
+   }
+
+   void setLike( const DistributedNDArray& other )
+   {
+      localArray.setLike( other.localArray );
+      group = other.getCommunicationGroup();
+      globalSizes = other.getSizes();
+      localBegins = other.localBegins;
+      localEnds = other.localEnds;
+   }
+
+   void reset()
+   {
+      localArray.reset();
+      group = CommunicatorType::NullGroup;
+      globalSizes = SizesHolderType{};
+      localBegins = LocalBeginsType{};
+      localEnds = SizesHolderType{};
+   }
+
+   // "safe" accessor - will do slow copy from device
+   template< typename... IndexTypes >
+   ValueType
+   getElement( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
+      auto getElement = [this]( auto&&... indices )
+      {
+         return this->localArray.getElement( std::forward< decltype(indices) >( indices )... );
+      };
+      return __ndarray_impl::host_call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, getElement, std::forward< IndexTypes >( indices )... );
+   }
+
+   void setValue( ValueType value )
+   {
+      localArray.setValue( value );
+   }
+
+protected:
+   NDArray localArray;
+   CommunicationGroup group = Communicator::NullGroup;
+   SizesHolderType globalSizes;
+   // static sizes should have different type: localBegin is always 0, localEnd is always the full size
+   LocalBeginsType localBegins;
+   SizesHolderType localEnds;
+
+private:
+   template< std::size_t level >
+   struct LocalSizesSetter
+   {
+      template< typename SizesHolder, typename LocalBegins >
+      static void exec( SizesHolder& localSizes, const SizesHolder& globalSizes, const LocalBegins& localBegins, const SizesHolder& localEnds )
+      {
+         if( SizesHolder::template getStaticSize< level >() != 0 )
+            return;
+
+         const auto begin = localBegins.template getSize< level >();
+         const auto end = localEnds.template getSize< level >();
+         if( begin == end )
+            localSizes.template setSize< level >( globalSizes.template getSize< level >() );
+         else {
+            TNL_ASSERT_GE( end - begin, (decltype(end)) __ndarray_impl::get<level>( Overlaps{} ), "local size is less than the size of overlaps" );
+            localSizes.template setSize< level >( end - begin + 2 * __ndarray_impl::get<level>( Overlaps{} ) );
+         }
+      }
+   };
+};
+
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/DistributedNDArrayView.h b/src/TNL/Containers/DistributedNDArrayView.h
new file mode 100644
index 000000000..8c2415092
--- /dev/null
+++ b/src/TNL/Containers/DistributedNDArrayView.h
@@ -0,0 +1,226 @@
+/***************************************************************************
+                          DistributedNDArrayView.h  -  description
+                             -------------------
+    begin                : Dec 27, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Containers/NDArrayView.h>
+#include <TNL/Containers/Subrange.h>
+
+namespace TNL {
+namespace Containers {
+
+template< typename NDArrayView,
+          typename Communicator = Communicators::MpiCommunicator,
+          typename Overlaps = __ndarray_impl::make_constant_index_sequence< NDArrayView::getDimension(), 0 > >
+class DistributedNDArrayView
+{
+   using CommunicationGroup = typename Communicator::CommunicationGroup;
+public:
+   using ValueType = typename NDArrayView::ValueType;
+   using DeviceType = typename NDArrayView::DeviceType;
+   using IndexType = typename NDArrayView::IndexType;
+   using SizesHolderType = typename NDArrayView::SizesHolderType;
+   using PermutationType = typename NDArrayView::PermutationType;
+   using CommunicatorType = Communicator;
+   using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArrayView::SizesHolderType >;
+   using LocalRangeType = Subrange< IndexType >;
+   using OverlapsType = Overlaps;
+
+   using ViewType = DistributedNDArrayView< NDArrayView, Communicator, Overlaps >;
+   using ConstViewType = DistributedNDArrayView< typename NDArrayView::ConstViewType, Communicator, Overlaps >;
+
+   static_assert( Overlaps::size() == NDArrayView::getDimension(), "invalid overlaps" );
+
+   __cuda_callable__
+   DistributedNDArrayView() = default;
+
+   // explicit initialization by local array view, global sizes and local begins and ends
+   __cuda_callable__
+   DistributedNDArrayView( NDArrayView localView, SizesHolderType globalSizes, LocalBeginsType localBegins, SizesHolderType localEnds, CommunicationGroup group )
+   : localView(localView), group(group), globalSizes(globalSizes), localBegins(localBegins), localEnds(localEnds) {}
+
+   // Copy-constructor does shallow copy, so views can be passed-by-value into
+   // CUDA kernels and they can be captured-by-value in __cuda_callable__
+   // lambda functions.
+   __cuda_callable__
+   DistributedNDArrayView( const DistributedNDArrayView& ) = default;
+
+   // default move-constructor
+   __cuda_callable__
+   DistributedNDArrayView( DistributedNDArrayView&& ) = default;
+
+   // Copy-assignment does deep copy, just like regular array, but the sizes
+   // must match (i.e. copy-assignment cannot resize).
+   __cuda_callable__
+   DistributedNDArrayView& operator=( const DistributedNDArrayView& other ) = default;
+
+   // There is no move-assignment operator, so expressions like `a = b.getView()`
+   // are resolved as copy-assignment.
+
+   // method for rebinding (reinitialization)
+   __cuda_callable__
+   void bind( DistributedNDArrayView view )
+   {
+      localView.bind( view.localView );
+      group = view.group;
+      globalSizes = view.globalSizes;
+      localBegins = view.localBegins;
+      localEnds = view.localEnds;
+   }
+
+   __cuda_callable__
+   void reset()
+   {
+      localView.reset();
+      group = CommunicatorType::NullGroup;
+      globalSizes = SizesHolderType{};
+      localBegins = LocalBeginsType{};
+      localEnds = SizesHolderType{};
+   }
+
+   static constexpr std::size_t getDimension()
+   {
+      return NDArrayView::getDimension();
+   }
+
+   __cuda_callable__
+   CommunicationGroup getCommunicationGroup() const
+   {
+      return group;
+   }
+
+   // Returns the *global* sizes
+   __cuda_callable__
+   const SizesHolderType& getSizes() const
+   {
+      return globalSizes;
+   }
+
+   // Returns the *global* size
+   template< std::size_t level >
+   __cuda_callable__
+   IndexType getSize() const
+   {
+      return globalSizes.template getSize< level >();
+   }
+
+   __cuda_callable__
+   LocalBeginsType getLocalBegins() const
+   {
+      return localBegins;
+   }
+
+   __cuda_callable__
+   SizesHolderType getLocalEnds() const
+   {
+      return localEnds;
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   LocalRangeType getLocalRange() const
+   {
+      return LocalRangeType( localBegins.template getSize< level >(), localEnds.template getSize< level >() );
+   }
+
+   // returns the local storage size
+   __cuda_callable__
+   IndexType getLocalStorageSize() const
+   {
+      return localView.getStorageSize();
+   }
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   ValueType&
+   operator()( IndexTypes&&... indices )
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
+      return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localView, std::forward< IndexTypes >( indices )... );
+   }
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   const ValueType&
+   operator()( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
+      return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localView, std::forward< IndexTypes >( indices )... );
+   }
+
+   // bracket operator for 1D arrays
+   __cuda_callable__
+   ValueType&
+   operator[]( IndexType index )
+   {
+      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) );
+      return localView[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ];
+   }
+
+   __cuda_callable__
+   const ValueType&
+   operator[]( IndexType index ) const
+   {
+      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) );
+      return localView[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ];
+   }
+
+   __cuda_callable__
+   ViewType getView()
+   {
+      return ViewType( *this );
+   }
+
+   __cuda_callable__
+   ConstViewType getConstView() const
+   {
+      return ConstViewType( localView, globalSizes, localBegins, localEnds, group );
+   }
+
+   // TODO: overlaps should be skipped, otherwise it works only after synchronization
+   bool operator==( const DistributedNDArrayView& other ) const
+   {
+      // we can't run allreduce if the communication groups are different
+      if( group != other.getCommunicationGroup() )
+         return false;
+      const bool localResult =
+            globalSizes == other.globalSizes &&
+            localBegins == other.localBegins &&
+            localEnds == other.localEnds &&
+            localView == other.localView;
+      bool result = true;
+      if( group != CommunicatorType::NullGroup )
+         CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
+      return result;
+   }
+
+   bool operator!=( const DistributedNDArrayView& other ) const
+   {
+      return ! (*this == other);
+   }
+
+protected:
+   NDArrayView localView;
+   CommunicationGroup group = Communicator::NullGroup;
+   SizesHolderType globalSizes;
+   // static sizes should have different type: localBegin is always 0, localEnd is always the full size
+   LocalBeginsType localBegins;
+   SizesHolderType localEnds;
+};
+
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/ndarray/Indexing.h b/src/TNL/Containers/ndarray/Indexing.h
index 7031d8899..0b9783ea3 100644
--- a/src/TNL/Containers/ndarray/Indexing.h
+++ b/src/TNL/Containers/ndarray/Indexing.h
@@ -12,8 +12,11 @@
 
 #pragma once
 
+#include <array>
+
 #include <TNL/Assert.h>
 #include <TNL/Devices/CudaCallable.h>
+#include <TNL/StaticFor.h>
 
 #include <TNL/Containers/ndarray/Meta.h>
 
@@ -164,6 +167,84 @@ void assertIndicesInBounds( const SizesHolder& sizes, Index&& i, IndexTypes&&...
 }
 
 
+// A variadic bounds-checker for distributed indices with overlaps
+template< typename SizesHolder1, typename SizesHolder2, typename Overlaps >
+__cuda_callable__
+void assertIndicesInRange( const SizesHolder1&, const SizesHolder2&, const Overlaps& )
+{}
+
+template< typename SizesHolder1,
+          typename SizesHolder2,
+          typename Overlaps,
+          typename Index,
+          typename... IndexTypes >
+__cuda_callable__
+void assertIndicesInRange( const SizesHolder1& begins, const SizesHolder2& ends, const Overlaps& overlaps, Index&& i, IndexTypes&&... indices )
+{
+   static_assert( SizesHolder1::getDimension() == SizesHolder2::getDimension(),
+                  "Inconsistent begins and ends." );
+#ifndef NDEBUG
+   // sizes.template getSize<...>() cannot be inside the assert macro, but the variables
+   // shouldn't be declared when compiling without assertions
+   constexpr std::size_t level = SizesHolder1::getDimension() - sizeof...(indices) - 1;
+   const auto begin = begins.template getSize< level >();
+   const auto end = ends.template getSize< level >();
+   TNL_ASSERT_LE( begin - get<level>( overlaps ), i, "Input error - some index is below the lower bound." );
+   TNL_ASSERT_LT( i, end + get<level>( overlaps ), "Input error - some index is above the upper bound." );
+#endif
+   assertIndicesInRange( begins, ends, overlaps, std::forward< IndexTypes >( indices )... );
+}
+
+
+template< typename SizesHolder,
+          typename Overlaps,
+          typename Sequence >
+struct IndexUnshiftHelper
+{};
+
+template< typename SizesHolder,
+          typename Overlaps,
+          std::size_t... N >
+struct IndexUnshiftHelper< SizesHolder, Overlaps, std::index_sequence< N... > >
+{
+   template< typename Func,
+             typename... Indices >
+   __cuda_callable__
+   static auto apply( const SizesHolder& begins, Func&& f, Indices&&... indices ) -> decltype(auto)
+   {
+      return f( ( get<N>( Overlaps{} ) + std::forward< Indices >( indices ) - begins.template getSize< N >() )... );
+   }
+
+   template< typename Func,
+             typename... Indices >
+   static auto apply_host( const SizesHolder& begins, Func&& f, Indices&&... indices ) -> decltype(auto)
+   {
+      return f( ( get<N>( Overlaps{} ) + std::forward< Indices >( indices ) - begins.template getSize< N >() )... );
+   }
+};
+
+template< typename SizesHolder,
+          typename Overlaps = make_constant_index_sequence< SizesHolder::getDimension(), 0 >,
+          typename Func,
+          typename... Indices >
+__cuda_callable__
+auto call_with_unshifted_indices( const SizesHolder& begins, Func&& f, Indices&&... indices ) -> decltype(auto)
+{
+   return IndexUnshiftHelper< SizesHolder, Overlaps, std::make_index_sequence< sizeof...( Indices ) > >
+          ::apply( begins, std::forward< Func >( f ), std::forward< Indices >( indices )... );
+}
+
+template< typename SizesHolder,
+          typename Overlaps = make_constant_index_sequence< SizesHolder::getDimension(), 0 >,
+          typename Func,
+          typename... Indices >
+auto host_call_with_unshifted_indices( const SizesHolder& begins, Func&& f, Indices&&... indices ) -> decltype(auto)
+{
+   return IndexUnshiftHelper< SizesHolder, Overlaps, std::make_index_sequence< sizeof...( Indices ) > >
+          ::apply_host( begins, std::forward< Func >( f ), std::forward< Indices >( indices )... );
+}
+
+
 template< typename Permutation,
           typename Alignment,
           typename SliceInfo,
diff --git a/src/TNL/Containers/ndarray/Meta.h b/src/TNL/Containers/ndarray/Meta.h
index 3ad0372f7..ccff5a329 100644
--- a/src/TNL/Containers/ndarray/Meta.h
+++ b/src/TNL/Containers/ndarray/Meta.h
@@ -355,6 +355,42 @@ filter_sequence( std::integer_sequence< Index, vals... > )
    return concat_sequences( FilterSingle< Mask >( std::integer_sequence< Index, vals >{} )... );
 }
 
+
+/*
+ * make_constant_integer_sequence, make_constant_index_sequence - helper
+ * templates for the generation of constant sequences like
+ * std::make_integer_sequence, std::make_index_sequence
+ */
+template< typename T, typename N, T v > struct gen_const_seq;
+template< typename T, typename N, T v > using gen_const_seq_t = typename gen_const_seq< T, N, v >::type;
+
+template< typename T, typename N, T v >
+struct gen_const_seq
+{
+   using type = decltype(concat_sequences(
+                     gen_const_seq_t<T, std::integral_constant<T, N::value/2>, v>{},
+                     gen_const_seq_t<T, std::integral_constant<T, N::value - N::value/2>, v>{}
+                  ));
+};
+
+template< typename T, T v >
+struct gen_const_seq< T, std::integral_constant<T, 0>, v >
+{
+   using type = std::integer_sequence<T>;
+};
+
+template< typename T, T v >
+struct gen_const_seq< T, std::integral_constant<T, 1>, v >
+{
+   using type = std::integer_sequence<T, v>;
+};
+
+template< typename T, T N, T value >
+using make_constant_integer_sequence = gen_const_seq_t< T, std::integral_constant<T, N>, value >;
+
+template< std::size_t N, std::size_t value >
+using make_constant_index_sequence = gen_const_seq_t< std::size_t, std::integral_constant<std::size_t, N>, value >;
+
 } // namespace __ndarray_impl
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/ndarray/SizesHolder.h b/src/TNL/Containers/ndarray/SizesHolder.h
index 306569de7..40e4b3143 100644
--- a/src/TNL/Containers/ndarray/SizesHolder.h
+++ b/src/TNL/Containers/ndarray/SizesHolder.h
@@ -257,9 +257,9 @@ std::ostream& operator<<( std::ostream& str, const SizesHolder< Index, sizes...
 }
 
 
-// helper for the forInternal method
 namespace __ndarray_impl {
 
+// helper for the forInternal method
 template< typename SizesHolder,
           std::size_t ConstValue >
 struct SubtractedSizesHolder
@@ -274,6 +274,40 @@ struct SubtractedSizesHolder< SizesHolder< Index, sizes... >, ConstValue >
    using type = SizesHolder< Index, ( (sizes >= ConstValue) ? sizes - ConstValue : 0 )... >;
 };
 
+
+// wrapper for localBegins in DistributedNDArray (static sizes cannot be distributed, begins are always 0)
+template< typename SizesHolder >
+struct LocalBeginsHolder : public SizesHolder
+{
+   template< std::size_t dimension >
+   static constexpr std::size_t getStaticSize()
+   {
+      static_assert( dimension < SizesHolder::getDimension(), "Invalid dimension passed to getStaticSize()." );
+      return 0;
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   typename SizesHolder::IndexType getSize() const
+   {
+      if( SizesHolder::template getStaticSize< level >() != 0 )
+         return 0;
+      return SizesHolder::template getSize< level >();
+   }
+};
+
+template< typename Index,
+          std::size_t... sizes >
+std::ostream& operator<<( std::ostream& str, const __ndarray_impl::LocalBeginsHolder< SizesHolder< Index, sizes... > >& holder )
+{
+   str << "LocalBeginsHolder< SizesHolder< ";
+   TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderStaticSizePrinter >::execHost( str, (SizesHolder< Index, sizes... >) holder );
+   str << holder.template getStaticSize< sizeof...(sizes) - 1 >() << " > >( ";
+   TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderSizePrinter >::execHost( str, holder );
+   str << holder.template getSize< sizeof...(sizes) - 1 >() << " )";
+   return str;
+}
+
 } // namespace __ndarray_impl
 
 } // namespace Containers
diff --git a/src/UnitTests/Containers/ndarray/CMakeLists.txt b/src/UnitTests/Containers/ndarray/CMakeLists.txt
index cbdbe328a..9e7e0ff4e 100644
--- a/src/UnitTests/Containers/ndarray/CMakeLists.txt
+++ b/src/UnitTests/Containers/ndarray/CMakeLists.txt
@@ -24,3 +24,18 @@ if( BUILD_CUDA )
    target_link_libraries( StaticNDArrayCudaTest ${GTEST_BOTH_LIBRARIES} )
    add_test( StaticNDArrayCudaTest ${EXECUTABLE_OUTPUT_PATH}/StaticNDArrayCudaTest${CMAKE_EXECUTABLE_SUFFIX} )
 endif()
+
+if( ${BUILD_MPI} )
+   if( BUILD_CUDA )
+      CUDA_ADD_EXECUTABLE( DistributedNDArrayTest DistributedNDArrayTest.cu
+                           OPTIONS ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedNDArrayTest ${GTEST_BOTH_LIBRARIES} )
+   else()
+      ADD_EXECUTABLE( DistributedNDArrayTest DistributedNDArrayTest.cpp )
+      TARGET_COMPILE_OPTIONS( DistributedNDArrayTest PRIVATE ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedNDArrayTest ${GTEST_BOTH_LIBRARIES} )
+   endif()
+
+   SET( mpi_test_parameters -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedNDArrayTest${CMAKE_EXECUTABLE_SUFFIX}" )
+   ADD_TEST( NAME DistributedNDArrayTest COMMAND "mpirun" ${mpi_test_parameters})
+endif()
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.cpp b/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.cpp
new file mode 100644
index 000000000..d526d56d0
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.cpp
@@ -0,0 +1 @@
+#include "DistributedNDArrayTest.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.cu b/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.cu
new file mode 100644
index 000000000..d526d56d0
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.cu
@@ -0,0 +1 @@
+#include "DistributedNDArrayTest.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h
new file mode 100644
index 000000000..62c3d2496
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h
@@ -0,0 +1,323 @@
+/***************************************************************************
+                          DistributedNDArrayTest.h  -  description
+                             -------------------
+    begin                : Dec 27, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/NoDistrCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/Containers/DistributedNDArray.h>
+#include <TNL/Containers/DistributedNDArrayView.h>
+#include <TNL/Containers/ArrayView.h>
+#include <TNL/Containers/Partitioner.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+/*
+ * Light check of DistributedNDArray.
+ *
+ * - Number of processes is not limited.
+ * - Global size is hardcoded as 97 to force non-uniform distribution.
+ * - Communication group is hardcoded as AllGroup -- it may be changed as needed.
+ */
+template< typename DistributedNDArray >
+class DistributedNDArrayTest
+: public ::testing::Test
+{
+protected:
+   using ValueType = typename DistributedNDArray::ValueType;
+   using DeviceType = typename DistributedNDArray::DeviceType;
+   using CommunicatorType = typename DistributedNDArray::CommunicatorType;
+   using IndexType = typename DistributedNDArray::IndexType;
+   using DistributedNDArrayType = DistributedNDArray;
+
+   // TODO: use ndarray
+   using LocalArrayType = Array< ValueType, DeviceType, IndexType >;
+   using LocalArrayViewType = ArrayView< ValueType, DeviceType, IndexType >;
+
+   const int globalSize = 97;  // prime number to force non-uniform distribution
+
+   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+
+   DistributedNDArrayType distributedNDArray;
+
+   const int rank = CommunicatorType::GetRank(group);
+   const int nproc = CommunicatorType::GetSize(group);
+
+   DistributedNDArrayTest()
+   {
+      using LocalRangeType = typename DistributedNDArray::LocalRangeType;
+      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      distributedNDArray.setSizes( globalSize );
+      distributedNDArray.template setDistribution< 0 >( localRange.getBegin(), localRange.getEnd(), group );
+      distributedNDArray.allocate();
+
+      EXPECT_EQ( distributedNDArray.template getLocalRange< 0 >(), localRange );
+      EXPECT_EQ( distributedNDArray.getCommunicationGroup(), group );
+   }
+};
+
+// types for which DistributedNDArrayTest is instantiated
+using DistributedNDArrayTypes = ::testing::Types<
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 0 >,
+                                std::index_sequence< 0 >,
+                                std::index_sequence< 0 >,
+                                Devices::Host >,
+                       Communicators::MpiCommunicator >,
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 0 >,
+                                std::index_sequence< 0 >,
+                                std::index_sequence< 0 >,
+                                Devices::Host >,
+                       Communicators::NoDistrCommunicator >
+#ifdef HAVE_CUDA
+   ,
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 0 >,
+                                std::index_sequence< 0 >,
+                                std::index_sequence< 0 >,
+                                Devices::Cuda >,
+                       Communicators::MpiCommunicator >,
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 0 >,
+                                std::index_sequence< 0 >,
+                                std::index_sequence< 0 >,
+                                Devices::Cuda >,
+                       Communicators::NoDistrCommunicator >
+#endif
+>;
+
+TYPED_TEST_SUITE( DistributedNDArrayTest, DistributedNDArrayTypes );
+
+TYPED_TEST( DistributedNDArrayTest, checkSumOfLocalSizes )
+{
+   using CommunicatorType = typename TestFixture::CommunicatorType;
+
+   const auto localRange = this->distributedNDArray.template getLocalRange< 0 >();
+   const int localSize = localRange.getEnd() - localRange.getBegin();
+   int sumOfLocalSizes = 0;
+   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   EXPECT_EQ( sumOfLocalSizes, this->globalSize );
+   EXPECT_EQ( this->distributedNDArray.template getSize< 0 >(), this->globalSize );
+}
+
+TYPED_TEST( DistributedNDArrayTest, setLike )
+{
+   using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType;
+
+   const auto localRange = this->distributedNDArray.template getLocalRange< 0 >();
+   EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), localRange.getEnd() - localRange.getBegin() );
+   DistributedNDArrayType copy;
+   EXPECT_EQ( copy.getLocalStorageSize(), 0 );
+   copy.setLike( this->distributedNDArray );
+   EXPECT_EQ( copy.getLocalStorageSize(), localRange.getEnd() - localRange.getBegin() );
+}
+
+TYPED_TEST( DistributedNDArrayTest, reset )
+{
+   const auto localRange = this->distributedNDArray.template getLocalRange< 0 >();
+   EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), localRange.getEnd() - localRange.getBegin() );
+   this->distributedNDArray.reset();
+   EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 0 );
+}
+
+// TODO: swap
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray, typename BufferView >
+void test_helper_setValue( DistributedArray& array, BufferView& buffer_view )
+{
+   using DeviceType = typename DistributedArray::DeviceType;
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = array.template getLocalRange< 0 >();
+   auto array_view = array.getConstView();
+   auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      buffer_view[ i - localRange.getBegin() ] = array_view( i );
+   };
+   ParallelFor< DeviceType >::exec( localRange.getBegin(), localRange.getEnd(), kernel );
+}
+
+TYPED_TEST( DistributedNDArrayTest, setValue )
+{
+   using LocalArrayType = typename TestFixture::LocalArrayType;
+   using LocalArrayViewType = typename TestFixture::LocalArrayViewType;
+
+   this->distributedNDArray.setValue( 1.0 );
+
+   const auto localRange = this->distributedNDArray.template getLocalRange< 0 >();
+   LocalArrayType buffer( localRange.getEnd() - localRange.getBegin() );
+   LocalArrayViewType buffer_view( buffer );
+   test_helper_setValue( this->distributedNDArray, buffer_view );
+
+   LocalArrayType expected( localRange.getEnd() - localRange.getBegin() );
+   expected.setValue( 1.0 );
+   EXPECT_EQ( buffer, expected );
+}
+
+TYPED_TEST( DistributedNDArrayTest, elementwiseAccess )
+{
+//   using ArrayViewType = typename TestFixture::ArrayViewType;
+   using IndexType = typename TestFixture::IndexType;
+
+   this->distributedNDArray.setValue( 0 );
+//   ArrayViewType localArrayView = this->distributedNDArray.getLocalArrayView();
+   const auto localRange = this->distributedNDArray.template getLocalRange< 0 >();
+
+   // check initial value
+   for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) {
+//      EXPECT_EQ( localArrayView.getElement( i ), 0 );
+      EXPECT_EQ( this->distributedNDArray.getElement( gi ), 0 );
+      if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value )
+         EXPECT_EQ( this->distributedNDArray[ gi ], 0 );
+   }
+
+   // use operator()
+   if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) {
+      for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) {
+         this->distributedNDArray( gi ) = gi + 1;
+      }
+
+      // check set value
+      for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) {
+//         EXPECT_EQ( localArrayView.getElement( i ), gi + 1 );
+         EXPECT_EQ( this->distributedNDArray.getElement( gi ), gi + 1 );
+         EXPECT_EQ( this->distributedNDArray( gi ), gi + 1 );
+         EXPECT_EQ( this->distributedNDArray[ gi ], gi + 1 );
+      }
+   }
+}
+
+TYPED_TEST( DistributedNDArrayTest, copyAssignment )
+{
+   using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType;
+
+   this->distributedNDArray.setValue( 1 );
+   DistributedNDArrayType copy;
+   copy = this->distributedNDArray;
+   // no binding, but deep copy
+//   EXPECT_NE( copy.getLocalArrayView().getData(), this->distributedNDArray.getLocalArrayView().getData() );
+//   EXPECT_EQ( copy.getLocalArrayView(), this->distributedNDArray.getLocalArrayView() );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_comparisonOperators( DistributedArray& u, DistributedArray& v, DistributedArray& w )
+{
+   using DeviceType = typename DistributedArray::DeviceType;
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = u.template getLocalRange< 0 >();
+   auto u_view = u.getView();
+   auto v_view = v.getView();
+   auto w_view = w.getView();
+
+   auto kernel = [=] __cuda_callable__ ( IndexType gi ) mutable
+   {
+      u_view( gi ) = gi;
+      v_view( gi ) = gi;
+      w_view( gi ) = 2 * gi;
+   };
+   ParallelFor< DeviceType >::exec( localRange.getBegin(), localRange.getEnd(), kernel );
+}
+
+TYPED_TEST( DistributedNDArrayTest, comparisonOperators )
+{
+   using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType;
+
+   DistributedNDArrayType& u = this->distributedNDArray;
+   DistributedNDArrayType v, w;
+   v.setLike( u );
+   w.setLike( u );
+
+   test_helper_comparisonOperators( u, v, w );
+
+   EXPECT_TRUE( u == u );
+   EXPECT_TRUE( u == v );
+   EXPECT_TRUE( v == u );
+   EXPECT_FALSE( u != v );
+   EXPECT_FALSE( v != u );
+   EXPECT_TRUE( u != w );
+   EXPECT_TRUE( w != u );
+   EXPECT_FALSE( u == w );
+   EXPECT_FALSE( w == u );
+
+   v.reset();
+   EXPECT_FALSE( u == v );
+   u.reset();
+   EXPECT_TRUE( u == v );
+}
+
+#endif  // HAVE_GTEST
+
+
+#if (defined(HAVE_GTEST) && defined(HAVE_MPI))
+using CommunicatorType = Communicators::MpiCommunicator;
+
+#include <sstream>
+
+class MinimalistBufferedPrinter
+: public ::testing::EmptyTestEventListener
+{
+private:
+   std::stringstream sout;
+
+public:
+   // Called before a test starts.
+   virtual void OnTestStart(const ::testing::TestInfo& test_info)
+   {
+      sout << test_info.test_case_name() << "." << test_info.name() << " Start." << std::endl;
+   }
+
+   // Called after a failed assertion or a SUCCEED() invocation.
+   virtual void OnTestPartResult(const ::testing::TestPartResult& test_part_result)
+   {
+      sout << (test_part_result.failed() ? "====Failure=== " : "===Success=== ")
+           << test_part_result.file_name() << " "
+           << test_part_result.line_number() <<std::endl
+           << test_part_result.summary() <<std::endl;
+   }
+
+   // Called after a test ends.
+   virtual void OnTestEnd(const ::testing::TestInfo& test_info)
+   {
+      const int rank = CommunicatorType::GetRank(CommunicatorType::AllGroup);
+      sout << test_info.test_case_name() << "." << test_info.name() << " End." <<std::endl;
+      std::cout << rank << ":" << std::endl << sout.str()<< std::endl;
+      sout.str( std::string() );
+      sout.clear();
+   }
+};
+#endif
+
+#include "../../GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+
+   #ifdef HAVE_MPI
+      ::testing::TestEventListeners& listeners =
+         ::testing::UnitTest::GetInstance()->listeners();
+
+      delete listeners.Release(listeners.default_result_printer());
+      listeners.Append(new MinimalistBufferedPrinter);
+
+      Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
+   #endif
+   return RUN_ALL_TESTS();
+#else
+   throw GtestMissingError();
+#endif
+}
-- 
GitLab


From 8b91dfcc7c9e55c8451f0b7fb835eb9971b5b011 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Fri, 22 Mar 2019 16:30:34 +0100
Subject: [PATCH 11/25] DistributedNDArray: added forAll method

---
 src/TNL/Containers/DistributedNDArray.h       |  8 +++++
 src/TNL/Containers/DistributedNDArrayView.h   |  8 +++++
 .../ndarray/DistributedNDArrayTest.h          | 33 +++++++++++++++++++
 3 files changed, 49 insertions(+)

diff --git a/src/TNL/Containers/DistributedNDArray.h b/src/TNL/Containers/DistributedNDArray.h
index fcf9b5c8c..5edee6353 100644
--- a/src/TNL/Containers/DistributedNDArray.h
+++ b/src/TNL/Containers/DistributedNDArray.h
@@ -184,6 +184,14 @@ public:
       return ! (*this == other);
    }
 
+   // iterate over all local elements
+   template< typename Device2 = DeviceType, typename Func >
+   void forAll( Func f ) const
+   {
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( localBegins, localEnds, f );
+   }
+
 
    // extra methods
 
diff --git a/src/TNL/Containers/DistributedNDArrayView.h b/src/TNL/Containers/DistributedNDArrayView.h
index 8c2415092..ae791b54c 100644
--- a/src/TNL/Containers/DistributedNDArrayView.h
+++ b/src/TNL/Containers/DistributedNDArrayView.h
@@ -213,6 +213,14 @@ public:
       return ! (*this == other);
    }
 
+   // iterate over all local elements
+   template< typename Device2 = DeviceType, typename Func >
+   void forAll( Func f ) const
+   {
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( localBegins, localEnds, f );
+   }
+
 protected:
    NDArrayView localView;
    CommunicationGroup group = Communicator::NullGroup;
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h
index 62c3d2496..a1fc55eb2 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h
@@ -259,6 +259,39 @@ TYPED_TEST( DistributedNDArrayTest, comparisonOperators )
    EXPECT_TRUE( u == v );
 }
 
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forAll( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forAll( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 );
+
+   a.setValue( 0 );
+   a_view.forAll( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 );
+}
+
+TYPED_TEST( DistributedNDArrayTest, forAll )
+{
+   test_helper_forAll( this->distributedNDArray );
+}
+
 #endif  // HAVE_GTEST
 
 
-- 
GitLab


From f9853a861033e5ed93337f450756eb1be5035141 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Fri, 22 Mar 2019 17:09:53 +0100
Subject: [PATCH 12/25] DistributedNDArray: added forInternal and
 forLocalInternal methods

---
 src/TNL/Containers/DistributedNDArray.h       |  47 +++++++
 src/TNL/Containers/DistributedNDArrayView.h   |  47 +++++++
 src/TNL/Containers/ndarray/Indexing.h         | 126 ++++++++++++++++--
 src/TNL/Containers/ndarray/SizesHolder.h      |  16 ++-
 .../ndarray/DistributedNDArrayTest.h          |  84 ++++++++++++
 5 files changed, 306 insertions(+), 14 deletions(-)

diff --git a/src/TNL/Containers/DistributedNDArray.h b/src/TNL/Containers/DistributedNDArray.h
index 5edee6353..993093654 100644
--- a/src/TNL/Containers/DistributedNDArray.h
+++ b/src/TNL/Containers/DistributedNDArray.h
@@ -192,6 +192,53 @@ public:
       dispatch( localBegins, localEnds, f );
    }
 
+   // iterate over local elements which are not neighbours of *global* boundaries
+   template< typename Device2 = DeviceType, typename Func >
+   void forInternal( Func f ) const
+   {
+      // add static sizes
+      using Begins = __ndarray_impl::LocalBeginsHolder< SizesHolderType, 1 >;
+      // add dynamic sizes
+      Begins begins;
+      __ndarray_impl::SetSizesAddHelper< 1, Begins, SizesHolderType, Overlaps >::add( begins, SizesHolderType{} );
+      __ndarray_impl::SetSizesMaxHelper< Begins, LocalBeginsType >::max( begins, localBegins );
+
+      // subtract static sizes
+      using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type;
+      // subtract dynamic sizes
+      Ends ends;
+      __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolderType, Overlaps >::subtract( ends, globalSizes );
+      __ndarray_impl::SetSizesMinHelper< Ends, SizesHolderType >::min( ends, localEnds );
+
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, ends, f );
+   }
+
+   // iterate over local elements inside the given [begins, ends) range specified by global indices
+   template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends >
+   void forInternal( Func f, const Begins& begins, const Ends& ends ) const
+   {
+      // TODO: assert "localBegins <= begins <= localEnds", "localBegins <= ends <= localEnds"
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, ends, f );
+   }
+
+   // iterate over local elements which are not neighbours of overlaps (if all overlaps are 0, it is equivalent to forAll)
+   template< typename Device2 = DeviceType, typename Func >
+   void forLocalInternal( Func f ) const
+   {
+      // add dynamic sizes
+      LocalBeginsType begins;
+      __ndarray_impl::SetSizesAddHelper< 1, LocalBeginsType, SizesHolderType, Overlaps >::add( begins, localBegins, false );
+
+      // subtract dynamic sizes
+      SizesHolderType ends;
+      __ndarray_impl::SetSizesSubtractHelper< 1, SizesHolderType, SizesHolderType, Overlaps >::subtract( ends, localEnds, false );
+
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, ends, f );
+   }
+
 
    // extra methods
 
diff --git a/src/TNL/Containers/DistributedNDArrayView.h b/src/TNL/Containers/DistributedNDArrayView.h
index ae791b54c..cac210899 100644
--- a/src/TNL/Containers/DistributedNDArrayView.h
+++ b/src/TNL/Containers/DistributedNDArrayView.h
@@ -221,6 +221,53 @@ public:
       dispatch( localBegins, localEnds, f );
    }
 
+   // iterate over local elements which are not neighbours of *global* boundaries
+   template< typename Device2 = DeviceType, typename Func >
+   void forInternal( Func f ) const
+   {
+      // add static sizes
+      using Begins = __ndarray_impl::LocalBeginsHolder< SizesHolderType, 1 >;
+      // add dynamic sizes
+      Begins begins;
+      __ndarray_impl::SetSizesAddHelper< 1, Begins, SizesHolderType, Overlaps >::add( begins, SizesHolderType{} );
+      __ndarray_impl::SetSizesMaxHelper< Begins, LocalBeginsType >::max( begins, localBegins );
+
+      // subtract static sizes
+      using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type;
+      // subtract dynamic sizes
+      Ends ends;
+      __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolderType, Overlaps >::subtract( ends, globalSizes );
+      __ndarray_impl::SetSizesMinHelper< Ends, SizesHolderType >::min( ends, localEnds );
+
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, ends, f );
+   }
+
+   // iterate over local elements inside the given [begins, ends) range specified by global indices
+   template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends >
+   void forInternal( Func f, const Begins& begins, const Ends& ends ) const
+   {
+      // TODO: assert "localBegins <= begins <= localEnds", "localBegins <= ends <= localEnds"
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, ends, f );
+   }
+
+   // iterate over local elements which are not neighbours of overlaps (if all overlaps are 0, it is equivalent to forAll)
+   template< typename Device2 = DeviceType, typename Func >
+   void forLocalInternal( Func f ) const
+   {
+      // add dynamic sizes
+      LocalBeginsType begins;
+      __ndarray_impl::SetSizesAddHelper< 1, LocalBeginsType, SizesHolderType, Overlaps >::add( begins, localBegins, false );
+
+      // subtract dynamic sizes
+      SizesHolderType ends;
+      __ndarray_impl::SetSizesSubtractHelper< 1, SizesHolderType, SizesHolderType, Overlaps >::subtract( ends, localEnds, false );
+
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, ends, f );
+   }
+
 protected:
    NDArrayView localView;
    CommunicationGroup group = Communicator::NullGroup;
diff --git a/src/TNL/Containers/ndarray/Indexing.h b/src/TNL/Containers/ndarray/Indexing.h
index 0b9783ea3..36a7f89ef 100644
--- a/src/TNL/Containers/ndarray/Indexing.h
+++ b/src/TNL/Containers/ndarray/Indexing.h
@@ -118,28 +118,138 @@ void setSizesHelper( SizesHolder& holder,
 template< std::size_t ConstValue,
           typename TargetHolder,
           typename SourceHolder,
+          typename Overlaps = make_constant_index_sequence< TargetHolder::getDimension(), 0 >,
           std::size_t level = TargetHolder::getDimension() - 1 >
 struct SetSizesSubtractHelper
 {
    static void subtract( TargetHolder& target,
-                         const SourceHolder& source )
+                         const SourceHolder& source,
+                         bool negateOverlaps = true )
    {
-      if( source.template getStaticSize< level >() == 0 )
-         target.template setSize< level >( source.template getSize< level >() - ConstValue );
-      SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, level - 1 >::subtract( target, source );
+      if( source.template getStaticSize< level >() == 0 ) {
+         if( negateOverlaps )
+            target.template setSize< level >( source.template getSize< level >() - ConstValue * ! get< level >( Overlaps{} ) );
+         else
+            target.template setSize< level >( source.template getSize< level >() - ConstValue * !! get< level >( Overlaps{} ) );
+      }
+      SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, level - 1 >::subtract( target, source );
    }
 };
 
 template< std::size_t ConstValue,
           typename TargetHolder,
-          typename SourceHolder >
-struct SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, 0 >
+          typename SourceHolder,
+          typename Overlaps >
+struct SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, 0 >
 {
    static void subtract( TargetHolder& target,
-                         const SourceHolder& source )
+                         const SourceHolder& source,
+                         bool negateOverlaps = true )
+   {
+      if( source.template getStaticSize< 0 >() == 0 ) {
+         if( negateOverlaps )
+            target.template setSize< 0 >( source.template getSize< 0 >() - ConstValue * ! get< 0 >( Overlaps{} ) );
+         else
+            target.template setSize< 0 >( source.template getSize< 0 >() - ConstValue * !! get< 0 >( Overlaps{} ) );
+      }
+   }
+};
+
+
+// helper for the forInternal method (DistributedNDArray)
+template< std::size_t ConstValue,
+          typename TargetHolder,
+          typename SourceHolder,
+          typename Overlaps = make_constant_index_sequence< TargetHolder::getDimension(), 0 >,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesAddHelper
+{
+   static void add( TargetHolder& target,
+                    const SourceHolder& source,
+                    bool negateOverlaps = true )
+   {
+      if( source.template getStaticSize< level >() == 0 ) {
+         if( negateOverlaps )
+            target.template setSize< level >( source.template getSize< level >() + ConstValue * ! get< level >( Overlaps{} ) );
+         else
+            target.template setSize< level >( source.template getSize< level >() + ConstValue * !! get< level >( Overlaps{} ) );
+      }
+      SetSizesAddHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, level - 1 >::add( target, source );
+   }
+};
+
+template< std::size_t ConstValue,
+          typename TargetHolder,
+          typename SourceHolder,
+          typename Overlaps >
+struct SetSizesAddHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, 0 >
+{
+   static void add( TargetHolder& target,
+                    const SourceHolder& source,
+                    bool negateOverlaps = true )
+   {
+      if( source.template getStaticSize< 0 >() == 0 ) {
+         if( negateOverlaps )
+            target.template setSize< 0 >( source.template getSize< 0 >() + ConstValue * ! get< 0 >( Overlaps{} ) );
+         else
+            target.template setSize< 0 >( source.template getSize< 0 >() + ConstValue * !! get< 0 >( Overlaps{} ) );
+      }
+   }
+};
+
+
+// helper for the forInternal method (DistributedNDArray)
+template< typename TargetHolder,
+          typename SourceHolder,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesMaxHelper
+{
+   static void max( TargetHolder& target,
+                    const SourceHolder& source )
+   {
+      if( source.template getStaticSize< level >() == 0 )
+         target.template setSize< level >( std::max( target.template getSize< level >(), source.template getSize< level >() ) );
+      SetSizesMaxHelper< TargetHolder, SourceHolder, level - 1 >::max( target, source );
+   }
+};
+
+template< typename TargetHolder,
+          typename SourceHolder >
+struct SetSizesMaxHelper< TargetHolder, SourceHolder, 0 >
+{
+   static void max( TargetHolder& target,
+                    const SourceHolder& source )
+   {
+      if( source.template getStaticSize< 0 >() == 0 )
+         target.template setSize< 0 >( std::max( target.template getSize< 0 >(), source.template getSize< 0 >() ) );
+   }
+};
+
+
+// helper for the forInternal method (DistributedNDArray)
+template< typename TargetHolder,
+          typename SourceHolder,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesMinHelper
+{
+   static void min( TargetHolder& target,
+                    const SourceHolder& source )
+   {
+      if( source.template getStaticSize< level >() == 0 )
+         target.template setSize< level >( std::min( target.template getSize< level >(), source.template getSize< level >() ) );
+      SetSizesMinHelper< TargetHolder, SourceHolder, level - 1 >::min( target, source );
+   }
+};
+
+template< typename TargetHolder,
+          typename SourceHolder >
+struct SetSizesMinHelper< TargetHolder, SourceHolder, 0 >
+{
+   static void min( TargetHolder& target,
+                    const SourceHolder& source )
    {
       if( source.template getStaticSize< 0 >() == 0 )
-         target.template setSize< 0 >( source.template getSize< 0 >() - ConstValue );
+         target.template setSize< 0 >( std::min( target.template getSize< 0 >(), source.template getSize< 0 >() ) );
    }
 };
 
diff --git a/src/TNL/Containers/ndarray/SizesHolder.h b/src/TNL/Containers/ndarray/SizesHolder.h
index 40e4b3143..408d6ed92 100644
--- a/src/TNL/Containers/ndarray/SizesHolder.h
+++ b/src/TNL/Containers/ndarray/SizesHolder.h
@@ -276,14 +276,16 @@ struct SubtractedSizesHolder< SizesHolder< Index, sizes... >, ConstValue >
 
 
 // wrapper for localBegins in DistributedNDArray (static sizes cannot be distributed, begins are always 0)
-template< typename SizesHolder >
+template< typename SizesHolder,
+          // overridable value is useful in the forInternal method
+          std::size_t ConstValue = 0 >
 struct LocalBeginsHolder : public SizesHolder
 {
    template< std::size_t dimension >
    static constexpr std::size_t getStaticSize()
    {
       static_assert( dimension < SizesHolder::getDimension(), "Invalid dimension passed to getStaticSize()." );
-      return 0;
+      return ConstValue;
    }
 
    template< std::size_t level >
@@ -291,18 +293,20 @@ struct LocalBeginsHolder : public SizesHolder
    typename SizesHolder::IndexType getSize() const
    {
       if( SizesHolder::template getStaticSize< level >() != 0 )
-         return 0;
+         return ConstValue;
       return SizesHolder::template getSize< level >();
    }
 };
 
 template< typename Index,
-          std::size_t... sizes >
-std::ostream& operator<<( std::ostream& str, const __ndarray_impl::LocalBeginsHolder< SizesHolder< Index, sizes... > >& holder )
+          std::size_t... sizes,
+          std::size_t ConstValue >
+std::ostream& operator<<( std::ostream& str, const __ndarray_impl::LocalBeginsHolder< SizesHolder< Index, sizes... >, ConstValue >& holder )
 {
    str << "LocalBeginsHolder< SizesHolder< ";
    TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderStaticSizePrinter >::execHost( str, (SizesHolder< Index, sizes... >) holder );
-   str << holder.template getStaticSize< sizeof...(sizes) - 1 >() << " > >( ";
+   str << holder.template getStaticSize< sizeof...(sizes) - 1 >() << " >, ";
+   str << ConstValue << " >( ";
    TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderSizePrinter >::execHost( str, holder );
    str << holder.template getSize< sizeof...(sizes) - 1 >() << " )";
    return str;
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h
index a1fc55eb2..f26fe1f10 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h
@@ -292,6 +292,90 @@ TYPED_TEST( DistributedNDArrayTest, forAll )
    test_helper_forAll( this->distributedNDArray );
 }
 
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forInternal( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forInternal( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   {
+      if( gi == 0 || gi == a.template getSize< 0 >() - 1 )
+         EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+      else
+         EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+   }
+
+   a.setValue( 0 );
+   a_view.forInternal( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   {
+      if( gi == 0 || gi == a.template getSize< 0 >() - 1 )
+         EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+      else
+         EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+   }
+}
+
+TYPED_TEST( DistributedNDArrayTest, forInternal )
+{
+   test_helper_forInternal( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forLocalInternal( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) += 1;
+   };
+
+   a.setValue( 0 );
+   // equivalent to forAll because all overlaps are 0
+   a.forLocalInternal( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+
+   a.setValue( 0 );
+   // equivalent to forAll because all overlaps are 0
+   a_view.forLocalInternal( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArrayTest, forLocalInternal )
+{
+   test_helper_forLocalInternal( this->distributedNDArray );
+}
+
 #endif  // HAVE_GTEST
 
 
-- 
GitLab


From 82e0c5389192402ac3fc091bd8920d763b080463 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Fri, 29 Mar 2019 22:44:22 +0100
Subject: [PATCH 13/25] DistributedNDArray: added forBoundary and
 forLocalBoundary methods

---
 src/TNL/Containers/DistributedNDArray.h       | 47 +++++++++++
 src/TNL/Containers/DistributedNDArrayView.h   | 47 +++++++++++
 .../ndarray/DistributedNDArrayTest.h          | 84 +++++++++++++++++++
 3 files changed, 178 insertions(+)

diff --git a/src/TNL/Containers/DistributedNDArray.h b/src/TNL/Containers/DistributedNDArray.h
index 993093654..d5cf49024 100644
--- a/src/TNL/Containers/DistributedNDArray.h
+++ b/src/TNL/Containers/DistributedNDArray.h
@@ -223,6 +223,37 @@ public:
       dispatch( begins, ends, f );
    }
 
+   // iterate over local elements which are neighbours of *global* boundaries
+   template< typename Device2 = DeviceType, typename Func >
+   void forBoundary( Func f ) const
+   {
+      // add static sizes
+      using SkipBegins = __ndarray_impl::LocalBeginsHolder< SizesHolderType, 1 >;
+      // add dynamic sizes
+      SkipBegins skipBegins;
+      __ndarray_impl::SetSizesAddHelper< 1, SkipBegins, SizesHolderType, Overlaps >::add( skipBegins, SizesHolderType{} );
+      __ndarray_impl::SetSizesMaxHelper< SkipBegins, LocalBeginsType >::max( skipBegins, localBegins );
+
+      // subtract static sizes
+      using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type;
+      // subtract dynamic sizes
+      SkipEnds skipEnds;
+      __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolderType, Overlaps >::subtract( skipEnds, globalSizes );
+      __ndarray_impl::SetSizesMinHelper< SkipEnds, SizesHolderType >::min( skipEnds, localEnds );
+
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( localBegins, skipBegins, skipEnds, localEnds, f );
+   }
+
+   // iterate over local elements outside the given [skipBegins, skipEnds) range specified by global indices
+   template< typename Device2 = DeviceType, typename Func, typename SkipBegins, typename SkipEnds >
+   void forBoundary( Func f, const SkipBegins& skipBegins, const SkipEnds& skipEnds ) const
+   {
+      // TODO: assert "localBegins <= skipBegins <= localEnds", "localBegins <= skipEnds <= localEnds"
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( localBegins, skipBegins, skipEnds, localEnds, f );
+   }
+
    // iterate over local elements which are not neighbours of overlaps (if all overlaps are 0, it is equivalent to forAll)
    template< typename Device2 = DeviceType, typename Func >
    void forLocalInternal( Func f ) const
@@ -239,6 +270,22 @@ public:
       dispatch( begins, ends, f );
    }
 
+   // iterate over local elements which are neighbours of overlaps (if all overlaps are 0, it has no effect)
+   template< typename Device2 = DeviceType, typename Func >
+   void forLocalBoundary( Func f ) const
+   {
+      // add dynamic sizes
+      LocalBeginsType skipBegins;
+      __ndarray_impl::SetSizesAddHelper< 1, LocalBeginsType, SizesHolderType, Overlaps >::add( skipBegins, localBegins, false );
+
+      // subtract dynamic sizes
+      SizesHolderType skipEnds;
+      __ndarray_impl::SetSizesSubtractHelper< 1, SizesHolderType, SizesHolderType, Overlaps >::subtract( skipEnds, localEnds, false );
+
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( localBegins, skipBegins, skipEnds, localEnds, f );
+   }
+
 
    // extra methods
 
diff --git a/src/TNL/Containers/DistributedNDArrayView.h b/src/TNL/Containers/DistributedNDArrayView.h
index cac210899..f8150a8d8 100644
--- a/src/TNL/Containers/DistributedNDArrayView.h
+++ b/src/TNL/Containers/DistributedNDArrayView.h
@@ -252,6 +252,37 @@ public:
       dispatch( begins, ends, f );
    }
 
+   // iterate over local elements which are neighbours of *global* boundaries
+   template< typename Device2 = DeviceType, typename Func >
+   void forBoundary( Func f ) const
+   {
+      // add static sizes
+      using SkipBegins = __ndarray_impl::LocalBeginsHolder< SizesHolderType, 1 >;
+      // add dynamic sizes
+      SkipBegins skipBegins;
+      __ndarray_impl::SetSizesAddHelper< 1, SkipBegins, SizesHolderType, Overlaps >::add( skipBegins, SizesHolderType{} );
+      __ndarray_impl::SetSizesMaxHelper< SkipBegins, LocalBeginsType >::max( skipBegins, localBegins );
+
+      // subtract static sizes
+      using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type;
+      // subtract dynamic sizes
+      SkipEnds skipEnds;
+      __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolderType, Overlaps >::subtract( skipEnds, globalSizes );
+      __ndarray_impl::SetSizesMinHelper< SkipEnds, SizesHolderType >::min( skipEnds, localEnds );
+
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( localBegins, skipBegins, skipEnds, localEnds, f );
+   }
+
+   // iterate over local elements outside the given [skipBegins, skipEnds) range specified by global indices
+   template< typename Device2 = DeviceType, typename Func, typename SkipBegins, typename SkipEnds >
+   void forBoundary( Func f, const SkipBegins& skipBegins, const SkipEnds& skipEnds ) const
+   {
+      // TODO: assert "localBegins <= skipBegins <= localEnds", "localBegins <= skipEnds <= localEnds"
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( localBegins, skipBegins, skipEnds, localEnds, f );
+   }
+
    // iterate over local elements which are not neighbours of overlaps (if all overlaps are 0, it is equivalent to forAll)
    template< typename Device2 = DeviceType, typename Func >
    void forLocalInternal( Func f ) const
@@ -268,6 +299,22 @@ public:
       dispatch( begins, ends, f );
    }
 
+   // iterate over local elements which are neighbours of overlaps (if all overlaps are 0, it has no effect)
+   template< typename Device2 = DeviceType, typename Func >
+   void forLocalBoundary( Func f ) const
+   {
+      // add dynamic sizes
+      LocalBeginsType skipBegins;
+      __ndarray_impl::SetSizesAddHelper< 1, LocalBeginsType, SizesHolderType, Overlaps >::add( skipBegins, localBegins, false );
+
+      // subtract dynamic sizes
+      SizesHolderType skipEnds;
+      __ndarray_impl::SetSizesSubtractHelper< 1, SizesHolderType, SizesHolderType, Overlaps >::subtract( skipEnds, localEnds, false );
+
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( localBegins, skipBegins, skipEnds, localEnds, f );
+   }
+
 protected:
    NDArrayView localView;
    CommunicationGroup group = Communicator::NullGroup;
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h
index f26fe1f10..31fd30639 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h
@@ -376,6 +376,90 @@ TYPED_TEST( DistributedNDArrayTest, forLocalInternal )
    test_helper_forLocalInternal( this->distributedNDArray );
 }
 
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forBoundary( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forBoundary( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   {
+      if( gi == 0 || gi == a.template getSize< 0 >() - 1 )
+         EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+      else
+         EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+   }
+
+   a.setValue( 0 );
+   a_view.forBoundary( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   {
+      if( gi == 0 || gi == a.template getSize< 0 >() - 1 )
+         EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+      else
+         EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+   }
+}
+
+TYPED_TEST( DistributedNDArrayTest, forBoundary )
+{
+   test_helper_forBoundary( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forLocalBoundary( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) += 1;
+   };
+
+   a.setValue( 0 );
+   // empty set because all overlaps are 0
+   a.forLocalBoundary( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+
+   a.setValue( 0 );
+   // empty set because all overlaps are 0
+   a_view.forLocalBoundary( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArrayTest, forLocalBoundary )
+{
+   test_helper_forLocalBoundary( this->distributedNDArray );
+}
+
 #endif  // HAVE_GTEST
 
 
-- 
GitLab


From c6221549e9e18d89134c5f15e4bc5201a5a1a460 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sat, 30 Mar 2019 09:47:16 +0100
Subject: [PATCH 14/25] NDArray refactoring: moved helpers from Indexing.h to
 SizesHolderHelpers.h

---
 src/TNL/Containers/ndarray/Indexing.h         | 290 +----------------
 .../Containers/ndarray/SizesHolderHelpers.h   | 307 ++++++++++++++++++
 2 files changed, 308 insertions(+), 289 deletions(-)
 create mode 100644 src/TNL/Containers/ndarray/SizesHolderHelpers.h

diff --git a/src/TNL/Containers/ndarray/Indexing.h b/src/TNL/Containers/ndarray/Indexing.h
index 36a7f89ef..2aa65d8cb 100644
--- a/src/TNL/Containers/ndarray/Indexing.h
+++ b/src/TNL/Containers/ndarray/Indexing.h
@@ -12,300 +12,12 @@
 
 #pragma once
 
-#include <array>
-
-#include <TNL/Assert.h>
-#include <TNL/Devices/CudaCallable.h>
-#include <TNL/StaticFor.h>
-
-#include <TNL/Containers/ndarray/Meta.h>
+#include <TNL/Containers/ndarray/SizesHolderHelpers.h>
 
 namespace TNL {
 namespace Containers {
 namespace __ndarray_impl {
 
-// Dynamic storage size with alignment
-template< typename SizesHolder,
-          typename Alignment,
-          typename LevelTag = IndexTag< SizesHolder::getDimension() - 1 > >
-struct StorageSizeGetter
-{
-   static typename SizesHolder::IndexType
-   __cuda_callable__
-   get( const SizesHolder& sizes )
-   {
-      const auto size = Alignment::template getAlignedSize< LevelTag::value >( sizes );
-      return size * StorageSizeGetter< SizesHolder, Alignment, IndexTag< LevelTag::value - 1 > >::get( sizes );
-   }
-
-   template< typename Permutation >
-   __cuda_callable__
-   static typename SizesHolder::IndexType
-   getPermuted( const SizesHolder& sizes, Permutation )
-   {
-      constexpr std::size_t idx = __ndarray_impl::get< LevelTag::value >( Permutation{} );
-      const auto size = Alignment::template getAlignedSize< idx >( sizes );
-      return size * StorageSizeGetter< SizesHolder, Alignment, IndexTag< LevelTag::value - 1 > >::get( sizes );
-   }
-};
-
-template< typename SizesHolder, typename Alignment >
-struct StorageSizeGetter< SizesHolder, Alignment, IndexTag< 0 > >
-{
-   static typename SizesHolder::IndexType
-   __cuda_callable__
-   get( const SizesHolder& sizes )
-   {
-      return Alignment::template getAlignedSize< 0 >( sizes );
-   }
-
-   template< typename Permutation >
-   __cuda_callable__
-   static typename SizesHolder::IndexType
-   getPermuted( const SizesHolder& sizes, Permutation )
-   {
-      constexpr std::size_t idx = __ndarray_impl::get< 0 >( Permutation{} );
-      return Alignment::template getAlignedSize< idx >( sizes );
-   }
-};
-
-
-// Static storage size without alignment, used in StaticNDArray
-template< typename SizesHolder,
-          typename LevelTag = IndexTag< SizesHolder::getDimension() - 1 > >
-struct StaticStorageSizeGetter
-{
-   constexpr static std::size_t get()
-   {
-      return SizesHolder::template getStaticSize< LevelTag::value >() *
-             StaticStorageSizeGetter< SizesHolder, IndexTag< LevelTag::value - 1 > >::get();
-   }
-};
-
-template< typename SizesHolder >
-struct StaticStorageSizeGetter< SizesHolder, IndexTag< 0 > >
-{
-   constexpr static std::size_t get()
-   {
-      return SizesHolder::template getStaticSize< 0 >();
-   }
-};
-
-
-template< std::size_t level = 0,
-          typename SizesHolder,
-          typename Index,
-          typename... IndexTypes >
-void setSizesHelper( SizesHolder& holder,
-                     Index&& size,
-                     IndexTypes&&... otherSizes )
-{
-   holder.template setSize< level >( std::forward< Index >( size ) );
-   setSizesHelper< level + 1 >( holder, std::forward< IndexTypes >( otherSizes )... );
-}
-
-template< std::size_t level = 0,
-          typename SizesHolder,
-          typename Index >
-void setSizesHelper( SizesHolder& holder,
-                     Index&& size )
-{
-   holder.template setSize< level >( std::forward< Index >( size ) );
-}
-
-
-// helper for the forInternal method
-template< std::size_t ConstValue,
-          typename TargetHolder,
-          typename SourceHolder,
-          typename Overlaps = make_constant_index_sequence< TargetHolder::getDimension(), 0 >,
-          std::size_t level = TargetHolder::getDimension() - 1 >
-struct SetSizesSubtractHelper
-{
-   static void subtract( TargetHolder& target,
-                         const SourceHolder& source,
-                         bool negateOverlaps = true )
-   {
-      if( source.template getStaticSize< level >() == 0 ) {
-         if( negateOverlaps )
-            target.template setSize< level >( source.template getSize< level >() - ConstValue * ! get< level >( Overlaps{} ) );
-         else
-            target.template setSize< level >( source.template getSize< level >() - ConstValue * !! get< level >( Overlaps{} ) );
-      }
-      SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, level - 1 >::subtract( target, source );
-   }
-};
-
-template< std::size_t ConstValue,
-          typename TargetHolder,
-          typename SourceHolder,
-          typename Overlaps >
-struct SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, 0 >
-{
-   static void subtract( TargetHolder& target,
-                         const SourceHolder& source,
-                         bool negateOverlaps = true )
-   {
-      if( source.template getStaticSize< 0 >() == 0 ) {
-         if( negateOverlaps )
-            target.template setSize< 0 >( source.template getSize< 0 >() - ConstValue * ! get< 0 >( Overlaps{} ) );
-         else
-            target.template setSize< 0 >( source.template getSize< 0 >() - ConstValue * !! get< 0 >( Overlaps{} ) );
-      }
-   }
-};
-
-
-// helper for the forInternal method (DistributedNDArray)
-template< std::size_t ConstValue,
-          typename TargetHolder,
-          typename SourceHolder,
-          typename Overlaps = make_constant_index_sequence< TargetHolder::getDimension(), 0 >,
-          std::size_t level = TargetHolder::getDimension() - 1 >
-struct SetSizesAddHelper
-{
-   static void add( TargetHolder& target,
-                    const SourceHolder& source,
-                    bool negateOverlaps = true )
-   {
-      if( source.template getStaticSize< level >() == 0 ) {
-         if( negateOverlaps )
-            target.template setSize< level >( source.template getSize< level >() + ConstValue * ! get< level >( Overlaps{} ) );
-         else
-            target.template setSize< level >( source.template getSize< level >() + ConstValue * !! get< level >( Overlaps{} ) );
-      }
-      SetSizesAddHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, level - 1 >::add( target, source );
-   }
-};
-
-template< std::size_t ConstValue,
-          typename TargetHolder,
-          typename SourceHolder,
-          typename Overlaps >
-struct SetSizesAddHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, 0 >
-{
-   static void add( TargetHolder& target,
-                    const SourceHolder& source,
-                    bool negateOverlaps = true )
-   {
-      if( source.template getStaticSize< 0 >() == 0 ) {
-         if( negateOverlaps )
-            target.template setSize< 0 >( source.template getSize< 0 >() + ConstValue * ! get< 0 >( Overlaps{} ) );
-         else
-            target.template setSize< 0 >( source.template getSize< 0 >() + ConstValue * !! get< 0 >( Overlaps{} ) );
-      }
-   }
-};
-
-
-// helper for the forInternal method (DistributedNDArray)
-template< typename TargetHolder,
-          typename SourceHolder,
-          std::size_t level = TargetHolder::getDimension() - 1 >
-struct SetSizesMaxHelper
-{
-   static void max( TargetHolder& target,
-                    const SourceHolder& source )
-   {
-      if( source.template getStaticSize< level >() == 0 )
-         target.template setSize< level >( std::max( target.template getSize< level >(), source.template getSize< level >() ) );
-      SetSizesMaxHelper< TargetHolder, SourceHolder, level - 1 >::max( target, source );
-   }
-};
-
-template< typename TargetHolder,
-          typename SourceHolder >
-struct SetSizesMaxHelper< TargetHolder, SourceHolder, 0 >
-{
-   static void max( TargetHolder& target,
-                    const SourceHolder& source )
-   {
-      if( source.template getStaticSize< 0 >() == 0 )
-         target.template setSize< 0 >( std::max( target.template getSize< 0 >(), source.template getSize< 0 >() ) );
-   }
-};
-
-
-// helper for the forInternal method (DistributedNDArray)
-template< typename TargetHolder,
-          typename SourceHolder,
-          std::size_t level = TargetHolder::getDimension() - 1 >
-struct SetSizesMinHelper
-{
-   static void min( TargetHolder& target,
-                    const SourceHolder& source )
-   {
-      if( source.template getStaticSize< level >() == 0 )
-         target.template setSize< level >( std::min( target.template getSize< level >(), source.template getSize< level >() ) );
-      SetSizesMinHelper< TargetHolder, SourceHolder, level - 1 >::min( target, source );
-   }
-};
-
-template< typename TargetHolder,
-          typename SourceHolder >
-struct SetSizesMinHelper< TargetHolder, SourceHolder, 0 >
-{
-   static void min( TargetHolder& target,
-                    const SourceHolder& source )
-   {
-      if( source.template getStaticSize< 0 >() == 0 )
-         target.template setSize< 0 >( std::min( target.template getSize< 0 >(), source.template getSize< 0 >() ) );
-   }
-};
-
-
-// A variadic bounds-checker for indices
-template< typename SizesHolder >
-__cuda_callable__
-void assertIndicesInBounds( const SizesHolder& )
-{}
-
-template< typename SizesHolder,
-          typename Index,
-          typename... IndexTypes >
-__cuda_callable__
-void assertIndicesInBounds( const SizesHolder& sizes, Index&& i, IndexTypes&&... indices )
-{
-#ifndef NDEBUG
-   // sizes.template getSize<...>() cannot be inside the assert macro, but the variables
-   // shouldn't be declared when compiling without assertions
-   constexpr std::size_t level = SizesHolder::getDimension() - sizeof...(indices) - 1;
-   const auto size = sizes.template getSize< level >();
-   TNL_ASSERT_LT( i, size, "Input error - some index is out of bounds." );
-#endif
-   assertIndicesInBounds( sizes, std::forward< IndexTypes >( indices )... );
-}
-
-
-// A variadic bounds-checker for distributed indices with overlaps
-template< typename SizesHolder1, typename SizesHolder2, typename Overlaps >
-__cuda_callable__
-void assertIndicesInRange( const SizesHolder1&, const SizesHolder2&, const Overlaps& )
-{}
-
-template< typename SizesHolder1,
-          typename SizesHolder2,
-          typename Overlaps,
-          typename Index,
-          typename... IndexTypes >
-__cuda_callable__
-void assertIndicesInRange( const SizesHolder1& begins, const SizesHolder2& ends, const Overlaps& overlaps, Index&& i, IndexTypes&&... indices )
-{
-   static_assert( SizesHolder1::getDimension() == SizesHolder2::getDimension(),
-                  "Inconsistent begins and ends." );
-#ifndef NDEBUG
-   // sizes.template getSize<...>() cannot be inside the assert macro, but the variables
-   // shouldn't be declared when compiling without assertions
-   constexpr std::size_t level = SizesHolder1::getDimension() - sizeof...(indices) - 1;
-   const auto begin = begins.template getSize< level >();
-   const auto end = ends.template getSize< level >();
-   TNL_ASSERT_LE( begin - get<level>( overlaps ), i, "Input error - some index is below the lower bound." );
-   TNL_ASSERT_LT( i, end + get<level>( overlaps ), "Input error - some index is above the upper bound." );
-#endif
-   assertIndicesInRange( begins, ends, overlaps, std::forward< IndexTypes >( indices )... );
-}
-
-
 template< typename SizesHolder,
           typename Overlaps,
           typename Sequence >
diff --git a/src/TNL/Containers/ndarray/SizesHolderHelpers.h b/src/TNL/Containers/ndarray/SizesHolderHelpers.h
new file mode 100644
index 000000000..143224442
--- /dev/null
+++ b/src/TNL/Containers/ndarray/SizesHolderHelpers.h
@@ -0,0 +1,307 @@
+/***************************************************************************
+                          SizesHolderHelpers.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <algorithm>
+
+#include <TNL/Assert.h>
+#include <TNL/Containers/ndarray/Meta.h>
+
+namespace TNL {
+namespace Containers {
+namespace __ndarray_impl {
+
+// Dynamic storage size with alignment
+template< typename SizesHolder,
+          typename Alignment,
+          typename LevelTag = IndexTag< SizesHolder::getDimension() - 1 > >
+struct StorageSizeGetter
+{
+   static typename SizesHolder::IndexType
+   __cuda_callable__
+   get( const SizesHolder& sizes )
+   {
+      const auto size = Alignment::template getAlignedSize< LevelTag::value >( sizes );
+      return size * StorageSizeGetter< SizesHolder, Alignment, IndexTag< LevelTag::value - 1 > >::get( sizes );
+   }
+
+   template< typename Permutation >
+   __cuda_callable__
+   static typename SizesHolder::IndexType
+   getPermuted( const SizesHolder& sizes, Permutation )
+   {
+      constexpr std::size_t idx = __ndarray_impl::get< LevelTag::value >( Permutation{} );
+      const auto size = Alignment::template getAlignedSize< idx >( sizes );
+      return size * StorageSizeGetter< SizesHolder, Alignment, IndexTag< LevelTag::value - 1 > >::get( sizes );
+   }
+};
+
+template< typename SizesHolder, typename Alignment >
+struct StorageSizeGetter< SizesHolder, Alignment, IndexTag< 0 > >
+{
+   static typename SizesHolder::IndexType
+   __cuda_callable__
+   get( const SizesHolder& sizes )
+   {
+      return Alignment::template getAlignedSize< 0 >( sizes );
+   }
+
+   template< typename Permutation >
+   __cuda_callable__
+   static typename SizesHolder::IndexType
+   getPermuted( const SizesHolder& sizes, Permutation )
+   {
+      constexpr std::size_t idx = __ndarray_impl::get< 0 >( Permutation{} );
+      return Alignment::template getAlignedSize< idx >( sizes );
+   }
+};
+
+
+// Static storage size without alignment, used in StaticNDArray
+template< typename SizesHolder,
+          typename LevelTag = IndexTag< SizesHolder::getDimension() - 1 > >
+struct StaticStorageSizeGetter
+{
+   constexpr static std::size_t get()
+   {
+      return SizesHolder::template getStaticSize< LevelTag::value >() *
+             StaticStorageSizeGetter< SizesHolder, IndexTag< LevelTag::value - 1 > >::get();
+   }
+};
+
+template< typename SizesHolder >
+struct StaticStorageSizeGetter< SizesHolder, IndexTag< 0 > >
+{
+   constexpr static std::size_t get()
+   {
+      return SizesHolder::template getStaticSize< 0 >();
+   }
+};
+
+
+template< std::size_t level = 0,
+          typename SizesHolder,
+          typename Index,
+          typename... IndexTypes >
+void setSizesHelper( SizesHolder& holder,
+                     Index&& size,
+                     IndexTypes&&... otherSizes )
+{
+   holder.template setSize< level >( std::forward< Index >( size ) );
+   setSizesHelper< level + 1 >( holder, std::forward< IndexTypes >( otherSizes )... );
+}
+
+template< std::size_t level = 0,
+          typename SizesHolder,
+          typename Index >
+void setSizesHelper( SizesHolder& holder,
+                     Index&& size )
+{
+   holder.template setSize< level >( std::forward< Index >( size ) );
+}
+
+
+// A variadic bounds-checker for indices
+template< typename SizesHolder >
+__cuda_callable__
+void assertIndicesInBounds( const SizesHolder& )
+{}
+
+template< typename SizesHolder,
+          typename Index,
+          typename... IndexTypes >
+__cuda_callable__
+void assertIndicesInBounds( const SizesHolder& sizes, Index&& i, IndexTypes&&... indices )
+{
+#ifndef NDEBUG
+   // sizes.template getSize<...>() cannot be inside the assert macro, but the variables
+   // shouldn't be declared when compiling without assertions
+   constexpr std::size_t level = SizesHolder::getDimension() - sizeof...(indices) - 1;
+   const auto size = sizes.template getSize< level >();
+   TNL_ASSERT_LT( i, size, "Input error - some index is out of bounds." );
+#endif
+   assertIndicesInBounds( sizes, std::forward< IndexTypes >( indices )... );
+}
+
+
+// A variadic bounds-checker for distributed indices with overlaps
+template< typename SizesHolder1, typename SizesHolder2, typename Overlaps >
+__cuda_callable__
+void assertIndicesInRange( const SizesHolder1&, const SizesHolder2&, const Overlaps& )
+{}
+
+template< typename SizesHolder1,
+          typename SizesHolder2,
+          typename Overlaps,
+          typename Index,
+          typename... IndexTypes >
+__cuda_callable__
+void assertIndicesInRange( const SizesHolder1& begins, const SizesHolder2& ends, const Overlaps& overlaps, Index&& i, IndexTypes&&... indices )
+{
+   static_assert( SizesHolder1::getDimension() == SizesHolder2::getDimension(),
+                  "Inconsistent begins and ends." );
+#ifndef NDEBUG
+   // sizes.template getSize<...>() cannot be inside the assert macro, but the variables
+   // shouldn't be declared when compiling without assertions
+   constexpr std::size_t level = SizesHolder1::getDimension() - sizeof...(indices) - 1;
+   const auto begin = begins.template getSize< level >();
+   const auto end = ends.template getSize< level >();
+   TNL_ASSERT_LE( begin - get<level>( overlaps ), i, "Input error - some index is below the lower bound." );
+   TNL_ASSERT_LT( i, end + get<level>( overlaps ), "Input error - some index is above the upper bound." );
+#endif
+   assertIndicesInRange( begins, ends, overlaps, std::forward< IndexTypes >( indices )... );
+}
+
+
+// helper for the forInternal method
+template< std::size_t ConstValue,
+          typename TargetHolder,
+          typename SourceHolder,
+          typename Overlaps = make_constant_index_sequence< TargetHolder::getDimension(), 0 >,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesSubtractHelper
+{
+   static void subtract( TargetHolder& target,
+                         const SourceHolder& source,
+                         bool negateOverlaps = true )
+   {
+      if( source.template getStaticSize< level >() == 0 ) {
+         if( negateOverlaps )
+            target.template setSize< level >( source.template getSize< level >() - ConstValue * ! get< level >( Overlaps{} ) );
+         else
+            target.template setSize< level >( source.template getSize< level >() - ConstValue * !! get< level >( Overlaps{} ) );
+      }
+      SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, level - 1 >::subtract( target, source );
+   }
+};
+
+template< std::size_t ConstValue,
+          typename TargetHolder,
+          typename SourceHolder,
+          typename Overlaps >
+struct SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, 0 >
+{
+   static void subtract( TargetHolder& target,
+                         const SourceHolder& source,
+                         bool negateOverlaps = true )
+   {
+      if( source.template getStaticSize< 0 >() == 0 ) {
+         if( negateOverlaps )
+            target.template setSize< 0 >( source.template getSize< 0 >() - ConstValue * ! get< 0 >( Overlaps{} ) );
+         else
+            target.template setSize< 0 >( source.template getSize< 0 >() - ConstValue * !! get< 0 >( Overlaps{} ) );
+      }
+   }
+};
+
+
+// helper for the forInternal method (DistributedNDArray)
+template< std::size_t ConstValue,
+          typename TargetHolder,
+          typename SourceHolder,
+          typename Overlaps = make_constant_index_sequence< TargetHolder::getDimension(), 0 >,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesAddHelper
+{
+   static void add( TargetHolder& target,
+                    const SourceHolder& source,
+                    bool negateOverlaps = true )
+   {
+      if( source.template getStaticSize< level >() == 0 ) {
+         if( negateOverlaps )
+            target.template setSize< level >( source.template getSize< level >() + ConstValue * ! get< level >( Overlaps{} ) );
+         else
+            target.template setSize< level >( source.template getSize< level >() + ConstValue * !! get< level >( Overlaps{} ) );
+      }
+      SetSizesAddHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, level - 1 >::add( target, source );
+   }
+};
+
+template< std::size_t ConstValue,
+          typename TargetHolder,
+          typename SourceHolder,
+          typename Overlaps >
+struct SetSizesAddHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, 0 >
+{
+   static void add( TargetHolder& target,
+                    const SourceHolder& source,
+                    bool negateOverlaps = true )
+   {
+      if( source.template getStaticSize< 0 >() == 0 ) {
+         if( negateOverlaps )
+            target.template setSize< 0 >( source.template getSize< 0 >() + ConstValue * ! get< 0 >( Overlaps{} ) );
+         else
+            target.template setSize< 0 >( source.template getSize< 0 >() + ConstValue * !! get< 0 >( Overlaps{} ) );
+      }
+   }
+};
+
+
+// helper for the forInternal method (DistributedNDArray)
+template< typename TargetHolder,
+          typename SourceHolder,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesMaxHelper
+{
+   static void max( TargetHolder& target,
+                    const SourceHolder& source )
+   {
+      if( source.template getStaticSize< level >() == 0 )
+         target.template setSize< level >( std::max( target.template getSize< level >(), source.template getSize< level >() ) );
+      SetSizesMaxHelper< TargetHolder, SourceHolder, level - 1 >::max( target, source );
+   }
+};
+
+template< typename TargetHolder,
+          typename SourceHolder >
+struct SetSizesMaxHelper< TargetHolder, SourceHolder, 0 >
+{
+   static void max( TargetHolder& target,
+                    const SourceHolder& source )
+   {
+      if( source.template getStaticSize< 0 >() == 0 )
+         target.template setSize< 0 >( std::max( target.template getSize< 0 >(), source.template getSize< 0 >() ) );
+   }
+};
+
+
+// helper for the forInternal method (DistributedNDArray)
+template< typename TargetHolder,
+          typename SourceHolder,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesMinHelper
+{
+   static void min( TargetHolder& target,
+                    const SourceHolder& source )
+   {
+      if( source.template getStaticSize< level >() == 0 )
+         target.template setSize< level >( std::min( target.template getSize< level >(), source.template getSize< level >() ) );
+      SetSizesMinHelper< TargetHolder, SourceHolder, level - 1 >::min( target, source );
+   }
+};
+
+template< typename TargetHolder,
+          typename SourceHolder >
+struct SetSizesMinHelper< TargetHolder, SourceHolder, 0 >
+{
+   static void min( TargetHolder& target,
+                    const SourceHolder& source )
+   {
+      if( source.template getStaticSize< 0 >() == 0 )
+         target.template setSize< 0 >( std::min( target.template getSize< 0 >(), source.template getSize< 0 >() ) );
+   }
+};
+
+} // namespace __ndarray_impl
+} // namespace Containers
+} // namespace TNL
-- 
GitLab


From 0dd215e8665eb002ebbfb1c7a30c2529e8041e6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sat, 30 Mar 2019 10:35:53 +0100
Subject: [PATCH 15/25] DistributedNDArray: added forOverlaps method

---
 src/TNL/Containers/DistributedNDArray.h       |  32 +-
 src/TNL/Containers/DistributedNDArrayView.h   |  32 +-
 src/TNL/Containers/ndarray/SizesHolder.h      |   3 +-
 .../Containers/ndarray/SizesHolderHelpers.h   | 114 ++++--
 .../Containers/ndarray/CMakeLists.txt         |  11 +
 .../DistributedNDArrayOverlapsTest.cpp        |   1 +
 .../ndarray/DistributedNDArrayOverlapsTest.cu |   1 +
 .../ndarray/DistributedNDArrayOverlapsTest.h  | 324 ++++++++++++++++++
 .../ndarray/DistributedNDArrayTest.h          |  37 ++
 9 files changed, 500 insertions(+), 55 deletions(-)
 create mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.cpp
 create mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.cu
 create mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.h

diff --git a/src/TNL/Containers/DistributedNDArray.h b/src/TNL/Containers/DistributedNDArray.h
index d5cf49024..339316af9 100644
--- a/src/TNL/Containers/DistributedNDArray.h
+++ b/src/TNL/Containers/DistributedNDArray.h
@@ -258,13 +258,13 @@ public:
    template< typename Device2 = DeviceType, typename Func >
    void forLocalInternal( Func f ) const
    {
-      // add dynamic sizes
+      // add overlaps to dynamic sizes
       LocalBeginsType begins;
-      __ndarray_impl::SetSizesAddHelper< 1, LocalBeginsType, SizesHolderType, Overlaps >::add( begins, localBegins, false );
+      __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::add( begins, localBegins );
 
-      // subtract dynamic sizes
+      // subtract overlaps from dynamic sizes
       SizesHolderType ends;
-      __ndarray_impl::SetSizesSubtractHelper< 1, SizesHolderType, SizesHolderType, Overlaps >::subtract( ends, localEnds, false );
+      __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::subtract( ends, localEnds );
 
       __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
       dispatch( begins, ends, f );
@@ -274,18 +274,34 @@ public:
    template< typename Device2 = DeviceType, typename Func >
    void forLocalBoundary( Func f ) const
    {
-      // add dynamic sizes
+      // add overlaps to dynamic sizes
       LocalBeginsType skipBegins;
-      __ndarray_impl::SetSizesAddHelper< 1, LocalBeginsType, SizesHolderType, Overlaps >::add( skipBegins, localBegins, false );
+      __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::add( skipBegins, localBegins );
 
-      // subtract dynamic sizes
+      // subtract overlaps from dynamic sizes
       SizesHolderType skipEnds;
-      __ndarray_impl::SetSizesSubtractHelper< 1, SizesHolderType, SizesHolderType, Overlaps >::subtract( skipEnds, localEnds, false );
+      __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::subtract( skipEnds, localEnds );
 
       __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
       dispatch( localBegins, skipBegins, skipEnds, localEnds, f );
    }
 
+   // iterate over elements of overlaps (if all overlaps are 0, it has no effect)
+   template< typename Device2 = DeviceType, typename Func >
+   void forOverlaps( Func f ) const
+   {
+      // subtract overlaps from dynamic sizes
+      LocalBeginsType begins;
+      __ndarray_impl::SetSizesSubtractOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::subtract( begins, localBegins );
+
+      // add overlaps to dynamic sizes
+      SizesHolderType ends;
+      __ndarray_impl::SetSizesAddOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::add( ends, localEnds );
+
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, localBegins, localEnds, ends, f );
+   }
+
 
    // extra methods
 
diff --git a/src/TNL/Containers/DistributedNDArrayView.h b/src/TNL/Containers/DistributedNDArrayView.h
index f8150a8d8..f143f5d75 100644
--- a/src/TNL/Containers/DistributedNDArrayView.h
+++ b/src/TNL/Containers/DistributedNDArrayView.h
@@ -287,13 +287,13 @@ public:
    template< typename Device2 = DeviceType, typename Func >
    void forLocalInternal( Func f ) const
    {
-      // add dynamic sizes
+      // add overlaps to dynamic sizes
       LocalBeginsType begins;
-      __ndarray_impl::SetSizesAddHelper< 1, LocalBeginsType, SizesHolderType, Overlaps >::add( begins, localBegins, false );
+      __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::add( begins, localBegins );
 
-      // subtract dynamic sizes
+      // subtract overlaps from dynamic sizes
       SizesHolderType ends;
-      __ndarray_impl::SetSizesSubtractHelper< 1, SizesHolderType, SizesHolderType, Overlaps >::subtract( ends, localEnds, false );
+      __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::subtract( ends, localEnds );
 
       __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
       dispatch( begins, ends, f );
@@ -303,18 +303,34 @@ public:
    template< typename Device2 = DeviceType, typename Func >
    void forLocalBoundary( Func f ) const
    {
-      // add dynamic sizes
+      // add overlaps to dynamic sizes
       LocalBeginsType skipBegins;
-      __ndarray_impl::SetSizesAddHelper< 1, LocalBeginsType, SizesHolderType, Overlaps >::add( skipBegins, localBegins, false );
+      __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::add( skipBegins, localBegins );
 
-      // subtract dynamic sizes
+      // subtract overlaps from dynamic sizes
       SizesHolderType skipEnds;
-      __ndarray_impl::SetSizesSubtractHelper< 1, SizesHolderType, SizesHolderType, Overlaps >::subtract( skipEnds, localEnds, false );
+      __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::subtract( skipEnds, localEnds );
 
       __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
       dispatch( localBegins, skipBegins, skipEnds, localEnds, f );
    }
 
+   // iterate over elements of overlaps (if all overlaps are 0, it has no effect)
+   template< typename Device2 = DeviceType, typename Func >
+   void forOverlaps( Func f ) const
+   {
+      // subtract overlaps from dynamic sizes
+      LocalBeginsType begins;
+      __ndarray_impl::SetSizesSubtractOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::subtract( begins, localBegins );
+
+      // add overlaps to dynamic sizes
+      SizesHolderType ends;
+      __ndarray_impl::SetSizesAddOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::add( ends, localEnds );
+
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, localBegins, localEnds, ends, f );
+   }
+
 protected:
    NDArrayView localView;
    CommunicationGroup group = Communicator::NullGroup;
diff --git a/src/TNL/Containers/ndarray/SizesHolder.h b/src/TNL/Containers/ndarray/SizesHolder.h
index 408d6ed92..ec89a7550 100644
--- a/src/TNL/Containers/ndarray/SizesHolder.h
+++ b/src/TNL/Containers/ndarray/SizesHolder.h
@@ -37,7 +37,7 @@ public:
 
    void setSize( LevelTag, Index newSize )
    {
-      TNL_ASSERT( newSize == 0, );
+      TNL_ASSERT_EQ( newSize, 0, "Dynamic size for a static dimension must be 0." );
    }
 
    __cuda_callable__
@@ -60,7 +60,6 @@ public:
 
    void setSize( LevelTag, Index size )
    {
-      TNL_ASSERT( size >= 0, );
       this->size = size;
    }
 
diff --git a/src/TNL/Containers/ndarray/SizesHolderHelpers.h b/src/TNL/Containers/ndarray/SizesHolderHelpers.h
index 143224442..d98bb3bdc 100644
--- a/src/TNL/Containers/ndarray/SizesHolderHelpers.h
+++ b/src/TNL/Containers/ndarray/SizesHolderHelpers.h
@@ -128,7 +128,7 @@ void assertIndicesInBounds( const SizesHolder& sizes, Index&& i, IndexTypes&&...
    // shouldn't be declared when compiling without assertions
    constexpr std::size_t level = SizesHolder::getDimension() - sizeof...(indices) - 1;
    const auto size = sizes.template getSize< level >();
-   TNL_ASSERT_LT( i, size, "Input error - some index is out of bounds." );
+   TNL_ASSERT_LT( i, (Index) size, "Input error - some index is out of bounds." );
 #endif
    assertIndicesInBounds( sizes, std::forward< IndexTypes >( indices )... );
 }
@@ -156,14 +156,14 @@ void assertIndicesInRange( const SizesHolder1& begins, const SizesHolder2& ends,
    constexpr std::size_t level = SizesHolder1::getDimension() - sizeof...(indices) - 1;
    const auto begin = begins.template getSize< level >();
    const auto end = ends.template getSize< level >();
-   TNL_ASSERT_LE( begin - get<level>( overlaps ), i, "Input error - some index is below the lower bound." );
-   TNL_ASSERT_LT( i, end + get<level>( overlaps ), "Input error - some index is above the upper bound." );
+   TNL_ASSERT_LE( begin - (decltype(begin)) get<level>( overlaps ), i, "Input error - some index is below the lower bound." );
+   TNL_ASSERT_LT( i, end + (decltype(end)) get<level>( overlaps ), "Input error - some index is above the upper bound." );
 #endif
    assertIndicesInRange( begins, ends, overlaps, std::forward< IndexTypes >( indices )... );
 }
 
 
-// helper for the forInternal method
+// helper for the forInternal and forBoundary methods (NDArray and DistributedNDArray)
 template< std::size_t ConstValue,
           typename TargetHolder,
           typename SourceHolder,
@@ -172,15 +172,10 @@ template< std::size_t ConstValue,
 struct SetSizesSubtractHelper
 {
    static void subtract( TargetHolder& target,
-                         const SourceHolder& source,
-                         bool negateOverlaps = true )
+                         const SourceHolder& source )
    {
-      if( source.template getStaticSize< level >() == 0 ) {
-         if( negateOverlaps )
-            target.template setSize< level >( source.template getSize< level >() - ConstValue * ! get< level >( Overlaps{} ) );
-         else
-            target.template setSize< level >( source.template getSize< level >() - ConstValue * !! get< level >( Overlaps{} ) );
-      }
+      if( source.template getStaticSize< level >() == 0 )
+         target.template setSize< level >( source.template getSize< level >() - ConstValue * ! get< level >( Overlaps{} ) );
       SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, level - 1 >::subtract( target, source );
    }
 };
@@ -192,20 +187,15 @@ template< std::size_t ConstValue,
 struct SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, 0 >
 {
    static void subtract( TargetHolder& target,
-                         const SourceHolder& source,
-                         bool negateOverlaps = true )
+                         const SourceHolder& source )
    {
-      if( source.template getStaticSize< 0 >() == 0 ) {
-         if( negateOverlaps )
-            target.template setSize< 0 >( source.template getSize< 0 >() - ConstValue * ! get< 0 >( Overlaps{} ) );
-         else
-            target.template setSize< 0 >( source.template getSize< 0 >() - ConstValue * !! get< 0 >( Overlaps{} ) );
-      }
+      if( source.template getStaticSize< 0 >() == 0 )
+         target.template setSize< 0 >( source.template getSize< 0 >() - ConstValue * ! get< 0 >( Overlaps{} ) );
    }
 };
 
 
-// helper for the forInternal method (DistributedNDArray)
+// helper for the forInternal and forBoundary methods (DistributedNDArray)
 template< std::size_t ConstValue,
           typename TargetHolder,
           typename SourceHolder,
@@ -214,15 +204,10 @@ template< std::size_t ConstValue,
 struct SetSizesAddHelper
 {
    static void add( TargetHolder& target,
-                    const SourceHolder& source,
-                    bool negateOverlaps = true )
+                    const SourceHolder& source )
    {
-      if( source.template getStaticSize< level >() == 0 ) {
-         if( negateOverlaps )
-            target.template setSize< level >( source.template getSize< level >() + ConstValue * ! get< level >( Overlaps{} ) );
-         else
-            target.template setSize< level >( source.template getSize< level >() + ConstValue * !! get< level >( Overlaps{} ) );
-      }
+      if( source.template getStaticSize< level >() == 0 )
+         target.template setSize< level >( source.template getSize< level >() + ConstValue * ! get< level >( Overlaps{} ) );
       SetSizesAddHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, level - 1 >::add( target, source );
    }
 };
@@ -234,15 +219,70 @@ template< std::size_t ConstValue,
 struct SetSizesAddHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, 0 >
 {
    static void add( TargetHolder& target,
-                    const SourceHolder& source,
-                    bool negateOverlaps = true )
+                    const SourceHolder& source )
    {
-      if( source.template getStaticSize< 0 >() == 0 ) {
-         if( negateOverlaps )
-            target.template setSize< 0 >( source.template getSize< 0 >() + ConstValue * ! get< 0 >( Overlaps{} ) );
-         else
-            target.template setSize< 0 >( source.template getSize< 0 >() + ConstValue * !! get< 0 >( Overlaps{} ) );
-      }
+      if( source.template getStaticSize< 0 >() == 0 )
+         target.template setSize< 0 >( source.template getSize< 0 >() + ConstValue * ! get< 0 >( Overlaps{} ) );
+   }
+};
+
+
+// helper for the forLocalInternal, forLocalBoundary and forOverlaps methods (DistributedNDArray)
+template< typename TargetHolder,
+          typename SourceHolder,
+          typename Overlaps = make_constant_index_sequence< TargetHolder::getDimension(), 0 >,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesSubtractOverlapsHelper
+{
+   static void subtract( TargetHolder& target,
+                         const SourceHolder& source )
+   {
+      if( source.template getStaticSize< level >() == 0 )
+         target.template setSize< level >( source.template getSize< level >() - get< level >( Overlaps{} ) );
+      SetSizesSubtractOverlapsHelper< TargetHolder, SourceHolder, Overlaps, level - 1 >::subtract( target, source );
+   }
+};
+
+template< typename TargetHolder,
+          typename SourceHolder,
+          typename Overlaps >
+struct SetSizesSubtractOverlapsHelper< TargetHolder, SourceHolder, Overlaps, 0 >
+{
+   static void subtract( TargetHolder& target,
+                         const SourceHolder& source )
+   {
+      if( source.template getStaticSize< 0 >() == 0 )
+         target.template setSize< 0 >( source.template getSize< 0 >() - get< 0 >( Overlaps{} ) );
+   }
+};
+
+
+// helper for the forLocalInternal, forLocalBoundary and forOverlaps methods (DistributedNDArray)
+template< typename TargetHolder,
+          typename SourceHolder,
+          typename Overlaps = make_constant_index_sequence< TargetHolder::getDimension(), 0 >,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesAddOverlapsHelper
+{
+   static void add( TargetHolder& target,
+                    const SourceHolder& source )
+   {
+      if( source.template getStaticSize< level >() == 0 )
+         target.template setSize< level >( source.template getSize< level >() + get< level >( Overlaps{} ) );
+      SetSizesAddOverlapsHelper< TargetHolder, SourceHolder, Overlaps, level - 1 >::add( target, source );
+   }
+};
+
+template< typename TargetHolder,
+          typename SourceHolder,
+          typename Overlaps >
+struct SetSizesAddOverlapsHelper< TargetHolder, SourceHolder, Overlaps, 0 >
+{
+   static void add( TargetHolder& target,
+                    const SourceHolder& source )
+   {
+      if( source.template getStaticSize< 0 >() == 0 )
+         target.template setSize< 0 >( source.template getSize< 0 >() + get< 0 >( Overlaps{} ) );
    }
 };
 
diff --git a/src/UnitTests/Containers/ndarray/CMakeLists.txt b/src/UnitTests/Containers/ndarray/CMakeLists.txt
index 9e7e0ff4e..d2b385eeb 100644
--- a/src/UnitTests/Containers/ndarray/CMakeLists.txt
+++ b/src/UnitTests/Containers/ndarray/CMakeLists.txt
@@ -30,12 +30,23 @@ if( ${BUILD_MPI} )
       CUDA_ADD_EXECUTABLE( DistributedNDArrayTest DistributedNDArrayTest.cu
                            OPTIONS ${CXX_TESTS_FLAGS} )
       TARGET_LINK_LIBRARIES( DistributedNDArrayTest ${GTEST_BOTH_LIBRARIES} )
+
+      CUDA_ADD_EXECUTABLE( DistributedNDArrayOverlapsTest DistributedNDArrayOverlapsTest.cu
+                           OPTIONS ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedNDArrayOverlapsTest ${GTEST_BOTH_LIBRARIES} )
    else()
       ADD_EXECUTABLE( DistributedNDArrayTest DistributedNDArrayTest.cpp )
       TARGET_COMPILE_OPTIONS( DistributedNDArrayTest PRIVATE ${CXX_TESTS_FLAGS} )
       TARGET_LINK_LIBRARIES( DistributedNDArrayTest ${GTEST_BOTH_LIBRARIES} )
+
+      ADD_EXECUTABLE( DistributedNDArrayOverlapsTest DistributedNDArrayOverlapsTest.cpp )
+      TARGET_COMPILE_OPTIONS( DistributedNDArrayOverlapsTest PRIVATE ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedNDArrayOverlapsTest ${GTEST_BOTH_LIBRARIES} )
    endif()
 
    SET( mpi_test_parameters -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedNDArrayTest${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedNDArrayTest COMMAND "mpirun" ${mpi_test_parameters})
+
+   SET( mpi_test_parameters -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedNDArrayOverlapsTest${CMAKE_EXECUTABLE_SUFFIX}" )
+   ADD_TEST( NAME DistributedNDArrayOverlapsTest COMMAND "mpirun" ${mpi_test_parameters})
 endif()
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.cpp b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.cpp
new file mode 100644
index 000000000..cb294c4fd
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.cpp
@@ -0,0 +1 @@
+#include "DistributedNDArrayOverlapsTest.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.cu b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.cu
new file mode 100644
index 000000000..cb294c4fd
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.cu
@@ -0,0 +1 @@
+#include "DistributedNDArrayOverlapsTest.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.h
new file mode 100644
index 000000000..b04fab529
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.h
@@ -0,0 +1,324 @@
+/***************************************************************************
+                          DistributedNDArrayOverlapsTest.h  -  description
+                             -------------------
+    begin                : Dec 27, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/NoDistrCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/Containers/DistributedNDArray.h>
+#include <TNL/Containers/DistributedNDArrayView.h>
+#include <TNL/Containers/ArrayView.h>
+#include <TNL/Containers/Partitioner.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+/*
+ * Light check of DistributedNDArray.
+ *
+ * - Number of processes is not limited.
+ * - Global size is hardcoded as 97 to force non-uniform distribution.
+ * - Communication group is hardcoded as AllGroup -- it may be changed as needed.
+ */
+template< typename DistributedNDArray >
+class DistributedNDArrayOverlapsTest
+: public ::testing::Test
+{
+protected:
+   using ValueType = typename DistributedNDArray::ValueType;
+   using DeviceType = typename DistributedNDArray::DeviceType;
+   using CommunicatorType = typename DistributedNDArray::CommunicatorType;
+   using IndexType = typename DistributedNDArray::IndexType;
+   using DistributedNDArrayType = DistributedNDArray;
+
+   // TODO: use ndarray
+   using LocalArrayType = Array< ValueType, DeviceType, IndexType >;
+   using LocalArrayViewType = ArrayView< ValueType, DeviceType, IndexType >;
+
+   const int globalSize = 97;  // prime number to force non-uniform distribution
+   const int overlaps = __ndarray_impl::get< 0 >( typename DistributedNDArray::OverlapsType{} );
+
+   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+
+   DistributedNDArrayType distributedNDArray;
+
+   const int rank = CommunicatorType::GetRank(group);
+   const int nproc = CommunicatorType::GetSize(group);
+
+   DistributedNDArrayOverlapsTest()
+   {
+      using LocalRangeType = typename DistributedNDArray::LocalRangeType;
+      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      distributedNDArray.setSizes( globalSize );
+      distributedNDArray.template setDistribution< 0 >( localRange.getBegin(), localRange.getEnd(), group );
+      distributedNDArray.allocate();
+
+      EXPECT_EQ( distributedNDArray.template getLocalRange< 0 >(), localRange );
+      EXPECT_EQ( distributedNDArray.getCommunicationGroup(), group );
+   }
+};
+
+// types for which DistributedNDArrayOverlapsTest is instantiated
+using DistributedNDArrayTypes = ::testing::Types<
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 0 >,
+                                std::index_sequence< 0 >,
+                                std::index_sequence< 0 >,
+                                Devices::Host >,
+                       Communicators::MpiCommunicator,
+                       std::index_sequence< 2 > >,
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 0 >,
+                                std::index_sequence< 0 >,
+                                std::index_sequence< 0 >,
+                                Devices::Host >,
+                       Communicators::NoDistrCommunicator,
+                       std::index_sequence< 2 > >
+#ifdef HAVE_CUDA
+   ,
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 0 >,
+                                std::index_sequence< 0 >,
+                                std::index_sequence< 0 >,
+                                Devices::Cuda >,
+                       Communicators::MpiCommunicator,
+                       std::index_sequence< 2 > >,
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 0 >,
+                                std::index_sequence< 0 >,
+                                std::index_sequence< 0 >,
+                                Devices::Cuda >,
+                       Communicators::NoDistrCommunicator,
+                       std::index_sequence< 2 > >
+#endif
+>;
+
+TYPED_TEST_SUITE( DistributedNDArrayOverlapsTest, DistributedNDArrayTypes );
+
+TYPED_TEST( DistributedNDArrayOverlapsTest, checkSumOfLocalSizes )
+{
+   using CommunicatorType = typename TestFixture::CommunicatorType;
+
+   const auto localRange = this->distributedNDArray.template getLocalRange< 0 >();
+   const int localSize = localRange.getEnd() - localRange.getBegin();
+   int sumOfLocalSizes = 0;
+   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   EXPECT_EQ( sumOfLocalSizes, this->globalSize );
+   EXPECT_EQ( this->distributedNDArray.template getSize< 0 >(), this->globalSize );
+
+   EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 2 * this->overlaps + localSize );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forLocalInternal( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const int overlaps = __ndarray_impl::get< 0 >( typename DistributedArray::OverlapsType{} );
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forLocalInternal( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+   for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+   for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+
+   a.setValue( 0 );
+   a_view.forLocalInternal( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+   for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+   for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArrayOverlapsTest, forLocalInternal )
+{
+   test_helper_forLocalInternal( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forLocalBoundary( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const int overlaps = __ndarray_impl::get< 0 >( typename DistributedArray::OverlapsType{} );
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forLocalBoundary( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+   for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+   for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+
+   a.setValue( 0 );
+   a_view.forLocalBoundary( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+   for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+   for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArrayOverlapsTest, forLocalBoundary )
+{
+   test_helper_forLocalBoundary( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forOverlaps( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const int overlaps = __ndarray_impl::get< 0 >( typename DistributedArray::OverlapsType{} );
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forOverlaps( setter );
+
+   for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+   for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+
+   a.setValue( 0 );
+   a_view.forOverlaps( setter );
+
+   for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+   for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArrayOverlapsTest, forOverlaps )
+{
+   test_helper_forOverlaps( this->distributedNDArray );
+}
+
+#endif  // HAVE_GTEST
+
+
+#if (defined(HAVE_GTEST) && defined(HAVE_MPI))
+using CommunicatorType = Communicators::MpiCommunicator;
+
+#include <sstream>
+
+class MinimalistBufferedPrinter
+: public ::testing::EmptyTestEventListener
+{
+private:
+   std::stringstream sout;
+
+public:
+   // Called before a test starts.
+   virtual void OnTestStart(const ::testing::TestInfo& test_info)
+   {
+      sout << test_info.test_case_name() << "." << test_info.name() << " Start." << std::endl;
+   }
+
+   // Called after a failed assertion or a SUCCEED() invocation.
+   virtual void OnTestPartResult(const ::testing::TestPartResult& test_part_result)
+   {
+      sout << (test_part_result.failed() ? "====Failure=== " : "===Success=== ")
+           << test_part_result.file_name() << " "
+           << test_part_result.line_number() <<std::endl
+           << test_part_result.summary() <<std::endl;
+   }
+
+   // Called after a test ends.
+   virtual void OnTestEnd(const ::testing::TestInfo& test_info)
+   {
+      const int rank = CommunicatorType::GetRank(CommunicatorType::AllGroup);
+      sout << test_info.test_case_name() << "." << test_info.name() << " End." <<std::endl;
+      std::cout << rank << ":" << std::endl << sout.str()<< std::endl;
+      sout.str( std::string() );
+      sout.clear();
+   }
+};
+#endif
+
+#include "../../GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+
+   #ifdef HAVE_MPI
+      ::testing::TestEventListeners& listeners =
+         ::testing::UnitTest::GetInstance()->listeners();
+
+      delete listeners.Release(listeners.default_result_printer());
+      listeners.Append(new MinimalistBufferedPrinter);
+
+      Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
+   #endif
+   return RUN_ALL_TESTS();
+#else
+   throw GtestMissingError();
+#endif
+}
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h
index 31fd30639..11d295170 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h
@@ -460,6 +460,43 @@ TYPED_TEST( DistributedNDArrayTest, forLocalBoundary )
    test_helper_forLocalBoundary( this->distributedNDArray );
 }
 
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forOverlaps( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) += 1;
+   };
+
+   a.setValue( 0 );
+   // empty set because all overlaps are 0
+   a.forOverlaps( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+
+   a.setValue( 0 );
+   // empty set because all overlaps are 0
+   a_view.forOverlaps( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArrayTest, forOverlaps )
+{
+   test_helper_forOverlaps( this->distributedNDArray );
+}
+
 #endif  // HAVE_GTEST
 
 
-- 
GitLab


From 3934beaf5ee8f12406916c0cc33077f5a3d37dfe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sun, 31 Mar 2019 22:56:47 +0200
Subject: [PATCH 16/25] Added DistributedNDArraySynchronizer

---
 .../DistributedNDArraySynchronizer.h          | 242 ++++++++++++++++++
 src/TNL/Containers/ndarray/Indexing.h         |  45 ++++
 .../Containers/ndarray/SynchronizerBuffers.h  |  89 +++++++
 .../ndarray/DistributedNDArrayOverlapsTest.h  |  83 +++++-
 4 files changed, 445 insertions(+), 14 deletions(-)
 create mode 100644 src/TNL/Containers/DistributedNDArraySynchronizer.h
 create mode 100644 src/TNL/Containers/ndarray/SynchronizerBuffers.h

diff --git a/src/TNL/Containers/DistributedNDArraySynchronizer.h b/src/TNL/Containers/DistributedNDArraySynchronizer.h
new file mode 100644
index 000000000..e6e41ba33
--- /dev/null
+++ b/src/TNL/Containers/DistributedNDArraySynchronizer.h
@@ -0,0 +1,242 @@
+/***************************************************************************
+                          DistributedNDArraySynchronizer.h  -  description
+                             -------------------
+    begin                : Mar 30, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <future>
+
+#include <TNL/Containers/ndarray/SynchronizerBuffers.h>
+
+namespace TNL {
+namespace Containers {
+
+template< typename DistributedNDArray >
+class DistributedNDArraySynchronizer
+{
+public:
+   void synchronize( DistributedNDArray& array )
+   {
+      auto future = synchronizeAsync( array, std::launch::deferred );
+      future.wait();
+   }
+
+   // This method is not thread-safe - only the thread which created and "owns" the
+   // instance of this object can call this method.
+   // Also note that this method must not be called again until the previous
+   // asynchronous operation has finished.
+   std::shared_future<void> synchronizeAsync( DistributedNDArray& array, std::launch policy = std::launch::async )
+   {
+      // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/
+      #ifdef HAVE_CUDA
+      if( std::is_same< typename DistributedNDArray::DeviceType, Devices::Cuda >::value )
+         cudaGetDevice(&this->gpu_id);
+      #endif
+
+      // NOTE: the allocation cannot be done in the worker, otherwise CUDA would crash
+      // skip allocation on repeated calls - compare only sizes, not the actual data
+      if( array_view.getCommunicationGroup() != array.getCommunicationGroup() ||
+          array_view.getSizes() != array.getSizes() ||
+          array_view.getLocalBegins() != array.getLocalBegins() ||
+          array_view.getLocalEnds() != array.getLocalEnds() )
+      {
+         array_view.bind( array.getView() );
+
+         // allocate buffers
+         TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), AllocateHelper >::execHost( buffers, array_view );
+      }
+      else {
+         // only bind to the actual data
+         array_view.bind( array.getView() );
+      }
+
+      auto worker = [this](){ this->worker(); };
+      return std::async( policy, worker );
+   }
+
+protected:
+   using DistributedNDArrayView = typename DistributedNDArray::ViewType;
+   using Communicator = typename DistributedNDArray::CommunicatorType;
+   using Buffers = __ndarray_impl::SynchronizerBuffers< DistributedNDArray >;
+
+   DistributedNDArrayView array_view;
+   Buffers buffers;
+   int gpu_id = 0;
+
+   void worker()
+   {
+      // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/
+      #ifdef HAVE_CUDA
+      if( std::is_same< typename DistributedNDArray::DeviceType, Devices::Cuda >::value )
+         cudaSetDevice(gpu_id);
+      #endif
+
+      // fill send buffers
+      TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, true );
+
+      // issue all send and receive async operations
+      std::vector< typename Communicator::Request > requests;
+      const typename Communicator::CommunicationGroup group = array_view.getCommunicationGroup();
+      TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), SendHelper >::execHost( buffers, requests, group );
+
+      // wait until send is done
+      Communicator::WaitAll( requests.data(), requests.size() );
+
+      // copy data from receive buffers
+      TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, false );
+   }
+
+   template< std::size_t dim >
+   struct AllocateHelper
+   {
+      static void exec( Buffers& buffers, const DistributedNDArrayView& array_view )
+      {
+         auto& dim_buffers = buffers.template getDimBuffers< dim >();
+
+         constexpr std::size_t overlap = __ndarray_impl::get< dim >( typename DistributedNDArray::OverlapsType{} );
+         // TODO
+//         constexpr std::size_t overlap = array_view.template getOverlap< dim >();
+         if( overlap == 0 ) {
+            dim_buffers.reset();
+            return;
+         }
+
+         using LocalBegins = typename DistributedNDArray::LocalBeginsType;
+         using SizesHolder = typename DistributedNDArray::SizesHolderType;
+         const LocalBegins& localBegins = array_view.getLocalBegins();
+         const SizesHolder& localEnds = array_view.getLocalEnds();
+
+         SizesHolder bufferSize( localEnds );
+         bufferSize.template setSize< dim >( overlap );
+
+         dim_buffers.left_send_buffer.setSize( bufferSize );
+         dim_buffers.left_recv_buffer.setSize( bufferSize );
+         dim_buffers.right_send_buffer.setSize( bufferSize );
+         dim_buffers.right_recv_buffer.setSize( bufferSize );
+
+         // TODO: check overlap offsets for 2D and 3D distributions (watch out for the corners - maybe use SetSizesSubtractOverlapsHelper?)
+
+         // offsets for left-send
+         dim_buffers.left_send_offsets = localBegins;
+
+         // offsets for left-receive
+         dim_buffers.left_recv_offsets = localBegins;
+         dim_buffers.left_recv_offsets.template setSize< dim >( localBegins.template getSize< dim >() - overlap );
+
+         // offsets for right-send
+         dim_buffers.right_send_offsets = localBegins;
+         dim_buffers.right_send_offsets.template setSize< dim >( localEnds.template getSize< dim >() - overlap );
+
+         // offsets for right-receive
+         dim_buffers.right_recv_offsets = localBegins;
+         dim_buffers.right_recv_offsets.template setSize< dim >( localEnds.template getSize< dim >() );
+
+         // FIXME: set proper neighbor IDs !!!
+         const typename Communicator::CommunicationGroup group = array_view.getCommunicationGroup();
+         const int rank = Communicator::GetRank(group);
+         const int nproc = Communicator::GetSize(group);
+         dim_buffers.left_neighbor = (rank + nproc - 1) % nproc;
+         dim_buffers.right_neighbor = (rank + 1) % nproc;
+      }
+   };
+
+   template< std::size_t dim >
+   struct CopyHelper
+   {
+      static void exec( Buffers& buffers, DistributedNDArrayView& array_view, bool to_buffer )
+      {
+         const std::size_t overlap = __ndarray_impl::get< dim >( typename DistributedNDArray::OverlapsType{} );
+         if( overlap == 0 )
+            return;
+
+         auto& dim_buffers = buffers.template getDimBuffers< dim >();
+
+         // TODO: specify CUDA stream for the copy, otherwise async won't work !!!
+         CopyKernel< decltype(dim_buffers.left_send_buffer.getView()) > copy_kernel;
+         copy_kernel.array_view.bind( array_view );
+         copy_kernel.to_buffer = to_buffer;
+
+         if( to_buffer ) {
+            copy_kernel.buffer_view.bind( dim_buffers.left_send_buffer.getView() );
+            copy_kernel.array_offsets = dim_buffers.left_send_offsets;
+            dim_buffers.left_send_buffer.forAll( copy_kernel );
+
+            copy_kernel.buffer_view.bind( dim_buffers.right_send_buffer.getView() );
+            copy_kernel.array_offsets = dim_buffers.right_send_offsets;
+            dim_buffers.right_send_buffer.forAll( copy_kernel );
+         }
+         else {
+            copy_kernel.buffer_view.bind( dim_buffers.left_recv_buffer.getView() );
+            copy_kernel.array_offsets = dim_buffers.left_recv_offsets;
+            dim_buffers.left_recv_buffer.forAll( copy_kernel );
+
+            copy_kernel.buffer_view.bind( dim_buffers.right_recv_buffer.getView() );
+            copy_kernel.array_offsets = dim_buffers.right_recv_offsets;
+            dim_buffers.right_recv_buffer.forAll( copy_kernel );
+         }
+      }
+   };
+
+   template< std::size_t dim >
+   struct SendHelper
+   {
+      template< typename Requests, typename Group >
+      static void exec( Buffers& buffers, Requests& requests, Group group )
+      {
+         const std::size_t overlap = __ndarray_impl::get< dim >( typename DistributedNDArray::OverlapsType{} );
+         if( overlap == 0 )
+            return;
+
+         auto& dim_buffers = buffers.template getDimBuffers< dim >();
+
+         requests.push_back( Communicator::ISend( dim_buffers.left_send_buffer.getStorageArray().getData(),
+                                                  dim_buffers.left_send_buffer.getStorageSize(),
+                                                  dim_buffers.left_neighbor, 0, group ) );
+         requests.push_back( Communicator::IRecv( dim_buffers.left_recv_buffer.getStorageArray().getData(),
+                                                  dim_buffers.left_recv_buffer.getStorageSize(),
+                                                  dim_buffers.left_neighbor, 1, group ) );
+         requests.push_back( Communicator::ISend( dim_buffers.right_send_buffer.getStorageArray().getData(),
+                                                  dim_buffers.right_send_buffer.getStorageSize(),
+                                                  dim_buffers.right_neighbor, 1, group ) );
+         requests.push_back( Communicator::IRecv( dim_buffers.right_recv_buffer.getStorageArray().getData(),
+                                                  dim_buffers.right_recv_buffer.getStorageSize(),
+                                                  dim_buffers.right_neighbor, 0, group ) );
+      }
+   };
+
+#ifdef __NVCC__
+public:
+#endif
+   template< typename BufferView >
+   struct CopyKernel
+   {
+      using ArrayView = typename DistributedNDArray::ViewType;
+      using LocalBegins = typename ArrayView::LocalBeginsType;
+
+      BufferView buffer_view;
+      ArrayView array_view;
+      LocalBegins array_offsets;
+      bool to_buffer;
+
+      template< typename... Indices >
+      __cuda_callable__
+      void operator()( Indices... indices )
+      {
+         if( to_buffer )
+            buffer_view( indices... ) = call_with_shifted_indices( array_offsets, array_view, indices... );
+         else
+            call_with_shifted_indices( array_offsets, array_view, indices... ) = buffer_view( indices... );
+      }
+   };
+};
+
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/ndarray/Indexing.h b/src/TNL/Containers/ndarray/Indexing.h
index 2aa65d8cb..04316ffe0 100644
--- a/src/TNL/Containers/ndarray/Indexing.h
+++ b/src/TNL/Containers/ndarray/Indexing.h
@@ -18,6 +18,51 @@ namespace TNL {
 namespace Containers {
 namespace __ndarray_impl {
 
+template< typename OffsetsHolder,
+          typename Sequence >
+struct IndexShiftHelper
+{};
+
+template< typename OffsetsHolder,
+          std::size_t... N >
+struct IndexShiftHelper< OffsetsHolder, std::index_sequence< N... > >
+{
+   template< typename Func,
+             typename... Indices >
+   __cuda_callable__
+   static auto apply( const OffsetsHolder& offsets, Func&& f, Indices&&... indices ) -> decltype(auto)
+   {
+      return f( ( std::forward< Indices >( indices ) + offsets.template getSize< N >() )... );
+   }
+
+   template< typename Func,
+             typename... Indices >
+   static auto apply_host( const OffsetsHolder& offsets, Func&& f, Indices&&... indices ) -> decltype(auto)
+   {
+      return f( ( std::forward< Indices >( indices ) + offsets.template getSize< N >() )... );
+   }
+};
+
+template< typename OffsetsHolder,
+          typename Func,
+          typename... Indices >
+__cuda_callable__
+auto call_with_shifted_indices( const OffsetsHolder& offsets, Func&& f, Indices&&... indices ) -> decltype(auto)
+{
+   return IndexShiftHelper< OffsetsHolder, std::make_index_sequence< sizeof...( Indices ) > >
+          ::apply( offsets, std::forward< Func >( f ), std::forward< Indices >( indices )... );
+}
+
+template< typename OffsetsHolder,
+          typename Func,
+          typename... Indices >
+auto host_call_with_unshifted_indices( const OffsetsHolder& offsets, Func&& f, Indices&&... indices ) -> decltype(auto)
+{
+   return IndexShiftHelper< OffsetsHolder, std::make_index_sequence< sizeof...( Indices ) > >
+          ::apply_host( offsets, std::forward< Func >( f ), std::forward< Indices >( indices )... );
+}
+
+
 template< typename SizesHolder,
           typename Overlaps,
           typename Sequence >
diff --git a/src/TNL/Containers/ndarray/SynchronizerBuffers.h b/src/TNL/Containers/ndarray/SynchronizerBuffers.h
new file mode 100644
index 000000000..5b6441a70
--- /dev/null
+++ b/src/TNL/Containers/ndarray/SynchronizerBuffers.h
@@ -0,0 +1,89 @@
+/***************************************************************************
+                          SynchronizerBuffers.h  -  description
+                             -------------------
+    begin                : Mar 30, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Containers/NDArray.h>
+
+namespace TNL {
+namespace Containers {
+namespace __ndarray_impl {
+
+template< typename DistributedNDArray, std::size_t level >
+struct SynchronizerBuffersLayer
+{
+   SynchronizerBuffersLayer& getDimBuffers( std::integral_constant< std::size_t, level > )
+   {
+      return *this;
+   }
+
+   using NDArrayType = NDArray< typename DistributedNDArray::ValueType,
+                                typename DistributedNDArray::SizesHolderType,
+                                typename DistributedNDArray::PermutationType,
+                                typename DistributedNDArray::PermutationType,
+                                typename DistributedNDArray::DeviceType >;
+   NDArrayType left_send_buffer, left_recv_buffer, right_send_buffer, right_recv_buffer;
+   typename DistributedNDArray::LocalBeginsType left_send_offsets, left_recv_offsets, right_send_offsets, right_recv_offsets;
+
+   int left_neighbor = -1;
+   int right_neighbor = -1;
+
+   void reset()
+   {
+      left_send_buffer.reset();
+      left_recv_buffer.reset();
+      right_send_buffer.reset();
+      right_recv_buffer.reset();
+
+      left_send_offsets = left_recv_offsets = right_send_offsets = right_recv_offsets = typename DistributedNDArray::LocalBeginsType{};
+
+      left_neighbor = right_neighbor = -1;
+   }
+};
+
+template< typename DistributedNDArray,
+          typename LevelTag = std::integral_constant< std::size_t, DistributedNDArray::getDimension() > >
+struct SynchronizerBuffersLayerHelper
+{};
+
+template< typename DistributedNDArray, std::size_t level >
+struct SynchronizerBuffersLayerHelper< DistributedNDArray, std::integral_constant< std::size_t, level > >
+: public SynchronizerBuffersLayerHelper< DistributedNDArray, std::integral_constant< std::size_t, level - 1 > >,
+  public SynchronizerBuffersLayer< DistributedNDArray, level >
+{
+   using SynchronizerBuffersLayerHelper< DistributedNDArray, std::integral_constant< std::size_t, level - 1 > >::getDimBuffers;
+   using SynchronizerBuffersLayer< DistributedNDArray, level >::getDimBuffers;
+};
+
+template< typename DistributedNDArray >
+struct SynchronizerBuffersLayerHelper< DistributedNDArray, std::integral_constant< std::size_t, 0 > >
+: public SynchronizerBuffersLayer< DistributedNDArray, 0 >
+{
+   using SynchronizerBuffersLayer< DistributedNDArray, 0 >::getDimBuffers;
+};
+
+template< typename DistributedNDArray >
+struct SynchronizerBuffers
+: public SynchronizerBuffersLayerHelper< DistributedNDArray >
+{
+   using SynchronizerBuffersLayerHelper< DistributedNDArray >::getDimBuffers;
+
+   template< std::size_t level >
+   auto& getDimBuffers()
+   {
+      return this->getDimBuffers( std::integral_constant< std::size_t, level >{} );
+   }
+};
+
+} // namespace __ndarray_impl
+} // namespace Containers
+} // namespace TNL
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.h
index b04fab529..7ffee51ac 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.h
@@ -14,6 +14,7 @@
 #include <TNL/Communicators/ScopedInitializer.h>
 #include <TNL/Containers/DistributedNDArray.h>
 #include <TNL/Containers/DistributedNDArrayView.h>
+#include <TNL/Containers/DistributedNDArraySynchronizer.h>
 #include <TNL/Containers/ArrayView.h>
 #include <TNL/Containers/Partitioner.h>
 
@@ -73,14 +74,15 @@ using DistributedNDArrayTypes = ::testing::Types<
                                 std::index_sequence< 0 >,
                                 Devices::Host >,
                        Communicators::MpiCommunicator,
-                       std::index_sequence< 2 > >,
-   DistributedNDArray< NDArray< double,
-                                SizesHolder< int, 0 >,
-                                std::index_sequence< 0 >,
-                                std::index_sequence< 0 >,
-                                Devices::Host >,
-                       Communicators::NoDistrCommunicator,
                        std::index_sequence< 2 > >
+// TODO: does it make sense for NoDistrCommunicator?
+//   DistributedNDArray< NDArray< double,
+//                                SizesHolder< int, 0 >,
+//                                std::index_sequence< 0 >,
+//                                std::index_sequence< 0 >,
+//                                Devices::Host >,
+//                       Communicators::NoDistrCommunicator,
+//                       std::index_sequence< 2 > >
 #ifdef HAVE_CUDA
    ,
    DistributedNDArray< NDArray< double,
@@ -89,14 +91,15 @@ using DistributedNDArrayTypes = ::testing::Types<
                                 std::index_sequence< 0 >,
                                 Devices::Cuda >,
                        Communicators::MpiCommunicator,
-                       std::index_sequence< 2 > >,
-   DistributedNDArray< NDArray< double,
-                                SizesHolder< int, 0 >,
-                                std::index_sequence< 0 >,
-                                std::index_sequence< 0 >,
-                                Devices::Cuda >,
-                       Communicators::NoDistrCommunicator,
                        std::index_sequence< 2 > >
+// TODO: does it make sense for NoDistrCommunicator?
+//   DistributedNDArray< NDArray< double,
+//                                SizesHolder< int, 0 >,
+//                                std::index_sequence< 0 >,
+//                                std::index_sequence< 0 >,
+//                                Devices::Cuda >,
+//                       Communicators::NoDistrCommunicator,
+//                       std::index_sequence< 2 > >
 #endif
 >;
 
@@ -260,6 +263,58 @@ TYPED_TEST( DistributedNDArrayOverlapsTest, forOverlaps )
    test_helper_forOverlaps( this->distributedNDArray );
 }
 
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_synchronize( DistributedArray& a, const int rank, const int nproc )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const int overlaps = __ndarray_impl::get< 0 >( typename DistributedArray::OverlapsType{} );
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) = rank;
+   };
+
+   a.setValue( -1 );
+   a.forAll( setter );
+   DistributedNDArraySynchronizer< DistributedArray > s1;
+   s1.synchronize( a );
+
+   for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), (rank + nproc - 1) % nproc )
+            << "gi = " << gi;
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), rank )
+            << "gi = " << gi;
+   for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), (rank + 1) % nproc )
+            << "gi = " << gi;
+
+   a.setValue( -1 );
+   a_view.forAll( setter );
+   DistributedNDArraySynchronizer< decltype(a_view) > s2;
+   s2.synchronize( a_view );
+
+   for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), (rank + nproc - 1) % nproc )
+            << "gi = " << gi;
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), rank )
+            << "gi = " << gi;
+   for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), (rank + 1) % nproc )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArrayOverlapsTest, synchronize )
+{
+   test_helper_synchronize( this->distributedNDArray, this->rank, this->nproc );
+}
+
 #endif  // HAVE_GTEST
 
 
-- 
GitLab


From 298dd4212fdfda06bb86465abfdda082bab5b073 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sat, 6 Apr 2019 11:47:10 +0200
Subject: [PATCH 17/25] DistributedNDArray: added tests for semi-1D
 distribution

---
 .../Containers/ndarray/CMakeLists.txt         |  50 +-
 .../DistributedNDArrayOverlapsTest.cpp        |   1 -
 .../ndarray/DistributedNDArrayOverlapsTest.cu |   1 -
 .../DistributedNDArrayOverlaps_1D_test.cpp    |   1 +
 .../DistributedNDArrayOverlaps_1D_test.cu     |   1 +
 ...h => DistributedNDArrayOverlaps_1D_test.h} |  40 +-
 ...DistributedNDArrayOverlaps_semi1D_test.cpp |   1 +
 .../DistributedNDArrayOverlaps_semi1D_test.cu |   1 +
 .../DistributedNDArrayOverlaps_semi1D_test.h  | 411 +++++++++++++
 .../ndarray/DistributedNDArrayTest.cpp        |   1 -
 .../ndarray/DistributedNDArrayTest.cu         |   1 -
 .../ndarray/DistributedNDArray_1D_test.cpp    |   1 +
 .../ndarray/DistributedNDArray_1D_test.cu     |   1 +
 ...rayTest.h => DistributedNDArray_1D_test.h} |  38 +-
 .../DistributedNDArray_semi1D_test.cpp        |   1 +
 .../ndarray/DistributedNDArray_semi1D_test.cu |   1 +
 .../ndarray/DistributedNDArray_semi1D_test.h  | 543 ++++++++++++++++++
 17 files changed, 1033 insertions(+), 61 deletions(-)
 delete mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.cpp
 delete mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.cu
 create mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cpp
 create mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cu
 rename src/UnitTests/Containers/ndarray/{DistributedNDArrayOverlapsTest.h => DistributedNDArrayOverlaps_1D_test.h} (92%)
 create mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cpp
 create mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cu
 create mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
 delete mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArrayTest.cpp
 delete mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArrayTest.cu
 create mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cpp
 create mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cu
 rename src/UnitTests/Containers/ndarray/{DistributedNDArrayTest.h => DistributedNDArray_1D_test.h} (94%)
 create mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cpp
 create mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cu
 create mode 100644 src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h

diff --git a/src/UnitTests/Containers/ndarray/CMakeLists.txt b/src/UnitTests/Containers/ndarray/CMakeLists.txt
index d2b385eeb..990012865 100644
--- a/src/UnitTests/Containers/ndarray/CMakeLists.txt
+++ b/src/UnitTests/Containers/ndarray/CMakeLists.txt
@@ -27,26 +27,48 @@ endif()
 
 if( ${BUILD_MPI} )
    if( BUILD_CUDA )
-      CUDA_ADD_EXECUTABLE( DistributedNDArrayTest DistributedNDArrayTest.cu
+      CUDA_ADD_EXECUTABLE( DistributedNDArray_1D_test DistributedNDArray_1D_test.cu
                            OPTIONS ${CXX_TESTS_FLAGS} )
-      TARGET_LINK_LIBRARIES( DistributedNDArrayTest ${GTEST_BOTH_LIBRARIES} )
+      TARGET_LINK_LIBRARIES( DistributedNDArray_1D_test ${GTEST_BOTH_LIBRARIES} )
 
-      CUDA_ADD_EXECUTABLE( DistributedNDArrayOverlapsTest DistributedNDArrayOverlapsTest.cu
+      CUDA_ADD_EXECUTABLE( DistributedNDArray_semi1D_test DistributedNDArray_semi1D_test.cu
                            OPTIONS ${CXX_TESTS_FLAGS} )
-      TARGET_LINK_LIBRARIES( DistributedNDArrayOverlapsTest ${GTEST_BOTH_LIBRARIES} )
+      TARGET_LINK_LIBRARIES( DistributedNDArray_semi1D_test ${GTEST_BOTH_LIBRARIES} )
+
+      CUDA_ADD_EXECUTABLE( DistributedNDArrayOverlaps_1D_test DistributedNDArrayOverlaps_1D_test.cu
+                           OPTIONS ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedNDArrayOverlaps_1D_test ${GTEST_BOTH_LIBRARIES} )
+
+      CUDA_ADD_EXECUTABLE( DistributedNDArrayOverlaps_semi1D_test DistributedNDArrayOverlaps_semi1D_test.cu
+                           OPTIONS ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedNDArrayOverlaps_semi1D_test ${GTEST_BOTH_LIBRARIES} )
    else()
-      ADD_EXECUTABLE( DistributedNDArrayTest DistributedNDArrayTest.cpp )
-      TARGET_COMPILE_OPTIONS( DistributedNDArrayTest PRIVATE ${CXX_TESTS_FLAGS} )
-      TARGET_LINK_LIBRARIES( DistributedNDArrayTest ${GTEST_BOTH_LIBRARIES} )
+      ADD_EXECUTABLE( DistributedNDArray_1D_test DistributedNDArray_1D_test.cpp )
+      TARGET_COMPILE_OPTIONS( DistributedNDArray_1D_test PRIVATE ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedNDArray_1D_test ${GTEST_BOTH_LIBRARIES} )
 
-      ADD_EXECUTABLE( DistributedNDArrayOverlapsTest DistributedNDArrayOverlapsTest.cpp )
-      TARGET_COMPILE_OPTIONS( DistributedNDArrayOverlapsTest PRIVATE ${CXX_TESTS_FLAGS} )
-      TARGET_LINK_LIBRARIES( DistributedNDArrayOverlapsTest ${GTEST_BOTH_LIBRARIES} )
+      ADD_EXECUTABLE( DistributedNDArray_semi1D_test DistributedNDArray_semi1D_test.cpp )
+      TARGET_COMPILE_OPTIONS( DistributedNDArray_semi1D_test PRIVATE ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedNDArray_semi1D_test ${GTEST_BOTH_LIBRARIES} )
+
+      ADD_EXECUTABLE( DistributedNDArrayOverlaps_1D_test DistributedNDArrayOverlaps_1D_test.cpp )
+      TARGET_COMPILE_OPTIONS( DistributedNDArrayOverlaps_1D_test PRIVATE ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedNDArrayOverlaps_1D_test ${GTEST_BOTH_LIBRARIES} )
+
+      ADD_EXECUTABLE( DistributedNDArrayOverlaps_semi1D_test DistributedNDArrayOverlaps_semi1D_test.cpp )
+      TARGET_COMPILE_OPTIONS( DistributedNDArrayOverlaps_semi1D_test PRIVATE ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedNDArrayOverlaps_semi1D_test ${GTEST_BOTH_LIBRARIES} )
    endif()
 
-   SET( mpi_test_parameters -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedNDArrayTest${CMAKE_EXECUTABLE_SUFFIX}" )
-   ADD_TEST( NAME DistributedNDArrayTest COMMAND "mpirun" ${mpi_test_parameters})
+   SET( mpi_test_parameters -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedNDArray_1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
+   ADD_TEST( NAME DistributedNDArray_1D_test COMMAND "mpirun" ${mpi_test_parameters})
+
+   SET( mpi_test_parameters -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedNDArray_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
+   ADD_TEST( NAME DistributedNDArray_semi1D_test COMMAND "mpirun" ${mpi_test_parameters})
+
+   SET( mpi_test_parameters -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedNDArrayOverlaps_1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
+   ADD_TEST( NAME DistributedNDArrayOverlaps_1D_test COMMAND "mpirun" ${mpi_test_parameters})
 
-   SET( mpi_test_parameters -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedNDArrayOverlapsTest${CMAKE_EXECUTABLE_SUFFIX}" )
-   ADD_TEST( NAME DistributedNDArrayOverlapsTest COMMAND "mpirun" ${mpi_test_parameters})
+   SET( mpi_test_parameters -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedNDArrayOverlaps_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
+   ADD_TEST( NAME DistributedNDArrayOverlaps_semi1D_test COMMAND "mpirun" ${mpi_test_parameters})
 endif()
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.cpp b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.cpp
deleted file mode 100644
index cb294c4fd..000000000
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include "DistributedNDArrayOverlapsTest.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.cu b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.cu
deleted file mode 100644
index cb294c4fd..000000000
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.cu
+++ /dev/null
@@ -1 +0,0 @@
-#include "DistributedNDArrayOverlapsTest.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cpp b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cpp
new file mode 100644
index 000000000..b0aa8e8e2
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cpp
@@ -0,0 +1 @@
+#include "DistributedNDArrayOverlaps_1D_test.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cu b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cu
new file mode 100644
index 000000000..b0aa8e8e2
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cu
@@ -0,0 +1 @@
+#include "DistributedNDArrayOverlaps_1D_test.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
similarity index 92%
rename from src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.h
rename to src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
index 7ffee51ac..eea9d84f9 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlapsTest.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          DistributedNDArrayOverlapsTest.h  -  description
+                          DistributedNDArrayOverlaps_1D_test.h  -  description
                              -------------------
     begin                : Dec 27, 2018
     copyright            : (C) 2018 by Tomas Oberhuber et al.
@@ -29,7 +29,7 @@ using namespace TNL::Containers;
  * - Communication group is hardcoded as AllGroup -- it may be changed as needed.
  */
 template< typename DistributedNDArray >
-class DistributedNDArrayOverlapsTest
+class DistributedNDArrayOverlaps_1D_test
 : public ::testing::Test
 {
 protected:
@@ -53,7 +53,7 @@ protected:
    const int rank = CommunicatorType::GetRank(group);
    const int nproc = CommunicatorType::GetSize(group);
 
-   DistributedNDArrayOverlapsTest()
+   DistributedNDArrayOverlaps_1D_test()
    {
       using LocalRangeType = typename DistributedNDArray::LocalRangeType;
       const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
@@ -66,7 +66,7 @@ protected:
    }
 };
 
-// types for which DistributedNDArrayOverlapsTest is instantiated
+// types for which DistributedNDArrayOverlaps_1D_test is instantiated
 using DistributedNDArrayTypes = ::testing::Types<
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 0 >,
@@ -103,9 +103,9 @@ using DistributedNDArrayTypes = ::testing::Types<
 #endif
 >;
 
-TYPED_TEST_SUITE( DistributedNDArrayOverlapsTest, DistributedNDArrayTypes );
+TYPED_TEST_SUITE( DistributedNDArrayOverlaps_1D_test, DistributedNDArrayTypes );
 
-TYPED_TEST( DistributedNDArrayOverlapsTest, checkSumOfLocalSizes )
+TYPED_TEST( DistributedNDArrayOverlaps_1D_test, checkSumOfLocalSizes )
 {
    using CommunicatorType = typename TestFixture::CommunicatorType;
 
@@ -162,7 +162,7 @@ void test_helper_forLocalInternal( DistributedArray& a )
             << "gi = " << gi;
 }
 
-TYPED_TEST( DistributedNDArrayOverlapsTest, forLocalInternal )
+TYPED_TEST( DistributedNDArrayOverlaps_1D_test, forLocalInternal )
 {
    test_helper_forLocalInternal( this->distributedNDArray );
 }
@@ -210,7 +210,7 @@ void test_helper_forLocalBoundary( DistributedArray& a )
             << "gi = " << gi;
 }
 
-TYPED_TEST( DistributedNDArrayOverlapsTest, forLocalBoundary )
+TYPED_TEST( DistributedNDArrayOverlaps_1D_test, forLocalBoundary )
 {
    test_helper_forLocalBoundary( this->distributedNDArray );
 }
@@ -258,7 +258,7 @@ void test_helper_forOverlaps( DistributedArray& a )
             << "gi = " << gi;
 }
 
-TYPED_TEST( DistributedNDArrayOverlapsTest, forOverlaps )
+TYPED_TEST( DistributedNDArrayOverlaps_1D_test, forOverlaps )
 {
    test_helper_forOverlaps( this->distributedNDArray );
 }
@@ -276,7 +276,7 @@ void test_helper_synchronize( DistributedArray& a, const int rank, const int npr
 
    auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
    {
-      a_view( i ) = rank;
+      a_view( i ) = i;
    };
 
    a.setValue( -1 );
@@ -285,14 +285,11 @@ void test_helper_synchronize( DistributedArray& a, const int rank, const int npr
    s1.synchronize( a );
 
    for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ )
-      EXPECT_EQ( a.getElement( gi ), (rank + nproc - 1) % nproc )
-            << "gi = " << gi;
+      EXPECT_EQ( a.getElement( gi ), gi + ((rank == 0) ? 97 : 0) );
    for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
-      EXPECT_EQ( a.getElement( gi ), rank )
-            << "gi = " << gi;
+      EXPECT_EQ( a.getElement( gi ), gi );
    for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ )
-      EXPECT_EQ( a.getElement( gi ), (rank + 1) % nproc )
-            << "gi = " << gi;
+      EXPECT_EQ( a.getElement( gi ), gi - ((rank == nproc-1) ? 97 : 0) );
 
    a.setValue( -1 );
    a_view.forAll( setter );
@@ -300,17 +297,14 @@ void test_helper_synchronize( DistributedArray& a, const int rank, const int npr
    s2.synchronize( a_view );
 
    for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ )
-      EXPECT_EQ( a.getElement( gi ), (rank + nproc - 1) % nproc )
-            << "gi = " << gi;
+      EXPECT_EQ( a.getElement( gi ), gi + ((rank == 0) ? 97 : 0) );
    for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
-      EXPECT_EQ( a.getElement( gi ), rank )
-            << "gi = " << gi;
+      EXPECT_EQ( a.getElement( gi ), gi );
    for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ )
-      EXPECT_EQ( a.getElement( gi ), (rank + 1) % nproc )
-            << "gi = " << gi;
+      EXPECT_EQ( a.getElement( gi ), gi - ((rank == nproc-1) ? 97 : 0) );
 }
 
-TYPED_TEST( DistributedNDArrayOverlapsTest, synchronize )
+TYPED_TEST( DistributedNDArrayOverlaps_1D_test, synchronize )
 {
    test_helper_synchronize( this->distributedNDArray, this->rank, this->nproc );
 }
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cpp b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cpp
new file mode 100644
index 000000000..cf74a71d1
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cpp
@@ -0,0 +1 @@
+#include "DistributedNDArrayOverlaps_semi1D_test.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cu b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cu
new file mode 100644
index 000000000..cf74a71d1
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cu
@@ -0,0 +1 @@
+#include "DistributedNDArrayOverlaps_semi1D_test.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
new file mode 100644
index 000000000..a019b3139
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
@@ -0,0 +1,411 @@
+/***************************************************************************
+                          DistributedNDArrayOverlaps_semi1D_test.h  -  description
+                             -------------------
+    begin                : Dec 9, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/NoDistrCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/Containers/DistributedNDArray.h>
+#include <TNL/Containers/DistributedNDArrayView.h>
+#include <TNL/Containers/DistributedNDArraySynchronizer.h>
+#include <TNL/Containers/ArrayView.h>
+#include <TNL/Containers/Partitioner.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+/*
+ * Light check of DistributedNDArray.
+ *
+ * - Number of processes is not limited.
+ * - Global size is hardcoded as 97 to force non-uniform distribution.
+ * - Communication group is hardcoded as AllGroup -- it may be changed as needed.
+ */
+template< typename DistributedNDArray >
+class DistributedNDArrayOverlaps_semi1D_test
+: public ::testing::Test
+{
+protected:
+   using ValueType = typename DistributedNDArray::ValueType;
+   using DeviceType = typename DistributedNDArray::DeviceType;
+   using CommunicatorType = typename DistributedNDArray::CommunicatorType;
+   using IndexType = typename DistributedNDArray::IndexType;
+   using DistributedNDArrayType = DistributedNDArray;
+
+   // TODO: use ndarray
+   using LocalArrayType = Array< ValueType, DeviceType, IndexType >;
+   using LocalArrayViewType = ArrayView< ValueType, DeviceType, IndexType >;
+
+   const int globalSize = 97;  // prime number to force non-uniform distribution
+   const int overlaps = __ndarray_impl::get< 1 >( typename DistributedNDArray::OverlapsType{} );
+
+   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+
+   DistributedNDArrayType distributedNDArray;
+
+   const int rank = CommunicatorType::GetRank(group);
+   const int nproc = CommunicatorType::GetSize(group);
+
+   DistributedNDArrayOverlaps_semi1D_test()
+   {
+      using LocalRangeType = typename DistributedNDArray::LocalRangeType;
+      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      distributedNDArray.setSizes( 0, globalSize, globalSize / 2 );
+      distributedNDArray.template setDistribution< 1 >( localRange.getBegin(), localRange.getEnd(), group );
+      distributedNDArray.allocate();
+
+      EXPECT_EQ( distributedNDArray.template getLocalRange< 1 >(), localRange );
+      EXPECT_EQ( distributedNDArray.getCommunicationGroup(), group );
+   }
+};
+
+// types for which DistributedNDArrayOverlaps_semi1D_test is instantiated
+using DistributedNDArrayTypes = ::testing::Types<
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 9, 0, 0 >,  // Q, X, Y
+                                std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
+                                std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
+                                Devices::Host >,
+                       Communicators::MpiCommunicator,
+                       std::index_sequence< 0, 2, 0 > >
+#ifdef HAVE_CUDA
+   ,
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 9, 0, 0 >,  // Q, X, Y
+                                std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
+                                std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
+                                Devices::Cuda >,
+                       Communicators::MpiCommunicator,
+                       std::index_sequence< 0, 2, 0 > >
+#endif
+>;
+
+TYPED_TEST_SUITE( DistributedNDArrayOverlaps_semi1D_test, DistributedNDArrayTypes );
+
+TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, checkSumOfLocalSizes )
+{
+   using CommunicatorType = typename TestFixture::CommunicatorType;
+
+   const auto localRange = this->distributedNDArray.template getLocalRange< 1 >();
+   const int localSize = localRange.getEnd() - localRange.getBegin();
+   int sumOfLocalSizes = 0;
+   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   EXPECT_EQ( sumOfLocalSizes, this->globalSize );
+   EXPECT_EQ( this->distributedNDArray.template getSize< 1 >(), this->globalSize );
+
+   EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 9 * (2 * this->overlaps + localSize) * (this->globalSize / 2) );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forLocalInternal( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const int overlaps = __ndarray_impl::get< 1 >( typename DistributedArray::OverlapsType{} );
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( q, i, j ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forLocalInternal( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+
+   a.setValue( 0 );
+   a_view.forLocalInternal( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, forLocalInternal )
+{
+   test_helper_forLocalInternal( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forLocalBoundary( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const int overlaps = __ndarray_impl::get< 1 >( typename DistributedArray::OverlapsType{} );
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( q, i, j ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forLocalBoundary( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+
+   a.setValue( 0 );
+   a_view.forLocalBoundary( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, forLocalBoundary )
+{
+   test_helper_forLocalBoundary( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forOverlaps( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const int overlaps = __ndarray_impl::get< 1 >( typename DistributedArray::OverlapsType{} );
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( q, i, j ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forOverlaps( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+
+   a.setValue( 0 );
+   a_view.forOverlaps( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, forOverlaps )
+{
+   test_helper_forOverlaps( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_synchronize( DistributedArray& a, const int rank, const int nproc )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const int overlaps = __ndarray_impl::get< 1 >( typename DistributedArray::OverlapsType{} );
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( i ) = i;
+   };
+
+   a.setValue( -1 );
+   a.forAll( setter );
+   DistributedNDArraySynchronizer< DistributedArray > s1;
+   s1.synchronize( a );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), gi + ((rank == 0) ? 97 : 0) )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), gi )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), gi - ((rank == nproc-1) ? 97 : 0) )
+            << "gi = " << gi;
+
+   a.setValue( -1 );
+   a_view.forAll( setter );
+   DistributedNDArraySynchronizer< decltype(a_view) > s2;
+   s2.synchronize( a_view );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), gi + ((rank == 0) ? 97 : 0) )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), gi )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), gi - ((rank == nproc-1) ? 97 : 0) )
+            << "gi = " << gi;
+}
+
+//TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, synchronize )
+//{
+//   test_helper_synchronize( this->distributedNDArray, this->rank, this->nproc );
+//}
+
+#endif  // HAVE_GTEST
+
+
+#if (defined(HAVE_GTEST) && defined(HAVE_MPI))
+using CommunicatorType = Communicators::MpiCommunicator;
+
+#include <sstream>
+
+class MinimalistBufferedPrinter
+: public ::testing::EmptyTestEventListener
+{
+private:
+   std::stringstream sout;
+
+public:
+   // Called before a test starts.
+   virtual void OnTestStart(const ::testing::TestInfo& test_info)
+   {
+      sout << test_info.test_case_name() << "." << test_info.name() << " Start." << std::endl;
+   }
+
+   // Called after a failed assertion or a SUCCEED() invocation.
+   virtual void OnTestPartResult(const ::testing::TestPartResult& test_part_result)
+   {
+      sout << (test_part_result.failed() ? "====Failure=== " : "===Success=== ")
+           << test_part_result.file_name() << " "
+           << test_part_result.line_number() <<std::endl
+           << test_part_result.summary() <<std::endl;
+   }
+
+   // Called after a test ends.
+   virtual void OnTestEnd(const ::testing::TestInfo& test_info)
+   {
+      const int rank = CommunicatorType::GetRank(CommunicatorType::AllGroup);
+      sout << test_info.test_case_name() << "." << test_info.name() << " End." <<std::endl;
+      std::cout << rank << ":" << std::endl << sout.str()<< std::endl;
+      sout.str( std::string() );
+      sout.clear();
+   }
+};
+#endif
+
+#include "../../GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+
+   #ifdef HAVE_MPI
+      ::testing::TestEventListeners& listeners =
+         ::testing::UnitTest::GetInstance()->listeners();
+
+      delete listeners.Release(listeners.default_result_printer());
+      listeners.Append(new MinimalistBufferedPrinter);
+
+      Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
+   #endif
+   return RUN_ALL_TESTS();
+#else
+   throw GtestMissingError();
+#endif
+}
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.cpp b/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.cpp
deleted file mode 100644
index d526d56d0..000000000
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include "DistributedNDArrayTest.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.cu b/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.cu
deleted file mode 100644
index d526d56d0..000000000
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.cu
+++ /dev/null
@@ -1 +0,0 @@
-#include "DistributedNDArrayTest.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cpp b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cpp
new file mode 100644
index 000000000..f234d7711
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cpp
@@ -0,0 +1 @@
+#include "DistributedNDArray_1D_test.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cu b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cu
new file mode 100644
index 000000000..f234d7711
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cu
@@ -0,0 +1 @@
+#include "DistributedNDArray_1D_test.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
similarity index 94%
rename from src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h
rename to src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
index 11d295170..dec9cb821 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayTest.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          DistributedNDArrayTest.h  -  description
+                          DistributedNDArray_1D_test.h  -  description
                              -------------------
     begin                : Dec 27, 2018
     copyright            : (C) 2018 by Tomas Oberhuber et al.
@@ -28,7 +28,7 @@ using namespace TNL::Containers;
  * - Communication group is hardcoded as AllGroup -- it may be changed as needed.
  */
 template< typename DistributedNDArray >
-class DistributedNDArrayTest
+class DistributedNDArray_1D_test
 : public ::testing::Test
 {
 protected:
@@ -51,7 +51,7 @@ protected:
    const int rank = CommunicatorType::GetRank(group);
    const int nproc = CommunicatorType::GetSize(group);
 
-   DistributedNDArrayTest()
+   DistributedNDArray_1D_test()
    {
       using LocalRangeType = typename DistributedNDArray::LocalRangeType;
       const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
@@ -64,7 +64,7 @@ protected:
    }
 };
 
-// types for which DistributedNDArrayTest is instantiated
+// types for which DistributedNDArray_1D_test is instantiated
 using DistributedNDArrayTypes = ::testing::Types<
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 0 >,
@@ -95,9 +95,9 @@ using DistributedNDArrayTypes = ::testing::Types<
 #endif
 >;
 
-TYPED_TEST_SUITE( DistributedNDArrayTest, DistributedNDArrayTypes );
+TYPED_TEST_SUITE( DistributedNDArray_1D_test, DistributedNDArrayTypes );
 
-TYPED_TEST( DistributedNDArrayTest, checkSumOfLocalSizes )
+TYPED_TEST( DistributedNDArray_1D_test, checkSumOfLocalSizes )
 {
    using CommunicatorType = typename TestFixture::CommunicatorType;
 
@@ -109,7 +109,7 @@ TYPED_TEST( DistributedNDArrayTest, checkSumOfLocalSizes )
    EXPECT_EQ( this->distributedNDArray.template getSize< 0 >(), this->globalSize );
 }
 
-TYPED_TEST( DistributedNDArrayTest, setLike )
+TYPED_TEST( DistributedNDArray_1D_test, setLike )
 {
    using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType;
 
@@ -121,7 +121,7 @@ TYPED_TEST( DistributedNDArrayTest, setLike )
    EXPECT_EQ( copy.getLocalStorageSize(), localRange.getEnd() - localRange.getBegin() );
 }
 
-TYPED_TEST( DistributedNDArrayTest, reset )
+TYPED_TEST( DistributedNDArray_1D_test, reset )
 {
    const auto localRange = this->distributedNDArray.template getLocalRange< 0 >();
    EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), localRange.getEnd() - localRange.getBegin() );
@@ -129,8 +129,6 @@ TYPED_TEST( DistributedNDArrayTest, reset )
    EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 0 );
 }
 
-// TODO: swap
-
 // separate function because nvcc does not allow __cuda_callable__ lambdas inside
 // private or protected methods (which are created by TYPED_TEST macro)
 template< typename DistributedArray, typename BufferView >
@@ -148,7 +146,7 @@ void test_helper_setValue( DistributedArray& array, BufferView& buffer_view )
    ParallelFor< DeviceType >::exec( localRange.getBegin(), localRange.getEnd(), kernel );
 }
 
-TYPED_TEST( DistributedNDArrayTest, setValue )
+TYPED_TEST( DistributedNDArray_1D_test, setValue )
 {
    using LocalArrayType = typename TestFixture::LocalArrayType;
    using LocalArrayViewType = typename TestFixture::LocalArrayViewType;
@@ -165,7 +163,7 @@ TYPED_TEST( DistributedNDArrayTest, setValue )
    EXPECT_EQ( buffer, expected );
 }
 
-TYPED_TEST( DistributedNDArrayTest, elementwiseAccess )
+TYPED_TEST( DistributedNDArray_1D_test, elementwiseAccess )
 {
 //   using ArrayViewType = typename TestFixture::ArrayViewType;
    using IndexType = typename TestFixture::IndexType;
@@ -198,7 +196,7 @@ TYPED_TEST( DistributedNDArrayTest, elementwiseAccess )
    }
 }
 
-TYPED_TEST( DistributedNDArrayTest, copyAssignment )
+TYPED_TEST( DistributedNDArray_1D_test, copyAssignment )
 {
    using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType;
 
@@ -232,7 +230,7 @@ void test_helper_comparisonOperators( DistributedArray& u, DistributedArray& v,
    ParallelFor< DeviceType >::exec( localRange.getBegin(), localRange.getEnd(), kernel );
 }
 
-TYPED_TEST( DistributedNDArrayTest, comparisonOperators )
+TYPED_TEST( DistributedNDArray_1D_test, comparisonOperators )
 {
    using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType;
 
@@ -287,7 +285,7 @@ void test_helper_forAll( DistributedArray& a )
       EXPECT_EQ( a.getElement( gi ), 1 );
 }
 
-TYPED_TEST( DistributedNDArrayTest, forAll )
+TYPED_TEST( DistributedNDArray_1D_test, forAll )
 {
    test_helper_forAll( this->distributedNDArray );
 }
@@ -334,7 +332,7 @@ void test_helper_forInternal( DistributedArray& a )
    }
 }
 
-TYPED_TEST( DistributedNDArrayTest, forInternal )
+TYPED_TEST( DistributedNDArray_1D_test, forInternal )
 {
    test_helper_forInternal( this->distributedNDArray );
 }
@@ -371,7 +369,7 @@ void test_helper_forLocalInternal( DistributedArray& a )
             << "gi = " << gi;
 }
 
-TYPED_TEST( DistributedNDArrayTest, forLocalInternal )
+TYPED_TEST( DistributedNDArray_1D_test, forLocalInternal )
 {
    test_helper_forLocalInternal( this->distributedNDArray );
 }
@@ -418,7 +416,7 @@ void test_helper_forBoundary( DistributedArray& a )
    }
 }
 
-TYPED_TEST( DistributedNDArrayTest, forBoundary )
+TYPED_TEST( DistributedNDArray_1D_test, forBoundary )
 {
    test_helper_forBoundary( this->distributedNDArray );
 }
@@ -455,7 +453,7 @@ void test_helper_forLocalBoundary( DistributedArray& a )
             << "gi = " << gi;
 }
 
-TYPED_TEST( DistributedNDArrayTest, forLocalBoundary )
+TYPED_TEST( DistributedNDArray_1D_test, forLocalBoundary )
 {
    test_helper_forLocalBoundary( this->distributedNDArray );
 }
@@ -492,7 +490,7 @@ void test_helper_forOverlaps( DistributedArray& a )
             << "gi = " << gi;
 }
 
-TYPED_TEST( DistributedNDArrayTest, forOverlaps )
+TYPED_TEST( DistributedNDArray_1D_test, forOverlaps )
 {
    test_helper_forOverlaps( this->distributedNDArray );
 }
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cpp b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cpp
new file mode 100644
index 000000000..720641ed9
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cpp
@@ -0,0 +1 @@
+#include "DistributedNDArray_semi1D_test.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cu b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cu
new file mode 100644
index 000000000..720641ed9
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cu
@@ -0,0 +1 @@
+#include "DistributedNDArray_semi1D_test.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
new file mode 100644
index 000000000..500dbea3d
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
@@ -0,0 +1,543 @@
+/***************************************************************************
+                          DistributedNDArray_semi1D_test.h  -  description
+                             -------------------
+    begin                : Dec 27, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/NoDistrCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/Containers/DistributedNDArray.h>
+#include <TNL/Containers/DistributedNDArrayView.h>
+#include <TNL/Containers/ArrayView.h>
+#include <TNL/Containers/Partitioner.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+/*
+ * Light check of DistributedNDArray.
+ *
+ * - Number of processes is not limited.
+ * - Global size is hardcoded as 97 to force non-uniform distribution.
+ * - Communication group is hardcoded as AllGroup -- it may be changed as needed.
+ */
+template< typename DistributedNDArray >
+class DistributedNDArray_semi1D_test
+: public ::testing::Test
+{
+protected:
+   using ValueType = typename DistributedNDArray::ValueType;
+   using DeviceType = typename DistributedNDArray::DeviceType;
+   using CommunicatorType = typename DistributedNDArray::CommunicatorType;
+   using IndexType = typename DistributedNDArray::IndexType;
+   using DistributedNDArrayType = DistributedNDArray;
+
+   // TODO: use ndarray
+   using LocalArrayType = Array< ValueType, DeviceType, IndexType >;
+   using LocalArrayViewType = ArrayView< ValueType, DeviceType, IndexType >;
+
+   const int globalSize = 97;  // prime number to force non-uniform distribution
+
+   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+
+   DistributedNDArrayType distributedNDArray;
+
+   const int rank = CommunicatorType::GetRank(group);
+   const int nproc = CommunicatorType::GetSize(group);
+
+   DistributedNDArray_semi1D_test()
+   {
+      using LocalRangeType = typename DistributedNDArray::LocalRangeType;
+      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      distributedNDArray.setSizes( 0, globalSize, globalSize / 2 );
+      distributedNDArray.template setDistribution< 1 >( localRange.getBegin(), localRange.getEnd(), group );
+      distributedNDArray.allocate();
+
+      EXPECT_EQ( distributedNDArray.template getLocalRange< 1 >(), localRange );
+      EXPECT_EQ( distributedNDArray.getCommunicationGroup(), group );
+   }
+};
+
+// types for which DistributedNDArray_semi1D_test is instantiated
+using DistributedNDArrayTypes = ::testing::Types<
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 9, 0, 0 >,  // Q, X, Y, Z
+                                std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
+                                std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
+                                Devices::Host >,
+                       Communicators::MpiCommunicator >
+#ifdef HAVE_CUDA
+   ,
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 9, 0, 0 >,  // Q, X, Y, Z
+                                std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
+                                std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
+                                Devices::Cuda >,
+                       Communicators::NoDistrCommunicator >
+#endif
+>;
+
+TYPED_TEST_SUITE( DistributedNDArray_semi1D_test, DistributedNDArrayTypes );
+
+TYPED_TEST( DistributedNDArray_semi1D_test, checkSumOfLocalSizes )
+{
+   using CommunicatorType = typename TestFixture::CommunicatorType;
+
+   const auto localRange = this->distributedNDArray.template getLocalRange< 1 >();
+   const int localSize = localRange.getEnd() - localRange.getBegin();
+   int sumOfLocalSizes = 0;
+   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   EXPECT_EQ( sumOfLocalSizes, this->globalSize );
+   EXPECT_EQ( this->distributedNDArray.template getSize< 1 >(), this->globalSize );
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, setLike )
+{
+   using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType;
+
+   const auto localRange = this->distributedNDArray.template getLocalRange< 1 >();
+   EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 9 * (localRange.getEnd() - localRange.getBegin()) * (this->globalSize / 2) );
+   DistributedNDArrayType copy;
+   EXPECT_EQ( copy.getLocalStorageSize(), 0 );
+   copy.setLike( this->distributedNDArray );
+   EXPECT_EQ( copy.getLocalStorageSize(), 9 * (localRange.getEnd() - localRange.getBegin()) * (this->globalSize / 2) );
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, reset )
+{
+   const auto localRange = this->distributedNDArray.template getLocalRange< 1 >();
+   EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 9 * (localRange.getEnd() - localRange.getBegin()) * (this->globalSize / 2) );
+   this->distributedNDArray.reset();
+   EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 0 );
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, elementwiseAccess )
+{
+//   using ArrayViewType = typename TestFixture::ArrayViewType;
+   using IndexType = typename TestFixture::IndexType;
+
+   this->distributedNDArray.setValue( 0 );
+//   ArrayViewType localArrayView = this->distributedNDArray.getLocalArrayView();
+   const auto localRange = this->distributedNDArray.template getLocalRange< 1 >();
+
+   // check initial value
+   for( int q = 0; q < 9; q++ )
+   for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < this->distributedNDArray.template getSize< 2 >(); j++ ) {
+//      EXPECT_EQ( localArrayView.getElement( i ), 0 );
+      EXPECT_EQ( this->distributedNDArray.getElement( q, gi, j ), 0 );
+   }
+
+   // use operator()
+   if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) {
+      for( int q = 0; q < 9; q++ )
+      for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      for( int j = 0; j < this->distributedNDArray.template getSize< 2 >(); j++ ) {
+         this->distributedNDArray( q, gi, j ) = gi + 1;
+      }
+
+      // check set value
+      for( int q = 0; q < 9; q++ )
+      for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      for( int j = 0; j < this->distributedNDArray.template getSize< 2 >(); j++ ) {
+         EXPECT_EQ( this->distributedNDArray.getElement( q, gi, j ), gi + 1 );
+         EXPECT_EQ( this->distributedNDArray( q, gi, j ), gi + 1 );
+      }
+   }
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, copyAssignment )
+{
+   using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType;
+
+   this->distributedNDArray.setValue( 1 );
+   DistributedNDArrayType copy;
+   copy = this->distributedNDArray;
+   // no binding, but deep copy
+//   EXPECT_NE( copy.getLocalArrayView().getData(), this->distributedNDArray.getLocalArrayView().getData() );
+//   EXPECT_EQ( copy.getLocalArrayView(), this->distributedNDArray.getLocalArrayView() );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_comparisonOperators( DistributedArray& u, DistributedArray& v, DistributedArray& w )
+{
+   using DeviceType = typename DistributedArray::DeviceType;
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = u.template getLocalRange< 1 >();
+   auto u_view = u.getView();
+   auto v_view = v.getView();
+   auto w_view = w.getView();
+
+   auto kernel = [=] __cuda_callable__ ( IndexType q, IndexType gi, IndexType j ) mutable
+   {
+      u_view( q, gi, j ) = gi;
+      v_view( q, gi, j ) = gi;
+      w_view( q, gi, j ) = 2 * gi;
+   };
+   ParallelFor3D< DeviceType >::exec( (IndexType) 0, localRange.getBegin(), (IndexType) 0,
+                                      9, localRange.getEnd(), u.template getSize< 2 >(),
+                                      kernel );
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, comparisonOperators )
+{
+   using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType;
+
+   DistributedNDArrayType& u = this->distributedNDArray;
+   DistributedNDArrayType v, w;
+   v.setLike( u );
+   w.setLike( u );
+
+   test_helper_comparisonOperators( u, v, w );
+
+   EXPECT_TRUE( u == u );
+   EXPECT_TRUE( u == v );
+   EXPECT_TRUE( v == u );
+   EXPECT_FALSE( u != v );
+   EXPECT_FALSE( v != u );
+   EXPECT_TRUE( u != w );
+   EXPECT_TRUE( w != u );
+   EXPECT_FALSE( u == w );
+   EXPECT_FALSE( w == u );
+
+   v.reset();
+   EXPECT_FALSE( u == v );
+   u.reset();
+   EXPECT_TRUE( u == v );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forAll( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( q, i, j ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forAll( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 );
+
+   a.setValue( 0 );
+   a_view.forAll( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 );
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, forAll )
+{
+   test_helper_forAll( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forInternal( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( q, i, j ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forInternal( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+   {
+      if( q == 0 || q == 8 ||
+          gi == 0 || gi == a.template getSize< 1 >() - 1 ||
+          j == 0 || j == a.template getSize< 2 >() - 1 )
+         EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+      else
+         EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+   }
+
+   a.setValue( 0 );
+   a_view.forInternal( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+   {
+      if( q == 0 || q == 8 ||
+          gi == 0 || gi == a.template getSize< 1 >() - 1 ||
+          j == 0 || j == a.template getSize< 2 >() - 1 )
+         EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+      else
+         EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+   }
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, forInternal )
+{
+   test_helper_forInternal( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forLocalInternal( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( q, i, j ) += 1;
+   };
+
+   a.setValue( 0 );
+   // equivalent to forAll because all overlaps are 0
+   a.forLocalInternal( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 );
+
+   a.setValue( 0 );
+   // equivalent to forAll because all overlaps are 0
+   a_view.forLocalInternal( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 );
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, forLocalInternal )
+{
+   test_helper_forLocalInternal( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forBoundary( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( q, i, j ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forBoundary( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+   {
+      if( q == 0 || q == 8 ||
+          gi == 0 || gi == a.template getSize< 1 >() - 1 ||
+          j == 0 || j == a.template getSize< 2 >() - 1 )
+         EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+      else
+         EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+   }
+
+   a.setValue( 0 );
+   a_view.forBoundary( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+   {
+      if( q == 0 || q == 8 ||
+          gi == 0 || gi == a.template getSize< 1 >() - 1 ||
+          j == 0 || j == a.template getSize< 2 >() - 1 )
+         EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+      else
+         EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+   }
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, forBoundary )
+{
+   test_helper_forBoundary( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forLocalBoundary( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( q, i, j ) += 1;
+   };
+
+   a.setValue( 0 );
+   // empty set because all overlaps are 0
+   a.forLocalBoundary( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 );
+
+   a.setValue( 0 );
+   // empty set because all overlaps are 0
+   a_view.forLocalBoundary( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 );
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, forLocalBoundary )
+{
+   test_helper_forLocalBoundary( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forOverlaps( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( q, i, j ) += 1;
+   };
+
+   a.setValue( 0 );
+   // empty set because all overlaps are 0
+   a.forOverlaps( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 );
+
+   a.setValue( 0 );
+   // empty set because all overlaps are 0
+   a_view.forOverlaps( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 );
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, forOverlaps )
+{
+   test_helper_forOverlaps( this->distributedNDArray );
+}
+
+#endif  // HAVE_GTEST
+
+
+#if (defined(HAVE_GTEST) && defined(HAVE_MPI))
+using CommunicatorType = Communicators::MpiCommunicator;
+
+#include <sstream>
+
+class MinimalistBufferedPrinter
+: public ::testing::EmptyTestEventListener
+{
+private:
+   std::stringstream sout;
+
+public:
+   // Called before a test starts.
+   virtual void OnTestStart(const ::testing::TestInfo& test_info)
+   {
+      sout << test_info.test_case_name() << "." << test_info.name() << " Start." << std::endl;
+   }
+
+   // Called after a failed assertion or a SUCCEED() invocation.
+   virtual void OnTestPartResult(const ::testing::TestPartResult& test_part_result)
+   {
+      sout << (test_part_result.failed() ? "====Failure=== " : "===Success=== ")
+           << test_part_result.file_name() << " "
+           << test_part_result.line_number() <<std::endl
+           << test_part_result.summary() <<std::endl;
+   }
+
+   // Called after a test ends.
+   virtual void OnTestEnd(const ::testing::TestInfo& test_info)
+   {
+      const int rank = CommunicatorType::GetRank(CommunicatorType::AllGroup);
+      sout << test_info.test_case_name() << "." << test_info.name() << " End." <<std::endl;
+      std::cout << rank << ":" << std::endl << sout.str()<< std::endl;
+      sout.str( std::string() );
+      sout.clear();
+   }
+};
+#endif
+
+#include "../../GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+
+   #ifdef HAVE_MPI
+      ::testing::TestEventListeners& listeners =
+         ::testing::UnitTest::GetInstance()->listeners();
+
+      delete listeners.Release(listeners.default_result_printer());
+      listeners.Append(new MinimalistBufferedPrinter);
+
+      Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
+   #endif
+   return RUN_ALL_TESTS();
+#else
+   throw GtestMissingError();
+#endif
+}
-- 
GitLab


From e885aa29086ec7b7213b56df2302d8cffed01154 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sat, 6 Apr 2019 11:47:42 +0200
Subject: [PATCH 18/25] Fixed setSize method in LocalBeginsHolder

---
 src/TNL/Containers/ndarray/SizesHolder.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/TNL/Containers/ndarray/SizesHolder.h b/src/TNL/Containers/ndarray/SizesHolder.h
index ec89a7550..e81bc3e60 100644
--- a/src/TNL/Containers/ndarray/SizesHolder.h
+++ b/src/TNL/Containers/ndarray/SizesHolder.h
@@ -295,6 +295,15 @@ struct LocalBeginsHolder : public SizesHolder
          return ConstValue;
       return SizesHolder::template getSize< level >();
    }
+
+   template< std::size_t level >
+   void setSize( typename SizesHolder::IndexType newSize )
+   {
+      if( SizesHolder::template getStaticSize< level >() == 0 )
+         SizesHolder::template setSize< level >( newSize );
+      else
+         TNL_ASSERT_EQ( newSize, ConstValue, "Dynamic size for a static dimension must be equal to the specified ConstValue." );
+   }
 };
 
 template< typename Index,
-- 
GitLab


From 560aba8043e592e21ec4b67db89898bdcf02d271 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Tue, 9 Apr 2019 20:07:41 +0200
Subject: [PATCH 19/25] Copied getSubarrayView method from NDArrayView to
 NDArray

---
 src/TNL/Containers/NDArray.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h
index 89315fc6f..7313e4726 100644
--- a/src/TNL/Containers/NDArray.h
+++ b/src/TNL/Containers/NDArray.h
@@ -173,6 +173,32 @@ public:
       return ConstViewType( array.getData(), sizes );
    }
 
+   template< std::size_t... Dimensions, typename... IndexTypes >
+   __cuda_callable__
+   auto getSubarrayView( IndexTypes&&... indices )
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      static_assert( 0 < sizeof...(Dimensions) && sizeof...(Dimensions) <= getDimension(), "got wrong number of dimensions" );
+      static_assert( __ndarray_impl::all_elements_in_range( 0, Permutation::size(), {Dimensions...} ),
+                     "invalid dimensions" );
+// FIXME: nvcc chokes on the variadic brace-initialization
+#ifndef __NVCC__
+      static_assert( __ndarray_impl::is_increasing_sequence( {Dimensions...} ),
+                     "specifying permuted dimensions is not supported" );
+#endif
+
+      using Getter = __ndarray_impl::SubarrayGetter< Base, Permutation, Dimensions... >;
+      using Subpermutation = typename Getter::Subpermutation;
+      auto& begin = operator()( std::forward< IndexTypes >( indices )... );
+      auto subarray_sizes = Getter::filterSizes( sizes, std::forward< IndexTypes >( indices )... );
+      auto strides = Getter::getStrides( sizes, std::forward< IndexTypes >( indices )... );
+      static_assert( Subpermutation::size() == sizeof...(Dimensions), "Bug - wrong subpermutation length." );
+      static_assert( decltype(subarray_sizes)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the new sizes." );
+      static_assert( decltype(strides)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the strides." );
+      using SubarrayView = NDArrayView< ValueType, Device, decltype(subarray_sizes), Subpermutation, Base, decltype(strides) >;
+      return SubarrayView{ &begin, subarray_sizes, strides };
+   }
+
    template< typename Device2 = DeviceType, typename Func >
    void forAll( Func f ) const
    {
-- 
GitLab


From fd4d842900adbc8e400263fefc562dbcb7fcc806 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Tue, 9 Apr 2019 20:07:03 +0200
Subject: [PATCH 20/25] Added templated assignment operators for NDArray and
 NDArrayView

It works for any value, device and index types, but the permutations of
both arrays must be the same and both arrays have to be contiguous.
---
 .../Containers/Algorithms/ArrayOperations.h   |   37 +
 .../Algorithms/ArrayOperationsStatic.hpp      |   82 +
 src/TNL/Containers/NDArray.h                  |   75 +-
 src/TNL/Containers/NDArrayView.h              |   88 +-
 .../Containers/ndarray/BoundaryExecutors.h    |   90 +-
 src/TNL/Containers/ndarray/Executors.h        |   68 +-
 src/TNL/Containers/ndarray/SizesHolder.h      |    4 +
 .../Containers/ndarray/SizesHolderHelpers.h   |   60 +
 src/TNL/Containers/ndarray/Subarrays.h        |   20 +
 .../Containers/ndarray/CMakeLists.txt         |   15 +-
 .../Containers/ndarray/NDArrayTest.cpp        | 1196 +--------------
 .../Containers/ndarray/NDArrayTest.cu         |    1 +
 .../Containers/ndarray/NDArrayTest.h          | 1339 +++++++++++++++++
 13 files changed, 1783 insertions(+), 1292 deletions(-)
 create mode 100644 src/TNL/Containers/Algorithms/ArrayOperationsStatic.hpp
 create mode 100644 src/UnitTests/Containers/ndarray/NDArrayTest.cu
 create mode 100644 src/UnitTests/Containers/ndarray/NDArrayTest.h

diff --git a/src/TNL/Containers/Algorithms/ArrayOperations.h b/src/TNL/Containers/Algorithms/ArrayOperations.h
index 7977b6b72..ca62f5b7e 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperations.h
+++ b/src/TNL/Containers/Algorithms/ArrayOperations.h
@@ -22,6 +22,42 @@ template< typename DestinationDevice,
           typename SourceDevice = DestinationDevice >
 struct ArrayOperations;
 
+// TODO: establish the concept of a "void device" for static computations in the whole TNL
+template<>
+struct ArrayOperations< void >
+{
+   template< typename Element >
+   __cuda_callable__
+   static void setElement( Element* data,
+                           const Element& value );
+
+   template< typename Element >
+   __cuda_callable__
+   static Element getElement( const Element* data );
+
+   template< typename Element, typename Index >
+   __cuda_callable__
+   static void set( Element* data,
+                    const Element& value,
+                    const Index size );
+
+   template< typename DestinationElement,
+             typename SourceElement,
+             typename Index >
+   __cuda_callable__
+   static void copy( DestinationElement* destination,
+                     const SourceElement* source,
+                     const Index size );
+
+   template< typename Element1,
+             typename Element2,
+             typename Index >
+   __cuda_callable__
+   static bool compare( const Element1* destination,
+                        const Element2* source,
+                        const Index size );
+};
+
 template<>
 struct ArrayOperations< Devices::Host >
 {
@@ -251,6 +287,7 @@ struct ArrayOperations< Devices::Host, Devices::MIC >
 } // namespace Containers
 } // namespace TNL
 
+#include <TNL/Containers/Algorithms/ArrayOperationsStatic.hpp>
 #include <TNL/Containers/Algorithms/ArrayOperationsHost.hpp>
 #include <TNL/Containers/Algorithms/ArrayOperationsCuda.hpp>
 #include <TNL/Containers/Algorithms/ArrayOperationsMIC.hpp>
diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsStatic.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsStatic.hpp
new file mode 100644
index 000000000..8115d25f4
--- /dev/null
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsStatic.hpp
@@ -0,0 +1,82 @@
+/***************************************************************************
+                          ArrayOperationsStatic_impl.h  -  description
+                             -------------------
+    begin                : Apr 8, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
+
+namespace TNL {
+namespace Containers {
+namespace Algorithms {
+
+template< typename Element >
+__cuda_callable__
+void
+ArrayOperations< void >::
+setElement( Element* data,
+            const Element& value )
+{
+   *data = value;
+}
+
+template< typename Element >
+__cuda_callable__
+Element
+ArrayOperations< void >::
+getElement( const Element* data )
+{
+   return *data;
+}
+
+template< typename Element, typename Index >
+__cuda_callable__
+void
+ArrayOperations< void >::
+set( Element* data,
+     const Element& value,
+     const Index size )
+{
+   for( Index i = 0; i < size; i ++ )
+      data[ i ] = value;
+}
+
+template< typename DestinationElement,
+          typename SourceElement,
+          typename Index >
+__cuda_callable__
+void
+ArrayOperations< void >::
+copy( DestinationElement* destination,
+      const SourceElement* source,
+      const Index size )
+{
+   for( Index i = 0; i < size; i ++ )
+      destination[ i ] = source[ i ];
+}
+
+template< typename Element1,
+          typename Element2,
+          typename Index >
+__cuda_callable__
+bool
+ArrayOperations< void >::
+compare( const Element1* destination,
+         const Element2* source,
+         const Index size )
+{
+   for( Index i = 0; i < size; i++ )
+      if( ! ( destination[ i ] == source[ i ] ) )
+         return false;
+   return true;
+}
+
+} // namespace Algorithms
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h
index 7313e4726..63ec5638d 100644
--- a/src/TNL/Containers/NDArray.h
+++ b/src/TNL/Containers/NDArray.h
@@ -53,6 +53,12 @@ public:
 
    static_assert( Permutation::size() == SizesHolder::getDimension(), "invalid permutation" );
 
+   // for compatibility with NDArrayView (which inherits from StrideBase)
+   static constexpr bool isContiguous()
+   {
+      return true;
+   }
+
    // all methods from NDArrayView
 
    NDArrayStorage() = default;
@@ -70,6 +76,21 @@ public:
    NDArrayStorage( NDArrayStorage&& ) = default;
    NDArrayStorage& operator=( NDArrayStorage&& ) = default;
 
+   // Templated copy-assignment
+   template< typename OtherArray >
+   NDArrayStorage& operator=( const OtherArray& other )
+   {
+      static_assert( std::is_same< PermutationType, typename OtherArray::PermutationType >::value,
+                     "Arrays must have the same permutation of indices." );
+      // update sizes
+      __ndarray_impl::SetSizesCopyHelper< SizesHolderType, typename OtherArray::SizesHolderType >::copy( sizes, other.getSizes() );
+      // (re)allocate storage if necessary
+      array.setSize( getStorageSize() );
+      // copy data
+      getView() = other.getConstView();
+      return *this;
+   }
+
    bool operator==( const NDArrayStorage& other ) const
    {
       // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
@@ -82,6 +103,14 @@ public:
       return sizes != other.sizes || array != other.array;
    }
 
+   // accessor to the underlying data
+   // (should not be used for accessing the elements, intended only for the implementation
+   // of operator= and functions like cudaHostRegister)
+   std::add_const_t< ValueType >* getData() const
+   {
+      return array.getData();
+   }
+
    static constexpr std::size_t getDimension()
    {
       return SizesHolder::getDimension();
@@ -330,7 +359,18 @@ class NDArray
                                                     PermutationHost,
                                                     PermutationCuda >::type,
                          __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >
-{};
+{
+   using Base = NDArrayStorage< Array< Value, Device, Index >,
+                         SizesHolder,
+                         typename std::conditional< std::is_same< Device, Devices::Host >::value,
+                                                    PermutationHost,
+                                                    PermutationCuda >::type,
+                         __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >;
+
+public:
+   // inherit all assignment operators
+   using Base::operator=;
+};
 
 template< typename Value,
           typename SizesHolder,
@@ -343,8 +383,17 @@ class StaticNDArray
                          __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >,
                          void >
 {
+   using Base = NDArrayStorage< StaticArray< __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get(), Value >,
+                         SizesHolder,
+                         Permutation,
+                         __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >,
+                         void >;
    static_assert( __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get() > 0,
                   "All dimensions of a static array must to be positive." );
+
+public:
+   // inherit all assignment operators
+   using Base::operator=;
 };
 
 template< typename Value,
@@ -356,7 +405,14 @@ class StaticMatrix
                         SizesHolder< std::size_t, Rows, Columns >,
                         Permutation >
 {
+   using Base = StaticNDArray< Value,
+                        SizesHolder< std::size_t, Rows, Columns >,
+                        Permutation >;
+
 public:
+   // inherit all assignment operators
+   using Base::operator=;
+
    static constexpr std::size_t getRows()
    {
       return Rows;
@@ -388,7 +444,22 @@ class SlicedNDArray
                                                        SliceInfoHost,
                                                        SliceInfoCuda >::type >
                         >
-{};
+{
+   using Base = NDArrayStorage< Array< Value, Device, Index >,
+                         SizesHolder,
+                         typename std::conditional< std::is_same< Device, Devices::Host >::value,
+                                                    PermutationHost,
+                                                    PermutationCuda >::type,
+                         __ndarray_impl::SlicedNDArrayBase<
+                            typename std::conditional< std::is_same< Device, Devices::Host >::value,
+                                                       SliceInfoHost,
+                                                       SliceInfoCuda >::type >
+                        >;
+
+public:
+   // inherit all assignment operators
+   using Base::operator=;
+};
 
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/NDArrayView.h b/src/TNL/Containers/NDArrayView.h
index e367f0a06..7883d30c1 100644
--- a/src/TNL/Containers/NDArrayView.h
+++ b/src/TNL/Containers/NDArrayView.h
@@ -18,6 +18,7 @@
 #include <TNL/Containers/ndarray/Executors.h>
 #include <TNL/Containers/ndarray/BoundaryExecutors.h>
 #include <TNL/Containers/ndarray/Operations.h>
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
 
 namespace TNL {
 namespace Containers {
@@ -71,7 +72,24 @@ public:
    {
       TNL_ASSERT_EQ( sizes, other.sizes, "The sizes of the array views must be equal, views are not resizable." );
       if( getStorageSize() > 0 )
-         ArrayOpsHelper< Device >::copy( array, other.array, getStorageSize() );
+         Algorithms::ArrayOperations< DeviceType >::copy( array, other.array, getStorageSize() );
+      return *this;
+   }
+
+   // Templated copy-assignment
+   template< typename OtherView >
+   NDArrayView& operator=( const OtherView& other )
+   {
+      static_assert( std::is_same< PermutationType, typename OtherView::PermutationType >::value,
+                     "Arrays must have the same permutation of indices." );
+      static_assert( NDArrayView::isContiguous() && OtherView::isContiguous(),
+                     "Non-contiguous array views cannot be assigned." );
+      TNL_ASSERT_TRUE( __ndarray_impl::sizesWeakCompare( getSizes(), other.getSizes() ),
+                       "The sizes of the array views must be equal, views are not resizable." );
+      if( getStorageSize() > 0 ) {
+         TNL_ASSERT_TRUE( array, "Attempted to assign to an empty view." );
+         Algorithms::ArrayOperations< DeviceType, typename OtherView::DeviceType >::copy( array, other.getData(), getStorageSize() );
+      }
       return *this;
    }
 
@@ -101,7 +119,7 @@ public:
       if( sizes != other.sizes )
          return false;
       // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
-      return ArrayOpsHelper< Device, Device >::compare( array, other.array, getStorageSize() );
+      return Algorithms::ArrayOperations< Device, Device >::compare( array, other.array, getStorageSize() );
    }
 
    __cuda_callable__
@@ -110,7 +128,7 @@ public:
       if( sizes != other.sizes )
          return true;
       // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
-      return ! ArrayOpsHelper< Device, Device >::compare( array, other.array, getStorageSize() );
+      return ! Algorithms::ArrayOperations< Device, Device >::compare( array, other.array, getStorageSize() );
    }
 
    static constexpr std::size_t getDimension()
@@ -118,6 +136,14 @@ public:
       return SizesHolder::getDimension();
    }
 
+   // accessor to the underlying data
+   // (should not be used for accessing the elements, intended only for the implementation
+   // of operator= and functions like cudaHostRegister)
+   std::add_const_t< ValueType >* getData() const
+   {
+      return array;
+   }
+
    const SizesHolderType& getSizes() const
    {
       return sizes;
@@ -285,62 +311,6 @@ public:
 protected:
    Value* array = nullptr;
    SizesHolder sizes;
-
-   // TODO: establish the concept of a "void device" for static computations in the whole TNL
-
-   template< typename DestinationDevice, typename SourceDevice = DestinationDevice, typename _unused = void >
-   struct ArrayOpsHelper
-   {
-      template< typename DestinationValue,
-                typename SourceValue,
-                typename Index >
-      static void copy( DestinationValue* destination,
-                        const SourceValue* source,
-                        const Index size )
-      {
-         Algorithms::ArrayOperations< DestinationDevice, SourceDevice >::copy( destination, source, size );
-      }
-
-      template< typename Value1,
-                typename Value2,
-                typename Index >
-      static bool compare( const Value1* destination,
-                           const Value2* source,
-                           const Index size )
-      {
-         return Algorithms::ArrayOperations< DestinationDevice, SourceDevice >::compare( destination, source, size );
-      }
-   };
-
-   template< typename _unused >
-   struct ArrayOpsHelper< void, void, _unused >
-   {
-      template< typename DestinationValue,
-                typename SourceValue,
-                typename Index >
-      __cuda_callable__
-      static void copy( DestinationValue* destination,
-                        const SourceValue* source,
-                        const Index size )
-      {
-         for( Index i = 0; i < size; i ++ )
-            destination[ i ] = source[ i ];
-      }
-
-      template< typename Value1,
-                typename Value2,
-                typename Index >
-      __cuda_callable__
-      static bool compare( const Value1* destination,
-                           const Value2* source,
-                           const Index size )
-      {
-         for( Index i = 0; i < size; i++ )
-            if( ! ( destination[ i ] == source[ i ] ) )
-               return false;
-         return true;
-      }
-   };
 };
 
 } // namespace Containers
diff --git a/src/TNL/Containers/ndarray/BoundaryExecutors.h b/src/TNL/Containers/ndarray/BoundaryExecutors.h
index 08970b46a..e4cd93705 100644
--- a/src/TNL/Containers/ndarray/BoundaryExecutors.h
+++ b/src/TNL/Containers/ndarray/BoundaryExecutors.h
@@ -204,12 +204,13 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 3 > >
       static_assert( Begins::getDimension() == Ends::getDimension(),
                      "wrong begins or ends" );
 
-      using Index = typename Ends::IndexType;
-
-      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
-      {
-         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
-      };
+      // nvcc does not like nested __cuda_callable__ and normal lambdas...
+//      using Index = typename Ends::IndexType;
+//      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
+//      {
+//         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
+//      };
+      Kernel< Device > kernel;
 
       const auto begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
       const auto begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
@@ -224,13 +225,35 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 3 > >
       const auto end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
       const auto end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
 
-      ParallelFor3D< Device >::exec( begin2,     begin1,     begin0,   skipBegin2, end1,       end0,       kernel );
-      ParallelFor3D< Device >::exec( skipEnd2,   begin1,     begin0,   end2,       end1,       end0,       kernel );
-      ParallelFor3D< Device >::exec( skipBegin2, begin1,     begin0,   skipEnd2,   skipBegin1, end0,       kernel );
-      ParallelFor3D< Device >::exec( skipBegin2, skipEnd1,   begin0,   skipEnd2,   end1,       end0,       kernel );
-      ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, begin0,   skipEnd2,   skipEnd1,   skipBegin0, kernel );
-      ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, skipEnd0, skipEnd2,   skipEnd1,   end0,       kernel );
+      ParallelFor3D< Device >::exec( begin2,     begin1,     begin0,   skipBegin2, end1,       end0,       kernel, f );
+      ParallelFor3D< Device >::exec( skipEnd2,   begin1,     begin0,   end2,       end1,       end0,       kernel, f );
+      ParallelFor3D< Device >::exec( skipBegin2, begin1,     begin0,   skipEnd2,   skipBegin1, end0,       kernel, f );
+      ParallelFor3D< Device >::exec( skipBegin2, skipEnd1,   begin0,   skipEnd2,   end1,       end0,       kernel, f );
+      ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, begin0,   skipEnd2,   skipEnd1,   skipBegin0, kernel, f );
+      ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, skipEnd0, skipEnd2,   skipEnd1,   end0,       kernel, f );
    }
+
+   template< typename __Device, typename = void >
+   struct Kernel
+   {
+      template< typename Index, typename Func >
+      void operator()( Index i2, Index i1, Index i0, Func f )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
+      };
+   };
+
+   // dummy specialization to avoid a shitpile of nvcc warnings
+   template< typename __unused >
+   struct Kernel< Devices::Cuda, __unused >
+   {
+      template< typename Index, typename Func >
+      __cuda_callable__
+      void operator()( Index i2, Index i1, Index i0, Func f )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
+      };
+   };
 };
 
 template< typename Permutation,
@@ -251,12 +274,13 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 2 > >
       static_assert( Begins::getDimension() == Ends::getDimension(),
                      "wrong begins or ends" );
 
-      using Index = typename Ends::IndexType;
-
-      auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
-      {
-         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
-      };
+      // nvcc does not like nested __cuda_callable__ and normal lambdas...
+//      using Index = typename Ends::IndexType;
+//      auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
+//      {
+//         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
+//      };
+      Kernel< Device > kernel;
 
       const auto begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
       const auto begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
@@ -267,11 +291,33 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 2 > >
       const auto end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
       const auto end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
 
-      ParallelFor2D< Device >::exec( begin1,     begin0,   skipBegin1, end0,       kernel );
-      ParallelFor2D< Device >::exec( skipEnd1,   begin0,   end1,       end0,       kernel );
-      ParallelFor2D< Device >::exec( skipBegin1, begin0,   skipEnd1,   skipBegin0, kernel );
-      ParallelFor2D< Device >::exec( skipBegin1, skipEnd0, skipEnd1,   end0,       kernel );
+      ParallelFor2D< Device >::exec( begin1,     begin0,   skipBegin1, end0,       kernel, f );
+      ParallelFor2D< Device >::exec( skipEnd1,   begin0,   end1,       end0,       kernel, f );
+      ParallelFor2D< Device >::exec( skipBegin1, begin0,   skipEnd1,   skipBegin0, kernel, f );
+      ParallelFor2D< Device >::exec( skipBegin1, skipEnd0, skipEnd1,   end0,       kernel, f );
    }
+
+   template< typename __Device, typename = void >
+   struct Kernel
+   {
+      template< typename Index, typename Func >
+      void operator()( Index i1, Index i0, Func f )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
+      };
+   };
+
+   // dummy specialization to avoid a shitpile of nvcc warnings
+   template< typename __unused >
+   struct Kernel< Devices::Cuda, __unused >
+   {
+      template< typename Index, typename Func >
+      __cuda_callable__
+      void operator()( Index i1, Index i0, Func f )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
+      };
+   };
 };
 
 template< typename Permutation,
diff --git a/src/TNL/Containers/ndarray/Executors.h b/src/TNL/Containers/ndarray/Executors.h
index ba37fe345..d09b6ec23 100644
--- a/src/TNL/Containers/ndarray/Executors.h
+++ b/src/TNL/Containers/ndarray/Executors.h
@@ -201,10 +201,12 @@ struct ParallelExecutor< Permutation, Device, IndexTag< 3 > >
 
       using Index = typename Ends::IndexType;
 
-      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
-      {
-         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
-      };
+      // nvcc does not like nested __cuda_callable__ and normal lambdas...
+//      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
+//      {
+//         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
+//      };
+      Kernel< Device > kernel;
 
       const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
       const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
@@ -212,8 +214,30 @@ struct ParallelExecutor< Permutation, Device, IndexTag< 3 > >
       const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
       const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
       const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
-      ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
+      ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel, f );
    }
+
+   template< typename __Device, typename = void >
+   struct Kernel
+   {
+      template< typename Index, typename Func >
+      void operator()( Index i2, Index i1, Index i0, Func f )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
+      };
+   };
+
+   // dummy specialization to avoid a shitpile of nvcc warnings
+   template< typename __unused >
+   struct Kernel< Devices::Cuda, __unused >
+   {
+      template< typename Index, typename Func >
+      __cuda_callable__
+      void operator()( Index i2, Index i1, Index i0, Func f )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
+      };
+   };
 };
 
 template< typename Permutation,
@@ -230,17 +254,41 @@ struct ParallelExecutor< Permutation, Device, IndexTag< 2 > >
 
       using Index = typename Ends::IndexType;
 
-      auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
-      {
-         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
-      };
+      // nvcc does not like nested __cuda_callable__ and normal lambdas...
+//      auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
+//      {
+//         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
+//      };
+      Kernel< Device > kernel;
 
       const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
       const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
       const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
       const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
-      ParallelFor2D< Device >::exec( begin1, begin0, end1, end0, kernel );
+      ParallelFor2D< Device >::exec( begin1, begin0, end1, end0, kernel, f );
    }
+
+   template< typename __Device, typename = void >
+   struct Kernel
+   {
+      template< typename Index, typename Func >
+      void operator()( Index i1, Index i0, Func f )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
+      };
+   };
+
+   // dummy specialization to avoid a shitpile of nvcc warnings
+   template< typename __unused >
+   struct Kernel< Devices::Cuda, __unused >
+   {
+      template< typename Index, typename Func >
+      __cuda_callable__
+      void operator()( Index i1, Index i0, Func f )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
+      };
+   };
 };
 
 template< typename Permutation,
diff --git a/src/TNL/Containers/ndarray/SizesHolder.h b/src/TNL/Containers/ndarray/SizesHolder.h
index e81bc3e60..7763b141f 100644
--- a/src/TNL/Containers/ndarray/SizesHolder.h
+++ b/src/TNL/Containers/ndarray/SizesHolder.h
@@ -35,6 +35,7 @@ public:
       return size;
    }
 
+   __cuda_callable__
    void setSize( LevelTag, Index newSize )
    {
       TNL_ASSERT_EQ( newSize, 0, "Dynamic size for a static dimension must be 0." );
@@ -58,6 +59,7 @@ public:
       return size;
    }
 
+   __cuda_callable__
    void setSize( LevelTag, Index size )
    {
       this->size = size;
@@ -179,6 +181,7 @@ public:
    }
 
    template< std::size_t level >
+   __cuda_callable__
    void setSize( Index size )
    {
       static_assert( level < sizeof...(sizes), "Invalid level passed to setSize()." );
@@ -297,6 +300,7 @@ struct LocalBeginsHolder : public SizesHolder
    }
 
    template< std::size_t level >
+   __cuda_callable__
    void setSize( typename SizesHolder::IndexType newSize )
    {
       if( SizesHolder::template getStaticSize< level >() == 0 )
diff --git a/src/TNL/Containers/ndarray/SizesHolderHelpers.h b/src/TNL/Containers/ndarray/SizesHolderHelpers.h
index d98bb3bdc..a18f7c6eb 100644
--- a/src/TNL/Containers/ndarray/SizesHolderHelpers.h
+++ b/src/TNL/Containers/ndarray/SizesHolderHelpers.h
@@ -15,6 +15,7 @@
 #include <algorithm>
 
 #include <TNL/Assert.h>
+#include <TNL/TemplateStaticFor.h>
 #include <TNL/Containers/ndarray/Meta.h>
 
 namespace TNL {
@@ -163,6 +164,65 @@ void assertIndicesInRange( const SizesHolder1& begins, const SizesHolder2& ends,
 }
 
 
+// helper for the assignment operator in NDArray
+template< typename TargetHolder,
+          typename SourceHolder,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesCopyHelper
+{
+   static void copy( TargetHolder& target,
+                     const SourceHolder& source )
+   {
+      if( target.template getStaticSize< level >() == 0 ) {
+         target.template setSize< level >( source.template getSize< level >() );
+         SetSizesCopyHelper< TargetHolder, SourceHolder, level - 1 >::copy( target, source );
+      }
+      else if( target.template getStaticSize< level >() != source.template getSize< level >() )
+         throw std::logic_error( "Cannot copy sizes due to inconsistent underlying types (static sizes don't match)." );
+   }
+};
+
+template< typename TargetHolder,
+          typename SourceHolder >
+struct SetSizesCopyHelper< TargetHolder, SourceHolder, 0 >
+{
+   static void copy( TargetHolder& target,
+                     const SourceHolder& source )
+   {
+      if( target.template getStaticSize< 0 >() == 0 )
+         target.template setSize< 0 >( source.template getSize< 0 >() );
+      else if( target.template getStaticSize< 0 >() != source.template getSize< 0 >() )
+         throw std::logic_error( "Cannot copy sizes due to inconsistent underlying types (static sizes don't match)." );
+   }
+};
+
+
+template< std::size_t level >
+struct WeakCompareHelper
+{
+   template< typename SizesHolder1,
+             typename SizesHolder2 >
+   __cuda_callable__
+   static void exec( const SizesHolder1& sizes1, const SizesHolder2& sizes2, bool& result )
+   {
+      result &= sizes1.template getSize< level >() == sizes2.template getSize< level >();
+   }
+};
+
+// helper for the assignment operator in NDArrayView
+template< typename SizesHolder1,
+          typename SizesHolder2 >
+__cuda_callable__
+bool sizesWeakCompare( const SizesHolder1& sizes1, const SizesHolder2& sizes2 )
+{
+   static_assert( SizesHolder1::getDimension() == SizesHolder2::getDimension(),
+                  "Cannot compare sizes of different dimensions." );
+   bool result = true;
+   TemplateStaticFor< std::size_t, 0, SizesHolder1::getDimension(), WeakCompareHelper >::exec( sizes1, sizes2, result );
+   return result;
+}
+
+
 // helper for the forInternal and forBoundary methods (NDArray and DistributedNDArray)
 template< std::size_t ConstValue,
           typename TargetHolder,
diff --git a/src/TNL/Containers/ndarray/Subarrays.h b/src/TNL/Containers/ndarray/Subarrays.h
index 5fc9554e4..5668ef594 100644
--- a/src/TNL/Containers/ndarray/Subarrays.h
+++ b/src/TNL/Containers/ndarray/Subarrays.h
@@ -74,6 +74,7 @@ private:
    {
       template< typename NewSizes,
                 typename OldSizes >
+      __cuda_callable__
       static void setSizes( NewSizes& newSizes,
                             const OldSizes& oldSizes )
       {
@@ -88,6 +89,7 @@ private:
    {
       template< typename NewSizes,
                 typename OldSizes >
+      __cuda_callable__
       static void setSizes( NewSizes& newSizes,
                             const OldSizes& oldSizes )
       {
@@ -129,6 +131,7 @@ public:
          );
 
    template< typename... IndexTypes >
+   __cuda_callable__
    static Sizes filterSizes( const SizesHolder< Index, sizes... >& oldSizes, IndexTypes&&... indices )
    {
       Sizes newSizes;
@@ -153,7 +156,13 @@ public:
 template< typename Index, std::size_t Dimension >
 struct DummyStrideBase
 {
+   static constexpr bool isContiguous()
+   {
+      return true;
+   }
+
    template< std::size_t level >
+   __cuda_callable__
    constexpr Index getStride( Index i = 0 ) const
    {
       return 1;
@@ -170,6 +179,12 @@ class StridesHolder
 public:
    using BaseType::getDimension;
 
+   static constexpr bool isContiguous()
+   {
+      // a priori not contiguous (otherwise DummyStrideBase would be used)
+      return false;
+   }
+
    template< std::size_t level >
    static constexpr std::size_t getStaticStride( Index i = 0 )
    {
@@ -184,6 +199,7 @@ public:
    }
 
    template< std::size_t level >
+   __cuda_callable__
    void setStride( Index size )
    {
       BaseType::template setSize< level >( size );
@@ -276,6 +292,7 @@ class SubarrayGetter< NDArrayBase< SliceInfo >, Permutation, Dimensions... >
    struct StrideSetterHelper
    {
       template< typename StridesHolder, typename SizesHolder >
+      __cuda_callable__
       static void setStrides( StridesHolder& strides, const SizesHolder& sizes )
       {
          static constexpr std::size_t dim = get_from_pack< level >( Dimensions... );
@@ -289,6 +306,7 @@ class SubarrayGetter< NDArrayBase< SliceInfo >, Permutation, Dimensions... >
    struct StrideSetterHelper< sizeof...(Dimensions) - 1, _unused >
    {
       template< typename StridesHolder, typename SizesHolder >
+      __cuda_callable__
       static void setStrides( StridesHolder& strides, const SizesHolder& sizes )
       {
          static constexpr std::size_t level = sizeof...(Dimensions) - 1;
@@ -302,6 +320,7 @@ public:
    using Subpermutation = typename SubpermutationGetter< std::index_sequence< Dimensions... >, Permutation >::Subpermutation;
 
    template< typename SizesHolder, typename... IndexTypes >
+   __cuda_callable__
    static auto filterSizes( const SizesHolder& sizes, IndexTypes&&... indices )
    {
       using Filter = SizesFilter< std::index_sequence< Dimensions... >, SizesHolder >;
@@ -309,6 +328,7 @@ public:
    }
 
    template< typename SizesHolder, typename... IndexTypes >
+   __cuda_callable__
    static auto getStrides( const SizesHolder& sizes, IndexTypes&&... indices )
    {
       using Strides = StridesHolder< typename SizesHolder::IndexType,
diff --git a/src/UnitTests/Containers/ndarray/CMakeLists.txt b/src/UnitTests/Containers/ndarray/CMakeLists.txt
index 990012865..2f87d64a9 100644
--- a/src/UnitTests/Containers/ndarray/CMakeLists.txt
+++ b/src/UnitTests/Containers/ndarray/CMakeLists.txt
@@ -1,7 +1,14 @@
-add_executable( NDArrayTest NDArrayTest.cpp )
-target_compile_options( NDArrayTest PRIVATE ${CXX_TESTS_FLAGS} )
-target_link_libraries( NDArrayTest ${GTEST_BOTH_LIBRARIES} )
-add_test( NDArrayTest ${EXECUTABLE_OUTPUT_PATH}/NDArrayTest${CMAKE_EXECUTABLE_SUFFIX} )
+if( BUILD_CUDA )
+   cuda_add_executable( NDArrayTest NDArrayTest.cu
+                        OPTIONS ${CXX_TESTS_FLAGS} )
+   target_link_libraries( NDArrayTest ${GTEST_BOTH_LIBRARIES} )
+   add_test( NDArrayTest ${EXECUTABLE_OUTPUT_PATH}/NDArrayTest${CMAKE_EXECUTABLE_SUFFIX} )
+else()
+   add_executable( NDArrayTest NDArrayTest.cpp )
+   target_compile_options( NDArrayTest PRIVATE ${CXX_TESTS_FLAGS} )
+   target_link_libraries( NDArrayTest ${GTEST_BOTH_LIBRARIES} )
+   add_test( NDArrayTest ${EXECUTABLE_OUTPUT_PATH}/NDArrayTest${CMAKE_EXECUTABLE_SUFFIX} )
+endif()
 
 add_executable( NDSubarrayTest NDSubarrayTest.cpp )
 target_compile_options( NDSubarrayTest PRIVATE ${CXX_TESTS_FLAGS} )
diff --git a/src/UnitTests/Containers/ndarray/NDArrayTest.cpp b/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
index 5790a80cf..d0e922218 100644
--- a/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
+++ b/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
@@ -1,1195 +1 @@
-#include "gtest/gtest.h"
-
-#include <TNL/Containers/NDArray.h>
-
-using namespace TNL::Containers;
-using std::index_sequence;
-
-template< typename Array >
-void expect_identity( const Array& a )
-{
-    Array identity;
-    identity.setLike( a );
-    int last = 0;
-    for( int i = 0; i < identity.getSize(); i++ ) {
-        // skip negative/invalid entries due to alignment
-        if( a[ i ] < 0 )
-            identity[ i ] = a[ i ];
-        else
-            identity[ i ] = last++;
-    }
-    EXPECT_EQ( a, identity );
-}
-
-TEST( NDArrayTest, setLike )
-{
-    int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
-    NDArray< int,
-             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
-             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
-    a.setSizes( I, J, K, L, M, N );
-
-    decltype(a) b;
-    EXPECT_EQ( b.template getSize< 0 >(), 0 );
-    EXPECT_EQ( b.template getSize< 1 >(), 0 );
-    EXPECT_EQ( b.template getSize< 2 >(), 0 );
-    EXPECT_EQ( b.template getSize< 3 >(), 0 );
-    EXPECT_EQ( b.template getSize< 4 >(), 0 );
-    EXPECT_EQ( b.template getSize< 5 >(), 0 );
-    b.setLike( a );
-    EXPECT_EQ( b.template getSize< 0 >(), I );
-    EXPECT_EQ( b.template getSize< 1 >(), J );
-    EXPECT_EQ( b.template getSize< 2 >(), K );
-    EXPECT_EQ( b.template getSize< 3 >(), L );
-    EXPECT_EQ( b.template getSize< 4 >(), M );
-    EXPECT_EQ( b.template getSize< 5 >(), N );
-}
-
-TEST( NDArrayTest, reset )
-{
-    int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
-    NDArray< int,
-             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
-             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
-    a.setSizes( I, J, K, L, M, N );
-    EXPECT_EQ( a.template getSize< 0 >(), I );
-    EXPECT_EQ( a.template getSize< 1 >(), J );
-    EXPECT_EQ( a.template getSize< 2 >(), K );
-    EXPECT_EQ( a.template getSize< 3 >(), L );
-    EXPECT_EQ( a.template getSize< 4 >(), M );
-    EXPECT_EQ( a.template getSize< 5 >(), N );
-
-    a.reset();
-    EXPECT_EQ( a.template getSize< 0 >(), 0 );
-    EXPECT_EQ( a.template getSize< 1 >(), 0 );
-    EXPECT_EQ( a.template getSize< 2 >(), 0 );
-    EXPECT_EQ( a.template getSize< 3 >(), 0 );
-    EXPECT_EQ( a.template getSize< 4 >(), 0 );
-    EXPECT_EQ( a.template getSize< 5 >(), 0 );
-}
-
-TEST( NDArrayTest, Static_1D )
-{
-    constexpr int I = 3;
-    NDArray< int, SizesHolder< int, I > > a;
-    a.setSizes( 0 );
-
-    int v = 0;
-    for( int i = 0; i < I; i++ ) {
-        a( i ) = v++;
-        EXPECT_EQ( a[ i ], a( i ) );
-    }
-
-    expect_identity( a.getStorageArray() );
-}
-
-TEST( NDArrayTest, Static_2D_Identity )
-{
-    constexpr int I = 3, J = 5;
-    NDArray< int, SizesHolder< int, I, J > > a;
-    a.setSizes( 0, 0 );
-
-    int v = 0;
-    for( int i = 0; i < I; i++ )
-        for( int j = 0; j < J; j++ )
-            a( i, j ) = v++;
-
-    expect_identity( a.getStorageArray() );
-}
-
-TEST( NDArrayTest, Static_2D_Permuted )
-{
-    constexpr int I = 3, J = 5;
-    NDArray< int,
-             SizesHolder< int, I, J >,
-             index_sequence< 1, 0 > > a;
-    a.setSizes( 0, 0 );
-
-    int v = 0;
-    for( int j = 0; j < J; j++ )
-        for( int i = 0; i < I; i++ )
-            a( i, j ) = v++;
-
-    expect_identity( a.getStorageArray() );
-}
-
-TEST( NDArrayTest, Dynamic_6D )
-{
-    int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
-    NDArray< int,
-             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
-             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
-    a.setSizes( I, J, K, L, M, N );
-
-    // initialize entries invalid due to alignment to -1
-    a.getStorageArray().setValue( -1 );
-
-    int v = 0;
-    for( int n = 0; n < N; n++ )
-        for( int l = 0; l < L; l++ )
-            for( int m = 0; m < M; m++ )
-                for( int k = 0; k < K; k++ )
-                    for( int i = 0; i < I; i++ )
-                        for( int j = 0; j < J; j++ )
-                            a( i, j, k, l, m, n ) = v++;
-
-    expect_identity( a.getStorageArray() );
-}
-
-TEST( NDArrayTest, CopySemantics )
-{
-    constexpr int I = 3, J = 4;
-    NDArray< int, SizesHolder< int, I, J > > a, b, c;
-    a.setSizes( 0, 0 );
-
-    int v = 0;
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-        a( i, j ) = v++;
-
-    expect_identity( a.getStorageArray() );
-
-    b = a;
-    EXPECT_EQ( a, b );
-
-    auto a_view = a.getView();
-    auto b_view = b.getView();
-    EXPECT_EQ( a_view, b_view );
-    EXPECT_EQ( a_view.getView(), b_view );
-    EXPECT_EQ( a_view.getConstView(), b_view.getConstView() );
-    EXPECT_EQ( a.getConstView(), b.getConstView() );
-    EXPECT_EQ( a.getConstView(), b_view.getConstView() );
-
-    c.setSizes( 0, 0 );
-    auto c_view = c.getView();
-    c_view = b_view;
-    EXPECT_EQ( a_view, c_view );
-    EXPECT_EQ( a_view.getView(), c_view );
-    EXPECT_EQ( a_view.getConstView(), c_view.getConstView() );
-    EXPECT_EQ( a.getConstView(), c.getConstView() );
-    EXPECT_EQ( a.getConstView(), c_view.getConstView() );
-}
-
-TEST( NDArrayTest, SizesHolderPrinter )
-{
-   SizesHolder< int, 0, 1, 2 > holder;
-   holder.setSize< 0 >( 3 );
-
-   std::stringstream str;
-   str << holder;
-   EXPECT_EQ( str.str(), "SizesHolder< 0, 1, 2 >( 3, 1, 2 )" );
-}
-
-TEST( NDArrayTest, forAll_dynamic_1D )
-{
-    int I = 2;
-    NDArray< int,
-             SizesHolder< int, 0 >,
-             index_sequence< 0 > > a;
-    a.setSizes( I );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i )
-    {
-       a( i ) += 1;
-    };
-
-    a.forAll( setter );
-
-    for( int i = 0; i < I; i++ )
-        EXPECT_EQ( a( i ), 1 );
-}
-
-TEST( NDArrayTest, forAll_dynamic_2D )
-{
-    int I = 2, J = 3;
-    NDArray< int,
-             SizesHolder< int, 0, 0 >,
-             index_sequence< 1, 0 > > a;
-    a.setSizes( I, J );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j )
-    {
-       a( i, j ) += 1;
-    };
-
-    a.forAll( setter );
-
-    for( int j = 0; j < J; j++ )
-    for( int i = 0; i < I; i++ )
-        EXPECT_EQ( a( i, j ), 1 );
-}
-
-TEST( NDArrayTest, forAll_dynamic_3D )
-{
-    int I = 2, J = 3, K = 4;
-    NDArray< int,
-             SizesHolder< int, 0, 0, 0 >,
-             index_sequence< 2, 0, 1 > > a;
-    a.setSizes( I, J, K );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k )
-    {
-       a( i, j, k ) += 1;
-    };
-
-    a.forAll( setter );
-
-    for( int k = 0; k < K; k++ )
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-        EXPECT_EQ( a( i, j, k ), 1 );
-}
-
-TEST( NDArrayTest, forAll_dynamic_4D )
-{
-    int I = 2, J = 3, K = 4, L = 5;
-    NDArray< int,
-             SizesHolder< int, 0, 0, 0, 0 >,
-             index_sequence< 3, 2, 0, 1 > > a;
-    a.setSizes( I, J, K, L );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k, int l )
-    {
-       a( i, j, k, l ) += 1;
-    };
-
-    a.forAll( setter );
-
-    for( int l = 0; l < L; l++ )
-    for( int k = 0; k < K; k++ )
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-        EXPECT_EQ( a( i, j, k, l ), 1 );
-}
-
-TEST( NDArrayTest, forAll_dynamic_5D )
-{
-    int I = 2, J = 3, K = 4, L = 5, M = 6;
-    NDArray< int,
-             SizesHolder< int, 0, 0, 0, 0, 0 >,
-             index_sequence< 3, 4, 2, 0, 1 > > a;
-    a.setSizes( I, J, K, L, M );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k, int l, int m )
-    {
-       a( i, j, k, l, m ) += 1;
-    };
-
-    a.forAll( setter );
-
-    for( int l = 0; l < L; l++ )
-    for( int m = 0; m < M; m++ )
-    for( int k = 0; k < K; k++ )
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-        EXPECT_EQ( a( i, j, k, l, m ), 1 );
-}
-
-TEST( NDArrayTest, forAll_dynamic_6D )
-{
-    int I = 2, J = 3, K = 4, L = 5, M = 6, N = 7;
-    NDArray< int,
-             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
-             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
-    a.setSizes( I, J, K, L, M, N );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
-    {
-       a( i, j, k, l, m, n ) += 1;
-    };
-
-    a.forAll( setter );
-
-    for( int n = 0; n < N; n++ )
-    for( int l = 0; l < L; l++ )
-    for( int m = 0; m < M; m++ )
-    for( int k = 0; k < K; k++ )
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-        EXPECT_EQ( a( i, j, k, l, m, n ), 1 );
-}
-
-TEST( NDArrayTest, forAll_static_1D )
-{
-    constexpr int I = 3;
-    StaticNDArray< int, SizesHolder< int, I > > a;
-//    a.setSizes( 0 );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i )
-    {
-       a( i ) += 1;
-    };
-
-    a.forAll( setter );
-
-    for( int i = 0; i < I; i++ )
-        EXPECT_EQ( a( i ), 1 );
-}
-
-TEST( NDArrayTest, forAll_static_2D )
-{
-    constexpr int I = 3, J = 4;
-    StaticNDArray< int, SizesHolder< int, I, J > > a;
-//    a.setSizes( 0, 0 );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j )
-    {
-       a( i, j ) += 1;
-    };
-
-    a.forAll( setter );
-
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-        EXPECT_EQ( a( i, j ), 1 );
-}
-
-TEST( NDArrayTest, forAll_static_3D )
-{
-    constexpr int I = 3, J = 4, K = 5;
-    StaticNDArray< int, SizesHolder< int, I, J, K > > a;
-//    a.setSizes( 0, 0, 0 );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k )
-    {
-       a( i, j, k ) += 1;
-    };
-
-    a.forAll( setter );
-
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-    for( int k = 0; k < K; k++ )
-        EXPECT_EQ( a( i, j, k ), 1 );
-}
-
-TEST( NDArrayTest, forAll_static_4D )
-{
-    constexpr int I = 3, J = 4, K = 5, L = 6;
-    StaticNDArray< int, SizesHolder< int, I, J, K, L > > a;
-//    a.setSizes( 0, 0, 0, 0 );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k, int l )
-    {
-       a( i, j, k, l ) += 1;
-    };
-
-    a.forAll( setter );
-
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-    for( int k = 0; k < K; k++ )
-    for( int l = 0; l < L; l++ )
-        EXPECT_EQ( a( i, j, k, l ), 1 );
-}
-
-TEST( NDArrayTest, forAll_static_5D )
-{
-    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7;
-    StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a;
-//    a.setSizes( 0, 0, 0, 0, 0 );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k, int l, int m )
-    {
-       a( i, j, k, l, m ) += 1;
-    };
-
-    a.forAll( setter );
-
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-    for( int k = 0; k < K; k++ )
-    for( int l = 0; l < L; l++ )
-    for( int m = 0; m < M; m++ )
-        EXPECT_EQ( a( i, j, k, l, m ), 1 );
-}
-
-TEST( NDArrayTest, forAll_static_6D )
-{
-    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
-    StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a;
-//    a.setSizes( 0, 0, 0, 0, 0, 0 );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
-    {
-       a( i, j, k, l, m, n ) += 1;
-    };
-
-    a.forAll( setter );
-
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-    for( int k = 0; k < K; k++ )
-    for( int l = 0; l < L; l++ )
-    for( int m = 0; m < M; m++ )
-    for( int n = 0; n < N; n++ )
-        EXPECT_EQ( a( i, j, k, l, m, n ), 1 );
-}
-
-TEST( NDArrayTest, forInternal_dynamic_1D )
-{
-    int I = 3;
-    NDArray< int,
-             SizesHolder< int, 0 >,
-             index_sequence< 0 > > a;
-    a.setSizes( I );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i )
-    {
-       a( i ) += 1;
-    };
-
-    a.forInternal( setter );
-
-    for( int i = 0; i < I; i++ )
-    {
-        if( i == 0 || i == I - 1 )
-            EXPECT_EQ( a( i ), 0 )
-               << "i = " << i;
-        else
-            EXPECT_EQ( a( i ), 1 )
-               << "i = " << i;
-    }
-}
-
-TEST( NDArrayTest, forInternal_dynamic_2D )
-{
-    int I = 3, J = 4;
-    NDArray< int,
-             SizesHolder< int, 0, 0 >,
-             index_sequence< 1, 0 > > a;
-    a.setSizes( I, J );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j )
-    {
-       a( i, j ) += 1;
-    };
-
-    a.forInternal( setter );
-
-    for( int j = 0; j < J; j++ )
-    for( int i = 0; i < I; i++ )
-    {
-        if( i == 0 || i == I - 1 ||
-            j == 0 || j == J - 1 )
-            EXPECT_EQ( a( i, j ), 0 )
-               << "i = " << i << ", j = " << j;
-        else
-            EXPECT_EQ( a( i, j ), 1 )
-               << "i = " << i << ", j = " << j;
-    }
-}
-
-TEST( NDArrayTest, forInternal_dynamic_3D )
-{
-    int I = 3, J = 4, K = 5;
-    NDArray< int,
-             SizesHolder< int, 0, 0, 0 >,
-             index_sequence< 2, 0, 1 > > a;
-    a.setSizes( I, J, K );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k )
-    {
-       a( i, j, k ) += 1;
-    };
-
-    a.forInternal( setter );
-
-    for( int k = 0; k < K; k++ )
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-    {
-        if( i == 0 || i == I - 1 ||
-            j == 0 || j == J - 1 ||
-            k == 0 || k == K - 1 )
-            EXPECT_EQ( a( i, j, k ), 0 )
-               << "i = " << i << ", j = " << j << ", k = " << k;
-        else
-            EXPECT_EQ( a( i, j, k ), 1 )
-               << "i = " << i << ", j = " << j << ", k = " << k;
-    }
-}
-
-TEST( NDArrayTest, forInternal_dynamic_4D )
-{
-    int I = 3, J = 4, K = 5, L = 6;
-    NDArray< int,
-             SizesHolder< int, 0, 0, 0, 0 >,
-             index_sequence< 3, 2, 0, 1 > > a;
-    a.setSizes( I, J, K, L );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k, int l )
-    {
-       a( i, j, k, l ) += 1;
-    };
-
-    a.forInternal( setter );
-
-    for( int l = 0; l < L; l++ )
-    for( int k = 0; k < K; k++ )
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-    {
-        if( i == 0 || i == I - 1 ||
-            j == 0 || j == J - 1 ||
-            k == 0 || k == K - 1 ||
-            l == 0 || l == L - 1 )
-            EXPECT_EQ( a( i, j, k, l ), 0 )
-               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
-        else
-            EXPECT_EQ( a( i, j, k, l ), 1 )
-               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
-    }
-}
-
-TEST( NDArrayTest, forInternal_dynamic_5D )
-{
-    int I = 3, J = 4, K = 5, L = 6, M = 7;
-    NDArray< int,
-             SizesHolder< int, 0, 0, 0, 0, 0 >,
-             index_sequence< 3, 4, 2, 0, 1 > > a;
-    a.setSizes( I, J, K, L, M );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k, int l, int m )
-    {
-       a( i, j, k, l, m ) += 1;
-    };
-
-    a.forInternal( setter );
-
-    for( int l = 0; l < L; l++ )
-    for( int m = 0; m < M; m++ )
-    for( int k = 0; k < K; k++ )
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-    {
-        if( i == 0 || i == I - 1 ||
-            j == 0 || j == J - 1 ||
-            k == 0 || k == K - 1 ||
-            l == 0 || l == L - 1 ||
-            m == 0 || m == M - 1 )
-            EXPECT_EQ( a( i, j, k, l, m ), 0 )
-               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
-        else
-            EXPECT_EQ( a( i, j, k, l, m ), 1 )
-               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
-    }
-}
-
-TEST( NDArrayTest, forInternal_dynamic_6D )
-{
-    int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
-    NDArray< int,
-             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
-             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
-    a.setSizes( I, J, K, L, M, N );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
-    {
-       a( i, j, k, l, m, n ) += 1;
-    };
-
-    a.forInternal( setter );
-
-    for( int n = 0; n < N; n++ )
-    for( int l = 0; l < L; l++ )
-    for( int m = 0; m < M; m++ )
-    for( int k = 0; k < K; k++ )
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-    {
-        if( i == 0 || i == I - 1 ||
-            j == 0 || j == J - 1 ||
-            k == 0 || k == K - 1 ||
-            l == 0 || l == L - 1 ||
-            m == 0 || m == M - 1 ||
-            n == 0 || n == N - 1 )
-            EXPECT_EQ( a( i, j, k, l, m, n ), 0 )
-               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
-        else
-            EXPECT_EQ( a( i, j, k, l, m, n ), 1 )
-               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
-    }
-}
-
-TEST( NDArrayTest, forInternal_static_1D )
-{
-    constexpr int I = 3;
-    StaticNDArray< int, SizesHolder< int, I > > a;
-//    a.setSizes( 0 );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i )
-    {
-       a( i ) += 1;
-    };
-
-    a.forInternal( setter );
-
-    for( int i = 0; i < I; i++ )
-    {
-        if( i == 0 || i == I - 1 )
-            EXPECT_EQ( a( i ), 0 )
-               << "i = " << i;
-        else
-            EXPECT_EQ( a( i ), 1 )
-               << "i = " << i;
-    }
-}
-
-TEST( NDArrayTest, forInternal_static_2D )
-{
-    constexpr int I = 3, J = 4;
-    StaticNDArray< int, SizesHolder< int, I, J > > a;
-//    a.setSizes( 0, 0 );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j )
-    {
-       a( i, j ) += 1;
-    };
-
-    a.forInternal( setter );
-
-    for( int j = 0; j < J; j++ )
-    for( int i = 0; i < I; i++ )
-    {
-        if( i == 0 || i == I - 1 ||
-            j == 0 || j == J - 1 )
-            EXPECT_EQ( a( i, j ), 0 )
-               << "i = " << i << ", j = " << j;
-        else
-            EXPECT_EQ( a( i, j ), 1 )
-               << "i = " << i << ", j = " << j;
-    }
-}
-
-TEST( NDArrayTest, forInternal_static_3D )
-{
-    constexpr int I = 3, J = 4, K = 5;
-    StaticNDArray< int, SizesHolder< int, I, J, K > > a;
-//    a.setSizes( 0, 0, 0 );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k )
-    {
-       a( i, j, k ) += 1;
-    };
-
-    a.forInternal( setter );
-
-    for( int k = 0; k < K; k++ )
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-    {
-        if( i == 0 || i == I - 1 ||
-            j == 0 || j == J - 1 ||
-            k == 0 || k == K - 1 )
-            EXPECT_EQ( a( i, j, k ), 0 )
-               << "i = " << i << ", j = " << j << ", k = " << k;
-        else
-            EXPECT_EQ( a( i, j, k ), 1 )
-               << "i = " << i << ", j = " << j << ", k = " << k;
-    }
-}
-
-TEST( NDArrayTest, forInternal_static_4D )
-{
-    constexpr int I = 3, J = 4, K = 5, L = 6;
-    StaticNDArray< int, SizesHolder< int, I, J, K, L > > a;
-//    a.setSizes( 0, 0, 0, 0 );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k, int l )
-    {
-       a( i, j, k, l ) += 1;
-    };
-
-    a.forInternal( setter );
-
-    for( int l = 0; l < L; l++ )
-    for( int k = 0; k < K; k++ )
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-    {
-        if( i == 0 || i == I - 1 ||
-            j == 0 || j == J - 1 ||
-            k == 0 || k == K - 1 ||
-            l == 0 || l == L - 1 )
-            EXPECT_EQ( a( i, j, k, l ), 0 )
-               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
-        else
-            EXPECT_EQ( a( i, j, k, l ), 1 )
-               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
-    }
-}
-
-TEST( NDArrayTest, forInternal_static_5D )
-{
-    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7;
-    StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a;
-//    a.setSizes( 0, 0, 0, 0, 0 );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k, int l, int m )
-    {
-       a( i, j, k, l, m ) += 1;
-    };
-
-    a.forInternal( setter );
-
-    for( int l = 0; l < L; l++ )
-    for( int m = 0; m < M; m++ )
-    for( int k = 0; k < K; k++ )
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-    {
-        if( i == 0 || i == I - 1 ||
-            j == 0 || j == J - 1 ||
-            k == 0 || k == K - 1 ||
-            l == 0 || l == L - 1 ||
-            m == 0 || m == M - 1 )
-            EXPECT_EQ( a( i, j, k, l, m ), 0 )
-               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
-        else
-            EXPECT_EQ( a( i, j, k, l, m ), 1 )
-               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
-    }
-}
-
-TEST( NDArrayTest, forInternal_static_6D )
-{
-    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
-    StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a;
-//    a.setSizes( 0, 0, 0, 0, 0, 0 );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
-    {
-       a( i, j, k, l, m, n ) += 1;
-    };
-
-    a.forInternal( setter );
-
-    for( int n = 0; n < N; n++ )
-    for( int l = 0; l < L; l++ )
-    for( int m = 0; m < M; m++ )
-    for( int k = 0; k < K; k++ )
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-    {
-        if( i == 0 || i == I - 1 ||
-            j == 0 || j == J - 1 ||
-            k == 0 || k == K - 1 ||
-            l == 0 || l == L - 1 ||
-            m == 0 || m == M - 1 ||
-            n == 0 || n == N - 1 )
-            EXPECT_EQ( a( i, j, k, l, m, n ), 0 )
-               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
-        else
-            EXPECT_EQ( a( i, j, k, l, m, n ), 1 )
-               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
-    }
-}
-
-TEST( NDArrayTest, forBoundary_dynamic_1D )
-{
-    int I = 3;
-    NDArray< int,
-             SizesHolder< int, 0 >,
-             index_sequence< 0 > > a;
-    a.setSizes( I );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i )
-    {
-       a( i ) += 1;
-    };
-
-    a.forBoundary( setter );
-
-    for( int i = 0; i < I; i++ )
-    {
-        if( i == 0 || i == I - 1 )
-            EXPECT_EQ( a( i ), 1 )
-               << "i = " << i;
-        else
-            EXPECT_EQ( a( i ), 0 )
-               << "i = " << i;
-    }
-}
-
-TEST( NDArrayTest, forBoundary_dynamic_2D )
-{
-    int I = 3, J = 4;
-    NDArray< int,
-             SizesHolder< int, 0, 0 >,
-             index_sequence< 1, 0 > > a;
-    a.setSizes( I, J );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j )
-    {
-       a( i, j ) += 1;
-    };
-
-    a.forBoundary( setter );
-
-    for( int j = 0; j < J; j++ )
-    for( int i = 0; i < I; i++ )
-    {
-        if( i == 0 || i == I - 1 ||
-            j == 0 || j == J - 1 )
-            EXPECT_EQ( a( i, j ), 1 )
-               << "i = " << i << ", j = " << j;
-        else
-            EXPECT_EQ( a( i, j ), 0 )
-               << "i = " << i << ", j = " << j;
-    }
-}
-
-TEST( NDArrayTest, forBoundary_dynamic_3D )
-{
-    int I = 3, J = 4, K = 5;
-    NDArray< int,
-             SizesHolder< int, 0, 0, 0 >,
-             index_sequence< 2, 0, 1 > > a;
-    a.setSizes( I, J, K );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k )
-    {
-       a( i, j, k ) += 1;
-    };
-
-    a.forBoundary( setter );
-
-    for( int k = 0; k < K; k++ )
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-    {
-        if( i == 0 || i == I - 1 ||
-            j == 0 || j == J - 1 ||
-            k == 0 || k == K - 1 )
-            EXPECT_EQ( a( i, j, k ), 1 )
-               << "i = " << i << ", j = " << j << ", k = " << k;
-        else
-            EXPECT_EQ( a( i, j, k ), 0 )
-               << "i = " << i << ", j = " << j << ", k = " << k;
-    }
-}
-
-// TODO: implement general ParallelBoundaryExecutor
-//TEST( NDArrayTest, forBoundary_dynamic_4D )
-//{
-//    int I = 3, J = 4, K = 5, L = 6;
-//    NDArray< int,
-//             SizesHolder< int, 0, 0, 0, 0 >,
-//             index_sequence< 3, 2, 0, 1 > > a;
-//    a.setSizes( I, J, K, L );
-//    a.setValue( 0 );
-//
-//    auto setter = [&] ( int i, int j, int k, int l )
-//    {
-//       a( i, j, k, l ) += 1;
-//    };
-//
-//    a.forBoundary( setter );
-//
-//    for( int l = 0; l < L; l++ )
-//    for( int k = 0; k < K; k++ )
-//    for( int i = 0; i < I; i++ )
-//    for( int j = 0; j < J; j++ )
-//    {
-//        if( i == 0 || i == I - 1 ||
-//            j == 0 || j == J - 1 ||
-//            k == 0 || k == K - 1 ||
-//            l == 0 || l == L - 1 )
-//            EXPECT_EQ( a( i, j, k, l ), 1 )
-//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
-//        else
-//            EXPECT_EQ( a( i, j, k, l ), 0 )
-//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
-//    }
-//}
-//
-//TEST( NDArrayTest, forBoundary_dynamic_5D )
-//{
-//    int I = 3, J = 4, K = 5, L = 6, M = 7;
-//    NDArray< int,
-//             SizesHolder< int, 0, 0, 0, 0, 0 >,
-//             index_sequence< 3, 4, 2, 0, 1 > > a;
-//    a.setSizes( I, J, K, L, M );
-//    a.setValue( 0 );
-//
-//    auto setter = [&] ( int i, int j, int k, int l, int m )
-//    {
-//       a( i, j, k, l, m ) += 1;
-//    };
-//
-//    a.forBoundary( setter );
-//
-//    for( int l = 0; l < L; l++ )
-//    for( int m = 0; m < M; m++ )
-//    for( int k = 0; k < K; k++ )
-//    for( int i = 0; i < I; i++ )
-//    for( int j = 0; j < J; j++ )
-//    {
-//        if( i == 0 || i == I - 1 ||
-//            j == 0 || j == J - 1 ||
-//            k == 0 || k == K - 1 ||
-//            l == 0 || l == L - 1 ||
-//            m == 0 || m == M - 1 )
-//            EXPECT_EQ( a( i, j, k, l, m ), 1 )
-//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
-//        else
-//            EXPECT_EQ( a( i, j, k, l, m ), 0 )
-//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
-//    }
-//}
-//
-//TEST( NDArrayTest, forBoundary_dynamic_6D )
-//{
-//    int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
-//    NDArray< int,
-//             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
-//             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
-//    a.setSizes( I, J, K, L, M, N );
-//    a.setValue( 0 );
-//
-//    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
-//    {
-//       a( i, j, k, l, m, n ) += 1;
-//    };
-//
-//    a.forBoundary( setter );
-//
-//    for( int n = 0; n < N; n++ )
-//    for( int l = 0; l < L; l++ )
-//    for( int m = 0; m < M; m++ )
-//    for( int k = 0; k < K; k++ )
-//    for( int i = 0; i < I; i++ )
-//    for( int j = 0; j < J; j++ )
-//    {
-//        if( i == 0 || i == I - 1 ||
-//            j == 0 || j == J - 1 ||
-//            k == 0 || k == K - 1 ||
-//            l == 0 || l == L - 1 ||
-//            m == 0 || m == M - 1 ||
-//            n == 0 || n == N - 1 )
-//            EXPECT_EQ( a( i, j, k, l, m, n ), 1 )
-//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
-//        else
-//            EXPECT_EQ( a( i, j, k, l, m, n ), 0 )
-//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
-//    }
-//}
-
-TEST( NDArrayTest, forBoundary_static_1D )
-{
-    constexpr int I = 3;
-    StaticNDArray< int, SizesHolder< int, I > > a;
-//    a.setSizes( 0 );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i )
-    {
-       a( i ) += 1;
-    };
-
-    a.forBoundary( setter );
-
-    for( int i = 0; i < I; i++ )
-    {
-        if( i == 0 || i == I - 1 )
-            EXPECT_EQ( a( i ), 1 )
-               << "i = " << i;
-        else
-            EXPECT_EQ( a( i ), 0 )
-               << "i = " << i;
-    }
-}
-
-TEST( NDArrayTest, forBoundary_static_2D )
-{
-    constexpr int I = 3, J = 4;
-    StaticNDArray< int, SizesHolder< int, I, J > > a;
-//    a.setSizes( 0, 0 );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j )
-    {
-       a( i, j ) += 1;
-    };
-
-    a.forBoundary( setter );
-
-    for( int j = 0; j < J; j++ )
-    for( int i = 0; i < I; i++ )
-    {
-        if( i == 0 || i == I - 1 ||
-            j == 0 || j == J - 1 )
-            EXPECT_EQ( a( i, j ), 1 )
-               << "i = " << i << ", j = " << j;
-        else
-            EXPECT_EQ( a( i, j ), 0 )
-               << "i = " << i << ", j = " << j;
-    }
-}
-
-TEST( NDArrayTest, forBoundary_static_3D )
-{
-    constexpr int I = 3, J = 4, K = 5;
-    StaticNDArray< int, SizesHolder< int, I, J, K > > a;
-//    a.setSizes( 0, 0, 0 );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k )
-    {
-       a( i, j, k ) += 1;
-    };
-
-    a.forBoundary( setter );
-
-    for( int k = 0; k < K; k++ )
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-    {
-        if( i == 0 || i == I - 1 ||
-            j == 0 || j == J - 1 ||
-            k == 0 || k == K - 1 )
-            EXPECT_EQ( a( i, j, k ), 1 )
-               << "i = " << i << ", j = " << j << ", k = " << k;
-        else
-            EXPECT_EQ( a( i, j, k ), 0 )
-               << "i = " << i << ", j = " << j << ", k = " << k;
-    }
-}
-
-TEST( NDArrayTest, forBoundary_static_4D )
-{
-    constexpr int I = 3, J = 4, K = 5, L = 6;
-    StaticNDArray< int, SizesHolder< int, I, J, K, L > > a;
-//    a.setSizes( 0, 0, 0, 0 );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k, int l )
-    {
-       a( i, j, k, l ) += 1;
-    };
-
-    a.forBoundary( setter );
-
-    for( int l = 0; l < L; l++ )
-    for( int k = 0; k < K; k++ )
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-    {
-        if( i == 0 || i == I - 1 ||
-            j == 0 || j == J - 1 ||
-            k == 0 || k == K - 1 ||
-            l == 0 || l == L - 1 )
-            EXPECT_EQ( a( i, j, k, l ), 1 )
-               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
-        else
-            EXPECT_EQ( a( i, j, k, l ), 0 )
-               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
-    }
-}
-
-TEST( NDArrayTest, forBoundary_static_5D )
-{
-    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7;
-    StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a;
-//    a.setSizes( 0, 0, 0, 0, 0 );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k, int l, int m )
-    {
-       a( i, j, k, l, m ) += 1;
-    };
-
-    a.forBoundary( setter );
-
-    for( int l = 0; l < L; l++ )
-    for( int m = 0; m < M; m++ )
-    for( int k = 0; k < K; k++ )
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-    {
-        if( i == 0 || i == I - 1 ||
-            j == 0 || j == J - 1 ||
-            k == 0 || k == K - 1 ||
-            l == 0 || l == L - 1 ||
-            m == 0 || m == M - 1 )
-            EXPECT_EQ( a( i, j, k, l, m ), 1 )
-               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
-        else
-            EXPECT_EQ( a( i, j, k, l, m ), 0 )
-               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
-    }
-}
-
-TEST( NDArrayTest, forBoundary_static_6D )
-{
-    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
-    StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a;
-//    a.setSizes( 0, 0, 0, 0, 0, 0 );
-    a.setValue( 0 );
-
-    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
-    {
-       a( i, j, k, l, m, n ) += 1;
-    };
-
-    a.forBoundary( setter );
-
-    for( int n = 0; n < N; n++ )
-    for( int l = 0; l < L; l++ )
-    for( int m = 0; m < M; m++ )
-    for( int k = 0; k < K; k++ )
-    for( int i = 0; i < I; i++ )
-    for( int j = 0; j < J; j++ )
-    {
-        if( i == 0 || i == I - 1 ||
-            j == 0 || j == J - 1 ||
-            k == 0 || k == K - 1 ||
-            l == 0 || l == L - 1 ||
-            m == 0 || m == M - 1 ||
-            n == 0 || n == N - 1 )
-            EXPECT_EQ( a( i, j, k, l, m, n ), 1 )
-               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
-        else
-            EXPECT_EQ( a( i, j, k, l, m, n ), 0 )
-               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
-    }
-}
-
-//#include "GtestMissingError.h"
-int main( int argc, char* argv[] )
-{
-//#ifdef HAVE_GTEST
-   ::testing::InitGoogleTest( &argc, argv );
-   return RUN_ALL_TESTS();
-//#else
-//   throw GtestMissingError();
-//#endif
-}
+#include "NDArrayTest.h"
diff --git a/src/UnitTests/Containers/ndarray/NDArrayTest.cu b/src/UnitTests/Containers/ndarray/NDArrayTest.cu
new file mode 100644
index 000000000..d0e922218
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/NDArrayTest.cu
@@ -0,0 +1 @@
+#include "NDArrayTest.h"
diff --git a/src/UnitTests/Containers/ndarray/NDArrayTest.h b/src/UnitTests/Containers/ndarray/NDArrayTest.h
new file mode 100644
index 000000000..cfea0015c
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/NDArrayTest.h
@@ -0,0 +1,1339 @@
+#include "gtest/gtest.h"
+
+#include <TNL/Containers/NDArray.h>
+
+using namespace TNL::Containers;
+using std::index_sequence;
+
+template< typename Array >
+void expect_identity( const Array& a )
+{
+    Array identity;
+    identity.setLike( a );
+    int last = 0;
+    for( int i = 0; i < identity.getSize(); i++ ) {
+        // skip negative/invalid entries due to alignment
+        if( a[ i ] < 0 )
+            identity[ i ] = a[ i ];
+        else
+            identity[ i ] = last++;
+    }
+    EXPECT_EQ( a, identity );
+}
+
+TEST( NDArrayTest, setLike )
+{
+    int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+
+    decltype(a) b;
+    EXPECT_EQ( b.template getSize< 0 >(), 0 );
+    EXPECT_EQ( b.template getSize< 1 >(), 0 );
+    EXPECT_EQ( b.template getSize< 2 >(), 0 );
+    EXPECT_EQ( b.template getSize< 3 >(), 0 );
+    EXPECT_EQ( b.template getSize< 4 >(), 0 );
+    EXPECT_EQ( b.template getSize< 5 >(), 0 );
+    b.setLike( a );
+    EXPECT_EQ( b.template getSize< 0 >(), I );
+    EXPECT_EQ( b.template getSize< 1 >(), J );
+    EXPECT_EQ( b.template getSize< 2 >(), K );
+    EXPECT_EQ( b.template getSize< 3 >(), L );
+    EXPECT_EQ( b.template getSize< 4 >(), M );
+    EXPECT_EQ( b.template getSize< 5 >(), N );
+}
+
+TEST( NDArrayTest, reset )
+{
+    int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+    EXPECT_EQ( a.template getSize< 0 >(), I );
+    EXPECT_EQ( a.template getSize< 1 >(), J );
+    EXPECT_EQ( a.template getSize< 2 >(), K );
+    EXPECT_EQ( a.template getSize< 3 >(), L );
+    EXPECT_EQ( a.template getSize< 4 >(), M );
+    EXPECT_EQ( a.template getSize< 5 >(), N );
+
+    a.reset();
+    EXPECT_EQ( a.template getSize< 0 >(), 0 );
+    EXPECT_EQ( a.template getSize< 1 >(), 0 );
+    EXPECT_EQ( a.template getSize< 2 >(), 0 );
+    EXPECT_EQ( a.template getSize< 3 >(), 0 );
+    EXPECT_EQ( a.template getSize< 4 >(), 0 );
+    EXPECT_EQ( a.template getSize< 5 >(), 0 );
+}
+
+TEST( NDArrayTest, Static_1D )
+{
+    constexpr int I = 3;
+    NDArray< int, SizesHolder< int, I > > a;
+    a.setSizes( 0 );
+
+    int v = 0;
+    for( int i = 0; i < I; i++ ) {
+        a( i ) = v++;
+        EXPECT_EQ( a[ i ], a( i ) );
+    }
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( NDArrayTest, Static_2D_Identity )
+{
+    constexpr int I = 3, J = 5;
+    NDArray< int, SizesHolder< int, I, J > > a;
+    a.setSizes( 0, 0 );
+
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+        for( int j = 0; j < J; j++ )
+            a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( NDArrayTest, Static_2D_Permuted )
+{
+    constexpr int I = 3, J = 5;
+    NDArray< int,
+             SizesHolder< int, I, J >,
+             index_sequence< 1, 0 > > a;
+    a.setSizes( 0, 0 );
+
+    int v = 0;
+    for( int j = 0; j < J; j++ )
+        for( int i = 0; i < I; i++ )
+            a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( NDArrayTest, Dynamic_6D )
+{
+    int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+
+    // initialize entries invalid due to alignment to -1
+    a.getStorageArray().setValue( -1 );
+
+    int v = 0;
+    for( int n = 0; n < N; n++ )
+        for( int l = 0; l < L; l++ )
+            for( int m = 0; m < M; m++ )
+                for( int k = 0; k < K; k++ )
+                    for( int i = 0; i < I; i++ )
+                        for( int j = 0; j < J; j++ )
+                            a( i, j, k, l, m, n ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( NDArrayTest, CopySemantics )
+{
+    constexpr int I = 3, J = 4;
+    NDArray< int, SizesHolder< int, 0, 0 > > a;
+    a.setSizes( I, J );
+
+    auto a_view = a.getView();
+
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+
+    // assignment with zero sizes
+    NDArray< int, SizesHolder< int, 0, 0 > > b;
+    b = a;
+    auto b_view = b.getView();
+    EXPECT_EQ( a, b );
+    EXPECT_EQ( a_view, b_view );
+    EXPECT_EQ( a_view.getView(), b_view );
+    EXPECT_EQ( a_view.getConstView(), b_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), b.getConstView() );
+    EXPECT_EQ( a.getConstView(), b_view.getConstView() );
+
+    // assignment between views
+    NDArray< int, SizesHolder< int, 0, 0 > > c;
+    c.setSizes( I, J );
+    auto c_view = c.getView();
+    c_view = a_view;
+    EXPECT_EQ( a, c );
+    EXPECT_EQ( a_view, c_view );
+    EXPECT_EQ( a_view.getView(), c_view );
+    EXPECT_EQ( a_view.getConstView(), c_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), c.getConstView() );
+    EXPECT_EQ( a.getConstView(), c_view.getConstView() );
+
+    // move-assignment between views should do a deep copy
+    b_view = a.getView();
+    EXPECT_EQ( a_view, b_view );
+    EXPECT_EQ( a, b );
+    EXPECT_NE( &b_view( 0, 0 ), &a_view( 0, 0 ) );
+
+    // assignment of view to array
+    c.setValue( 0 );
+    c = a_view;
+    EXPECT_EQ( a, c );
+    EXPECT_EQ( a_view, c_view );
+    EXPECT_EQ( a_view.getView(), c_view );
+    EXPECT_EQ( a_view.getConstView(), c_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), c.getConstView() );
+    EXPECT_EQ( a.getConstView(), c_view.getConstView() );
+
+    // assignment of array to view
+    c.setValue( 0 );
+    c_view = a;
+    EXPECT_EQ( a, c );
+    EXPECT_EQ( a_view, c_view );
+    EXPECT_EQ( a_view.getView(), c_view );
+    EXPECT_EQ( a_view.getConstView(), c_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), c.getConstView() );
+    EXPECT_EQ( a.getConstView(), c_view.getConstView() );
+
+    // assignment with different ValueType
+    NDArray< double, SizesHolder< int, 0, 0 > > d;
+    d = a;
+    expect_identity( d.getStorageArray() );
+
+    // assignment with different SizesHolder
+    NDArray< double, SizesHolder< int, I, J > > e;
+    e = a;
+    expect_identity( e.getStorageArray() );
+
+    // assignment with different IndexType
+    NDArray< double, SizesHolder< short int, 0, 0 > > f;
+    f = a;
+    expect_identity( f.getStorageArray() );
+
+    // assignment with different Permutation
+    // TODO
+}
+
+#ifdef HAVE_CUDA
+TEST( NDArrayTest, CopySemanticsCrossDevice )
+{
+    constexpr int I = 3, J = 4;
+    NDArray< int, SizesHolder< int, 0, 0 > > a;
+    NDArray< int, SizesHolder< int, 0, 0 >,
+             std::index_sequence< 0, 1 >,
+             std::index_sequence< 0, 1 >,
+             TNL::Devices::Cuda > da;
+    a.setSizes( I, J );
+    da.setSizes( I, J );
+
+    auto a_view = a.getView();
+    auto da_view = da.getView();
+
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+
+    // copy to the device, simple check
+    da = a;
+    EXPECT_EQ( da.getStorageArray(), a.getStorageArray() );
+
+    // assignment with zero sizes
+    NDArray< int, SizesHolder< int, 0, 0 > > b;
+    b = da;
+    auto b_view = b.getView();
+    EXPECT_EQ( a, b );
+    EXPECT_EQ( a_view, b_view );
+    EXPECT_EQ( a_view.getView(), b_view );
+    EXPECT_EQ( a_view.getConstView(), b_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), b.getConstView() );
+    EXPECT_EQ( a.getConstView(), b_view.getConstView() );
+
+    // assignment between views
+    NDArray< int, SizesHolder< int, 0, 0 > > c;
+    c.setSizes( I, J );
+    auto c_view = c.getView();
+    c_view = da_view;
+    EXPECT_EQ( a, c );
+    EXPECT_EQ( a_view, c_view );
+    EXPECT_EQ( a_view.getView(), c_view );
+    EXPECT_EQ( a_view.getConstView(), c_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), c.getConstView() );
+    EXPECT_EQ( a.getConstView(), c_view.getConstView() );
+
+    // move-assignment between views should do a deep copy
+    b_view = da.getView();
+    EXPECT_EQ( a_view, b_view );
+    EXPECT_EQ( a, b );
+    EXPECT_NE( &b_view( 0, 0 ), &a_view( 0, 0 ) );
+
+    // assignment of view to array
+    c.setValue( 0 );
+    c = da_view;
+    EXPECT_EQ( a, c );
+    EXPECT_EQ( a_view, c_view );
+    EXPECT_EQ( a_view.getView(), c_view );
+    EXPECT_EQ( a_view.getConstView(), c_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), c.getConstView() );
+    EXPECT_EQ( a.getConstView(), c_view.getConstView() );
+
+    // assignment of array to view
+    c.setValue( 0 );
+    c_view = da;
+    EXPECT_EQ( a, c );
+    EXPECT_EQ( a_view, c_view );
+    EXPECT_EQ( a_view.getView(), c_view );
+    EXPECT_EQ( a_view.getConstView(), c_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), c.getConstView() );
+    EXPECT_EQ( a.getConstView(), c_view.getConstView() );
+
+    // assignment with different ValueType
+    NDArray< double, SizesHolder< int, 0, 0 > > d;
+    d = da;
+    expect_identity( d.getStorageArray() );
+
+    // assignment with different SizesHolder
+    NDArray< double, SizesHolder< int, I, J > > e;
+    e = da;
+    expect_identity( e.getStorageArray() );
+
+    // assignment with different IndexType
+    NDArray< double, SizesHolder< short int, 0, 0 > > f;
+    f = da;
+    expect_identity( f.getStorageArray() );
+
+    // assignment with different Permutation
+    // TODO
+}
+#endif
+
+TEST( NDArrayTest, SizesHolderPrinter )
+{
+   SizesHolder< int, 0, 1, 2 > holder;
+   holder.setSize< 0 >( 3 );
+
+   std::stringstream str;
+   str << holder;
+   EXPECT_EQ( str.str(), "SizesHolder< 0, 1, 2 >( 3, 1, 2 )" );
+}
+
+TEST( NDArrayTest, forAll_dynamic_1D )
+{
+    int I = 2;
+    NDArray< int,
+             SizesHolder< int, 0 >,
+             index_sequence< 0 > > a;
+    a.setSizes( I );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+        EXPECT_EQ( a( i ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_2D )
+{
+    int I = 2, J = 3;
+    NDArray< int,
+             SizesHolder< int, 0, 0 >,
+             index_sequence< 1, 0 > > a;
+    a.setSizes( I, J );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int j = 0; j < J; j++ )
+    for( int i = 0; i < I; i++ )
+        EXPECT_EQ( a( i, j ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_3D )
+{
+    int I = 2, J = 3, K = 4;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0 >,
+             index_sequence< 2, 0, 1 > > a;
+    a.setSizes( I, J, K );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        EXPECT_EQ( a( i, j, k ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_4D )
+{
+    int I = 2, J = 3, K = 4, L = 5;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0 >,
+             index_sequence< 3, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        EXPECT_EQ( a( i, j, k, l ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_5D )
+{
+    int I = 2, J = 3, K = 4, L = 5, M = 6;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0 >,
+             index_sequence< 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        EXPECT_EQ( a( i, j, k, l, m ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_6D )
+{
+    int I = 2, J = 3, K = 4, L = 5, M = 6, N = 7;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+    {
+       a( i, j, k, l, m, n ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int n = 0; n < N; n++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        EXPECT_EQ( a( i, j, k, l, m, n ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_1D )
+{
+    constexpr int I = 3;
+    StaticNDArray< int, SizesHolder< int, I > > a;
+//    a.setSizes( 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+        EXPECT_EQ( a( i ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_2D )
+{
+    constexpr int I = 3, J = 4;
+    StaticNDArray< int, SizesHolder< int, I, J > > a;
+//    a.setSizes( 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        EXPECT_EQ( a( i, j ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_3D )
+{
+    constexpr int I = 3, J = 4, K = 5;
+    StaticNDArray< int, SizesHolder< int, I, J, K > > a;
+//    a.setSizes( 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    for( int k = 0; k < K; k++ )
+        EXPECT_EQ( a( i, j, k ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_4D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L > > a;
+//    a.setSizes( 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    for( int k = 0; k < K; k++ )
+    for( int l = 0; l < L; l++ )
+        EXPECT_EQ( a( i, j, k, l ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_5D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a;
+//    a.setSizes( 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    for( int k = 0; k < K; k++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+        EXPECT_EQ( a( i, j, k, l, m ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_6D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a;
+//    a.setSizes( 0, 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+    {
+       a( i, j, k, l, m, n ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    for( int k = 0; k < K; k++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int n = 0; n < N; n++ )
+        EXPECT_EQ( a( i, j, k, l, m, n ), 1 );
+}
+
+TEST( NDArrayTest, forInternal_dynamic_1D )
+{
+    int I = 3;
+    NDArray< int,
+             SizesHolder< int, 0 >,
+             index_sequence< 0 > > a;
+    a.setSizes( I );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 )
+            EXPECT_EQ( a( i ), 0 )
+               << "i = " << i;
+        else
+            EXPECT_EQ( a( i ), 1 )
+               << "i = " << i;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_2D )
+{
+    int I = 3, J = 4;
+    NDArray< int,
+             SizesHolder< int, 0, 0 >,
+             index_sequence< 1, 0 > > a;
+    a.setSizes( I, J );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int j = 0; j < J; j++ )
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 )
+            EXPECT_EQ( a( i, j ), 0 )
+               << "i = " << i << ", j = " << j;
+        else
+            EXPECT_EQ( a( i, j ), 1 )
+               << "i = " << i << ", j = " << j;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_3D )
+{
+    int I = 3, J = 4, K = 5;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0 >,
+             index_sequence< 2, 0, 1 > > a;
+    a.setSizes( I, J, K );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 )
+            EXPECT_EQ( a( i, j, k ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+        else
+            EXPECT_EQ( a( i, j, k ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_4D )
+{
+    int I = 3, J = 4, K = 5, L = 6;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0 >,
+             index_sequence< 3, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 )
+            EXPECT_EQ( a( i, j, k, l ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+        else
+            EXPECT_EQ( a( i, j, k, l ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_5D )
+{
+    int I = 3, J = 4, K = 5, L = 6, M = 7;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0 >,
+             index_sequence< 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 )
+            EXPECT_EQ( a( i, j, k, l, m ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+        else
+            EXPECT_EQ( a( i, j, k, l, m ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_6D )
+{
+    int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+    {
+       a( i, j, k, l, m, n ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int n = 0; n < N; n++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 ||
+            n == 0 || n == N - 1 )
+            EXPECT_EQ( a( i, j, k, l, m, n ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+        else
+            EXPECT_EQ( a( i, j, k, l, m, n ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_1D )
+{
+    constexpr int I = 3;
+    StaticNDArray< int, SizesHolder< int, I > > a;
+//    a.setSizes( 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 )
+            EXPECT_EQ( a( i ), 0 )
+               << "i = " << i;
+        else
+            EXPECT_EQ( a( i ), 1 )
+               << "i = " << i;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_2D )
+{
+    constexpr int I = 3, J = 4;
+    StaticNDArray< int, SizesHolder< int, I, J > > a;
+//    a.setSizes( 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int j = 0; j < J; j++ )
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 )
+            EXPECT_EQ( a( i, j ), 0 )
+               << "i = " << i << ", j = " << j;
+        else
+            EXPECT_EQ( a( i, j ), 1 )
+               << "i = " << i << ", j = " << j;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_3D )
+{
+    constexpr int I = 3, J = 4, K = 5;
+    StaticNDArray< int, SizesHolder< int, I, J, K > > a;
+//    a.setSizes( 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 )
+            EXPECT_EQ( a( i, j, k ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+        else
+            EXPECT_EQ( a( i, j, k ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_4D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L > > a;
+//    a.setSizes( 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 )
+            EXPECT_EQ( a( i, j, k, l ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+        else
+            EXPECT_EQ( a( i, j, k, l ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_5D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a;
+//    a.setSizes( 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 )
+            EXPECT_EQ( a( i, j, k, l, m ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+        else
+            EXPECT_EQ( a( i, j, k, l, m ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_6D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a;
+//    a.setSizes( 0, 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+    {
+       a( i, j, k, l, m, n ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int n = 0; n < N; n++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 ||
+            n == 0 || n == N - 1 )
+            EXPECT_EQ( a( i, j, k, l, m, n ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+        else
+            EXPECT_EQ( a( i, j, k, l, m, n ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_dynamic_1D )
+{
+    int I = 3;
+    NDArray< int,
+             SizesHolder< int, 0 >,
+             index_sequence< 0 > > a;
+    a.setSizes( I );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 )
+            EXPECT_EQ( a( i ), 1 )
+               << "i = " << i;
+        else
+            EXPECT_EQ( a( i ), 0 )
+               << "i = " << i;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_dynamic_2D )
+{
+    int I = 3, J = 4;
+    NDArray< int,
+             SizesHolder< int, 0, 0 >,
+             index_sequence< 1, 0 > > a;
+    a.setSizes( I, J );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int j = 0; j < J; j++ )
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 )
+            EXPECT_EQ( a( i, j ), 1 )
+               << "i = " << i << ", j = " << j;
+        else
+            EXPECT_EQ( a( i, j ), 0 )
+               << "i = " << i << ", j = " << j;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_dynamic_3D )
+{
+    int I = 3, J = 4, K = 5;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0 >,
+             index_sequence< 2, 0, 1 > > a;
+    a.setSizes( I, J, K );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 )
+            EXPECT_EQ( a( i, j, k ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+        else
+            EXPECT_EQ( a( i, j, k ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+    }
+}
+
+// TODO: implement general ParallelBoundaryExecutor
+//TEST( NDArrayTest, forBoundary_dynamic_4D )
+//{
+//    int I = 3, J = 4, K = 5, L = 6;
+//    NDArray< int,
+//             SizesHolder< int, 0, 0, 0, 0 >,
+//             index_sequence< 3, 2, 0, 1 > > a;
+//    a.setSizes( I, J, K, L );
+//    a.setValue( 0 );
+//
+//    auto setter = [&] ( int i, int j, int k, int l )
+//    {
+//       a( i, j, k, l ) += 1;
+//    };
+//
+//    a.forBoundary( setter );
+//
+//    for( int l = 0; l < L; l++ )
+//    for( int k = 0; k < K; k++ )
+//    for( int i = 0; i < I; i++ )
+//    for( int j = 0; j < J; j++ )
+//    {
+//        if( i == 0 || i == I - 1 ||
+//            j == 0 || j == J - 1 ||
+//            k == 0 || k == K - 1 ||
+//            l == 0 || l == L - 1 )
+//            EXPECT_EQ( a( i, j, k, l ), 1 )
+//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+//        else
+//            EXPECT_EQ( a( i, j, k, l ), 0 )
+//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+//    }
+//}
+//
+//TEST( NDArrayTest, forBoundary_dynamic_5D )
+//{
+//    int I = 3, J = 4, K = 5, L = 6, M = 7;
+//    NDArray< int,
+//             SizesHolder< int, 0, 0, 0, 0, 0 >,
+//             index_sequence< 3, 4, 2, 0, 1 > > a;
+//    a.setSizes( I, J, K, L, M );
+//    a.setValue( 0 );
+//
+//    auto setter = [&] ( int i, int j, int k, int l, int m )
+//    {
+//       a( i, j, k, l, m ) += 1;
+//    };
+//
+//    a.forBoundary( setter );
+//
+//    for( int l = 0; l < L; l++ )
+//    for( int m = 0; m < M; m++ )
+//    for( int k = 0; k < K; k++ )
+//    for( int i = 0; i < I; i++ )
+//    for( int j = 0; j < J; j++ )
+//    {
+//        if( i == 0 || i == I - 1 ||
+//            j == 0 || j == J - 1 ||
+//            k == 0 || k == K - 1 ||
+//            l == 0 || l == L - 1 ||
+//            m == 0 || m == M - 1 )
+//            EXPECT_EQ( a( i, j, k, l, m ), 1 )
+//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+//        else
+//            EXPECT_EQ( a( i, j, k, l, m ), 0 )
+//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+//    }
+//}
+//
+//TEST( NDArrayTest, forBoundary_dynamic_6D )
+//{
+//    int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
+//    NDArray< int,
+//             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+//             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+//    a.setSizes( I, J, K, L, M, N );
+//    a.setValue( 0 );
+//
+//    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+//    {
+//       a( i, j, k, l, m, n ) += 1;
+//    };
+//
+//    a.forBoundary( setter );
+//
+//    for( int n = 0; n < N; n++ )
+//    for( int l = 0; l < L; l++ )
+//    for( int m = 0; m < M; m++ )
+//    for( int k = 0; k < K; k++ )
+//    for( int i = 0; i < I; i++ )
+//    for( int j = 0; j < J; j++ )
+//    {
+//        if( i == 0 || i == I - 1 ||
+//            j == 0 || j == J - 1 ||
+//            k == 0 || k == K - 1 ||
+//            l == 0 || l == L - 1 ||
+//            m == 0 || m == M - 1 ||
+//            n == 0 || n == N - 1 )
+//            EXPECT_EQ( a( i, j, k, l, m, n ), 1 )
+//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+//        else
+//            EXPECT_EQ( a( i, j, k, l, m, n ), 0 )
+//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+//    }
+//}
+
+TEST( NDArrayTest, forBoundary_static_1D )
+{
+    constexpr int I = 3;
+    StaticNDArray< int, SizesHolder< int, I > > a;
+//    a.setSizes( 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 )
+            EXPECT_EQ( a( i ), 1 )
+               << "i = " << i;
+        else
+            EXPECT_EQ( a( i ), 0 )
+               << "i = " << i;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_static_2D )
+{
+    constexpr int I = 3, J = 4;
+    StaticNDArray< int, SizesHolder< int, I, J > > a;
+//    a.setSizes( 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int j = 0; j < J; j++ )
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 )
+            EXPECT_EQ( a( i, j ), 1 )
+               << "i = " << i << ", j = " << j;
+        else
+            EXPECT_EQ( a( i, j ), 0 )
+               << "i = " << i << ", j = " << j;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_static_3D )
+{
+    constexpr int I = 3, J = 4, K = 5;
+    StaticNDArray< int, SizesHolder< int, I, J, K > > a;
+//    a.setSizes( 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 )
+            EXPECT_EQ( a( i, j, k ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+        else
+            EXPECT_EQ( a( i, j, k ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_static_4D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L > > a;
+//    a.setSizes( 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 )
+            EXPECT_EQ( a( i, j, k, l ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+        else
+            EXPECT_EQ( a( i, j, k, l ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_static_5D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a;
+//    a.setSizes( 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 )
+            EXPECT_EQ( a( i, j, k, l, m ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+        else
+            EXPECT_EQ( a( i, j, k, l, m ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_static_6D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a;
+//    a.setSizes( 0, 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+    {
+       a( i, j, k, l, m, n ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int n = 0; n < N; n++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 ||
+            n == 0 || n == N - 1 )
+            EXPECT_EQ( a( i, j, k, l, m, n ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+        else
+            EXPECT_EQ( a( i, j, k, l, m, n ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+    }
+}
+
+//#include "GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+//#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+   return RUN_ALL_TESTS();
+//#else
+//   throw GtestMissingError();
+//#endif
+}
-- 
GitLab


From de8f1eb8588e1f5aada3f28238fbbb0b95816555 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sun, 14 Apr 2019 11:25:17 +0200
Subject: [PATCH 21/25] Split NDArrayIndexer from NDArrayStorage and
 NDArrayView

---
 src/TNL/Containers/NDArray.h        | 146 +++++++++++--------------
 src/TNL/Containers/NDArrayIndexer.h |  94 ++++++++++++++++
 src/TNL/Containers/NDArrayView.h    | 159 ++++++++++++----------------
 3 files changed, 224 insertions(+), 175 deletions(-)
 create mode 100644 src/TNL/Containers/NDArrayIndexer.h

diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h
index 63ec5638d..5c71a83cc 100644
--- a/src/TNL/Containers/NDArray.h
+++ b/src/TNL/Containers/NDArray.h
@@ -40,6 +40,7 @@ template< typename Array,
           typename Base,
           typename Device = typename Array::DeviceType >
 class NDArrayStorage
+    : public NDArrayIndexer< SizesHolder, Permutation, Base >
 {
 public:
    using StorageArray = Array;
@@ -48,17 +49,12 @@ public:
    using IndexType = typename Array::IndexType;
    using SizesHolderType = SizesHolder;
    using PermutationType = Permutation;
+   using IndexerType = NDArrayIndexer< SizesHolder, Permutation, Base >;
    using ViewType = NDArrayView< ValueType, DeviceType, SizesHolder, Permutation, Base >;
    using ConstViewType = NDArrayView< std::add_const_t< ValueType >, DeviceType, SizesHolder, Permutation, Base >;
 
    static_assert( Permutation::size() == SizesHolder::getDimension(), "invalid permutation" );
 
-   // for compatibility with NDArrayView (which inherits from StrideBase)
-   static constexpr bool isContiguous()
-   {
-      return true;
-   }
-
    // all methods from NDArrayView
 
    NDArrayStorage() = default;
@@ -83,7 +79,7 @@ public:
       static_assert( std::is_same< PermutationType, typename OtherArray::PermutationType >::value,
                      "Arrays must have the same permutation of indices." );
       // update sizes
-      __ndarray_impl::SetSizesCopyHelper< SizesHolderType, typename OtherArray::SizesHolderType >::copy( sizes, other.getSizes() );
+      __ndarray_impl::SetSizesCopyHelper< SizesHolderType, typename OtherArray::SizesHolderType >::copy( getSizes(), other.getSizes() );
       // (re)allocate storage if necessary
       array.setSize( getStorageSize() );
       // copy data
@@ -94,57 +90,77 @@ public:
    bool operator==( const NDArrayStorage& other ) const
    {
       // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
-      return sizes == other.sizes && array == other.array;
+      return getSizes() == other.getSizes() && array == other.array;
    }
 
    bool operator!=( const NDArrayStorage& other ) const
    {
       // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
-      return sizes != other.sizes || array != other.array;
+      return getSizes() != other.getSizes() || array != other.array;
    }
 
-   // accessor to the underlying data
-   // (should not be used for accessing the elements, intended only for the implementation
-   // of operator= and functions like cudaHostRegister)
-   std::add_const_t< ValueType >* getData() const
+   __cuda_callable__
+   ValueType* getData()
    {
       return array.getData();
    }
 
-   static constexpr std::size_t getDimension()
+   __cuda_callable__
+   std::add_const_t< ValueType >* getData() const
    {
-      return SizesHolder::getDimension();
+      return array.getData();
    }
 
-   const SizesHolderType& getSizes() const
+   // methods from the base class
+   using IndexerType::getDimension;
+   using IndexerType::getSizes;
+   using IndexerType::getSize;
+   using IndexerType::getStride;
+   using IndexerType::getStorageSize;
+   using IndexerType::getStorageIndex;
+
+   __cuda_callable__
+   const IndexerType& getIndexer() const
    {
-      return sizes;
+      return *this;
    }
 
-   template< std::size_t level >
    __cuda_callable__
-   IndexType getSize() const
+   ViewType getView()
    {
-      return sizes.template getSize< level >();
+      return ViewType( array.getData(), getSizes() );
    }
 
-   // returns the product of the aligned sizes
    __cuda_callable__
-   IndexType getStorageSize() const
+   ConstViewType getConstView() const
    {
-      using Alignment = typename Base::template Alignment< Permutation >;
-      return __ndarray_impl::StorageSizeGetter< SizesHolder, Alignment >::get( sizes );
+      return ConstViewType( array.getData(), getSizes() );
    }
 
-   template< typename... IndexTypes >
+   template< std::size_t... Dimensions, typename... IndexTypes >
    __cuda_callable__
-   IndexType
-   getStorageIndex( IndexTypes&&... indices ) const
+   auto getSubarrayView( IndexTypes&&... indices )
    {
       static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
-      return Base::template getStorageIndex< Permutation >( sizes,
-                                                            StrideBase{},
-                                                            std::forward< IndexTypes >( indices )... );
+      static_assert( 0 < sizeof...(Dimensions) && sizeof...(Dimensions) <= getDimension(), "got wrong number of dimensions" );
+      static_assert( __ndarray_impl::all_elements_in_range( 0, Permutation::size(), {Dimensions...} ),
+                     "invalid dimensions" );
+// FIXME: nvcc chokes on the variadic brace-initialization
+#ifndef __NVCC__
+      static_assert( __ndarray_impl::is_increasing_sequence( {Dimensions...} ),
+                     "specifying permuted dimensions is not supported" );
+#endif
+
+      using Getter = __ndarray_impl::SubarrayGetter< Base, Permutation, Dimensions... >;
+      using Subpermutation = typename Getter::Subpermutation;
+      auto& begin = operator()( std::forward< IndexTypes >( indices )... );
+      auto subarray_sizes = Getter::filterSizes( getSizes(), std::forward< IndexTypes >( indices )... );
+      auto strides = Getter::getStrides( getSizes(), std::forward< IndexTypes >( indices )... );
+      static_assert( Subpermutation::size() == sizeof...(Dimensions), "Bug - wrong subpermutation length." );
+      static_assert( decltype(subarray_sizes)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the new sizes." );
+      static_assert( decltype(strides)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the strides." );
+      using SubarrayView = NDArrayView< ValueType, Device, decltype(subarray_sizes), Subpermutation, Base, decltype(strides) >;
+      return SubarrayView{ &begin, subarray_sizes, strides };
    }
 
    template< typename... IndexTypes >
@@ -153,7 +169,7 @@ public:
    operator()( IndexTypes&&... indices )
    {
       static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
-      __ndarray_impl::assertIndicesInBounds( sizes, std::forward< IndexTypes >( indices )... );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... );
       TNL_ASSERT_LT( getStorageIndex( std::forward< IndexTypes >( indices )... ), getStorageSize(),
                      "storage index out of bounds - either input error or a bug in the indexer" );
       return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ];
@@ -165,7 +181,7 @@ public:
    operator()( IndexTypes&&... indices ) const
    {
       static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
-      __ndarray_impl::assertIndicesInBounds( sizes, std::forward< IndexTypes >( indices )... );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... );
       TNL_ASSERT_LT( getStorageIndex( std::forward< IndexTypes >( indices )... ), getStorageSize(),
                      "storage index out of bounds - either input error or a bug in the indexer" );
       return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ];
@@ -177,7 +193,7 @@ public:
    operator[]( IndexType index )
    {
       static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
-      __ndarray_impl::assertIndicesInBounds( sizes, std::forward< IndexType >( index ) );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexType >( index ) );
       return array[ index ];
    }
 
@@ -186,54 +202,16 @@ public:
    operator[]( IndexType index ) const
    {
       static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
-      __ndarray_impl::assertIndicesInBounds( sizes, std::forward< IndexType >( index ) );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexType >( index ) );
       return array[ index ];
    }
 
-   __cuda_callable__
-   ViewType getView()
-   {
-      return ViewType( array.getData(), sizes );
-   }
-
-   __cuda_callable__
-   ConstViewType getConstView() const
-   {
-      return ConstViewType( array.getData(), sizes );
-   }
-
-   template< std::size_t... Dimensions, typename... IndexTypes >
-   __cuda_callable__
-   auto getSubarrayView( IndexTypes&&... indices )
-   {
-      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
-      static_assert( 0 < sizeof...(Dimensions) && sizeof...(Dimensions) <= getDimension(), "got wrong number of dimensions" );
-      static_assert( __ndarray_impl::all_elements_in_range( 0, Permutation::size(), {Dimensions...} ),
-                     "invalid dimensions" );
-// FIXME: nvcc chokes on the variadic brace-initialization
-#ifndef __NVCC__
-      static_assert( __ndarray_impl::is_increasing_sequence( {Dimensions...} ),
-                     "specifying permuted dimensions is not supported" );
-#endif
-
-      using Getter = __ndarray_impl::SubarrayGetter< Base, Permutation, Dimensions... >;
-      using Subpermutation = typename Getter::Subpermutation;
-      auto& begin = operator()( std::forward< IndexTypes >( indices )... );
-      auto subarray_sizes = Getter::filterSizes( sizes, std::forward< IndexTypes >( indices )... );
-      auto strides = Getter::getStrides( sizes, std::forward< IndexTypes >( indices )... );
-      static_assert( Subpermutation::size() == sizeof...(Dimensions), "Bug - wrong subpermutation length." );
-      static_assert( decltype(subarray_sizes)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the new sizes." );
-      static_assert( decltype(strides)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the strides." );
-      using SubarrayView = NDArrayView< ValueType, Device, decltype(subarray_sizes), Subpermutation, Base, decltype(strides) >;
-      return SubarrayView{ &begin, subarray_sizes, strides };
-   }
-
    template< typename Device2 = DeviceType, typename Func >
    void forAll( Func f ) const
    {
       __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
       using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
-      dispatch( Begins{}, sizes, f );
+      dispatch( Begins{}, getSizes(), f );
    }
 
    template< typename Device2 = DeviceType, typename Func >
@@ -245,7 +223,7 @@ public:
       using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type;
       // subtract dynamic sizes
       Ends ends;
-      __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolder >::subtract( ends, sizes );
+      __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolder >::subtract( ends, getSizes() );
       dispatch( Begins{}, ends, f );
    }
 
@@ -266,10 +244,10 @@ public:
       using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type;
       // subtract dynamic sizes
       SkipEnds skipEnds;
-      __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolder >::subtract( skipEnds, sizes );
+      __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolder >::subtract( skipEnds, getSizes() );
 
       __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
-      dispatch( Begins{}, SkipBegins{}, skipEnds, sizes, f );
+      dispatch( Begins{}, SkipBegins{}, skipEnds, getSizes(), f );
    }
 
    template< typename Device2 = DeviceType, typename Func, typename SkipBegins, typename SkipEnds >
@@ -278,7 +256,7 @@ public:
       // TODO: assert "skipBegins <= sizes", "skipEnds <= sizes"
       using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
       __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
-      dispatch( Begins{}, skipBegins, skipEnds, sizes, f );
+      dispatch( Begins{}, skipBegins, skipEnds, getSizes(), f );
    }
 
 
@@ -287,7 +265,7 @@ public:
    // TODO: rename to setSizes and make sure that overloading with the following method works
    void setSize( const SizesHolderType& sizes )
    {
-      this->sizes = sizes;
+      getSizes() = sizes;
       array.setSize( getStorageSize() );
    }
 
@@ -295,19 +273,19 @@ public:
    void setSizes( IndexTypes&&... sizes )
    {
       static_assert( sizeof...( sizes ) == getDimension(), "got wrong number of sizes" );
-      __ndarray_impl::setSizesHelper( this->sizes, std::forward< IndexTypes >( sizes )... );
+      __ndarray_impl::setSizesHelper( getSizes(), std::forward< IndexTypes >( sizes )... );
       array.setSize( getStorageSize() );
    }
 
    void setLike( const NDArrayStorage& other )
    {
-      this->sizes = other.getSizes();
+      getSizes() = other.getSizes();
       array.setSize( getStorageSize() );
    }
 
    void reset()
    {
-      this->sizes = SizesHolder{};
+      getSizes() = SizesHolder{};
       TNL_ASSERT_EQ( getStorageSize(), 0, "Failed to reset the sizes." );
       array.reset();
    }
@@ -318,7 +296,7 @@ public:
    getElement( IndexTypes&&... indices ) const
    {
       static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
-      __ndarray_impl::assertIndicesInBounds( sizes, std::forward< IndexTypes >( indices )... );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... );
       TNL_ASSERT_LT( getStorageIndex( std::forward< IndexTypes >( indices )... ), getStorageSize(),
                      "storage index out of bounds - either input error or a bug in the indexer" );
       return array.getElement( getStorageIndex( std::forward< IndexTypes >( indices )... ) );
@@ -341,9 +319,7 @@ public:
 
 protected:
    StorageArray array;
-   SizesHolder sizes;
-
-   using StrideBase = __ndarray_impl::DummyStrideBase< typename SizesHolder::IndexType, SizesHolder::getDimension() >;
+   IndexerType indexer;
 };
 
 template< typename Value,
diff --git a/src/TNL/Containers/NDArrayIndexer.h b/src/TNL/Containers/NDArrayIndexer.h
new file mode 100644
index 000000000..08389e9bd
--- /dev/null
+++ b/src/TNL/Containers/NDArrayIndexer.h
@@ -0,0 +1,94 @@
+/***************************************************************************
+                          NDArrayIndexer.h  -  description
+                             -------------------
+    begin                : Apr 14, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Containers/ndarray/Indexing.h>
+#include <TNL/Containers/ndarray/SizesHolderHelpers.h>   // StorageSizeGetter
+#include <TNL/Containers/ndarray/Subarrays.h>   // DummyStrideBase
+
+namespace TNL {
+namespace Containers {
+
+template< typename SizesHolder,
+          typename Permutation,
+          typename Base,
+          typename StridesHolder = __ndarray_impl::DummyStrideBase< typename SizesHolder::IndexType, SizesHolder::getDimension() > >
+class NDArrayIndexer
+    : public StridesHolder
+{
+public:
+   using IndexType = typename SizesHolder::IndexType;
+   using SizesHolderType = SizesHolder;
+   using PermutationType = Permutation;
+
+   __cuda_callable__
+   NDArrayIndexer() = default;
+
+   // explicit initialization by sizes and strides
+   __cuda_callable__
+   NDArrayIndexer( SizesHolder sizes, StridesHolder strides )
+   : StridesHolder(strides), sizes(sizes) {}
+
+   static constexpr std::size_t getDimension()
+   {
+      return SizesHolder::getDimension();
+   }
+
+   __cuda_callable__
+   const SizesHolderType& getSizes() const
+   {
+      return sizes;
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   IndexType getSize() const
+   {
+      return sizes.template getSize< level >();
+   }
+
+   // method template from base class
+   using StridesHolder::getStride;
+
+   // returns the product of the aligned sizes
+   __cuda_callable__
+   IndexType getStorageSize() const
+   {
+      using Alignment = typename Base::template Alignment< Permutation >;
+      return __ndarray_impl::StorageSizeGetter< SizesHolder, Alignment >::get( sizes );
+   }
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   IndexType
+   getStorageIndex( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == SizesHolder::getDimension(), "got wrong number of indices" );
+      return Base::template getStorageIndex< Permutation >( sizes,
+                                                            static_cast< const StridesHolder& >( *this ),
+                                                            std::forward< IndexTypes >( indices )... );
+   }
+
+protected:
+   // non-const reference accessor cannot be public - only subclasses like NDArrayStorage may modify the sizes
+   __cuda_callable__
+   SizesHolderType& getSizes()
+   {
+      return sizes;
+   }
+
+   SizesHolder sizes;
+};
+
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/NDArrayView.h b/src/TNL/Containers/NDArrayView.h
index 7883d30c1..5eb01e198 100644
--- a/src/TNL/Containers/NDArrayView.h
+++ b/src/TNL/Containers/NDArrayView.h
@@ -12,7 +12,7 @@
 
 #pragma once
 
-#include <TNL/Containers/ndarray/Indexing.h>
+#include <TNL/Containers/NDArrayIndexer.h>
 #include <TNL/Containers/ndarray/SizesHolder.h>
 #include <TNL/Containers/ndarray/Subarrays.h>
 #include <TNL/Containers/ndarray/Executors.h>
@@ -28,9 +28,9 @@ template< typename Value,
           typename SizesHolder,
           typename Permutation,
           typename Base,
-          typename StrideBase = __ndarray_impl::DummyStrideBase< typename SizesHolder::IndexType, SizesHolder::getDimension() > >
+          typename StridesHolder = __ndarray_impl::DummyStrideBase< typename SizesHolder::IndexType, SizesHolder::getDimension() > >
 class NDArrayView
-    : public StrideBase
+    : public NDArrayIndexer< SizesHolder, Permutation, Base, StridesHolder >
 {
 public:
    using ValueType = Value;
@@ -38,22 +38,24 @@ public:
    using IndexType = typename SizesHolder::IndexType;
    using SizesHolderType = SizesHolder;
    using PermutationType = Permutation;
-   using ViewType = NDArrayView< ValueType, Device, SizesHolder, Permutation, Base, StrideBase >;
-   using ConstViewType = NDArrayView< std::add_const_t< ValueType >, Device, SizesHolder, Permutation, Base, StrideBase >;
+   using IndexerType = NDArrayIndexer< SizesHolder, Permutation, Base, StridesHolder >;
+   using ViewType = NDArrayView< Value, Device, SizesHolder, Permutation, Base, StridesHolder >;
+   using ConstViewType = NDArrayView< std::add_const_t< Value >, Device, SizesHolder, Permutation, Base, StridesHolder >;
 
    static_assert( Permutation::size() == SizesHolder::getDimension(), "invalid permutation" );
 
    __cuda_callable__
    NDArrayView() = default;
 
-   // explicit initialization by raw data pointer and sizes
+   // explicit initialization by raw data pointer and sizes and strides
    __cuda_callable__
-   NDArrayView( Value* data, SizesHolder sizes ) : array(data), sizes(sizes) {}
+   NDArrayView( Value* data, SizesHolder sizes, StridesHolder strides = StridesHolder{} )
+   : IndexerType(sizes, strides), array(data) {}
 
-   // explicit initialization by raw data pointer and sizes and strides
+   // explicit initialization by raw data pointer and indexer
    __cuda_callable__
-   NDArrayView( Value* data, SizesHolder sizes, StrideBase strides )
-   : StrideBase(strides), array(data), sizes(sizes) {}
+   NDArrayView( Value* data, IndexerType indexer )
+   : IndexerType(indexer), array(data) {}
 
    // Copy-constructor does shallow copy, so views can be passed-by-value into
    // CUDA kernels and they can be captured-by-value in __cuda_callable__
@@ -70,7 +72,7 @@ public:
    __cuda_callable__
    NDArrayView& operator=( const NDArrayView& other )
    {
-      TNL_ASSERT_EQ( sizes, other.sizes, "The sizes of the array views must be equal, views are not resizable." );
+      TNL_ASSERT_EQ( getSizes(), other.getSizes(), "The sizes of the array views must be equal, views are not resizable." );
       if( getStorageSize() > 0 )
          Algorithms::ArrayOperations< DeviceType >::copy( array, other.array, getStorageSize() );
       return *this;
@@ -100,23 +102,21 @@ public:
    __cuda_callable__
    void bind( NDArrayView view )
    {
+      IndexerType::operator=( view );
       array = view.array;
-      sizes = view.sizes;
-      StrideBase::operator=( view );
    }
 
    __cuda_callable__
    void reset()
    {
+      IndexerType::operator=( IndexerType{} );
       array = nullptr;
-      sizes = SizesHolder{};
-      StrideBase::operator=( StrideBase{} );
    }
 
    __cuda_callable__
    bool operator==( const NDArrayView& other ) const
    {
-      if( sizes != other.sizes )
+      if( getSizes() != other.getSizes() )
          return false;
       // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
       return Algorithms::ArrayOperations< Device, Device >::compare( array, other.array, getStorageSize() );
@@ -125,57 +125,74 @@ public:
    __cuda_callable__
    bool operator!=( const NDArrayView& other ) const
    {
-      if( sizes != other.sizes )
+      if( getSizes() != other.getSizes() )
          return true;
       // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
       return ! Algorithms::ArrayOperations< Device, Device >::compare( array, other.array, getStorageSize() );
    }
 
-   static constexpr std::size_t getDimension()
+   __cuda_callable__
+   ValueType* getData()
    {
-      return SizesHolder::getDimension();
+      return array;
    }
 
-   // accessor to the underlying data
-   // (should not be used for accessing the elements, intended only for the implementation
-   // of operator= and functions like cudaHostRegister)
+   __cuda_callable__
    std::add_const_t< ValueType >* getData() const
    {
       return array;
    }
 
-   const SizesHolderType& getSizes() const
+   // methods from the base class
+   using IndexerType::getDimension;
+   using IndexerType::getSizes;
+   using IndexerType::getSize;
+   using IndexerType::getStride;
+   using IndexerType::getStorageSize;
+   using IndexerType::getStorageIndex;
+
+   __cuda_callable__
+   const IndexerType& getIndexer() const
    {
-      return sizes;
+      return *this;
    }
 
-   template< std::size_t level >
    __cuda_callable__
-   IndexType getSize() const
+   ViewType getView()
    {
-      return sizes.template getSize< level >();
+      return ViewType( *this );
    }
 
-   // method template from base class
-   using StrideBase::getStride;
-
-   // returns the product of the aligned sizes
    __cuda_callable__
-   IndexType getStorageSize() const
+   ConstViewType getConstView() const
    {
-      using Alignment = typename Base::template Alignment< Permutation >;
-      return __ndarray_impl::StorageSizeGetter< SizesHolder, Alignment >::get( sizes );
+      return ConstViewType( array, getSizes(), static_cast< const StridesHolder& >( *this ) );
    }
 
-   template< typename... IndexTypes >
+   template< std::size_t... Dimensions, typename... IndexTypes >
    __cuda_callable__
-   IndexType
-   getStorageIndex( IndexTypes&&... indices ) const
+   auto getSubarrayView( IndexTypes&&... indices )
    {
       static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
-      return Base::template getStorageIndex< Permutation >( sizes,
-                                                            static_cast< const StrideBase& >( *this ),
-                                                            std::forward< IndexTypes >( indices )... );
+      static_assert( 0 < sizeof...(Dimensions) && sizeof...(Dimensions) <= getDimension(), "got wrong number of dimensions" );
+      static_assert( __ndarray_impl::all_elements_in_range( 0, Permutation::size(), {Dimensions...} ),
+                     "invalid dimensions" );
+// FIXME: nvcc chokes on the variadic brace-initialization
+#ifndef __NVCC__
+      static_assert( __ndarray_impl::is_increasing_sequence( {Dimensions...} ),
+                     "specifying permuted dimensions is not supported" );
+#endif
+
+      using Getter = __ndarray_impl::SubarrayGetter< Base, Permutation, Dimensions... >;
+      using Subpermutation = typename Getter::Subpermutation;
+      auto& begin = operator()( std::forward< IndexTypes >( indices )... );
+      auto subarray_sizes = Getter::filterSizes( getSizes(), std::forward< IndexTypes >( indices )... );
+      auto strides = Getter::getStrides( getSizes(), std::forward< IndexTypes >( indices )... );
+      static_assert( Subpermutation::size() == sizeof...(Dimensions), "Bug - wrong subpermutation length." );
+      static_assert( decltype(subarray_sizes)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the new sizes." );
+      static_assert( decltype(strides)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the strides." );
+      using SubarrayView = NDArrayView< ValueType, Device, decltype(subarray_sizes), Subpermutation, Base, decltype(strides) >;
+      return SubarrayView{ &begin, subarray_sizes, strides };
    }
 
    template< typename... IndexTypes >
@@ -184,7 +201,7 @@ public:
    operator()( IndexTypes&&... indices )
    {
       static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
-      __ndarray_impl::assertIndicesInBounds( sizes, std::forward< IndexTypes >( indices )... );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... );
       return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ];
    }
 
@@ -194,7 +211,7 @@ public:
    operator()( IndexTypes&&... indices ) const
    {
       static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
-      __ndarray_impl::assertIndicesInBounds( sizes, std::forward< IndexTypes >( indices )... );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... );
       return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ];
    }
 
@@ -204,7 +221,7 @@ public:
    operator[]( IndexType&& index )
    {
       static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
-      __ndarray_impl::assertIndicesInBounds( sizes, std::forward< IndexType >( index ) );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexType >( index ) );
       return array[ index ];
    }
 
@@ -213,54 +230,16 @@ public:
    operator[]( IndexType index ) const
    {
       static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
-      __ndarray_impl::assertIndicesInBounds( sizes, std::forward< IndexType >( index ) );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexType >( index ) );
       return array[ index ];
    }
 
-   __cuda_callable__
-   ViewType getView()
-   {
-      return ViewType( *this );
-   }
-
-   __cuda_callable__
-   ConstViewType getConstView() const
-   {
-      return ConstViewType( array, sizes );
-   }
-
-   template< std::size_t... Dimensions, typename... IndexTypes >
-   __cuda_callable__
-   auto getSubarrayView( IndexTypes&&... indices )
-   {
-      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
-      static_assert( 0 < sizeof...(Dimensions) && sizeof...(Dimensions) <= getDimension(), "got wrong number of dimensions" );
-      static_assert( __ndarray_impl::all_elements_in_range( 0, Permutation::size(), {Dimensions...} ),
-                     "invalid dimensions" );
-// FIXME: nvcc chokes on the variadic brace-initialization
-#ifndef __NVCC__
-      static_assert( __ndarray_impl::is_increasing_sequence( {Dimensions...} ),
-                     "specifying permuted dimensions is not supported" );
-#endif
-
-      using Getter = __ndarray_impl::SubarrayGetter< Base, Permutation, Dimensions... >;
-      using Subpermutation = typename Getter::Subpermutation;
-      auto& begin = operator()( std::forward< IndexTypes >( indices )... );
-      auto subarray_sizes = Getter::filterSizes( sizes, std::forward< IndexTypes >( indices )... );
-      auto strides = Getter::getStrides( sizes, std::forward< IndexTypes >( indices )... );
-      static_assert( Subpermutation::size() == sizeof...(Dimensions), "Bug - wrong subpermutation length." );
-      static_assert( decltype(subarray_sizes)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the new sizes." );
-      static_assert( decltype(strides)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the strides." );
-      using SubarrayView = NDArrayView< ValueType, Device, decltype(subarray_sizes), Subpermutation, Base, decltype(strides) >;
-      return SubarrayView{ &begin, subarray_sizes, strides };
-   }
-
    template< typename Device2 = DeviceType, typename Func >
    void forAll( Func f ) const
    {
       __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
       using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
-      dispatch( Begins{}, sizes, f );
+      dispatch( Begins{}, getSizes(), f );
    }
 
    template< typename Device2 = DeviceType, typename Func >
@@ -272,14 +251,14 @@ public:
       using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type;
       // subtract dynamic sizes
       Ends ends;
-      __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolder >::subtract( ends, sizes );
+      __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolder >::subtract( ends, getSizes() );
       dispatch( Begins{}, ends, f );
    }
 
    template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends >
    void forInternal( Func f, const Begins& begins, const Ends& ends ) const
    {
-      // TODO: assert "begins <= sizes", "ends <= sizes"
+      // TODO: assert "begins <= getSizes()", "ends <= getSizes()"
       __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
       dispatch( begins, ends, f );
    }
@@ -293,24 +272,24 @@ public:
       using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type;
       // subtract dynamic sizes
       SkipEnds skipEnds;
-      __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolder >::subtract( skipEnds, sizes );
+      __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolder >::subtract( skipEnds, getSizes() );
 
       __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
-      dispatch( Begins{}, SkipBegins{}, skipEnds, sizes, f );
+      dispatch( Begins{}, SkipBegins{}, skipEnds, getSizes(), f );
    }
 
    template< typename Device2 = DeviceType, typename Func, typename SkipBegins, typename SkipEnds >
    void forBoundary( Func f, const SkipBegins& skipBegins, const SkipEnds& skipEnds ) const
    {
-      // TODO: assert "skipBegins <= sizes", "skipEnds <= sizes"
+      // TODO: assert "skipBegins <= getSizes()", "skipEnds <= getSizes()"
       using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
       __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
-      dispatch( Begins{}, skipBegins, skipEnds, sizes, f );
+      dispatch( Begins{}, skipBegins, skipEnds, getSizes(), f );
    }
 
 protected:
    Value* array = nullptr;
-   SizesHolder sizes;
+   IndexerType indexer;
 };
 
 } // namespace Containers
-- 
GitLab


From 55ded6ad41214e278edae4a3d61c6e20f0e9d10b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Fri, 19 Apr 2019 12:05:21 +0200
Subject: [PATCH 22/25] Removed conditional per-device Permutation and
 SliceInfo setting from NDArray and SlicedNDArray

---
 .../NDArray/tnl-benchmark-ndarray-boundary.h  | 11 ------
 .../NDArray/tnl-benchmark-ndarray.h           | 11 ------
 src/TNL/Containers/NDArray.h                  | 37 +++++--------------
 .../Containers/ndarray/SynchronizerBuffers.h  |  1 -
 .../DistributedNDArrayOverlaps_1D_test.h      |  4 --
 .../DistributedNDArrayOverlaps_semi1D_test.h  |  2 -
 .../ndarray/DistributedNDArray_1D_test.h      |  4 --
 .../ndarray/DistributedNDArray_semi1D_test.h  |  2 -
 .../Containers/ndarray/NDArrayTest.h          |  1 -
 .../ndarray/StaticNDArrayCudaTest.cu          |  2 -
 10 files changed, 9 insertions(+), 66 deletions(-)

diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h
index e47149d84..a30a25352 100644
--- a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h
@@ -85,7 +85,6 @@ void benchmark_1D( Benchmark& benchmark, index_type size = 500000000 )
    NDArray< value_type,
             SizesHolder< index_type, 0 >,
             std::make_index_sequence< 1 >,
-            std::make_index_sequence< 1 >,
             Device > a, b;
    a.setSizes( size );
    b.setSizes( size );
@@ -113,7 +112,6 @@ void benchmark_2D( Benchmark& benchmark, index_type size = 22333 )
    NDArray< value_type,
             SizesHolder< index_type, 0, 0 >,
             std::make_index_sequence< 2 >,
-            std::make_index_sequence< 2 >,
             Device > a, b;
    a.setSizes( size, size );
    b.setSizes( size, size );
@@ -141,7 +139,6 @@ void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0 >,
             std::make_index_sequence< 3 >,
-            std::make_index_sequence< 3 >,
             Device > a, b;
    a.setSizes( size, size, size );
    b.setSizes( size, size, size );
@@ -170,7 +167,6 @@ void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
 //   NDArray< value_type,
 //            SizesHolder< index_type, 0, 0, 0, 0 >,
 //            std::make_index_sequence< 4 >,
-//            std::make_index_sequence< 4 >,
 //            Device > a, b;
 //   a.setSizes( size, size, size, size );
 //   b.setSizes( size, size, size, size );
@@ -198,7 +194,6 @@ void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
 //   NDArray< value_type,
 //            SizesHolder< index_type, 0, 0, 0, 0, 0 >,
 //            std::make_index_sequence< 5 >,
-//            std::make_index_sequence< 5 >,
 //            Device > a, b;
 //   a.setSizes( size, size, size, size, size );
 //   b.setSizes( size, size, size, size, size );
@@ -226,7 +221,6 @@ void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
 //   NDArray< value_type,
 //            SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
 //            std::make_index_sequence< 6 >,
-//            std::make_index_sequence< 6 >,
 //            Device > a, b;
 //   a.setSizes( size, size, size, size, size, size );
 //   b.setSizes( size, size, size, size, size, size );
@@ -255,7 +249,6 @@ void benchmark_2D_perm( Benchmark& benchmark, index_type size = 22333 )
    NDArray< value_type,
             SizesHolder< index_type, 0, 0 >,
             std::index_sequence< 1, 0 >,
-            std::index_sequence< 1, 0 >,
             Device > a, b;
    a.setSizes( size, size );
    b.setSizes( size, size );
@@ -283,7 +276,6 @@ void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0 >,
             std::index_sequence< 2, 1, 0 >,
-            std::index_sequence< 2, 1, 0 >,
             Device > a, b;
    a.setSizes( size, size, size );
    b.setSizes( size, size, size );
@@ -312,7 +304,6 @@ void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
 //   NDArray< value_type,
 //            SizesHolder< index_type, 0, 0, 0, 0 >,
 //            std::index_sequence< 3, 2, 1, 0 >,
-//            std::index_sequence< 3, 2, 1, 0 >,
 //            Device > a, b;
 //   a.setSizes( size, size, size, size );
 //   b.setSizes( size, size, size, size );
@@ -340,7 +331,6 @@ void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
 //   NDArray< value_type,
 //            SizesHolder< index_type, 0, 0, 0, 0, 0 >,
 //            std::index_sequence< 4, 3, 2, 1, 0 >,
-//            std::index_sequence< 4, 3, 2, 1, 0 >,
 //            Device > a, b;
 //   a.setSizes( size, size, size, size, size );
 //   b.setSizes( size, size, size, size, size );
@@ -368,7 +358,6 @@ void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
 //   NDArray< value_type,
 //            SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
 //            std::index_sequence< 5, 4, 3, 2, 1, 0 >,
-//            std::index_sequence< 5, 4, 3, 2, 1, 0 >,
 //            Device > a, b;
 //   a.setSizes( size, size, size, size, size, size );
 //   b.setSizes( size, size, size, size, size, size );
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h
index ab7c4fa8c..0de53ea88 100644
--- a/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h
@@ -117,7 +117,6 @@ void benchmark_1D( Benchmark& benchmark, index_type size = 500000000 )
    NDArray< value_type,
             SizesHolder< index_type, 0 >,
             std::make_index_sequence< 1 >,
-            std::make_index_sequence< 1 >,
             Device > a, b;
    a.setSizes( size );
    b.setSizes( size );
@@ -141,7 +140,6 @@ void benchmark_2D( Benchmark& benchmark, index_type size = 22333 )
    NDArray< value_type,
             SizesHolder< index_type, 0, 0 >,
             std::make_index_sequence< 2 >,
-            std::make_index_sequence< 2 >,
             Device > a, b;
    a.setSizes( size, size );
    b.setSizes( size, size );
@@ -165,7 +163,6 @@ void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0 >,
             std::make_index_sequence< 3 >,
-            std::make_index_sequence< 3 >,
             Device > a, b;
    a.setSizes( size, size, size );
    b.setSizes( size, size, size );
@@ -189,7 +186,6 @@ void benchmark_4D( Benchmark& benchmark, index_type size = 150 )
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0, 0 >,
             std::make_index_sequence< 4 >,
-            std::make_index_sequence< 4 >,
             Device > a, b;
    a.setSizes( size, size, size, size );
    b.setSizes( size, size, size, size );
@@ -213,7 +209,6 @@ void benchmark_5D( Benchmark& benchmark, index_type size = 56 )
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0, 0, 0 >,
             std::make_index_sequence< 5 >,
-            std::make_index_sequence< 5 >,
             Device > a, b;
    a.setSizes( size, size, size, size, size );
    b.setSizes( size, size, size, size, size );
@@ -237,7 +232,6 @@ void benchmark_6D( Benchmark& benchmark, index_type size = 28 )
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
             std::make_index_sequence< 6 >,
-            std::make_index_sequence< 6 >,
             Device > a, b;
    a.setSizes( size, size, size, size, size, size );
    b.setSizes( size, size, size, size, size, size );
@@ -262,7 +256,6 @@ void benchmark_2D_perm( Benchmark& benchmark, index_type size = 22333 )
    NDArray< value_type,
             SizesHolder< index_type, 0, 0 >,
             std::index_sequence< 1, 0 >,
-            std::index_sequence< 1, 0 >,
             Device > a, b;
    a.setSizes( size, size );
    b.setSizes( size, size );
@@ -286,7 +279,6 @@ void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0 >,
             std::index_sequence< 2, 1, 0 >,
-            std::index_sequence< 2, 1, 0 >,
             Device > a, b;
    a.setSizes( size, size, size );
    b.setSizes( size, size, size );
@@ -310,7 +302,6 @@ void benchmark_4D_perm( Benchmark& benchmark, index_type size = 150 )
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0, 0 >,
             std::index_sequence< 3, 2, 1, 0 >,
-            std::index_sequence< 3, 2, 1, 0 >,
             Device > a, b;
    a.setSizes( size, size, size, size );
    b.setSizes( size, size, size, size );
@@ -334,7 +325,6 @@ void benchmark_5D_perm( Benchmark& benchmark, index_type size = 56 )
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0, 0, 0 >,
             std::index_sequence< 4, 3, 2, 1, 0 >,
-            std::index_sequence< 4, 3, 2, 1, 0 >,
             Device > a, b;
    a.setSizes( size, size, size, size, size );
    b.setSizes( size, size, size, size, size );
@@ -358,7 +348,6 @@ void benchmark_6D_perm( Benchmark& benchmark, index_type size = 28 )
    NDArray< value_type,
             SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
             std::index_sequence< 5, 4, 3, 2, 1, 0 >,
-            std::index_sequence< 5, 4, 3, 2, 1, 0 >,
             Device > a, b;
    a.setSizes( size, size, size, size, size, size );
    b.setSizes( size, size, size, size, size, size );
diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h
index 5c71a83cc..8472f4d71 100644
--- a/src/TNL/Containers/NDArray.h
+++ b/src/TNL/Containers/NDArray.h
@@ -324,23 +324,18 @@ protected:
 
 template< typename Value,
           typename SizesHolder,
-          typename PermutationHost = std::make_index_sequence< SizesHolder::getDimension() >,  // identity by default
-          typename PermutationCuda = std::make_index_sequence< SizesHolder::getDimension() >,  // identity by default
+          typename Permutation = std::make_index_sequence< SizesHolder::getDimension() >,  // identity by default
           typename Device = Devices::Host,
           typename Index = typename SizesHolder::IndexType >
 class NDArray
 : public NDArrayStorage< Array< Value, Device, Index >,
                          SizesHolder,
-                         typename std::conditional< std::is_same< Device, Devices::Host >::value,
-                                                    PermutationHost,
-                                                    PermutationCuda >::type,
+                         Permutation,
                          __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >
 {
    using Base = NDArrayStorage< Array< Value, Device, Index >,
                          SizesHolder,
-                         typename std::conditional< std::is_same< Device, Devices::Host >::value,
-                                                    PermutationHost,
-                                                    PermutationCuda >::type,
+                         Permutation,
                          __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >;
 
 public:
@@ -403,34 +398,20 @@ public:
 
 template< typename Value,
           typename SizesHolder,
-          typename PermutationHost = std::make_index_sequence< SizesHolder::getDimension() >,  // identity by default
-          typename SliceInfoHost = SliceInfo<>,  // no slicing by default
-          typename PermutationCuda = std::make_index_sequence< SizesHolder::getDimension() >,  // identity by default
-          typename SliceInfoCuda = SliceInfo<>,  // no slicing by default
+          typename Permutation = std::make_index_sequence< SizesHolder::getDimension() >,  // identity by default
+          typename SliceInfo = SliceInfo<>,  // no slicing by default
           typename Device = Devices::Host,
           typename Index = typename SizesHolder::IndexType >
 class SlicedNDArray
 : public NDArrayStorage< Array< Value, Device, Index >,
                          SizesHolder,
-                         typename std::conditional< std::is_same< Device, Devices::Host >::value,
-                                                    PermutationHost,
-                                                    PermutationCuda >::type,
-                         __ndarray_impl::SlicedNDArrayBase<
-                            typename std::conditional< std::is_same< Device, Devices::Host >::value,
-                                                       SliceInfoHost,
-                                                       SliceInfoCuda >::type >
-                        >
+                         Permutation,
+                         __ndarray_impl::SlicedNDArrayBase< SliceInfo > >
 {
    using Base = NDArrayStorage< Array< Value, Device, Index >,
                          SizesHolder,
-                         typename std::conditional< std::is_same< Device, Devices::Host >::value,
-                                                    PermutationHost,
-                                                    PermutationCuda >::type,
-                         __ndarray_impl::SlicedNDArrayBase<
-                            typename std::conditional< std::is_same< Device, Devices::Host >::value,
-                                                       SliceInfoHost,
-                                                       SliceInfoCuda >::type >
-                        >;
+                         Permutation,
+                         __ndarray_impl::SlicedNDArrayBase< SliceInfo > >;
 
 public:
    // inherit all assignment operators
diff --git a/src/TNL/Containers/ndarray/SynchronizerBuffers.h b/src/TNL/Containers/ndarray/SynchronizerBuffers.h
index 5b6441a70..d54fddfd7 100644
--- a/src/TNL/Containers/ndarray/SynchronizerBuffers.h
+++ b/src/TNL/Containers/ndarray/SynchronizerBuffers.h
@@ -29,7 +29,6 @@ struct SynchronizerBuffersLayer
    using NDArrayType = NDArray< typename DistributedNDArray::ValueType,
                                 typename DistributedNDArray::SizesHolderType,
                                 typename DistributedNDArray::PermutationType,
-                                typename DistributedNDArray::PermutationType,
                                 typename DistributedNDArray::DeviceType >;
    NDArrayType left_send_buffer, left_recv_buffer, right_send_buffer, right_recv_buffer;
    typename DistributedNDArray::LocalBeginsType left_send_offsets, left_recv_offsets, right_send_offsets, right_recv_offsets;
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
index eea9d84f9..a7609ee74 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
@@ -71,7 +71,6 @@ using DistributedNDArrayTypes = ::testing::Types<
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 0 >,
                                 std::index_sequence< 0 >,
-                                std::index_sequence< 0 >,
                                 Devices::Host >,
                        Communicators::MpiCommunicator,
                        std::index_sequence< 2 > >
@@ -79,7 +78,6 @@ using DistributedNDArrayTypes = ::testing::Types<
 //   DistributedNDArray< NDArray< double,
 //                                SizesHolder< int, 0 >,
 //                                std::index_sequence< 0 >,
-//                                std::index_sequence< 0 >,
 //                                Devices::Host >,
 //                       Communicators::NoDistrCommunicator,
 //                       std::index_sequence< 2 > >
@@ -88,7 +86,6 @@ using DistributedNDArrayTypes = ::testing::Types<
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 0 >,
                                 std::index_sequence< 0 >,
-                                std::index_sequence< 0 >,
                                 Devices::Cuda >,
                        Communicators::MpiCommunicator,
                        std::index_sequence< 2 > >
@@ -96,7 +93,6 @@ using DistributedNDArrayTypes = ::testing::Types<
 //   DistributedNDArray< NDArray< double,
 //                                SizesHolder< int, 0 >,
 //                                std::index_sequence< 0 >,
-//                                std::index_sequence< 0 >,
 //                                Devices::Cuda >,
 //                       Communicators::NoDistrCommunicator,
 //                       std::index_sequence< 2 > >
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
index a019b3139..a7f28ead5 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
@@ -71,7 +71,6 @@ using DistributedNDArrayTypes = ::testing::Types<
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 9, 0, 0 >,  // Q, X, Y
                                 std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
-                                std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
                                 Devices::Host >,
                        Communicators::MpiCommunicator,
                        std::index_sequence< 0, 2, 0 > >
@@ -80,7 +79,6 @@ using DistributedNDArrayTypes = ::testing::Types<
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 9, 0, 0 >,  // Q, X, Y
                                 std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
-                                std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
                                 Devices::Cuda >,
                        Communicators::MpiCommunicator,
                        std::index_sequence< 0, 2, 0 > >
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
index dec9cb821..3dda2d1b4 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
@@ -69,13 +69,11 @@ using DistributedNDArrayTypes = ::testing::Types<
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 0 >,
                                 std::index_sequence< 0 >,
-                                std::index_sequence< 0 >,
                                 Devices::Host >,
                        Communicators::MpiCommunicator >,
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 0 >,
                                 std::index_sequence< 0 >,
-                                std::index_sequence< 0 >,
                                 Devices::Host >,
                        Communicators::NoDistrCommunicator >
 #ifdef HAVE_CUDA
@@ -83,13 +81,11 @@ using DistributedNDArrayTypes = ::testing::Types<
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 0 >,
                                 std::index_sequence< 0 >,
-                                std::index_sequence< 0 >,
                                 Devices::Cuda >,
                        Communicators::MpiCommunicator >,
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 0 >,
                                 std::index_sequence< 0 >,
-                                std::index_sequence< 0 >,
                                 Devices::Cuda >,
                        Communicators::NoDistrCommunicator >
 #endif
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
index 500dbea3d..33390a33c 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
@@ -69,7 +69,6 @@ using DistributedNDArrayTypes = ::testing::Types<
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 9, 0, 0 >,  // Q, X, Y, Z
                                 std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
-                                std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
                                 Devices::Host >,
                        Communicators::MpiCommunicator >
 #ifdef HAVE_CUDA
@@ -77,7 +76,6 @@ using DistributedNDArrayTypes = ::testing::Types<
    DistributedNDArray< NDArray< double,
                                 SizesHolder< int, 9, 0, 0 >,  // Q, X, Y, Z
                                 std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
-                                std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
                                 Devices::Cuda >,
                        Communicators::NoDistrCommunicator >
 #endif
diff --git a/src/UnitTests/Containers/ndarray/NDArrayTest.h b/src/UnitTests/Containers/ndarray/NDArrayTest.h
index cfea0015c..1e5d9a30c 100644
--- a/src/UnitTests/Containers/ndarray/NDArrayTest.h
+++ b/src/UnitTests/Containers/ndarray/NDArrayTest.h
@@ -225,7 +225,6 @@ TEST( NDArrayTest, CopySemanticsCrossDevice )
     constexpr int I = 3, J = 4;
     NDArray< int, SizesHolder< int, 0, 0 > > a;
     NDArray< int, SizesHolder< int, 0, 0 >,
-             std::index_sequence< 0, 1 >,
              std::index_sequence< 0, 1 >,
              TNL::Devices::Cuda > da;
     a.setSizes( I, J );
diff --git a/src/UnitTests/Containers/ndarray/StaticNDArrayCudaTest.cu b/src/UnitTests/Containers/ndarray/StaticNDArrayCudaTest.cu
index 71ff572be..0a0a83dd8 100644
--- a/src/UnitTests/Containers/ndarray/StaticNDArrayCudaTest.cu
+++ b/src/UnitTests/Containers/ndarray/StaticNDArrayCudaTest.cu
@@ -25,7 +25,6 @@ void __test_SetThroughView()
     NDArray< int,
              SizesHolder< int, I, J >,
              std::make_index_sequence< 2 >,
-             std::make_index_sequence< 2 >,
              TNL::Devices::Cuda > a;
     a.setSizes( 0, 0 );
     ViewType a_view( a.getStorageArray().getData(), SizesHolder< int, I, J >{} );
@@ -54,7 +53,6 @@ void __test_CopyFromArray()
     NDArray< int,
              SizesHolder< int, I, J >,
              std::make_index_sequence< 2 >,
-             std::make_index_sequence< 2 >,
              TNL::Devices::Cuda > a;
     a.setSizes( 0, 0 );
     ViewType a_view( a.getStorageArray().getData(), SizesHolder< int, I, J >{} );
-- 
GitLab


From daebbce149fb6afc66cf6d4beec8108169a0b8d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 14 Aug 2019 11:34:43 +0200
Subject: [PATCH 23/25] Fixed StaticFor for loops with a large iterations count

---
 src/TNL/StaticFor.h | 48 +++++++++++++++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/src/TNL/StaticFor.h b/src/TNL/StaticFor.h
index 1539e05aa..0fa3bc0ef 100644
--- a/src/TNL/StaticFor.h
+++ b/src/TNL/StaticFor.h
@@ -14,25 +14,47 @@
 
 namespace TNL {
 
+// Manual unrolling does not make sense for loops with a large iterations
+// count. For a very large iterations count it would trigger the compiler's
+// limit on recursive template instantiation. Also note that the compiler
+// will (at least partially) unroll loops with static bounds anyway.
+template< int Begin, int End, bool unrolled = (End - Begin <= 8) >
+struct StaticFor;
+
 template< int Begin, int End >
-struct StaticFor
+struct StaticFor< Begin, End, true >
 {
-    template< typename Function, typename... Args >
-    __cuda_callable__
-    static void exec( const Function& f, Args... args )
-    {
-        static_assert( Begin < End, "Wrong index interval for StaticFor. Being must be lower than end." );
-        f( Begin, args... );
-        StaticFor< Begin + 1, End >::exec( f, args... );
-    };
+   static_assert( Begin < End, "Wrong index interval for StaticFor. Begin must be less than end." );
+
+   template< typename Function, typename... Args >
+   __cuda_callable__
+   static void exec( const Function& f, Args... args )
+   {
+      f( Begin, args... );
+      StaticFor< Begin + 1, End >::exec( f, args... );
+   }
 };
 
 template< int End >
-struct StaticFor< End, End >
+struct StaticFor< End, End, true >
 {
-    template< typename Function, typename... Args >
-    __cuda_callable__
-    static void exec( const Function& f, Args... args ){};
+   template< typename Function, typename... Args >
+   __cuda_callable__
+   static void exec( const Function& f, Args... args ) {}
+};
+
+template< int Begin, int End >
+struct StaticFor< Begin, End, false >
+{
+   static_assert( Begin <= End, "Wrong index interval for StaticFor. Begin must be less than or equal to end." );
+
+   template< typename Function, typename... Args >
+   __cuda_callable__
+   static void exec( const Function& f, Args... args )
+   {
+      for( int i = Begin; i < End; i++ )
+         f( i, args... );
+   }
 };
 
 } //namespace TNL
-- 
GitLab


From e4e780e1e4e6ef619501bee5ad3a383348c8b2c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 14 Aug 2019 13:39:13 +0200
Subject: [PATCH 24/25] Added overlaps to NDArrayIndexer

This is necessary for local indexing of DistributedNDArray.
---
 src/TNL/Containers/NDArrayIndexer.h           | 28 ++++++++++---
 src/TNL/Containers/ndarray/Indexing.h         | 39 ++++++++++++-------
 .../Containers/ndarray/SizesHolderHelpers.h   | 23 +++++++----
 3 files changed, 62 insertions(+), 28 deletions(-)

diff --git a/src/TNL/Containers/NDArrayIndexer.h b/src/TNL/Containers/NDArrayIndexer.h
index 08389e9bd..e3f068e0c 100644
--- a/src/TNL/Containers/NDArrayIndexer.h
+++ b/src/TNL/Containers/NDArrayIndexer.h
@@ -22,14 +22,25 @@ namespace Containers {
 template< typename SizesHolder,
           typename Permutation,
           typename Base,
-          typename StridesHolder = __ndarray_impl::DummyStrideBase< typename SizesHolder::IndexType, SizesHolder::getDimension() > >
+          typename StridesHolder = __ndarray_impl::DummyStrideBase< typename SizesHolder::IndexType, SizesHolder::getDimension() >,
+          typename Overlaps = __ndarray_impl::make_constant_index_sequence< SizesHolder::getDimension(), 0 > >
 class NDArrayIndexer
     : public StridesHolder
 {
 public:
    using IndexType = typename SizesHolder::IndexType;
+   using NDBaseType = Base;
    using SizesHolderType = SizesHolder;
+   using StridesHolderType = StridesHolder;
    using PermutationType = Permutation;
+   using OverlapsType = Overlaps;
+
+   static_assert( StridesHolder::getDimension() == SizesHolder::getDimension(),
+                  "Dimension of strides does not match the dimension of sizes." );
+   static_assert( Permutation::size() == SizesHolder::getDimension(),
+                  "Dimension of permutation does not match the dimension of sizes." );
+   static_assert( Overlaps::size() == SizesHolder::getDimension(),
+                  "Dimension of overlaps does not match the dimension of sizes." );
 
    __cuda_callable__
    NDArrayIndexer() = default;
@@ -60,12 +71,18 @@ public:
    // method template from base class
    using StridesHolder::getStride;
 
+   template< std::size_t level >
+   static constexpr std::size_t getOverlap()
+   {
+      return __ndarray_impl::get< level >( Overlaps{} );
+   }
+
    // returns the product of the aligned sizes
    __cuda_callable__
    IndexType getStorageSize() const
    {
       using Alignment = typename Base::template Alignment< Permutation >;
-      return __ndarray_impl::StorageSizeGetter< SizesHolder, Alignment >::get( sizes );
+      return __ndarray_impl::StorageSizeGetter< SizesHolder, Alignment, Overlaps >::get( sizes );
    }
 
    template< typename... IndexTypes >
@@ -74,9 +91,10 @@ public:
    getStorageIndex( IndexTypes&&... indices ) const
    {
       static_assert( sizeof...( indices ) == SizesHolder::getDimension(), "got wrong number of indices" );
-      return Base::template getStorageIndex< Permutation >( sizes,
-                                                            static_cast< const StridesHolder& >( *this ),
-                                                            std::forward< IndexTypes >( indices )... );
+      return Base::template getStorageIndex< Permutation, Overlaps >
+             ( sizes,
+               static_cast< const StridesHolder& >( *this ),
+               std::forward< IndexTypes >( indices )... );
    }
 
 protected:
diff --git a/src/TNL/Containers/ndarray/Indexing.h b/src/TNL/Containers/ndarray/Indexing.h
index 04316ffe0..a1b83ae51 100644
--- a/src/TNL/Containers/ndarray/Indexing.h
+++ b/src/TNL/Containers/ndarray/Indexing.h
@@ -113,6 +113,7 @@ auto host_call_with_unshifted_indices( const SizesHolder& begins, Func&& f, Indi
 
 
 template< typename Permutation,
+          typename Overlaps,
           typename Alignment,
           typename SliceInfo,
           std::size_t level = Permutation::size() - 1,
@@ -121,10 +122,11 @@ struct SlicedIndexer
 {};
 
 template< typename Permutation,
+          typename Overlaps,
           typename Alignment,
           typename SliceInfo,
           std::size_t level >
-struct SlicedIndexer< Permutation, Alignment, SliceInfo, level, false >
+struct SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, level, false >
 {
    template< typename SizesHolder, typename StridesHolder, typename... Indices >
    __cuda_callable__
@@ -134,17 +136,19 @@ struct SlicedIndexer< Permutation, Alignment, SliceInfo, level, false >
              Indices&&... indices )
    {
       static constexpr std::size_t idx = get< level >( Permutation{} );
+      static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} );
       const auto alpha = get_from_pack< idx >( std::forward< Indices >( indices )... );
-      const auto previous = SlicedIndexer< Permutation, Alignment, SliceInfo, level - 1 >::getIndex( sizes, strides, std::forward< Indices >( indices )... );
-      return strides.template getStride< idx >( alpha ) * ( alpha + Alignment::template getAlignedSize< idx >( sizes ) * previous );
+      const auto previous = SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, level - 1 >::getIndex( sizes, strides, std::forward< Indices >( indices )... );
+      return strides.template getStride< idx >( alpha ) * ( alpha + overlap + Alignment::template getAlignedSize< idx >( sizes ) * previous );
    }
 };
 
 template< typename Permutation,
+          typename Overlaps,
           typename Alignment,
           typename SliceInfo,
           std::size_t level >
-struct SlicedIndexer< Permutation, Alignment, SliceInfo, level, true >
+struct SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, level, true >
 {
    template< typename SizesHolder, typename StridesHolder, typename... Indices >
    __cuda_callable__
@@ -157,20 +161,22 @@ struct SlicedIndexer< Permutation, Alignment, SliceInfo, level, true >
                      "Invalid SliceInfo: static dimension cannot be sliced." );
 
       static constexpr std::size_t idx = get< level >( Permutation{} );
+      static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} );
       const auto alpha = get_from_pack< idx >( std::forward< Indices >( indices )... );
       static constexpr std::size_t S = SliceInfo::getSliceSize( idx );
       // TODO: check the calculation with strides
       return strides.template getStride< idx >( alpha ) *
-                  ( S * (alpha / S) * StorageSizeGetter< SizesHolder, Alignment, IndexTag< level - 1 > >::getPermuted( sizes, Permutation{} ) +
-                    alpha % S ) +
-             S * SlicedIndexer< Permutation, Alignment, SliceInfo, level - 1 >::getIndex( sizes, strides, std::forward< Indices >( indices )... );
+                  ( S * ((alpha + overlap) / S) * StorageSizeGetter< SizesHolder, Alignment, Overlaps, IndexTag< level - 1 > >::getPermuted( sizes, Permutation{} ) +
+                    (alpha + overlap) % S ) +
+             S * SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, level - 1 >::getIndex( sizes, strides, std::forward< Indices >( indices )... );
    }
 };
 
 template< typename Permutation,
+          typename Overlaps,
           typename Alignment,
           typename SliceInfo >
-struct SlicedIndexer< Permutation, Alignment, SliceInfo, 0, false >
+struct SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, 0, false >
 {
    template< typename SizesHolder, typename StridesHolder, typename... Indices >
    __cuda_callable__
@@ -180,15 +186,17 @@ struct SlicedIndexer< Permutation, Alignment, SliceInfo, 0, false >
              Indices&&... indices )
    {
       static constexpr std::size_t idx = get< 0 >( Permutation{} );
+      static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} );
       const auto alpha = get_from_pack< idx >( std::forward< Indices >( indices )... );
-      return strides.template getStride< idx >( alpha ) * alpha;
+      return strides.template getStride< idx >( alpha ) * (alpha + overlap);
    }
 };
 
 template< typename Permutation,
+          typename Overlaps,
           typename Alignment,
           typename SliceInfo >
-struct SlicedIndexer< Permutation, Alignment, SliceInfo, 0, true >
+struct SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, 0, true >
 {
    template< typename SizesHolder, typename StridesHolder, typename... Indices >
    __cuda_callable__
@@ -198,8 +206,9 @@ struct SlicedIndexer< Permutation, Alignment, SliceInfo, 0, true >
              Indices&&... indices )
    {
       static constexpr std::size_t idx = get< 0 >( Permutation{} );
+      static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} );
       const auto alpha = get_from_pack< idx >( std::forward< Indices >( indices )... );
-      return strides.template getStride< idx >( alpha ) * alpha;
+      return strides.template getStride< idx >( alpha ) * (alpha + overlap);
    }
 };
 
@@ -227,14 +236,14 @@ struct NDArrayBase
       }
    };
 
-   template< typename Permutation, typename SizesHolder, typename StridesHolder, typename... Indices >
+   template< typename Permutation, typename Overlaps, typename SizesHolder, typename StridesHolder, typename... Indices >
    __cuda_callable__
    typename SizesHolder::IndexType
    static getStorageIndex( const SizesHolder& sizes, const StridesHolder& strides, Indices&&... indices )
    {
       static_assert( check_slice_size( SizesHolder::getDimension(), 0 ), "BUG - invalid SliceInfo type passed to NDArrayBase" );
       using Alignment = Alignment< Permutation >;
-      return SlicedIndexer< Permutation, Alignment, SliceInfo >::getIndex( sizes, strides, std::forward< Indices >( indices )... );
+      return SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo >::getIndex( sizes, strides, std::forward< Indices >( indices )... );
    }
 
 private:
@@ -271,13 +280,13 @@ struct SlicedNDArrayBase
       }
    };
 
-   template< typename Permutation, typename SizesHolder, typename StridesHolder, typename... Indices >
+   template< typename Permutation, typename Overlaps, typename SizesHolder, typename StridesHolder, typename... Indices >
    __cuda_callable__
    static typename SizesHolder::IndexType
    getStorageIndex( const SizesHolder& sizes, const StridesHolder& strides, Indices&&... indices )
    {
       using Alignment = Alignment< Permutation >;
-      return SlicedIndexer< Permutation, Alignment, SliceInfo >::getIndex( sizes, strides, std::forward< Indices >( indices )... );
+      return SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo >::getIndex( sizes, strides, std::forward< Indices >( indices )... );
    }
 };
 
diff --git a/src/TNL/Containers/ndarray/SizesHolderHelpers.h b/src/TNL/Containers/ndarray/SizesHolderHelpers.h
index a18f7c6eb..2e92ed43d 100644
--- a/src/TNL/Containers/ndarray/SizesHolderHelpers.h
+++ b/src/TNL/Containers/ndarray/SizesHolderHelpers.h
@@ -25,6 +25,7 @@ namespace __ndarray_impl {
 // Dynamic storage size with alignment
 template< typename SizesHolder,
           typename Alignment,
+          typename Overlaps,
           typename LevelTag = IndexTag< SizesHolder::getDimension() - 1 > >
 struct StorageSizeGetter
 {
@@ -32,8 +33,10 @@ struct StorageSizeGetter
    __cuda_callable__
    get( const SizesHolder& sizes )
    {
+      static constexpr std::size_t overlap = __ndarray_impl::get< LevelTag::value >( Overlaps{} );
       const auto size = Alignment::template getAlignedSize< LevelTag::value >( sizes );
-      return size * StorageSizeGetter< SizesHolder, Alignment, IndexTag< LevelTag::value - 1 > >::get( sizes );
+      return ( size + 2 * overlap )
+             * StorageSizeGetter< SizesHolder, Alignment, Overlaps, IndexTag< LevelTag::value - 1 > >::get( sizes );
    }
 
    template< typename Permutation >
@@ -41,20 +44,23 @@ struct StorageSizeGetter
    static typename SizesHolder::IndexType
    getPermuted( const SizesHolder& sizes, Permutation )
    {
-      constexpr std::size_t idx = __ndarray_impl::get< LevelTag::value >( Permutation{} );
+      static constexpr std::size_t idx = __ndarray_impl::get< LevelTag::value >( Permutation{} );
+      static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} );
       const auto size = Alignment::template getAlignedSize< idx >( sizes );
-      return size * StorageSizeGetter< SizesHolder, Alignment, IndexTag< LevelTag::value - 1 > >::get( sizes );
+      return ( size + 2 * overlap )
+             * StorageSizeGetter< SizesHolder, Alignment, Overlaps, IndexTag< LevelTag::value - 1 > >::get( sizes );
    }
 };
 
-template< typename SizesHolder, typename Alignment >
-struct StorageSizeGetter< SizesHolder, Alignment, IndexTag< 0 > >
+template< typename SizesHolder, typename Alignment, typename Overlaps >
+struct StorageSizeGetter< SizesHolder, Alignment, Overlaps, IndexTag< 0 > >
 {
    static typename SizesHolder::IndexType
    __cuda_callable__
    get( const SizesHolder& sizes )
    {
-      return Alignment::template getAlignedSize< 0 >( sizes );
+      static constexpr std::size_t overlap = __ndarray_impl::get< 0 >( Overlaps{} );
+      return Alignment::template getAlignedSize< 0 >( sizes ) + 2 * overlap;
    }
 
    template< typename Permutation >
@@ -62,8 +68,9 @@ struct StorageSizeGetter< SizesHolder, Alignment, IndexTag< 0 > >
    static typename SizesHolder::IndexType
    getPermuted( const SizesHolder& sizes, Permutation )
    {
-      constexpr std::size_t idx = __ndarray_impl::get< 0 >( Permutation{} );
-      return Alignment::template getAlignedSize< idx >( sizes );
+      static constexpr std::size_t idx = __ndarray_impl::get< 0 >( Permutation{} );
+      static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} );
+      return Alignment::template getAlignedSize< idx >( sizes ) + 2 * overlap;
    }
 };
 
-- 
GitLab


From 056343233cbc10fa37ea7dc3f603231a793b6cdd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 14 Aug 2019 13:43:52 +0200
Subject: [PATCH 25/25] DistributedNDArray: added methods getLocalIndexer,
 getLocalView, getConstLocalView, getStorageIndex and getData

---
 src/TNL/Containers/DistributedNDArray.h     | 58 ++++++++++++++++
 src/TNL/Containers/DistributedNDArrayView.h | 75 ++++++++++++++++++++-
 src/TNL/Containers/NDArrayView.h            | 17 ++++-
 src/TNL/Containers/ndarray/SizesHolder.h    | 44 ++++++++++++
 src/TNL/Containers/ndarray/Subarrays.h      |  5 ++
 5 files changed, 197 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Containers/DistributedNDArray.h b/src/TNL/Containers/DistributedNDArray.h
index 339316af9..4b123d114 100644
--- a/src/TNL/Containers/DistributedNDArray.h
+++ b/src/TNL/Containers/DistributedNDArray.h
@@ -36,9 +36,12 @@ public:
    using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArray::SizesHolderType >;
    using LocalRangeType = Subrange< IndexType >;
    using OverlapsType = Overlaps;
+   using LocalIndexerType = NDArrayIndexer< SizesHolderType, PermutationType, typename NDArray::NDBaseType, typename NDArray::StridesHolderType, Overlaps >;
 
    using ViewType = DistributedNDArrayView< typename NDArray::ViewType, Communicator, Overlaps >;
    using ConstViewType = DistributedNDArrayView< typename NDArray::ConstViewType, Communicator, Overlaps >;
+   using LocalViewType = typename NDArray::ViewType;
+   using ConstLocalViewType = typename NDArray::ConstViewType;
 
    static_assert( Overlaps::size() == NDArray::getDimension(), "invalid overlaps" );
 
@@ -59,6 +62,18 @@ public:
    DistributedNDArray( DistributedNDArray&& ) = default;
    DistributedNDArray& operator=( DistributedNDArray&& ) = default;
 
+   // Templated copy-assignment
+   template< typename OtherArray >
+   DistributedNDArray& operator=( const OtherArray& other )
+   {
+      globalSizes = other.getSizes();
+      localBegins = other.getLocalBegins();
+      localEnds = other.getLocalEnds();
+      group = other.getCommunicationGroup();
+      localArray = other.getConstLocalView();
+      return *this;
+   }
+
    static constexpr std::size_t getDimension()
    {
       return NDArray::getDimension();
@@ -111,6 +126,49 @@ public:
       return localArray.getStorageSize();
    }
 
+   LocalIndexerType getLocalIndexer() const
+   {
+      return LocalIndexerType( localEnds - localBegins, typename NDArray::StridesHolderType{} );
+   }
+
+   LocalViewType getLocalView()
+   {
+      return localArray.getView();
+   }
+
+   ConstLocalViewType getConstLocalView() const
+   {
+      return localArray.getConstView();
+   }
+
+   // returns the *local* storage index for given *global* indices
+   template< typename... IndexTypes >
+   __cuda_callable__
+   IndexType
+   getStorageIndex( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == SizesHolderType::getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
+      auto getStorageIndex = [this]( auto&&... indices )
+      {
+         return this->localArray.getStorageIndex( std::forward< decltype(indices) >( indices )... );
+      };
+      return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, getStorageIndex, std::forward< IndexTypes >( indices )... );
+   }
+
+   __cuda_callable__
+   ValueType* getData()
+   {
+      return localArray.getData();
+   }
+
+   __cuda_callable__
+   std::add_const_t< ValueType >* getData() const
+   {
+      return localArray.getData();
+   }
+
+
    template< typename... IndexTypes >
    __cuda_callable__
    ValueType&
diff --git a/src/TNL/Containers/DistributedNDArrayView.h b/src/TNL/Containers/DistributedNDArrayView.h
index f143f5d75..102985e9c 100644
--- a/src/TNL/Containers/DistributedNDArrayView.h
+++ b/src/TNL/Containers/DistributedNDArrayView.h
@@ -35,9 +35,12 @@ public:
    using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArrayView::SizesHolderType >;
    using LocalRangeType = Subrange< IndexType >;
    using OverlapsType = Overlaps;
+   using LocalIndexerType = NDArrayIndexer< SizesHolderType, PermutationType, typename NDArrayView::NDBaseType, typename NDArrayView::StridesHolderType, Overlaps >;
 
    using ViewType = DistributedNDArrayView< NDArrayView, Communicator, Overlaps >;
    using ConstViewType = DistributedNDArrayView< typename NDArrayView::ConstViewType, Communicator, Overlaps >;
+   using LocalViewType = NDArrayView;
+   using ConstLocalViewType = typename NDArrayView::ConstViewType;
 
    static_assert( Overlaps::size() == NDArrayView::getDimension(), "invalid overlaps" );
 
@@ -67,7 +70,19 @@ public:
    // There is no move-assignment operator, so expressions like `a = b.getView()`
    // are resolved as copy-assignment.
 
-   // method for rebinding (reinitialization)
+   // Templated copy-assignment
+   template< typename OtherArray >
+   DistributedNDArrayView& operator=( const OtherArray& other )
+   {
+      globalSizes = other.getSizes();
+      localBegins = other.getLocalBegins();
+      localEnds = other.getLocalEnds();
+      group = other.getCommunicationGroup();
+      localView = other.getConstLocalView();
+      return *this;
+   }
+
+   // methods for rebinding (reinitialization)
    __cuda_callable__
    void bind( DistributedNDArrayView view )
    {
@@ -78,6 +93,21 @@ public:
       localEnds = view.localEnds;
    }
 
+   // binds to the given raw pointer and changes the indexer
+   __cuda_callable__
+   void bind( ValueType* data, LocalIndexerType indexer )
+   {
+      localView.bind( data, indexer );
+      localView.bind( data );
+   }
+
+   // binds to the given raw pointer and preserves the current indexer
+   __cuda_callable__
+   void bind( ValueType* data )
+   {
+      localView.bind( data );
+   }
+
    __cuda_callable__
    void reset()
    {
@@ -140,6 +170,49 @@ public:
       return localView.getStorageSize();
    }
 
+   LocalIndexerType getLocalIndexer() const
+   {
+      return LocalIndexerType( localEnds - localBegins, typename NDArrayView::StridesHolderType{} );
+   }
+
+   LocalViewType getLocalView()
+   {
+      return localView;
+   }
+
+   ConstLocalViewType getConstLocalView() const
+   {
+      return localView.getConstView();
+   }
+
+   // returns the *local* storage index for given *global* indices
+   template< typename... IndexTypes >
+   __cuda_callable__
+   IndexType
+   getStorageIndex( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == SizesHolderType::getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
+      auto getStorageIndex = [this]( auto&&... indices )
+      {
+         return this->localView.getStorageIndex( std::forward< decltype(indices) >( indices )... );
+      };
+      return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, getStorageIndex, std::forward< IndexTypes >( indices )... );
+   }
+
+   __cuda_callable__
+   ValueType* getData()
+   {
+      return localView.getData();
+   }
+
+   __cuda_callable__
+   std::add_const_t< ValueType >* getData() const
+   {
+      return localView.getData();
+   }
+
+
    template< typename... IndexTypes >
    __cuda_callable__
    ValueType&
diff --git a/src/TNL/Containers/NDArrayView.h b/src/TNL/Containers/NDArrayView.h
index 5eb01e198..54a020a64 100644
--- a/src/TNL/Containers/NDArrayView.h
+++ b/src/TNL/Containers/NDArrayView.h
@@ -98,7 +98,7 @@ public:
    // There is no move-assignment operator, so expressions like `a = b.getView()`
    // are resolved as copy-assignment.
 
-   // method for rebinding (reinitialization)
+   // methods for rebinding (reinitialization)
    __cuda_callable__
    void bind( NDArrayView view )
    {
@@ -106,6 +106,21 @@ public:
       array = view.array;
    }
 
+   // binds to the given raw pointer and changes the indexer
+   __cuda_callable__
+   void bind( Value* data, IndexerType indexer )
+   {
+      IndexerType::operator=( indexer );
+      array = data;
+   }
+
+   // binds to the given raw pointer and preserves the current indexer
+   __cuda_callable__
+   void bind( Value* data )
+   {
+      array = data;
+   }
+
    __cuda_callable__
    void reset()
    {
diff --git a/src/TNL/Containers/ndarray/SizesHolder.h b/src/TNL/Containers/ndarray/SizesHolder.h
index 7763b141f..0b6e1f83d 100644
--- a/src/TNL/Containers/ndarray/SizesHolder.h
+++ b/src/TNL/Containers/ndarray/SizesHolder.h
@@ -144,6 +144,28 @@ struct SizesHolderSizePrinter
    }
 };
 
+template< std::size_t level >
+struct SizesHolerOperatorPlusHelper
+{
+   template< typename Result, typename LHS, typename RHS >
+   static void exec( Result& result, const LHS& lhs, const RHS& rhs )
+   {
+      if( result.template getStaticSize< level >() == 0 )
+         result.template setSize< level >( lhs.template getSize< level >() + rhs.template getSize< level >() );
+   }
+};
+
+template< std::size_t level >
+struct SizesHolerOperatorMinusHelper
+{
+   template< typename Result, typename LHS, typename RHS >
+   static void exec( Result& result, const LHS& lhs, const RHS& rhs )
+   {
+      if( result.template getStaticSize< level >() == 0 )
+         result.template setSize< level >( lhs.template getSize< level >() - rhs.template getSize< level >() );
+   }
+};
+
 } // namespace __ndarray_impl
 
 
@@ -202,6 +224,28 @@ public:
    }
 };
 
+template< typename Index,
+          std::size_t... sizes,
+          typename OtherHolder >
+SizesHolder< Index, sizes... >
+operator+( const SizesHolder< Index, sizes... >& lhs, const OtherHolder& rhs )
+{
+   SizesHolder< Index, sizes... > result;
+   TemplateStaticFor< std::size_t, 0, sizeof...(sizes), __ndarray_impl::SizesHolerOperatorPlusHelper >::execHost( result, lhs, rhs );
+   return result;
+}
+
+template< typename Index,
+          std::size_t... sizes,
+          typename OtherHolder >
+SizesHolder< Index, sizes... >
+operator-( const SizesHolder< Index, sizes... >& lhs, const OtherHolder& rhs )
+{
+   SizesHolder< Index, sizes... > result;
+   TemplateStaticFor< std::size_t, 0, sizeof...(sizes), __ndarray_impl::SizesHolerOperatorMinusHelper >::execHost( result, lhs, rhs );
+   return result;
+}
+
 
 template< typename Index,
           std::size_t dimension,
diff --git a/src/TNL/Containers/ndarray/Subarrays.h b/src/TNL/Containers/ndarray/Subarrays.h
index 5668ef594..d50a30ea1 100644
--- a/src/TNL/Containers/ndarray/Subarrays.h
+++ b/src/TNL/Containers/ndarray/Subarrays.h
@@ -156,6 +156,11 @@ public:
 template< typename Index, std::size_t Dimension >
 struct DummyStrideBase
 {
+   static constexpr std::size_t getDimension()
+   {
+      return Dimension;
+   }
+
    static constexpr bool isContiguous()
    {
       return true;
-- 
GitLab