diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt
index b4639d60ade55aba454b79c27c01e04b151980b1..6f3185329c8dab36b4c07bdb949b33a45595607a 100644
--- a/src/Benchmarks/CMakeLists.txt
+++ b/src/Benchmarks/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_subdirectory( HeatEquation )
 add_subdirectory( BLAS )
+add_subdirectory( NDArray )
 add_subdirectory( SpMV )
 add_subdirectory( DistSpMV )
 add_subdirectory( LinearSolvers )
diff --git a/src/Benchmarks/NDArray/CMakeLists.txt b/src/Benchmarks/NDArray/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e6291c5f32c5633a70afb646a2ebcfe0e9d70303
--- /dev/null
+++ b/src/Benchmarks/NDArray/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_executable( tnl-benchmark-ndarray tnl-benchmark-ndarray.cpp )
+target_compile_options( tnl-benchmark-ndarray PRIVATE ${CXX_TESTS_FLAGS} )
+install( TARGETS tnl-benchmark-ndarray RUNTIME DESTINATION bin )
+
+add_executable( tnl-benchmark-ndarray-boundary tnl-benchmark-ndarray-boundary.cpp )
+target_compile_options( tnl-benchmark-ndarray-boundary PRIVATE ${CXX_TESTS_FLAGS} )
+install( TARGETS tnl-benchmark-ndarray-boundary RUNTIME DESTINATION bin )
+
+if( BUILD_CUDA )
+   cuda_add_executable( tnl-benchmark-ndarray-cuda tnl-benchmark-ndarray-cuda.cu
+                        OPTIONS ${CXX_TESTS_FLAGS} )
+   install( TARGETS tnl-benchmark-ndarray-cuda RUNTIME DESTINATION bin )
+
+   cuda_add_executable( tnl-benchmark-ndarray-boundary-cuda tnl-benchmark-ndarray-boundary-cuda.cu
+                        OPTIONS ${CXX_TESTS_FLAGS} )
+   install( TARGETS tnl-benchmark-ndarray-boundary-cuda RUNTIME DESTINATION bin )
+endif()
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary-cuda.cu b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary-cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b5a2622a4b335a054844dda8c46d76344e370fec
--- /dev/null
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary-cuda.cu
@@ -0,0 +1 @@
+#include "tnl-benchmark-ndarray-boundary.h"
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.cpp b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b5a2622a4b335a054844dda8c46d76344e370fec
--- /dev/null
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.cpp
@@ -0,0 +1 @@
+#include "tnl-benchmark-ndarray-boundary.h"
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h
new file mode 100644
index 0000000000000000000000000000000000000000..a30a25352438c4dba25f525466faa147d59dfe8a
--- /dev/null
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h
@@ -0,0 +1,466 @@
+/***************************************************************************
+                          tnl-benchmark-ndarray-boundary.h  -  description
+                             -------------------
+    begin                : Feb 9, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Math.h>
+#include <TNL/ParallelFor.h>
+
+#include <TNL/Containers/NDArray.h>
+
+#include "../Benchmarks.h"
+
+using namespace TNL;
+using namespace TNL::Benchmarks;
+using namespace TNL::Containers;
+using std::index_sequence;
+
+using value_type = float;
+//using index_type = std::size_t;
+using index_type = unsigned;
+
+template< typename Array >
+void expect_eq_chunked( Array& a, Array& b )
+{
+   // TODO: use something like EXPECT_EQ
+   TNL_ASSERT_EQ( a.getSize(), b.getSize(), "array sizes don't match" );
+   if( a.getSize() != b.getSize() )
+      return;
+
+   using IndexType = typename Array::IndexType;
+
+   const IndexType chunk_size = 4096;
+   for( IndexType c = 0; c < (IndexType) roundUpDivision( a.getSize(), chunk_size ); c++ ) {
+      const typename Array::IndexType this_chunk_size = TNL::min( chunk_size, a.getSize() - c * chunk_size );
+      Array a_chunk( &a[ c * chunk_size ], this_chunk_size );
+      Array b_chunk( &b[ c * chunk_size ], this_chunk_size );
+      // TODO: use something like EXPECT_EQ
+      TNL_ASSERT_EQ( a_chunk, b_chunk, "chunks are not equal" );
+   }
+}
+
+template< typename Array >
+void expect_eq( Array& a, Array& b )
+{
+   if( std::is_same< typename Array::DeviceType, TNL::Devices::Cuda >::value ) {
+      typename Array::HostType a_host, b_host;
+      a_host = a;
+      b_host = b;
+      expect_eq_chunked( a_host, b_host );
+   }
+   else {
+      expect_eq_chunked( a, b );
+   }
+}
+
+template< typename Device >
+const char* performer()
+{
+   if( std::is_same< Device, Devices::Host >::value )
+      return "CPU";
+   else if( std::is_same< Device, Devices::Cuda >::value )
+      return "GPU";
+   else
+      return "unknown";
+}
+
+void reset() {}
+
+// NOTE: having the sizes as function parameters keeps the compiler from treating them
+// as "compile-time constants" and thus e.g. optimizing the 1D iterations with memcpy
+
+template< typename Device >
+void benchmark_1D( Benchmark& benchmark, index_type size = 500000000 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0 >,
+            std::make_index_sequence< 1 >,
+            Device > a, b;
+   a.setSizes( size );
+   b.setSizes( size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto a_view = a.getView();
+   auto b_view = b.getView();
+
+   auto f = [&]() {
+      a.forBoundary( [=] __cuda_callable__ ( index_type i ) mutable { a_view( i ) = b_view( i ); } );
+      a.forInternal( [=] __cuda_callable__ ( index_type i ) mutable { a_view( i ) = b_view( i ); } );
+   };
+
+   const double datasetSize = 2 * size * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "1D", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_2D( Benchmark& benchmark, index_type size = 22333 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0 >,
+            std::make_index_sequence< 2 >,
+            Device > a, b;
+   a.setSizes( size, size );
+   b.setSizes( size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto a_view = a.getView();
+   auto b_view = b.getView();
+
+   auto f = [&]() {
+      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j ) mutable { a_view( i, j ) = b_view( i, j ); } );
+      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j ) mutable { a_view( i, j ) = b_view( i, j ); } );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 2 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "2D", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0 >,
+            std::make_index_sequence< 3 >,
+            Device > a, b;
+   a.setSizes( size, size, size );
+   b.setSizes( size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto a_view = a.getView();
+   auto b_view = b.getView();
+
+   auto f = [&]() {
+      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k ) mutable { a_view( i, j, k ) = b_view( i, j, k ); } );
+      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k ) mutable { a_view( i, j, k ) = b_view( i, j, k ); } );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 3 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "3D", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+// TODO: implement general ParallelBoundaryExecutor
+//template< typename Device >
+//void benchmark_4D( Benchmark& benchmark, index_type size = 150 )
+//{
+//   NDArray< value_type,
+//            SizesHolder< index_type, 0, 0, 0, 0 >,
+//            std::make_index_sequence< 4 >,
+//            Device > a, b;
+//   a.setSizes( size, size, size, size );
+//   b.setSizes( size, size, size, size );
+//   a.getStorageArray().setValue( -1 );
+//   b.getStorageArray().setValue( 1 );
+//
+//   auto a_view = a.getView();
+//   auto b_view = b.getView();
+//
+//   auto f = [&]() {
+//      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l ) mutable { a_view( i, j, k, l ) = b_view( i, j, k, l ); } );
+//      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l ) mutable { a_view( i, j, k, l ) = b_view( i, j, k, l ); } );
+//   };
+//
+//   const double datasetSize = 2 * std::pow( size, 4 ) * sizeof(value_type) / oneGB;
+//   benchmark.setOperation( "4D", datasetSize );
+//   benchmark.time< Device >( reset, performer< Device >(), f );
+//
+//   expect_eq( a.getStorageArray(), b.getStorageArray() );
+//}
+//
+//template< typename Device >
+//void benchmark_5D( Benchmark& benchmark, index_type size = 56 )
+//{
+//   NDArray< value_type,
+//            SizesHolder< index_type, 0, 0, 0, 0, 0 >,
+//            std::make_index_sequence< 5 >,
+//            Device > a, b;
+//   a.setSizes( size, size, size, size, size );
+//   b.setSizes( size, size, size, size, size );
+//   a.getStorageArray().setValue( -1 );
+//   b.getStorageArray().setValue( 1 );
+//
+//   auto a_view = a.getView();
+//   auto b_view = b.getView();
+//
+//   auto f = [&]() {
+//      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m ) mutable { a_view( i, j, k, l, m ) = b_view( i, j, k, l, m ); } );
+//      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m ) mutable { a_view( i, j, k, l, m ) = b_view( i, j, k, l, m ); } );
+//   };
+//
+//   const double datasetSize = 2 * std::pow( size, 5 ) * sizeof(value_type) / oneGB;
+//   benchmark.setOperation( "5D", datasetSize );
+//   benchmark.time< Device >( reset, performer< Device >(), f );
+//
+//   expect_eq( a.getStorageArray(), b.getStorageArray() );
+//}
+//
+//template< typename Device >
+//void benchmark_6D( Benchmark& benchmark, index_type size = 28 )
+//{
+//   NDArray< value_type,
+//            SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
+//            std::make_index_sequence< 6 >,
+//            Device > a, b;
+//   a.setSizes( size, size, size, size, size, size );
+//   b.setSizes( size, size, size, size, size, size );
+//   a.getStorageArray().setValue( -1 );
+//   b.getStorageArray().setValue( 1 );
+//
+//   auto a_view = a.getView();
+//   auto b_view = b.getView();
+//
+//   auto f = [&]() {
+//      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m, index_type n ) mutable { a_view( i, j, k, l, m, n ) = b_view( i, j, k, l, m, n ); } );
+//      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m, index_type n ) mutable { a_view( i, j, k, l, m, n ) = b_view( i, j, k, l, m, n ); } );
+//   };
+//
+//   const double datasetSize = 2 * std::pow( size, 6 ) * sizeof(value_type) / oneGB;
+//   benchmark.setOperation( "6D", datasetSize );
+//   benchmark.time< Device >( reset, performer< Device >(), f );
+//
+//   expect_eq( a.getStorageArray(), b.getStorageArray() );
+//}
+
+
+template< typename Device >
+void benchmark_2D_perm( Benchmark& benchmark, index_type size = 22333 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0 >,
+            std::index_sequence< 1, 0 >,
+            Device > a, b;
+   a.setSizes( size, size );
+   b.setSizes( size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto a_view = a.getView();
+   auto b_view = b.getView();
+
+   auto f = [&]() {
+      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j ) mutable { a_view( i, j ) = b_view( i, j ); } );
+      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j ) mutable { a_view( i, j ) = b_view( i, j ); } );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 2 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "2D permuted", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0 >,
+            std::index_sequence< 2, 1, 0 >,
+            Device > a, b;
+   a.setSizes( size, size, size );
+   b.setSizes( size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto a_view = a.getView();
+   auto b_view = b.getView();
+
+   auto f = [&]() {
+      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k ) mutable { a_view( i, j, k ) = b_view( i, j, k ); } );
+      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k ) mutable { a_view( i, j, k ) = b_view( i, j, k ); } );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 3 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "3D permuted", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+// TODO: implement general ParallelBoundaryExecutor
+//template< typename Device >
+//void benchmark_4D_perm( Benchmark& benchmark, index_type size = 150 )
+//{
+//   NDArray< value_type,
+//            SizesHolder< index_type, 0, 0, 0, 0 >,
+//            std::index_sequence< 3, 2, 1, 0 >,
+//            Device > a, b;
+//   a.setSizes( size, size, size, size );
+//   b.setSizes( size, size, size, size );
+//   a.getStorageArray().setValue( -1 );
+//   b.getStorageArray().setValue( 1 );
+//
+//   auto a_view = a.getView();
+//   auto b_view = b.getView();
+//
+//   auto f = [&]() {
+//      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l ) mutable { a_view( i, j, k, l ) = b_view( i, j, k, l ); } );
+//      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l ) mutable { a_view( i, j, k, l ) = b_view( i, j, k, l ); } );
+//   };
+//
+//   const double datasetSize = 2 * std::pow( size, 4 ) * sizeof(value_type) / oneGB;
+//   benchmark.setOperation( "4D permuted", datasetSize );
+//   benchmark.time< Device >( reset, performer< Device >(), f );
+//
+//   expect_eq( a.getStorageArray(), b.getStorageArray() );
+//}
+//
+//template< typename Device >
+//void benchmark_5D_perm( Benchmark& benchmark, index_type size = 56 )
+//{
+//   NDArray< value_type,
+//            SizesHolder< index_type, 0, 0, 0, 0, 0 >,
+//            std::index_sequence< 4, 3, 2, 1, 0 >,
+//            Device > a, b;
+//   a.setSizes( size, size, size, size, size );
+//   b.setSizes( size, size, size, size, size );
+//   a.getStorageArray().setValue( -1 );
+//   b.getStorageArray().setValue( 1 );
+//
+//   auto a_view = a.getView();
+//   auto b_view = b.getView();
+//
+//   auto f = [&]() {
+//      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m ) mutable { a_view( i, j, k, l, m ) = b_view( i, j, k, l, m ); } );
+//      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m ) mutable { a_view( i, j, k, l, m ) = b_view( i, j, k, l, m ); } );
+//   };
+//
+//   const double datasetSize = 2 * std::pow( size, 5 ) * sizeof(value_type) / oneGB;
+//   benchmark.setOperation( "5D permuted", datasetSize );
+//   benchmark.time< Device >( reset, performer< Device >(), f );
+//
+//   expect_eq( a.getStorageArray(), b.getStorageArray() );
+//}
+//
+//template< typename Device >
+//void benchmark_6D_perm( Benchmark& benchmark, index_type size = 28 )
+//{
+//   NDArray< value_type,
+//            SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
+//            std::index_sequence< 5, 4, 3, 2, 1, 0 >,
+//            Device > a, b;
+//   a.setSizes( size, size, size, size, size, size );
+//   b.setSizes( size, size, size, size, size, size );
+//   a.getStorageArray().setValue( -1 );
+//   b.getStorageArray().setValue( 1 );
+//
+//   auto a_view = a.getView();
+//   auto b_view = b.getView();
+//
+//   auto f = [&]() {
+//      a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m, index_type n ) mutable { a_view( i, j, k, l, m, n ) = b_view( i, j, k, l, m, n ); } );
+//      a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m, index_type n ) mutable { a_view( i, j, k, l, m, n ) = b_view( i, j, k, l, m, n ); } );
+//   };
+//
+//   const double datasetSize = 2 * std::pow( size, 6 ) * sizeof(value_type) / oneGB;
+//   benchmark.setOperation( "6D permuted", datasetSize );
+//   benchmark.time< Device >( reset, performer< Device >(), f );
+//
+//   expect_eq( a.getStorageArray(), b.getStorageArray() );
+//}
+
+template< typename Device >
+void run_benchmarks( Benchmark& benchmark )
+{
+   benchmark_1D< Device >( benchmark );
+   benchmark_2D< Device >( benchmark );
+   benchmark_3D< Device >( benchmark );
+//   benchmark_4D< Device >( benchmark );
+//   benchmark_5D< Device >( benchmark );
+//   benchmark_6D< Device >( benchmark );
+   benchmark_2D_perm< Device >( benchmark );
+   benchmark_3D_perm< Device >( benchmark );
+//   benchmark_4D_perm< Device >( benchmark );
+//   benchmark_5D_perm< Device >( benchmark );
+//   benchmark_6D_perm< Device >( benchmark );
+}
+
+void setupConfig( Config::ConfigDescription & config )
+{
+   config.addDelimiter( "Benchmark settings:" );
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-ndarray-boundary.log");
+   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+   config.addEntryEnum( "append" );
+   config.addEntryEnum( "overwrite" );
+   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
+   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
+   config.addEntry< String >( "devices", "Run benchmarks on these devices.", "all" );
+   config.addEntryEnum( "all" );
+   config.addEntryEnum( "host" );
+   #ifdef HAVE_CUDA
+   config.addEntryEnum( "cuda" );
+   #endif
+
+   config.addDelimiter( "Device settings:" );
+   Devices::Host::configSetup( config );
+   Devices::Cuda::configSetup( config );
+}
+
+int main( int argc, char* argv[] )
+{
+   Config::ParameterContainer parameters;
+   Config::ConfigDescription conf_desc;
+
+   setupConfig( conf_desc );
+
+   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) {
+      conf_desc.printUsage( argv[ 0 ] );
+      return EXIT_FAILURE;
+   }
+
+   if( ! Devices::Host::setup( parameters ) ||
+       ! Devices::Cuda::setup( parameters ) )
+      return EXIT_FAILURE;
+
+   const String & logFileName = parameters.getParameter< String >( "log-file" );
+   const String & outputMode = parameters.getParameter< String >( "output-mode" );
+   const int loops = parameters.getParameter< int >( "loops" );
+   const int verbose = parameters.getParameter< int >( "verbose" );
+
+   // open log file
+   auto mode = std::ios::out;
+   if( outputMode == "append" )
+       mode |= std::ios::app;
+   std::ofstream logFile( logFileName.getString(), mode );
+
+   // init benchmark and common metadata
+   Benchmark benchmark( loops, verbose );
+
+   // prepare global metadata
+   Benchmark::MetadataMap metadata = getHardwareMetadata();
+
+   const String devices = parameters.getParameter< String >( "devices" );
+   if( devices == "all" || devices == "host" )
+      run_benchmarks< Devices::Host >( benchmark );
+#ifdef HAVE_CUDA
+   if( devices == "all" || devices == "cuda" )
+      run_benchmarks< Devices::Cuda >( benchmark );
+#endif
+
+   if( ! benchmark.save( logFile ) ) {
+      std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   return EXIT_SUCCESS;
+}
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-cuda.cu b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ccbac3b3841d1122989c658b1d181cd23c80e3ef
--- /dev/null
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-cuda.cu
@@ -0,0 +1 @@
+#include "tnl-benchmark-ndarray.h"
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray.cpp b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ccbac3b3841d1122989c658b1d181cd23c80e3ef
--- /dev/null
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.cpp
@@ -0,0 +1 @@
+#include "tnl-benchmark-ndarray.h"
diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h
new file mode 100644
index 0000000000000000000000000000000000000000..0de53ea8815033654194cc9e2eb6f3eaf6356356
--- /dev/null
+++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h
@@ -0,0 +1,453 @@
+/***************************************************************************
+                          tnl-benchmark-ndarray.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Math.h>
+#include <TNL/ParallelFor.h>
+
+#include <TNL/Containers/NDArray.h>
+#include <TNL/Containers/ndarray/Operations.h>
+
+#include "../Benchmarks.h"
+
+using namespace TNL;
+using namespace TNL::Benchmarks;
+using namespace TNL::Containers;
+using std::index_sequence;
+
+using value_type = float;
+//using index_type = std::size_t;
+using index_type = unsigned;
+
+template< typename Array >
+void expect_eq_chunked( Array& a, Array& b )
+{
+   // TODO: use something like EXPECT_EQ
+   TNL_ASSERT_EQ( a.getSize(), b.getSize(), "array sizes don't match" );
+   if( a.getSize() != b.getSize() )
+      return;
+
+   using IndexType = typename Array::IndexType;
+
+   const IndexType chunk_size = 4096;
+   for( IndexType c = 0; c < (IndexType) roundUpDivision( a.getSize(), chunk_size ); c++ ) {
+      const typename Array::IndexType this_chunk_size = TNL::min( chunk_size, a.getSize() - c * chunk_size );
+      Array a_chunk( &a[ c * chunk_size ], this_chunk_size );
+      Array b_chunk( &b[ c * chunk_size ], this_chunk_size );
+      // TODO: use something like EXPECT_EQ
+      TNL_ASSERT_EQ( a_chunk, b_chunk, "chunks are not equal" );
+   }
+}
+
+template< typename Array >
+void expect_eq( Array& a, Array& b )
+{
+   if( std::is_same< typename Array::DeviceType, TNL::Devices::Cuda >::value ) {
+      typename Array::HostType a_host, b_host;
+      a_host = a;
+      b_host = b;
+      expect_eq_chunked( a_host, b_host );
+   }
+   else {
+      expect_eq_chunked( a, b );
+   }
+}
+
+template< typename Device >
+const char* performer()
+{
+   if( std::is_same< Device, Devices::Host >::value )
+      return "CPU";
+   else if( std::is_same< Device, Devices::Cuda >::value )
+      return "GPU";
+   else
+      return "unknown";
+}
+
+void reset() {}
+
+// NOTE: having the sizes as function parameters keeps the compiler from treating them
+// as "compile-time constants" and thus e.g. optimizing the 1D iterations with memcpy
+
+template< typename Device >
+void benchmark_array( Benchmark& benchmark, index_type size = 500000000 )
+{
+   Array< value_type, Device > a, b;
+   a.setSize( size );
+   b.setSize( size );
+   a.setValue( -1 );
+   b.setValue( 1 );
+
+   auto kernel = [] __cuda_callable__
+      ( int i,
+        value_type* a,
+        const value_type* b )
+   {
+      a[ i ] = b[ i ];
+   };
+
+   auto f = [&]() {
+      TNL::ParallelFor< Device >::exec( 0, (int) size, kernel, a.getData(), b.getData() );
+   };
+
+   // warm-up for all benchmarks
+   f();
+
+   const double datasetSize = 2 * size * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "array", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a, b );
+}
+
+template< typename Device >
+void benchmark_1D( Benchmark& benchmark, index_type size = 500000000 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0 >,
+            std::make_index_sequence< 1 >,
+            Device > a, b;
+   a.setSizes( size );
+   b.setSizes( size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * size * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "1D", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_2D( Benchmark& benchmark, index_type size = 22333 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0 >,
+            std::make_index_sequence< 2 >,
+            Device > a, b;
+   a.setSizes( size, size );
+   b.setSizes( size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 2 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "2D", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0 >,
+            std::make_index_sequence< 3 >,
+            Device > a, b;
+   a.setSizes( size, size, size );
+   b.setSizes( size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 3 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "3D", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_4D( Benchmark& benchmark, index_type size = 150 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0, 0 >,
+            std::make_index_sequence< 4 >,
+            Device > a, b;
+   a.setSizes( size, size, size, size );
+   b.setSizes( size, size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 4 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "4D", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_5D( Benchmark& benchmark, index_type size = 56 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0, 0, 0 >,
+            std::make_index_sequence< 5 >,
+            Device > a, b;
+   a.setSizes( size, size, size, size, size );
+   b.setSizes( size, size, size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 5 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "5D", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_6D( Benchmark& benchmark, index_type size = 28 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
+            std::make_index_sequence< 6 >,
+            Device > a, b;
+   a.setSizes( size, size, size, size, size, size );
+   b.setSizes( size, size, size, size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 6 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "6D", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+
+template< typename Device >
+void benchmark_2D_perm( Benchmark& benchmark, index_type size = 22333 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0 >,
+            std::index_sequence< 1, 0 >,
+            Device > a, b;
+   a.setSizes( size, size );
+   b.setSizes( size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 2 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "2D permuted", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0 >,
+            std::index_sequence< 2, 1, 0 >,
+            Device > a, b;
+   a.setSizes( size, size, size );
+   b.setSizes( size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 3 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "3D permuted", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_4D_perm( Benchmark& benchmark, index_type size = 150 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0, 0 >,
+            std::index_sequence< 3, 2, 1, 0 >,
+            Device > a, b;
+   a.setSizes( size, size, size, size );
+   b.setSizes( size, size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 4 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "4D permuted", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_5D_perm( Benchmark& benchmark, index_type size = 56 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0, 0, 0 >,
+            std::index_sequence< 4, 3, 2, 1, 0 >,
+            Device > a, b;
+   a.setSizes( size, size, size, size, size );
+   b.setSizes( size, size, size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 5 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "5D permuted", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void benchmark_6D_perm( Benchmark& benchmark, index_type size = 28 )
+{
+   NDArray< value_type,
+            SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
+            std::index_sequence< 5, 4, 3, 2, 1, 0 >,
+            Device > a, b;
+   a.setSizes( size, size, size, size, size, size );
+   b.setSizes( size, size, size, size, size, size );
+   a.getStorageArray().setValue( -1 );
+   b.getStorageArray().setValue( 1 );
+
+   auto f = [&]() {
+      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
+   };
+
+   const double datasetSize = 2 * std::pow( size, 6 ) * sizeof(value_type) / oneGB;
+   benchmark.setOperation( "6D permuted", datasetSize );
+   benchmark.time< Device >( reset, performer< Device >(), f );
+
+   expect_eq( a.getStorageArray(), b.getStorageArray() );
+}
+
+template< typename Device >
+void run_benchmarks( Benchmark& benchmark )
+{
+   benchmark_array< Device >( benchmark );
+   benchmark_1D< Device >( benchmark );
+   benchmark_2D< Device >( benchmark );
+   benchmark_3D< Device >( benchmark );
+   benchmark_4D< Device >( benchmark );
+   benchmark_5D< Device >( benchmark );
+   benchmark_6D< Device >( benchmark );
+   benchmark_2D_perm< Device >( benchmark );
+   benchmark_3D_perm< Device >( benchmark );
+   benchmark_4D_perm< Device >( benchmark );
+   benchmark_5D_perm< Device >( benchmark );
+   benchmark_6D_perm< Device >( benchmark );
+}
+
+void setupConfig( Config::ConfigDescription & config )
+{
+   config.addDelimiter( "Benchmark settings:" );
+   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-ndarray.log");
+   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+   config.addEntryEnum( "append" );
+   config.addEntryEnum( "overwrite" );
+   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
+   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
+   config.addEntry< String >( "devices", "Run benchmarks on these devices.", "all" );
+   config.addEntryEnum( "all" );
+   config.addEntryEnum( "host" );
+   #ifdef HAVE_CUDA
+   config.addEntryEnum( "cuda" );
+   #endif
+
+   config.addDelimiter( "Device settings:" );
+   Devices::Host::configSetup( config );
+   Devices::Cuda::configSetup( config );
+}
+
+int main( int argc, char* argv[] )
+{
+   Config::ParameterContainer parameters;
+   Config::ConfigDescription conf_desc;
+
+   setupConfig( conf_desc );
+
+   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) {
+      conf_desc.printUsage( argv[ 0 ] );
+      return EXIT_FAILURE;
+   }
+
+   if( ! Devices::Host::setup( parameters ) ||
+       ! Devices::Cuda::setup( parameters ) )
+      return EXIT_FAILURE;
+
+   const String & logFileName = parameters.getParameter< String >( "log-file" );
+   const String & outputMode = parameters.getParameter< String >( "output-mode" );
+   const int loops = parameters.getParameter< int >( "loops" );
+   const int verbose = parameters.getParameter< int >( "verbose" );
+
+   // open log file
+   auto mode = std::ios::out;
+   if( outputMode == "append" )
+       mode |= std::ios::app;
+   std::ofstream logFile( logFileName.getString(), mode );
+
+   // init benchmark and common metadata
+   Benchmark benchmark( loops, verbose );
+
+   // prepare global metadata
+   Benchmark::MetadataMap metadata = getHardwareMetadata();
+
+   const String devices = parameters.getParameter< String >( "devices" );
+   if( devices == "all" || devices == "host" )
+      run_benchmarks< Devices::Host >( benchmark );
+#ifdef HAVE_CUDA
+   if( devices == "all" || devices == "cuda" )
+      run_benchmarks< Devices::Cuda >( benchmark );
+#endif
+
+   if( ! benchmark.save( logFile ) ) {
+      std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
+      return EXIT_FAILURE;
+   }
+
+   return EXIT_SUCCESS;
+}
diff --git a/src/TNL/Containers/Algorithms/ArrayOperations.h b/src/TNL/Containers/Algorithms/ArrayOperations.h
index 7977b6b728f1b9827b569d2520ef322d2f49b432..ca62f5b7ea45254298cb02d0ac909ee2242e72f2 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperations.h
+++ b/src/TNL/Containers/Algorithms/ArrayOperations.h
@@ -22,6 +22,42 @@ template< typename DestinationDevice,
           typename SourceDevice = DestinationDevice >
 struct ArrayOperations;
 
+// TODO: establish the concept of a "void device" for static computations in the whole TNL
+template<>
+struct ArrayOperations< void >
+{
+   template< typename Element >
+   __cuda_callable__
+   static void setElement( Element* data,
+                           const Element& value );
+
+   template< typename Element >
+   __cuda_callable__
+   static Element getElement( const Element* data );
+
+   template< typename Element, typename Index >
+   __cuda_callable__
+   static void set( Element* data,
+                    const Element& value,
+                    const Index size );
+
+   template< typename DestinationElement,
+             typename SourceElement,
+             typename Index >
+   __cuda_callable__
+   static void copy( DestinationElement* destination,
+                     const SourceElement* source,
+                     const Index size );
+
+   template< typename Element1,
+             typename Element2,
+             typename Index >
+   __cuda_callable__
+   static bool compare( const Element1* destination,
+                        const Element2* source,
+                        const Index size );
+};
+
 template<>
 struct ArrayOperations< Devices::Host >
 {
@@ -251,6 +287,7 @@ struct ArrayOperations< Devices::Host, Devices::MIC >
 } // namespace Containers
 } // namespace TNL
 
+#include <TNL/Containers/Algorithms/ArrayOperationsStatic.hpp>
 #include <TNL/Containers/Algorithms/ArrayOperationsHost.hpp>
 #include <TNL/Containers/Algorithms/ArrayOperationsCuda.hpp>
 #include <TNL/Containers/Algorithms/ArrayOperationsMIC.hpp>
diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsStatic.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsStatic.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8115d25f4c3431e68fb40bf8b18406a4b176ca33
--- /dev/null
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsStatic.hpp
@@ -0,0 +1,82 @@
+/***************************************************************************
+                          ArrayOperationsStatic_impl.h  -  description
+                             -------------------
+    begin                : Apr 8, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
+
+namespace TNL {
+namespace Containers {
+namespace Algorithms {
+
+template< typename Element >
+__cuda_callable__
+void
+ArrayOperations< void >::
+setElement( Element* data,
+            const Element& value )
+{
+   *data = value;
+}
+
+template< typename Element >
+__cuda_callable__
+Element
+ArrayOperations< void >::
+getElement( const Element* data )
+{
+   return *data;
+}
+
+template< typename Element, typename Index >
+__cuda_callable__
+void
+ArrayOperations< void >::
+set( Element* data,
+     const Element& value,
+     const Index size )
+{
+   for( Index i = 0; i < size; i ++ )
+      data[ i ] = value;
+}
+
+template< typename DestinationElement,
+          typename SourceElement,
+          typename Index >
+__cuda_callable__
+void
+ArrayOperations< void >::
+copy( DestinationElement* destination,
+      const SourceElement* source,
+      const Index size )
+{
+   for( Index i = 0; i < size; i ++ )
+      destination[ i ] = source[ i ];
+}
+
+template< typename Element1,
+          typename Element2,
+          typename Index >
+__cuda_callable__
+bool
+ArrayOperations< void >::
+compare( const Element1* destination,
+         const Element2* source,
+         const Index size )
+{
+   for( Index i = 0; i < size; i++ )
+      if( ! ( destination[ i ] == source[ i ] ) )
+         return false;
+   return true;
+}
+
+} // namespace Algorithms
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/DistributedNDArray.h b/src/TNL/Containers/DistributedNDArray.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b123d114faa37e9022d7b5caab6f9c7124c2263
--- /dev/null
+++ b/src/TNL/Containers/DistributedNDArray.h
@@ -0,0 +1,467 @@
+/***************************************************************************
+                          DistributedNDArray.h  -  description
+                             -------------------
+    begin                : Dec 27, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Containers/NDArray.h>
+#include <TNL/Containers/Subrange.h>
+#include <TNL/Containers/DistributedNDArrayView.h>
+
+namespace TNL {
+namespace Containers {
+
+template< typename NDArray,
+          typename Communicator = Communicators::MpiCommunicator,
+          typename Overlaps = __ndarray_impl::make_constant_index_sequence< NDArray::getDimension(), 0 > >
+class DistributedNDArray
+{
+   using CommunicationGroup = typename Communicator::CommunicationGroup;
+public:
+   using ValueType = typename NDArray::ValueType;
+   using DeviceType = typename NDArray::DeviceType;
+   using IndexType = typename NDArray::IndexType;
+   using SizesHolderType = typename NDArray::SizesHolderType;
+   using PermutationType = typename NDArray::PermutationType;
+   using CommunicatorType = Communicator;
+   using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArray::SizesHolderType >;
+   using LocalRangeType = Subrange< IndexType >;
+   using OverlapsType = Overlaps;
+   using LocalIndexerType = NDArrayIndexer< SizesHolderType, PermutationType, typename NDArray::NDBaseType, typename NDArray::StridesHolderType, Overlaps >;
+
+   using ViewType = DistributedNDArrayView< typename NDArray::ViewType, Communicator, Overlaps >;
+   using ConstViewType = DistributedNDArrayView< typename NDArray::ConstViewType, Communicator, Overlaps >;
+   using LocalViewType = typename NDArray::ViewType;
+   using ConstLocalViewType = typename NDArray::ConstViewType;
+
+   static_assert( Overlaps::size() == NDArray::getDimension(), "invalid overlaps" );
+
+   // all methods from NDArrayView
+
+   DistributedNDArray() = default;
+
+   // The copy-constructor of TNL::Containers::Array makes shallow copy so our
+   // copy-constructor cannot be default. Actually, we most likely don't need
+   // it anyway, so let's just delete it.
+   DistributedNDArray( const DistributedNDArray& ) = delete;
+
+   // Standard copy-semantics with deep copy, just like regular 1D array.
+   // Mismatched sizes cause reallocations.
+   DistributedNDArray& operator=( const DistributedNDArray& other ) = default;
+
+   // default move-semantics
+   DistributedNDArray( DistributedNDArray&& ) = default;
+   DistributedNDArray& operator=( DistributedNDArray&& ) = default;
+
+   // Templated copy-assignment
+   template< typename OtherArray >
+   DistributedNDArray& operator=( const OtherArray& other )
+   {
+      globalSizes = other.getSizes();
+      localBegins = other.getLocalBegins();
+      localEnds = other.getLocalEnds();
+      group = other.getCommunicationGroup();
+      localArray = other.getConstLocalView();
+      return *this;
+   }
+
+   static constexpr std::size_t getDimension()
+   {
+      return NDArray::getDimension();
+   }
+
+   __cuda_callable__
+   CommunicationGroup getCommunicationGroup() const
+   {
+      return group;
+   }
+
+   // Returns the *global* sizes
+   __cuda_callable__
+   const SizesHolderType& getSizes() const
+   {
+      return globalSizes;
+   }
+
+   // Returns the *global* size
+   template< std::size_t level >
+   __cuda_callable__
+   IndexType getSize() const
+   {
+      return globalSizes.template getSize< level >();
+   }
+
+   __cuda_callable__
+   LocalBeginsType getLocalBegins() const
+   {
+      return localBegins;
+   }
+
+   __cuda_callable__
+   SizesHolderType getLocalEnds() const
+   {
+      return localEnds;
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   LocalRangeType getLocalRange() const
+   {
+      return LocalRangeType( localBegins.template getSize< level >(), localEnds.template getSize< level >() );
+   }
+
+   // returns the local storage size
+   __cuda_callable__
+   IndexType getLocalStorageSize() const
+   {
+      return localArray.getStorageSize();
+   }
+
+   LocalIndexerType getLocalIndexer() const
+   {
+      return LocalIndexerType( localEnds - localBegins, typename NDArray::StridesHolderType{} );
+   }
+
+   LocalViewType getLocalView()
+   {
+      return localArray.getView();
+   }
+
+   ConstLocalViewType getConstLocalView() const
+   {
+      return localArray.getConstView();
+   }
+
+   // returns the *local* storage index for given *global* indices
+   template< typename... IndexTypes >
+   __cuda_callable__
+   IndexType
+   getStorageIndex( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == SizesHolderType::getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
+      auto getStorageIndex = [this]( auto&&... indices )
+      {
+         return this->localArray.getStorageIndex( std::forward< decltype(indices) >( indices )... );
+      };
+      return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, getStorageIndex, std::forward< IndexTypes >( indices )... );
+   }
+
+   __cuda_callable__
+   ValueType* getData()
+   {
+      return localArray.getData();
+   }
+
+   __cuda_callable__
+   std::add_const_t< ValueType >* getData() const
+   {
+      return localArray.getData();
+   }
+
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   ValueType&
+   operator()( IndexTypes&&... indices )
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
+      return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localArray, std::forward< IndexTypes >( indices )... );
+   }
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   const ValueType&
+   operator()( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
+      return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localArray, std::forward< IndexTypes >( indices )... );
+   }
+
+   // bracket operator for 1D arrays
+   __cuda_callable__
+   ValueType&
+   operator[]( IndexType index )
+   {
+      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) );
+      return localArray[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ];
+   }
+
+   __cuda_callable__
+   const ValueType&
+   operator[]( IndexType index ) const
+   {
+      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) );
+      return localArray[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ];
+   }
+
+   __cuda_callable__
+   ViewType getView()
+   {
+      return ViewType( localArray.getView(), globalSizes, localBegins, localEnds, group );
+   }
+
+   __cuda_callable__
+   ConstViewType getConstView() const
+   {
+      return ConstViewType( localArray.getConstView(), globalSizes, localBegins, localEnds, group );
+   }
+
+   // TODO: overlaps should be skipped, otherwise it works only after synchronization
+   bool operator==( const DistributedNDArray& other ) const
+   {
+      // we can't run allreduce if the communication groups are different
+      if( group != other.getCommunicationGroup() )
+         return false;
+      const bool localResult =
+            globalSizes == other.globalSizes &&
+            localBegins == other.localBegins &&
+            localEnds == other.localEnds &&
+            localArray == other.localArray;
+      bool result = true;
+      if( group != CommunicatorType::NullGroup )
+         CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
+      return result;
+   }
+
+   bool operator!=( const DistributedNDArray& other ) const
+   {
+      return ! (*this == other);
+   }
+
+   // iterate over all local elements
+   template< typename Device2 = DeviceType, typename Func >
+   void forAll( Func f ) const
+   {
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( localBegins, localEnds, f );
+   }
+
+   // iterate over local elements which are not neighbours of *global* boundaries
+   template< typename Device2 = DeviceType, typename Func >
+   void forInternal( Func f ) const
+   {
+      // add static sizes
+      using Begins = __ndarray_impl::LocalBeginsHolder< SizesHolderType, 1 >;
+      // add dynamic sizes
+      Begins begins;
+      __ndarray_impl::SetSizesAddHelper< 1, Begins, SizesHolderType, Overlaps >::add( begins, SizesHolderType{} );
+      __ndarray_impl::SetSizesMaxHelper< Begins, LocalBeginsType >::max( begins, localBegins );
+
+      // subtract static sizes
+      using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type;
+      // subtract dynamic sizes
+      Ends ends;
+      __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolderType, Overlaps >::subtract( ends, globalSizes );
+      __ndarray_impl::SetSizesMinHelper< Ends, SizesHolderType >::min( ends, localEnds );
+
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, ends, f );
+   }
+
+   // iterate over local elements inside the given [begins, ends) range specified by global indices
+   template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends >
+   void forInternal( Func f, const Begins& begins, const Ends& ends ) const
+   {
+      // TODO: assert "localBegins <= begins <= localEnds", "localBegins <= ends <= localEnds"
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, ends, f );
+   }
+
+   // iterate over local elements which are neighbours of *global* boundaries
+   template< typename Device2 = DeviceType, typename Func >
+   void forBoundary( Func f ) const
+   {
+      // add static sizes
+      using SkipBegins = __ndarray_impl::LocalBeginsHolder< SizesHolderType, 1 >;
+      // add dynamic sizes
+      SkipBegins skipBegins;
+      __ndarray_impl::SetSizesAddHelper< 1, SkipBegins, SizesHolderType, Overlaps >::add( skipBegins, SizesHolderType{} );
+      __ndarray_impl::SetSizesMaxHelper< SkipBegins, LocalBeginsType >::max( skipBegins, localBegins );
+
+      // subtract static sizes
+      using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type;
+      // subtract dynamic sizes
+      SkipEnds skipEnds;
+      __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolderType, Overlaps >::subtract( skipEnds, globalSizes );
+      __ndarray_impl::SetSizesMinHelper< SkipEnds, SizesHolderType >::min( skipEnds, localEnds );
+
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( localBegins, skipBegins, skipEnds, localEnds, f );
+   }
+
+   // iterate over local elements outside the given [skipBegins, skipEnds) range specified by global indices
+   template< typename Device2 = DeviceType, typename Func, typename SkipBegins, typename SkipEnds >
+   void forBoundary( Func f, const SkipBegins& skipBegins, const SkipEnds& skipEnds ) const
+   {
+      // TODO: assert "localBegins <= skipBegins <= localEnds", "localBegins <= skipEnds <= localEnds"
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( localBegins, skipBegins, skipEnds, localEnds, f );
+   }
+
+   // iterate over local elements which are not neighbours of overlaps (if all overlaps are 0, it is equivalent to forAll)
+   template< typename Device2 = DeviceType, typename Func >
+   void forLocalInternal( Func f ) const
+   {
+      // add overlaps to dynamic sizes
+      LocalBeginsType begins;
+      __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::add( begins, localBegins );
+
+      // subtract overlaps from dynamic sizes
+      SizesHolderType ends;
+      __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::subtract( ends, localEnds );
+
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, ends, f );
+   }
+
+   // iterate over local elements which are neighbours of overlaps (if all overlaps are 0, it has no effect)
+   template< typename Device2 = DeviceType, typename Func >
+   void forLocalBoundary( Func f ) const
+   {
+      // add overlaps to dynamic sizes
+      LocalBeginsType skipBegins;
+      __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::add( skipBegins, localBegins );
+
+      // subtract overlaps from dynamic sizes
+      SizesHolderType skipEnds;
+      __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::subtract( skipEnds, localEnds );
+
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( localBegins, skipBegins, skipEnds, localEnds, f );
+   }
+
+   // iterate over elements of overlaps (if all overlaps are 0, it has no effect)
+   template< typename Device2 = DeviceType, typename Func >
+   void forOverlaps( Func f ) const
+   {
+      // subtract overlaps from dynamic sizes
+      LocalBeginsType begins;
+      __ndarray_impl::SetSizesSubtractOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::subtract( begins, localBegins );
+
+      // add overlaps to dynamic sizes
+      SizesHolderType ends;
+      __ndarray_impl::SetSizesAddOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::add( ends, localEnds );
+
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, localBegins, localEnds, ends, f );
+   }
+
+
+   // extra methods
+
+   // Sets the *global* size, but does not allocate storage
+   template< typename... IndexTypes >
+   void setSizes( IndexTypes&&... sizes )
+   {
+      static_assert( sizeof...( sizes ) == getDimension(), "got wrong number of sizes" );
+      __ndarray_impl::setSizesHelper( globalSizes, std::forward< IndexTypes >( sizes )... );
+      // initialize localBegins and localEnds
+      localBegins = LocalBeginsType{};
+      localEnds = globalSizes;
+   }
+
+   template< std::size_t level >
+   void setDistribution( IndexType begin, IndexType end, CommunicationGroup group = Communicator::AllGroup )
+   {
+      static_assert( SizesHolderType::template getStaticSize< level >() == 0, "NDArray cannot be distributed in static dimensions." );
+      TNL_ASSERT_GE( begin, 0, "begin must be non-negative" );
+      TNL_ASSERT_LE( end, globalSizes.template getSize< level >(), "end must not be greater than global size" );
+      TNL_ASSERT_LT( begin, end, "begin must be lesser than end" );
+      localBegins.template setSize< level >( begin );
+      localEnds.template setSize< level >( end );
+      TNL_ASSERT( this->group == Communicator::NullGroup || this->group == group,
+                  std::cerr << "different groups cannot be combined for different dimensions" );
+      this->group = group;
+   }
+
+   // Computes the distributed storage size and allocates the local array
+   void allocate()
+   {
+      SizesHolderType localSizes;
+      TemplateStaticFor< std::size_t, 0, SizesHolderType::getDimension(), LocalSizesSetter >::execHost( localSizes, globalSizes, localBegins, localEnds );
+      localArray.setSize( localSizes );
+   }
+
+   void setLike( const DistributedNDArray& other )
+   {
+      localArray.setLike( other.localArray );
+      group = other.getCommunicationGroup();
+      globalSizes = other.getSizes();
+      localBegins = other.localBegins;
+      localEnds = other.localEnds;
+   }
+
+   void reset()
+   {
+      localArray.reset();
+      group = CommunicatorType::NullGroup;
+      globalSizes = SizesHolderType{};
+      localBegins = LocalBeginsType{};
+      localEnds = SizesHolderType{};
+   }
+
+   // "safe" accessor - will do slow copy from device
+   template< typename... IndexTypes >
+   ValueType
+   getElement( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
+      auto getElement = [this]( auto&&... indices )
+      {
+         return this->localArray.getElement( std::forward< decltype(indices) >( indices )... );
+      };
+      return __ndarray_impl::host_call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, getElement, std::forward< IndexTypes >( indices )... );
+   }
+
+   void setValue( ValueType value )
+   {
+      localArray.setValue( value );
+   }
+
+protected:
+   NDArray localArray;
+   CommunicationGroup group = Communicator::NullGroup;
+   SizesHolderType globalSizes;
+   // static sizes should have different type: localBegin is always 0, localEnd is always the full size
+   LocalBeginsType localBegins;
+   SizesHolderType localEnds;
+
+private:
+   template< std::size_t level >
+   struct LocalSizesSetter
+   {
+      template< typename SizesHolder, typename LocalBegins >
+      static void exec( SizesHolder& localSizes, const SizesHolder& globalSizes, const LocalBegins& localBegins, const SizesHolder& localEnds )
+      {
+         if( SizesHolder::template getStaticSize< level >() != 0 )
+            return;
+
+         const auto begin = localBegins.template getSize< level >();
+         const auto end = localEnds.template getSize< level >();
+         if( begin == end )
+            localSizes.template setSize< level >( globalSizes.template getSize< level >() );
+         else {
+            TNL_ASSERT_GE( end - begin, (decltype(end)) __ndarray_impl::get<level>( Overlaps{} ), "local size is less than the size of overlaps" );
+            localSizes.template setSize< level >( end - begin + 2 * __ndarray_impl::get<level>( Overlaps{} ) );
+         }
+      }
+   };
+};
+
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/DistributedNDArraySynchronizer.h b/src/TNL/Containers/DistributedNDArraySynchronizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6e41ba3338010779a1b110af90e198cdae617aa
--- /dev/null
+++ b/src/TNL/Containers/DistributedNDArraySynchronizer.h
@@ -0,0 +1,242 @@
+/***************************************************************************
+                          DistributedNDArraySynchronizer.h  -  description
+                             -------------------
+    begin                : Mar 30, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <future>
+
+#include <TNL/Containers/ndarray/SynchronizerBuffers.h>
+
+namespace TNL {
+namespace Containers {
+
+template< typename DistributedNDArray >
+class DistributedNDArraySynchronizer
+{
+public:
+   void synchronize( DistributedNDArray& array )
+   {
+      auto future = synchronizeAsync( array, std::launch::deferred );
+      future.wait();
+   }
+
+   // This method is not thread-safe - only the thread which created and "owns" the
+   // instance of this object can call this method.
+   // Also note that this method must not be called again until the previous
+   // asynchronous operation has finished.
+   std::shared_future<void> synchronizeAsync( DistributedNDArray& array, std::launch policy = std::launch::async )
+   {
+      // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/
+      #ifdef HAVE_CUDA
+      if( std::is_same< typename DistributedNDArray::DeviceType, Devices::Cuda >::value )
+         cudaGetDevice(&this->gpu_id);
+      #endif
+
+      // NOTE: the allocation cannot be done in the worker, otherwise CUDA would crash
+      // skip allocation on repeated calls - compare only sizes, not the actual data
+      if( array_view.getCommunicationGroup() != array.getCommunicationGroup() ||
+          array_view.getSizes() != array.getSizes() ||
+          array_view.getLocalBegins() != array.getLocalBegins() ||
+          array_view.getLocalEnds() != array.getLocalEnds() )
+      {
+         array_view.bind( array.getView() );
+
+         // allocate buffers
+         TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), AllocateHelper >::execHost( buffers, array_view );
+      }
+      else {
+         // only bind to the actual data
+         array_view.bind( array.getView() );
+      }
+
+      auto worker = [this](){ this->worker(); };
+      return std::async( policy, worker );
+   }
+
+protected:
+   using DistributedNDArrayView = typename DistributedNDArray::ViewType;
+   using Communicator = typename DistributedNDArray::CommunicatorType;
+   using Buffers = __ndarray_impl::SynchronizerBuffers< DistributedNDArray >;
+
+   DistributedNDArrayView array_view;
+   Buffers buffers;
+   int gpu_id = 0;
+
+   void worker()
+   {
+      // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/
+      #ifdef HAVE_CUDA
+      if( std::is_same< typename DistributedNDArray::DeviceType, Devices::Cuda >::value )
+         cudaSetDevice(gpu_id);
+      #endif
+
+      // fill send buffers
+      TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, true );
+
+      // issue all send and receive async operations
+      std::vector< typename Communicator::Request > requests;
+      const typename Communicator::CommunicationGroup group = array_view.getCommunicationGroup();
+      TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), SendHelper >::execHost( buffers, requests, group );
+
+      // wait until send is done
+      Communicator::WaitAll( requests.data(), requests.size() );
+
+      // copy data from receive buffers
+      TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, false );
+   }
+
+   template< std::size_t dim >
+   struct AllocateHelper
+   {
+      static void exec( Buffers& buffers, const DistributedNDArrayView& array_view )
+      {
+         auto& dim_buffers = buffers.template getDimBuffers< dim >();
+
+         constexpr std::size_t overlap = __ndarray_impl::get< dim >( typename DistributedNDArray::OverlapsType{} );
+         // TODO
+//         constexpr std::size_t overlap = array_view.template getOverlap< dim >();
+         if( overlap == 0 ) {
+            dim_buffers.reset();
+            return;
+         }
+
+         using LocalBegins = typename DistributedNDArray::LocalBeginsType;
+         using SizesHolder = typename DistributedNDArray::SizesHolderType;
+         const LocalBegins& localBegins = array_view.getLocalBegins();
+         const SizesHolder& localEnds = array_view.getLocalEnds();
+
+         SizesHolder bufferSize( localEnds );
+         bufferSize.template setSize< dim >( overlap );
+
+         dim_buffers.left_send_buffer.setSize( bufferSize );
+         dim_buffers.left_recv_buffer.setSize( bufferSize );
+         dim_buffers.right_send_buffer.setSize( bufferSize );
+         dim_buffers.right_recv_buffer.setSize( bufferSize );
+
+         // TODO: check overlap offsets for 2D and 3D distributions (watch out for the corners - maybe use SetSizesSubtractOverlapsHelper?)
+
+         // offsets for left-send
+         dim_buffers.left_send_offsets = localBegins;
+
+         // offsets for left-receive
+         dim_buffers.left_recv_offsets = localBegins;
+         dim_buffers.left_recv_offsets.template setSize< dim >( localBegins.template getSize< dim >() - overlap );
+
+         // offsets for right-send
+         dim_buffers.right_send_offsets = localBegins;
+         dim_buffers.right_send_offsets.template setSize< dim >( localEnds.template getSize< dim >() - overlap );
+
+         // offsets for right-receive
+         dim_buffers.right_recv_offsets = localBegins;
+         dim_buffers.right_recv_offsets.template setSize< dim >( localEnds.template getSize< dim >() );
+
+         // FIXME: set proper neighbor IDs !!!
+         const typename Communicator::CommunicationGroup group = array_view.getCommunicationGroup();
+         const int rank = Communicator::GetRank(group);
+         const int nproc = Communicator::GetSize(group);
+         dim_buffers.left_neighbor = (rank + nproc - 1) % nproc;
+         dim_buffers.right_neighbor = (rank + 1) % nproc;
+      }
+   };
+
+   template< std::size_t dim >
+   struct CopyHelper
+   {
+      static void exec( Buffers& buffers, DistributedNDArrayView& array_view, bool to_buffer )
+      {
+         const std::size_t overlap = __ndarray_impl::get< dim >( typename DistributedNDArray::OverlapsType{} );
+         if( overlap == 0 )
+            return;
+
+         auto& dim_buffers = buffers.template getDimBuffers< dim >();
+
+         // TODO: specify CUDA stream for the copy, otherwise async won't work !!!
+         CopyKernel< decltype(dim_buffers.left_send_buffer.getView()) > copy_kernel;
+         copy_kernel.array_view.bind( array_view );
+         copy_kernel.to_buffer = to_buffer;
+
+         if( to_buffer ) {
+            copy_kernel.buffer_view.bind( dim_buffers.left_send_buffer.getView() );
+            copy_kernel.array_offsets = dim_buffers.left_send_offsets;
+            dim_buffers.left_send_buffer.forAll( copy_kernel );
+
+            copy_kernel.buffer_view.bind( dim_buffers.right_send_buffer.getView() );
+            copy_kernel.array_offsets = dim_buffers.right_send_offsets;
+            dim_buffers.right_send_buffer.forAll( copy_kernel );
+         }
+         else {
+            copy_kernel.buffer_view.bind( dim_buffers.left_recv_buffer.getView() );
+            copy_kernel.array_offsets = dim_buffers.left_recv_offsets;
+            dim_buffers.left_recv_buffer.forAll( copy_kernel );
+
+            copy_kernel.buffer_view.bind( dim_buffers.right_recv_buffer.getView() );
+            copy_kernel.array_offsets = dim_buffers.right_recv_offsets;
+            dim_buffers.right_recv_buffer.forAll( copy_kernel );
+         }
+      }
+   };
+
+   template< std::size_t dim >
+   struct SendHelper
+   {
+      template< typename Requests, typename Group >
+      static void exec( Buffers& buffers, Requests& requests, Group group )
+      {
+         const std::size_t overlap = __ndarray_impl::get< dim >( typename DistributedNDArray::OverlapsType{} );
+         if( overlap == 0 )
+            return;
+
+         auto& dim_buffers = buffers.template getDimBuffers< dim >();
+
+         requests.push_back( Communicator::ISend( dim_buffers.left_send_buffer.getStorageArray().getData(),
+                                                  dim_buffers.left_send_buffer.getStorageSize(),
+                                                  dim_buffers.left_neighbor, 0, group ) );
+         requests.push_back( Communicator::IRecv( dim_buffers.left_recv_buffer.getStorageArray().getData(),
+                                                  dim_buffers.left_recv_buffer.getStorageSize(),
+                                                  dim_buffers.left_neighbor, 1, group ) );
+         requests.push_back( Communicator::ISend( dim_buffers.right_send_buffer.getStorageArray().getData(),
+                                                  dim_buffers.right_send_buffer.getStorageSize(),
+                                                  dim_buffers.right_neighbor, 1, group ) );
+         requests.push_back( Communicator::IRecv( dim_buffers.right_recv_buffer.getStorageArray().getData(),
+                                                  dim_buffers.right_recv_buffer.getStorageSize(),
+                                                  dim_buffers.right_neighbor, 0, group ) );
+      }
+   };
+
+#ifdef __NVCC__
+public:
+#endif
+   template< typename BufferView >
+   struct CopyKernel
+   {
+      using ArrayView = typename DistributedNDArray::ViewType;
+      using LocalBegins = typename ArrayView::LocalBeginsType;
+
+      BufferView buffer_view;
+      ArrayView array_view;
+      LocalBegins array_offsets;
+      bool to_buffer;
+
+      template< typename... Indices >
+      __cuda_callable__
+      void operator()( Indices... indices )
+      {
+         if( to_buffer )
+            buffer_view( indices... ) = call_with_shifted_indices( array_offsets, array_view, indices... );
+         else
+            call_with_shifted_indices( array_offsets, array_view, indices... ) = buffer_view( indices... );
+      }
+   };
+};
+
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/DistributedNDArrayView.h b/src/TNL/Containers/DistributedNDArrayView.h
new file mode 100644
index 0000000000000000000000000000000000000000..102985e9c15e4ff0d058dc79c04ff14b7ae2194b
--- /dev/null
+++ b/src/TNL/Containers/DistributedNDArrayView.h
@@ -0,0 +1,417 @@
+/***************************************************************************
+                          DistributedNDArrayView.h  -  description
+                             -------------------
+    begin                : Dec 27, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Containers/NDArrayView.h>
+#include <TNL/Containers/Subrange.h>
+
+namespace TNL {
+namespace Containers {
+
+template< typename NDArrayView,
+          typename Communicator = Communicators::MpiCommunicator,
+          typename Overlaps = __ndarray_impl::make_constant_index_sequence< NDArrayView::getDimension(), 0 > >
+class DistributedNDArrayView
+{
+   using CommunicationGroup = typename Communicator::CommunicationGroup;
+public:
+   using ValueType = typename NDArrayView::ValueType;
+   using DeviceType = typename NDArrayView::DeviceType;
+   using IndexType = typename NDArrayView::IndexType;
+   using SizesHolderType = typename NDArrayView::SizesHolderType;
+   using PermutationType = typename NDArrayView::PermutationType;
+   using CommunicatorType = Communicator;
+   using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArrayView::SizesHolderType >;
+   using LocalRangeType = Subrange< IndexType >;
+   using OverlapsType = Overlaps;
+   using LocalIndexerType = NDArrayIndexer< SizesHolderType, PermutationType, typename NDArrayView::NDBaseType, typename NDArrayView::StridesHolderType, Overlaps >;
+
+   using ViewType = DistributedNDArrayView< NDArrayView, Communicator, Overlaps >;
+   using ConstViewType = DistributedNDArrayView< typename NDArrayView::ConstViewType, Communicator, Overlaps >;
+   using LocalViewType = NDArrayView;
+   using ConstLocalViewType = typename NDArrayView::ConstViewType;
+
+   static_assert( Overlaps::size() == NDArrayView::getDimension(), "invalid overlaps" );
+
+   __cuda_callable__
+   DistributedNDArrayView() = default;
+
+   // explicit initialization by local array view, global sizes and local begins and ends
+   __cuda_callable__
+   DistributedNDArrayView( NDArrayView localView, SizesHolderType globalSizes, LocalBeginsType localBegins, SizesHolderType localEnds, CommunicationGroup group )
+   : localView(localView), group(group), globalSizes(globalSizes), localBegins(localBegins), localEnds(localEnds) {}
+
+   // Copy-constructor does shallow copy, so views can be passed-by-value into
+   // CUDA kernels and they can be captured-by-value in __cuda_callable__
+   // lambda functions.
+   __cuda_callable__
+   DistributedNDArrayView( const DistributedNDArrayView& ) = default;
+
+   // default move-constructor
+   __cuda_callable__
+   DistributedNDArrayView( DistributedNDArrayView&& ) = default;
+
+   // Copy-assignment does deep copy, just like regular array, but the sizes
+   // must match (i.e. copy-assignment cannot resize).
+   __cuda_callable__
+   DistributedNDArrayView& operator=( const DistributedNDArrayView& other ) = default;
+
+   // There is no move-assignment operator, so expressions like `a = b.getView()`
+   // are resolved as copy-assignment.
+
+   // Templated copy-assignment
+   template< typename OtherArray >
+   DistributedNDArrayView& operator=( const OtherArray& other )
+   {
+      globalSizes = other.getSizes();
+      localBegins = other.getLocalBegins();
+      localEnds = other.getLocalEnds();
+      group = other.getCommunicationGroup();
+      localView = other.getConstLocalView();
+      return *this;
+   }
+
+   // methods for rebinding (reinitialization)
+   __cuda_callable__
+   void bind( DistributedNDArrayView view )
+   {
+      localView.bind( view.localView );
+      group = view.group;
+      globalSizes = view.globalSizes;
+      localBegins = view.localBegins;
+      localEnds = view.localEnds;
+   }
+
+   // binds to the given raw pointer and changes the indexer
+   __cuda_callable__
+   void bind( ValueType* data, LocalIndexerType indexer )
+   {
+      localView.bind( data, indexer );
+      localView.bind( data );
+   }
+
+   // binds to the given raw pointer and preserves the current indexer
+   __cuda_callable__
+   void bind( ValueType* data )
+   {
+      localView.bind( data );
+   }
+
+   __cuda_callable__
+   void reset()
+   {
+      localView.reset();
+      group = CommunicatorType::NullGroup;
+      globalSizes = SizesHolderType{};
+      localBegins = LocalBeginsType{};
+      localEnds = SizesHolderType{};
+   }
+
+   static constexpr std::size_t getDimension()
+   {
+      return NDArrayView::getDimension();
+   }
+
+   __cuda_callable__
+   CommunicationGroup getCommunicationGroup() const
+   {
+      return group;
+   }
+
+   // Returns the *global* sizes
+   __cuda_callable__
+   const SizesHolderType& getSizes() const
+   {
+      return globalSizes;
+   }
+
+   // Returns the *global* size
+   template< std::size_t level >
+   __cuda_callable__
+   IndexType getSize() const
+   {
+      return globalSizes.template getSize< level >();
+   }
+
+   __cuda_callable__
+   LocalBeginsType getLocalBegins() const
+   {
+      return localBegins;
+   }
+
+   __cuda_callable__
+   SizesHolderType getLocalEnds() const
+   {
+      return localEnds;
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   LocalRangeType getLocalRange() const
+   {
+      return LocalRangeType( localBegins.template getSize< level >(), localEnds.template getSize< level >() );
+   }
+
+   // returns the local storage size
+   __cuda_callable__
+   IndexType getLocalStorageSize() const
+   {
+      return localView.getStorageSize();
+   }
+
+   LocalIndexerType getLocalIndexer() const
+   {
+      return LocalIndexerType( localEnds - localBegins, typename NDArrayView::StridesHolderType{} );
+   }
+
+   LocalViewType getLocalView()
+   {
+      return localView;
+   }
+
+   ConstLocalViewType getConstLocalView() const
+   {
+      return localView.getConstView();
+   }
+
+   // returns the *local* storage index for given *global* indices
+   template< typename... IndexTypes >
+   __cuda_callable__
+   IndexType
+   getStorageIndex( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == SizesHolderType::getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
+      auto getStorageIndex = [this]( auto&&... indices )
+      {
+         return this->localView.getStorageIndex( std::forward< decltype(indices) >( indices )... );
+      };
+      return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, getStorageIndex, std::forward< IndexTypes >( indices )... );
+   }
+
+   __cuda_callable__
+   ValueType* getData()
+   {
+      return localView.getData();
+   }
+
+   __cuda_callable__
+   std::add_const_t< ValueType >* getData() const
+   {
+      return localView.getData();
+   }
+
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   ValueType&
+   operator()( IndexTypes&&... indices )
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
+      return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localView, std::forward< IndexTypes >( indices )... );
+   }
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   const ValueType&
+   operator()( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
+      return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localView, std::forward< IndexTypes >( indices )... );
+   }
+
+   // bracket operator for 1D arrays
+   __cuda_callable__
+   ValueType&
+   operator[]( IndexType index )
+   {
+      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) );
+      return localView[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ];
+   }
+
+   __cuda_callable__
+   const ValueType&
+   operator[]( IndexType index ) const
+   {
+      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
+      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) );
+      return localView[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ];
+   }
+
+   __cuda_callable__
+   ViewType getView()
+   {
+      return ViewType( *this );
+   }
+
+   __cuda_callable__
+   ConstViewType getConstView() const
+   {
+      return ConstViewType( localView, globalSizes, localBegins, localEnds, group );
+   }
+
+   // TODO: overlaps should be skipped, otherwise it works only after synchronization
+   bool operator==( const DistributedNDArrayView& other ) const
+   {
+      // we can't run allreduce if the communication groups are different
+      if( group != other.getCommunicationGroup() )
+         return false;
+      const bool localResult =
+            globalSizes == other.globalSizes &&
+            localBegins == other.localBegins &&
+            localEnds == other.localEnds &&
+            localView == other.localView;
+      bool result = true;
+      if( group != CommunicatorType::NullGroup )
+         CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
+      return result;
+   }
+
+   bool operator!=( const DistributedNDArrayView& other ) const
+   {
+      return ! (*this == other);
+   }
+
+   // iterate over all local elements
+   template< typename Device2 = DeviceType, typename Func >
+   void forAll( Func f ) const
+   {
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( localBegins, localEnds, f );
+   }
+
+   // iterate over local elements which are not neighbours of *global* boundaries
+   template< typename Device2 = DeviceType, typename Func >
+   void forInternal( Func f ) const
+   {
+      // add static sizes
+      using Begins = __ndarray_impl::LocalBeginsHolder< SizesHolderType, 1 >;
+      // add dynamic sizes
+      Begins begins;
+      __ndarray_impl::SetSizesAddHelper< 1, Begins, SizesHolderType, Overlaps >::add( begins, SizesHolderType{} );
+      __ndarray_impl::SetSizesMaxHelper< Begins, LocalBeginsType >::max( begins, localBegins );
+
+      // subtract static sizes
+      using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type;
+      // subtract dynamic sizes
+      Ends ends;
+      __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolderType, Overlaps >::subtract( ends, globalSizes );
+      __ndarray_impl::SetSizesMinHelper< Ends, SizesHolderType >::min( ends, localEnds );
+
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, ends, f );
+   }
+
+   // iterate over local elements inside the given [begins, ends) range specified by global indices
+   template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends >
+   void forInternal( Func f, const Begins& begins, const Ends& ends ) const
+   {
+      // TODO: assert "localBegins <= begins <= localEnds", "localBegins <= ends <= localEnds"
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, ends, f );
+   }
+
+   // iterate over local elements which are neighbours of *global* boundaries
+   template< typename Device2 = DeviceType, typename Func >
+   void forBoundary( Func f ) const
+   {
+      // add static sizes
+      using SkipBegins = __ndarray_impl::LocalBeginsHolder< SizesHolderType, 1 >;
+      // add dynamic sizes
+      SkipBegins skipBegins;
+      __ndarray_impl::SetSizesAddHelper< 1, SkipBegins, SizesHolderType, Overlaps >::add( skipBegins, SizesHolderType{} );
+      __ndarray_impl::SetSizesMaxHelper< SkipBegins, LocalBeginsType >::max( skipBegins, localBegins );
+
+      // subtract static sizes
+      using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type;
+      // subtract dynamic sizes
+      SkipEnds skipEnds;
+      __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolderType, Overlaps >::subtract( skipEnds, globalSizes );
+      __ndarray_impl::SetSizesMinHelper< SkipEnds, SizesHolderType >::min( skipEnds, localEnds );
+
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( localBegins, skipBegins, skipEnds, localEnds, f );
+   }
+
+   // iterate over local elements outside the given [skipBegins, skipEnds) range specified by global indices
+   template< typename Device2 = DeviceType, typename Func, typename SkipBegins, typename SkipEnds >
+   void forBoundary( Func f, const SkipBegins& skipBegins, const SkipEnds& skipEnds ) const
+   {
+      // TODO: assert "localBegins <= skipBegins <= localEnds", "localBegins <= skipEnds <= localEnds"
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( localBegins, skipBegins, skipEnds, localEnds, f );
+   }
+
+   // iterate over local elements which are not neighbours of overlaps (if all overlaps are 0, it is equivalent to forAll)
+   template< typename Device2 = DeviceType, typename Func >
+   void forLocalInternal( Func f ) const
+   {
+      // add overlaps to dynamic sizes
+      LocalBeginsType begins;
+      __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::add( begins, localBegins );
+
+      // subtract overlaps from dynamic sizes
+      SizesHolderType ends;
+      __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::subtract( ends, localEnds );
+
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, ends, f );
+   }
+
+   // iterate over local elements which are neighbours of overlaps (if all overlaps are 0, it has no effect)
+   template< typename Device2 = DeviceType, typename Func >
+   void forLocalBoundary( Func f ) const
+   {
+      // add overlaps to dynamic sizes
+      LocalBeginsType skipBegins;
+      __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::add( skipBegins, localBegins );
+
+      // subtract overlaps from dynamic sizes
+      SizesHolderType skipEnds;
+      __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::subtract( skipEnds, localEnds );
+
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( localBegins, skipBegins, skipEnds, localEnds, f );
+   }
+
+   // iterate over elements of overlaps (if all overlaps are 0, it has no effect)
+   template< typename Device2 = DeviceType, typename Func >
+   void forOverlaps( Func f ) const
+   {
+      // subtract overlaps from dynamic sizes
+      LocalBeginsType begins;
+      __ndarray_impl::SetSizesSubtractOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::subtract( begins, localBegins );
+
+      // add overlaps to dynamic sizes
+      SizesHolderType ends;
+      __ndarray_impl::SetSizesAddOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::add( ends, localEnds );
+
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, localBegins, localEnds, ends, f );
+   }
+
+protected:
+   NDArrayView localView;
+   CommunicationGroup group = Communicator::NullGroup;
+   SizesHolderType globalSizes;
+   // static sizes should have different type: localBegin is always 0, localEnd is always the full size
+   LocalBeginsType localBegins;
+   SizesHolderType localEnds;
+};
+
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h
new file mode 100644
index 0000000000000000000000000000000000000000..8472f4d7151b9896a20c3f20af5d302286969022
--- /dev/null
+++ b/src/TNL/Containers/NDArray.h
@@ -0,0 +1,422 @@
+/***************************************************************************
+                          NDArray.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Containers/Array.h>
+#include <TNL/Containers/StaticArray.h>
+
+#include <TNL/Containers/NDArrayView.h>
+
+namespace TNL {
+namespace Containers {
+
+template< std::size_t slicedDimension = 0,
+          std::size_t sliceSize = 0 >
+struct SliceInfo
+{
+   // sliceSize == 0 means no slicing
+   static constexpr std::size_t getSliceSize( std::size_t dimension )
+   {
+      return (dimension == slicedDimension) ? sliceSize : 0;
+   }
+};
+
+
+
+
+template< typename Array,
+          typename SizesHolder,
+          typename Permutation,
+          typename Base,
+          typename Device = typename Array::DeviceType >
+class NDArrayStorage
+    : public NDArrayIndexer< SizesHolder, Permutation, Base >
+{
+public:
+   using StorageArray = Array;
+   using ValueType = typename Array::ValueType;
+   using DeviceType = Device;
+   using IndexType = typename Array::IndexType;
+   using SizesHolderType = SizesHolder;
+   using PermutationType = Permutation;
+   using IndexerType = NDArrayIndexer< SizesHolder, Permutation, Base >;
+   using ViewType = NDArrayView< ValueType, DeviceType, SizesHolder, Permutation, Base >;
+   using ConstViewType = NDArrayView< std::add_const_t< ValueType >, DeviceType, SizesHolder, Permutation, Base >;
+
+   static_assert( Permutation::size() == SizesHolder::getDimension(), "invalid permutation" );
+
+   // all methods from NDArrayView
+
+   NDArrayStorage() = default;
+
+   // The copy-constructor of TNL::Containers::Array makes shallow copy so our
+   // copy-constructor cannot be default. Actually, we most likely don't need
+   // it anyway, so let's just delete it.
+   NDArrayStorage( const NDArrayStorage& ) = delete;
+
+   // Standard copy-semantics with deep copy, just like regular 1D array.
+   // Mismatched sizes cause reallocations.
+   NDArrayStorage& operator=( const NDArrayStorage& other ) = default;
+
+   // default move-semantics
+   NDArrayStorage( NDArrayStorage&& ) = default;
+   NDArrayStorage& operator=( NDArrayStorage&& ) = default;
+
+   // Templated copy-assignment
+   template< typename OtherArray >
+   NDArrayStorage& operator=( const OtherArray& other )
+   {
+      static_assert( std::is_same< PermutationType, typename OtherArray::PermutationType >::value,
+                     "Arrays must have the same permutation of indices." );
+      // update sizes
+      __ndarray_impl::SetSizesCopyHelper< SizesHolderType, typename OtherArray::SizesHolderType >::copy( getSizes(), other.getSizes() );
+      // (re)allocate storage if necessary
+      array.setSize( getStorageSize() );
+      // copy data
+      getView() = other.getConstView();
+      return *this;
+   }
+
+   bool operator==( const NDArrayStorage& other ) const
+   {
+      // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
+      return getSizes() == other.getSizes() && array == other.array;
+   }
+
+   bool operator!=( const NDArrayStorage& other ) const
+   {
+      // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
+      return getSizes() != other.getSizes() || array != other.array;
+   }
+
+   __cuda_callable__
+   ValueType* getData()
+   {
+      return array.getData();
+   }
+
+   __cuda_callable__
+   std::add_const_t< ValueType >* getData() const
+   {
+      return array.getData();
+   }
+
+   // methods from the base class
+   using IndexerType::getDimension;
+   using IndexerType::getSizes;
+   using IndexerType::getSize;
+   using IndexerType::getStride;
+   using IndexerType::getStorageSize;
+   using IndexerType::getStorageIndex;
+
+   __cuda_callable__
+   const IndexerType& getIndexer() const
+   {
+      return *this;
+   }
+
+   __cuda_callable__
+   ViewType getView()
+   {
+      return ViewType( array.getData(), getSizes() );
+   }
+
+   __cuda_callable__
+   ConstViewType getConstView() const
+   {
+      return ConstViewType( array.getData(), getSizes() );
+   }
+
+   template< std::size_t... Dimensions, typename... IndexTypes >
+   __cuda_callable__
+   auto getSubarrayView( IndexTypes&&... indices )
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      static_assert( 0 < sizeof...(Dimensions) && sizeof...(Dimensions) <= getDimension(), "got wrong number of dimensions" );
+      static_assert( __ndarray_impl::all_elements_in_range( 0, Permutation::size(), {Dimensions...} ),
+                     "invalid dimensions" );
+// FIXME: nvcc chokes on the variadic brace-initialization
+#ifndef __NVCC__
+      static_assert( __ndarray_impl::is_increasing_sequence( {Dimensions...} ),
+                     "specifying permuted dimensions is not supported" );
+#endif
+
+      using Getter = __ndarray_impl::SubarrayGetter< Base, Permutation, Dimensions... >;
+      using Subpermutation = typename Getter::Subpermutation;
+      auto& begin = operator()( std::forward< IndexTypes >( indices )... );
+      auto subarray_sizes = Getter::filterSizes( getSizes(), std::forward< IndexTypes >( indices )... );
+      auto strides = Getter::getStrides( getSizes(), std::forward< IndexTypes >( indices )... );
+      static_assert( Subpermutation::size() == sizeof...(Dimensions), "Bug - wrong subpermutation length." );
+      static_assert( decltype(subarray_sizes)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the new sizes." );
+      static_assert( decltype(strides)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the strides." );
+      using SubarrayView = NDArrayView< ValueType, Device, decltype(subarray_sizes), Subpermutation, Base, decltype(strides) >;
+      return SubarrayView{ &begin, subarray_sizes, strides };
+   }
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   ValueType&
+   operator()( IndexTypes&&... indices )
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... );
+      TNL_ASSERT_LT( getStorageIndex( std::forward< IndexTypes >( indices )... ), getStorageSize(),
+                     "storage index out of bounds - either input error or a bug in the indexer" );
+      return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ];
+   }
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   const ValueType&
+   operator()( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... );
+      TNL_ASSERT_LT( getStorageIndex( std::forward< IndexTypes >( indices )... ), getStorageSize(),
+                     "storage index out of bounds - either input error or a bug in the indexer" );
+      return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ];
+   }
+
+   // bracket operator for 1D arrays
+   __cuda_callable__
+   ValueType&
+   operator[]( IndexType index )
+   {
+      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexType >( index ) );
+      return array[ index ];
+   }
+
+   __cuda_callable__
+   const ValueType&
+   operator[]( IndexType index ) const
+   {
+      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexType >( index ) );
+      return array[ index ];
+   }
+
+   template< typename Device2 = DeviceType, typename Func >
+   void forAll( Func f ) const
+   {
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
+      dispatch( Begins{}, getSizes(), f );
+   }
+
+   template< typename Device2 = DeviceType, typename Func >
+   void forInternal( Func f ) const
+   {
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >;
+      // subtract static sizes
+      using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type;
+      // subtract dynamic sizes
+      Ends ends;
+      __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolder >::subtract( ends, getSizes() );
+      dispatch( Begins{}, ends, f );
+   }
+
+   template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends >
+   void forInternal( Func f, const Begins& begins, const Ends& ends ) const
+   {
+      // TODO: assert "begins <= sizes", "ends <= sizes"
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, ends, f );
+   }
+
+   template< typename Device2 = DeviceType, typename Func >
+   void forBoundary( Func f ) const
+   {
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
+      using SkipBegins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >;
+      // subtract static sizes
+      using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type;
+      // subtract dynamic sizes
+      SkipEnds skipEnds;
+      __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolder >::subtract( skipEnds, getSizes() );
+
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( Begins{}, SkipBegins{}, skipEnds, getSizes(), f );
+   }
+
+   template< typename Device2 = DeviceType, typename Func, typename SkipBegins, typename SkipEnds >
+   void forBoundary( Func f, const SkipBegins& skipBegins, const SkipEnds& skipEnds ) const
+   {
+      // TODO: assert "skipBegins <= sizes", "skipEnds <= sizes"
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( Begins{}, skipBegins, skipEnds, getSizes(), f );
+   }
+
+
+   // extra methods
+
+   // TODO: rename to setSizes and make sure that overloading with the following method works
+   void setSize( const SizesHolderType& sizes )
+   {
+      getSizes() = sizes;
+      array.setSize( getStorageSize() );
+   }
+
+   template< typename... IndexTypes >
+   void setSizes( IndexTypes&&... sizes )
+   {
+      static_assert( sizeof...( sizes ) == getDimension(), "got wrong number of sizes" );
+      __ndarray_impl::setSizesHelper( getSizes(), std::forward< IndexTypes >( sizes )... );
+      array.setSize( getStorageSize() );
+   }
+
+   void setLike( const NDArrayStorage& other )
+   {
+      getSizes() = other.getSizes();
+      array.setSize( getStorageSize() );
+   }
+
+   void reset()
+   {
+      getSizes() = SizesHolder{};
+      TNL_ASSERT_EQ( getStorageSize(), 0, "Failed to reset the sizes." );
+      array.reset();
+   }
+
+   // "safe" accessor - will do slow copy from device
+   template< typename... IndexTypes >
+   ValueType
+   getElement( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... );
+      TNL_ASSERT_LT( getStorageIndex( std::forward< IndexTypes >( indices )... ), getStorageSize(),
+                     "storage index out of bounds - either input error or a bug in the indexer" );
+      return array.getElement( getStorageIndex( std::forward< IndexTypes >( indices )... ) );
+   }
+
+   const StorageArray& getStorageArray() const
+   {
+      return array;
+   }
+
+   StorageArray& getStorageArray()
+   {
+      return array;
+   }
+
+   void setValue( ValueType value )
+   {
+      array.setValue( value );
+   }
+
+protected:
+   StorageArray array;
+   IndexerType indexer;
+};
+
+template< typename Value,
+          typename SizesHolder,
+          typename Permutation = std::make_index_sequence< SizesHolder::getDimension() >,  // identity by default
+          typename Device = Devices::Host,
+          typename Index = typename SizesHolder::IndexType >
+class NDArray
+: public NDArrayStorage< Array< Value, Device, Index >,
+                         SizesHolder,
+                         Permutation,
+                         __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >
+{
+   using Base = NDArrayStorage< Array< Value, Device, Index >,
+                         SizesHolder,
+                         Permutation,
+                         __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >;
+
+public:
+   // inherit all assignment operators
+   using Base::operator=;
+};
+
+template< typename Value,
+          typename SizesHolder,
+          typename Permutation = std::make_index_sequence< SizesHolder::getDimension() >,  // identity by default
+          typename Index = typename SizesHolder::IndexType >
+class StaticNDArray
+: public NDArrayStorage< StaticArray< __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get(), Value >,
+                         SizesHolder,
+                         Permutation,
+                         __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >,
+                         void >
+{
+   using Base = NDArrayStorage< StaticArray< __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get(), Value >,
+                         SizesHolder,
+                         Permutation,
+                         __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >,
+                         void >;
+   static_assert( __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get() > 0,
+                  "All dimensions of a static array must to be positive." );
+
+public:
+   // inherit all assignment operators
+   using Base::operator=;
+};
+
+template< typename Value,
+          std::size_t Rows,
+          std::size_t Columns,
+          typename Permutation = std::index_sequence< 0, 1 > >  // identity by default
+class StaticMatrix
+: public StaticNDArray< Value,
+                        SizesHolder< std::size_t, Rows, Columns >,
+                        Permutation >
+{
+   using Base = StaticNDArray< Value,
+                        SizesHolder< std::size_t, Rows, Columns >,
+                        Permutation >;
+
+public:
+   // inherit all assignment operators
+   using Base::operator=;
+
+   static constexpr std::size_t getRows()
+   {
+      return Rows;
+   }
+
+   __cuda_callable__
+   static constexpr std::size_t getColumns()
+   {
+      return Columns;
+   }
+};
+
+template< typename Value,
+          typename SizesHolder,
+          typename Permutation = std::make_index_sequence< SizesHolder::getDimension() >,  // identity by default
+          typename SliceInfo = SliceInfo<>,  // no slicing by default
+          typename Device = Devices::Host,
+          typename Index = typename SizesHolder::IndexType >
+class SlicedNDArray
+: public NDArrayStorage< Array< Value, Device, Index >,
+                         SizesHolder,
+                         Permutation,
+                         __ndarray_impl::SlicedNDArrayBase< SliceInfo > >
+{
+   using Base = NDArrayStorage< Array< Value, Device, Index >,
+                         SizesHolder,
+                         Permutation,
+                         __ndarray_impl::SlicedNDArrayBase< SliceInfo > >;
+
+public:
+   // inherit all assignment operators
+   using Base::operator=;
+};
+
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/NDArrayIndexer.h b/src/TNL/Containers/NDArrayIndexer.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3f068e0cacb774db90122c25617e8531ae52787
--- /dev/null
+++ b/src/TNL/Containers/NDArrayIndexer.h
@@ -0,0 +1,112 @@
+/***************************************************************************
+                          NDArrayIndexer.h  -  description
+                             -------------------
+    begin                : Apr 14, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Containers/ndarray/Indexing.h>
+#include <TNL/Containers/ndarray/SizesHolderHelpers.h>   // StorageSizeGetter
+#include <TNL/Containers/ndarray/Subarrays.h>   // DummyStrideBase
+
+namespace TNL {
+namespace Containers {
+
+template< typename SizesHolder,
+          typename Permutation,
+          typename Base,
+          typename StridesHolder = __ndarray_impl::DummyStrideBase< typename SizesHolder::IndexType, SizesHolder::getDimension() >,
+          typename Overlaps = __ndarray_impl::make_constant_index_sequence< SizesHolder::getDimension(), 0 > >
+class NDArrayIndexer
+    : public StridesHolder
+{
+public:
+   using IndexType = typename SizesHolder::IndexType;
+   using NDBaseType = Base;
+   using SizesHolderType = SizesHolder;
+   using StridesHolderType = StridesHolder;
+   using PermutationType = Permutation;
+   using OverlapsType = Overlaps;
+
+   static_assert( StridesHolder::getDimension() == SizesHolder::getDimension(),
+                  "Dimension of strides does not match the dimension of sizes." );
+   static_assert( Permutation::size() == SizesHolder::getDimension(),
+                  "Dimension of permutation does not match the dimension of sizes." );
+   static_assert( Overlaps::size() == SizesHolder::getDimension(),
+                  "Dimension of overlaps does not match the dimension of sizes." );
+
+   __cuda_callable__
+   NDArrayIndexer() = default;
+
+   // explicit initialization by sizes and strides
+   __cuda_callable__
+   NDArrayIndexer( SizesHolder sizes, StridesHolder strides )
+   : StridesHolder(strides), sizes(sizes) {}
+
+   static constexpr std::size_t getDimension()
+   {
+      return SizesHolder::getDimension();
+   }
+
+   __cuda_callable__
+   const SizesHolderType& getSizes() const
+   {
+      return sizes;
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   IndexType getSize() const
+   {
+      return sizes.template getSize< level >();
+   }
+
+   // method template from base class
+   using StridesHolder::getStride;
+
+   template< std::size_t level >
+   static constexpr std::size_t getOverlap()
+   {
+      return __ndarray_impl::get< level >( Overlaps{} );
+   }
+
+   // returns the product of the aligned sizes
+   __cuda_callable__
+   IndexType getStorageSize() const
+   {
+      using Alignment = typename Base::template Alignment< Permutation >;
+      return __ndarray_impl::StorageSizeGetter< SizesHolder, Alignment, Overlaps >::get( sizes );
+   }
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   IndexType
+   getStorageIndex( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == SizesHolder::getDimension(), "got wrong number of indices" );
+      return Base::template getStorageIndex< Permutation, Overlaps >
+             ( sizes,
+               static_cast< const StridesHolder& >( *this ),
+               std::forward< IndexTypes >( indices )... );
+   }
+
+protected:
+   // non-const reference accessor cannot be public - only subclasses like NDArrayStorage may modify the sizes
+   __cuda_callable__
+   SizesHolderType& getSizes()
+   {
+      return sizes;
+   }
+
+   SizesHolder sizes;
+};
+
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/NDArrayView.h b/src/TNL/Containers/NDArrayView.h
new file mode 100644
index 0000000000000000000000000000000000000000..54a020a64e5b5130400c6e954ae92840cb0fe1df
--- /dev/null
+++ b/src/TNL/Containers/NDArrayView.h
@@ -0,0 +1,311 @@
+/***************************************************************************
+                          NDArrayView.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Containers/NDArrayIndexer.h>
+#include <TNL/Containers/ndarray/SizesHolder.h>
+#include <TNL/Containers/ndarray/Subarrays.h>
+#include <TNL/Containers/ndarray/Executors.h>
+#include <TNL/Containers/ndarray/BoundaryExecutors.h>
+#include <TNL/Containers/ndarray/Operations.h>
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
+
+namespace TNL {
+namespace Containers {
+
+template< typename Value,
+          typename Device,
+          typename SizesHolder,
+          typename Permutation,
+          typename Base,
+          typename StridesHolder = __ndarray_impl::DummyStrideBase< typename SizesHolder::IndexType, SizesHolder::getDimension() > >
+class NDArrayView
+    : public NDArrayIndexer< SizesHolder, Permutation, Base, StridesHolder >
+{
+public:
+   using ValueType = Value;
+   using DeviceType = Device;
+   using IndexType = typename SizesHolder::IndexType;
+   using SizesHolderType = SizesHolder;
+   using PermutationType = Permutation;
+   using IndexerType = NDArrayIndexer< SizesHolder, Permutation, Base, StridesHolder >;
+   using ViewType = NDArrayView< Value, Device, SizesHolder, Permutation, Base, StridesHolder >;
+   using ConstViewType = NDArrayView< std::add_const_t< Value >, Device, SizesHolder, Permutation, Base, StridesHolder >;
+
+   static_assert( Permutation::size() == SizesHolder::getDimension(), "invalid permutation" );
+
+   __cuda_callable__
+   NDArrayView() = default;
+
+   // explicit initialization by raw data pointer and sizes and strides
+   __cuda_callable__
+   NDArrayView( Value* data, SizesHolder sizes, StridesHolder strides = StridesHolder{} )
+   : IndexerType(sizes, strides), array(data) {}
+
+   // explicit initialization by raw data pointer and indexer
+   __cuda_callable__
+   NDArrayView( Value* data, IndexerType indexer )
+   : IndexerType(indexer), array(data) {}
+
+   // Copy-constructor does shallow copy, so views can be passed-by-value into
+   // CUDA kernels and they can be captured-by-value in __cuda_callable__
+   // lambda functions.
+   __cuda_callable__
+   NDArrayView( const NDArrayView& ) = default;
+
+   // default move-constructor
+   __cuda_callable__
+   NDArrayView( NDArrayView&& ) = default;
+
+   // Copy-assignment does deep copy, just like regular array, but the sizes
+   // must match (i.e. copy-assignment cannot resize).
+   __cuda_callable__
+   NDArrayView& operator=( const NDArrayView& other )
+   {
+      TNL_ASSERT_EQ( getSizes(), other.getSizes(), "The sizes of the array views must be equal, views are not resizable." );
+      if( getStorageSize() > 0 )
+         Algorithms::ArrayOperations< DeviceType >::copy( array, other.array, getStorageSize() );
+      return *this;
+   }
+
+   // Templated copy-assignment
+   template< typename OtherView >
+   NDArrayView& operator=( const OtherView& other )
+   {
+      static_assert( std::is_same< PermutationType, typename OtherView::PermutationType >::value,
+                     "Arrays must have the same permutation of indices." );
+      static_assert( NDArrayView::isContiguous() && OtherView::isContiguous(),
+                     "Non-contiguous array views cannot be assigned." );
+      TNL_ASSERT_TRUE( __ndarray_impl::sizesWeakCompare( getSizes(), other.getSizes() ),
+                       "The sizes of the array views must be equal, views are not resizable." );
+      if( getStorageSize() > 0 ) {
+         TNL_ASSERT_TRUE( array, "Attempted to assign to an empty view." );
+         Algorithms::ArrayOperations< DeviceType, typename OtherView::DeviceType >::copy( array, other.getData(), getStorageSize() );
+      }
+      return *this;
+   }
+
+   // There is no move-assignment operator, so expressions like `a = b.getView()`
+   // are resolved as copy-assignment.
+
+   // methods for rebinding (reinitialization)
+   __cuda_callable__
+   void bind( NDArrayView view )
+   {
+      IndexerType::operator=( view );
+      array = view.array;
+   }
+
+   // binds to the given raw pointer and changes the indexer
+   __cuda_callable__
+   void bind( Value* data, IndexerType indexer )
+   {
+      IndexerType::operator=( indexer );
+      array = data;
+   }
+
+   // binds to the given raw pointer and preserves the current indexer
+   __cuda_callable__
+   void bind( Value* data )
+   {
+      array = data;
+   }
+
+   __cuda_callable__
+   void reset()
+   {
+      IndexerType::operator=( IndexerType{} );
+      array = nullptr;
+   }
+
+   __cuda_callable__
+   bool operator==( const NDArrayView& other ) const
+   {
+      if( getSizes() != other.getSizes() )
+         return false;
+      // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
+      return Algorithms::ArrayOperations< Device, Device >::compare( array, other.array, getStorageSize() );
+   }
+
+   __cuda_callable__
+   bool operator!=( const NDArrayView& other ) const
+   {
+      if( getSizes() != other.getSizes() )
+         return true;
+      // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
+      return ! Algorithms::ArrayOperations< Device, Device >::compare( array, other.array, getStorageSize() );
+   }
+
+   __cuda_callable__
+   ValueType* getData()
+   {
+      return array;
+   }
+
+   __cuda_callable__
+   std::add_const_t< ValueType >* getData() const
+   {
+      return array;
+   }
+
+   // methods from the base class
+   using IndexerType::getDimension;
+   using IndexerType::getSizes;
+   using IndexerType::getSize;
+   using IndexerType::getStride;
+   using IndexerType::getStorageSize;
+   using IndexerType::getStorageIndex;
+
+   __cuda_callable__
+   const IndexerType& getIndexer() const
+   {
+      return *this;
+   }
+
+   __cuda_callable__
+   ViewType getView()
+   {
+      return ViewType( *this );
+   }
+
+   __cuda_callable__
+   ConstViewType getConstView() const
+   {
+      return ConstViewType( array, getSizes(), static_cast< const StridesHolder& >( *this ) );
+   }
+
+   template< std::size_t... Dimensions, typename... IndexTypes >
+   __cuda_callable__
+   auto getSubarrayView( IndexTypes&&... indices )
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      static_assert( 0 < sizeof...(Dimensions) && sizeof...(Dimensions) <= getDimension(), "got wrong number of dimensions" );
+      static_assert( __ndarray_impl::all_elements_in_range( 0, Permutation::size(), {Dimensions...} ),
+                     "invalid dimensions" );
+// FIXME: nvcc chokes on the variadic brace-initialization
+#ifndef __NVCC__
+      static_assert( __ndarray_impl::is_increasing_sequence( {Dimensions...} ),
+                     "specifying permuted dimensions is not supported" );
+#endif
+
+      using Getter = __ndarray_impl::SubarrayGetter< Base, Permutation, Dimensions... >;
+      using Subpermutation = typename Getter::Subpermutation;
+      auto& begin = operator()( std::forward< IndexTypes >( indices )... );
+      auto subarray_sizes = Getter::filterSizes( getSizes(), std::forward< IndexTypes >( indices )... );
+      auto strides = Getter::getStrides( getSizes(), std::forward< IndexTypes >( indices )... );
+      static_assert( Subpermutation::size() == sizeof...(Dimensions), "Bug - wrong subpermutation length." );
+      static_assert( decltype(subarray_sizes)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the new sizes." );
+      static_assert( decltype(strides)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the strides." );
+      using SubarrayView = NDArrayView< ValueType, Device, decltype(subarray_sizes), Subpermutation, Base, decltype(strides) >;
+      return SubarrayView{ &begin, subarray_sizes, strides };
+   }
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   ValueType&
+   operator()( IndexTypes&&... indices )
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... );
+      return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ];
+   }
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   const ValueType&
+   operator()( IndexTypes&&... indices ) const
+   {
+      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... );
+      return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ];
+   }
+
+   // bracket operator for 1D arrays
+   __cuda_callable__
+   ValueType&
+   operator[]( IndexType&& index )
+   {
+      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexType >( index ) );
+      return array[ index ];
+   }
+
+   __cuda_callable__
+   const ValueType&
+   operator[]( IndexType index ) const
+   {
+      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexType >( index ) );
+      return array[ index ];
+   }
+
+   template< typename Device2 = DeviceType, typename Func >
+   void forAll( Func f ) const
+   {
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
+      dispatch( Begins{}, getSizes(), f );
+   }
+
+   template< typename Device2 = DeviceType, typename Func >
+   void forInternal( Func f ) const
+   {
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >;
+      // subtract static sizes
+      using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type;
+      // subtract dynamic sizes
+      Ends ends;
+      __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolder >::subtract( ends, getSizes() );
+      dispatch( Begins{}, ends, f );
+   }
+
+   template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends >
+   void forInternal( Func f, const Begins& begins, const Ends& ends ) const
+   {
+      // TODO: assert "begins <= getSizes()", "ends <= getSizes()"
+      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( begins, ends, f );
+   }
+
+   template< typename Device2 = DeviceType, typename Func >
+   void forBoundary( Func f ) const
+   {
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
+      using SkipBegins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >;
+      // subtract static sizes
+      using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type;
+      // subtract dynamic sizes
+      SkipEnds skipEnds;
+      __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolder >::subtract( skipEnds, getSizes() );
+
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( Begins{}, SkipBegins{}, skipEnds, getSizes(), f );
+   }
+
+   template< typename Device2 = DeviceType, typename Func, typename SkipBegins, typename SkipEnds >
+   void forBoundary( Func f, const SkipBegins& skipBegins, const SkipEnds& skipEnds ) const
+   {
+      // TODO: assert "skipBegins <= getSizes()", "skipEnds <= getSizes()"
+      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
+      __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch;
+      dispatch( Begins{}, skipBegins, skipEnds, getSizes(), f );
+   }
+
+protected:
+   Value* array = nullptr;
+   IndexerType indexer;
+};
+
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/ndarray/BoundaryExecutors.h b/src/TNL/Containers/ndarray/BoundaryExecutors.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4cd93705c7ae83dd36378662fa67b2e618f66eb
--- /dev/null
+++ b/src/TNL/Containers/ndarray/BoundaryExecutors.h
@@ -0,0 +1,413 @@
+/***************************************************************************
+                          BoundaryExecutors.h  -  description
+                             -------------------
+    begin                : Feb 09, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+
+#include <TNL/Containers/ndarray/Meta.h>
+#include <TNL/Containers/ndarray/SizesHolder.h>
+
+namespace TNL {
+namespace Containers {
+namespace __ndarray_impl {
+
+template< typename Permutation,
+          typename LevelTag = IndexTag< 0 > >
+struct SequentialBoundaryExecutor_inner
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    std::size_t level,
+                    Func f,
+                    Indices&&... indices )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      SequentialBoundaryExecutor_inner< Permutation, IndexTag< LevelTag::value + 1 > > exec;
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto skipBegin = skipBegins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto skipEnd = skipEnds.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      if( level == LevelTag::value ) {
+         for( auto i = begin; i < skipBegin; i++ )
+            exec( begins, skipBegins, skipEnds, ends, level, f, std::forward< Indices >( indices )..., i );
+         for( auto i = skipEnd; i < end; i++ )
+            exec( begins, skipBegins, skipEnds, ends, level, f, std::forward< Indices >( indices )..., i );
+      }
+      else if( level > LevelTag::value ) {
+         for( auto i = skipBegin; i < skipEnd; i++ )
+            exec( begins, skipBegins, skipEnds, ends, level, f, std::forward< Indices >( indices )..., i );
+      }
+      else {
+         for( auto i = begin; i < end; i++ )
+            exec( begins, skipBegins, skipEnds, ends, level, f, std::forward< Indices >( indices )..., i );
+      }
+   }
+};
+
+template< typename Permutation >
+struct SequentialBoundaryExecutor_inner< Permutation, IndexTag< Permutation::size() - 1 > >
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    std::size_t level,
+                    Func f,
+                    Indices&&... indices )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      static_assert( sizeof...(indices) == Begins::getDimension() - 1,
+                     "invalid number of indices in the final step of the SequentialBoundaryExecutor" );
+
+      using LevelTag = IndexTag< Permutation::size() - 1 >;
+
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto skipBegin = skipBegins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto skipEnd = skipEnds.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      if( level == LevelTag::value ) {
+         for( auto i = begin; i < skipBegin; i++ )
+            call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
+         for( auto i = skipEnd; i < end; i++ )
+            call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
+      }
+      else if( level > LevelTag::value ) {
+         for( auto i = skipBegin; i < skipEnd; i++ )
+            call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
+      }
+      else {
+         for( auto i = begin; i < end; i++ )
+            call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
+      }
+   }
+};
+
+template< typename Permutation,
+          std::size_t dim = Permutation::size() >
+struct SequentialBoundaryExecutor
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func >
+   __cuda_callable__
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      SequentialBoundaryExecutor_inner< Permutation > exec;
+      for( std::size_t level = 0; level < Permutation::size(); level++ )
+         exec( begins, skipBegins, skipEnds, ends, level, f );
+   }
+};
+
+template< typename Permutation >
+struct SequentialBoundaryExecutor< Permutation, 0 >
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func >
+   __cuda_callable__
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto skipBegin = skipBegins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto skipEnd = skipEnds.template getSize< get< 0 >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< 0 >( Permutation{} ) >();
+      for( auto i = begin; i < skipBegin; i++ )
+         f( i );
+      for( auto i = skipEnd; i < end; i++ )
+         f( i );
+   }
+};
+
+
+template< typename Permutation,
+          typename Device,
+          typename DimTag = IndexTag< Permutation::size() > >
+struct ParallelBoundaryExecutor
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    Func f )
+   {
+      static_assert( Permutation::size() <= 3, "ParallelBoundaryExecutor is implemented only for 1D, 2D, and 3D." );
+   }
+};
+
+template< typename Permutation,
+          typename Device >
+struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 3 > >
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      // nvcc does not like nested __cuda_callable__ and normal lambdas...
+//      using Index = typename Ends::IndexType;
+//      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
+//      {
+//         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
+//      };
+      Kernel< Device > kernel;
+
+      const auto begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const auto begin2 = begins.template getSize< get< 2 >( Permutation{} ) >();
+      const auto skipBegin0 = skipBegins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto skipBegin1 = skipBegins.template getSize< get< 1 >( Permutation{} ) >();
+      const auto skipBegin2 = skipBegins.template getSize< get< 2 >( Permutation{} ) >();
+      const auto skipEnd0 = skipEnds.template getSize< get< 0 >( Permutation{} ) >();
+      const auto skipEnd1 = skipEnds.template getSize< get< 1 >( Permutation{} ) >();
+      const auto skipEnd2 = skipEnds.template getSize< get< 2 >( Permutation{} ) >();
+      const auto end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const auto end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+      const auto end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
+
+      ParallelFor3D< Device >::exec( begin2,     begin1,     begin0,   skipBegin2, end1,       end0,       kernel, f );
+      ParallelFor3D< Device >::exec( skipEnd2,   begin1,     begin0,   end2,       end1,       end0,       kernel, f );
+      ParallelFor3D< Device >::exec( skipBegin2, begin1,     begin0,   skipEnd2,   skipBegin1, end0,       kernel, f );
+      ParallelFor3D< Device >::exec( skipBegin2, skipEnd1,   begin0,   skipEnd2,   end1,       end0,       kernel, f );
+      ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, begin0,   skipEnd2,   skipEnd1,   skipBegin0, kernel, f );
+      ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, skipEnd0, skipEnd2,   skipEnd1,   end0,       kernel, f );
+   }
+
+   template< typename __Device, typename = void >
+   struct Kernel
+   {
+      template< typename Index, typename Func >
+      void operator()( Index i2, Index i1, Index i0, Func f )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
+      };
+   };
+
+   // dummy specialization to avoid a shitpile of nvcc warnings
+   template< typename __unused >
+   struct Kernel< Devices::Cuda, __unused >
+   {
+      template< typename Index, typename Func >
+      __cuda_callable__
+      void operator()( Index i2, Index i1, Index i0, Func f )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
+      };
+   };
+};
+
+template< typename Permutation,
+          typename Device >
+struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 2 > >
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      // nvcc does not like nested __cuda_callable__ and normal lambdas...
+//      using Index = typename Ends::IndexType;
+//      auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
+//      {
+//         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
+//      };
+      Kernel< Device > kernel;
+
+      const auto begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const auto skipBegin0 = skipBegins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto skipBegin1 = skipBegins.template getSize< get< 1 >( Permutation{} ) >();
+      const auto skipEnd0 = skipEnds.template getSize< get< 0 >( Permutation{} ) >();
+      const auto skipEnd1 = skipEnds.template getSize< get< 1 >( Permutation{} ) >();
+      const auto end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const auto end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+
+      ParallelFor2D< Device >::exec( begin1,     begin0,   skipBegin1, end0,       kernel, f );
+      ParallelFor2D< Device >::exec( skipEnd1,   begin0,   end1,       end0,       kernel, f );
+      ParallelFor2D< Device >::exec( skipBegin1, begin0,   skipEnd1,   skipBegin0, kernel, f );
+      ParallelFor2D< Device >::exec( skipBegin1, skipEnd0, skipEnd1,   end0,       kernel, f );
+   }
+
+   template< typename __Device, typename = void >
+   struct Kernel
+   {
+      template< typename Index, typename Func >
+      void operator()( Index i1, Index i0, Func f )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
+      };
+   };
+
+   // dummy specialization to avoid a shitpile of nvcc warnings
+   template< typename __unused >
+   struct Kernel< Devices::Cuda, __unused >
+   {
+      template< typename Index, typename Func >
+      __cuda_callable__
+      void operator()( Index i1, Index i0, Func f )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
+      };
+   };
+};
+
+template< typename Permutation,
+          typename Device >
+struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 1 > >
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto skipBegin = skipBegins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto skipEnd = skipEnds.template getSize< get< 0 >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< 0 >( Permutation{} ) >();
+
+      ParallelFor< Device >::exec( begin, skipBegin, f );
+      ParallelFor< Device >::exec( skipEnd, end, f );
+   }
+};
+
+
+// Device may be void which stands for StaticNDArray
+template< typename Permutation,
+          typename Device >
+struct BoundaryExecutorDispatcher
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    Func f )
+   {
+      SequentialBoundaryExecutor< Permutation >()( begins, skipBegins, skipEnds, ends, f );
+   }
+};
+
+template< typename Permutation >
+struct BoundaryExecutorDispatcher< Permutation, Devices::Host >
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    Func f )
+   {
+      if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 )
+         ParallelBoundaryExecutor< Permutation, Devices::Host >()( begins, skipBegins, skipEnds, ends, f );
+      else
+         SequentialBoundaryExecutor< Permutation >()( begins, skipBegins, skipEnds, ends, f );
+   }
+};
+
+template< typename Permutation >
+struct BoundaryExecutorDispatcher< Permutation, Devices::Cuda >
+{
+   template< typename Begins,
+             typename SkipBegins,
+             typename SkipEnds,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins,
+                    const SkipBegins& skipBegins,
+                    const SkipEnds& skipEnds,
+                    const Ends& ends,
+                    Func f )
+   {
+      ParallelBoundaryExecutor< Permutation, Devices::Cuda >()( begins, skipBegins, skipEnds, ends, f );
+   }
+};
+
+} // namespace __ndarray_impl
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/ndarray/Executors.h b/src/TNL/Containers/ndarray/Executors.h
new file mode 100644
index 0000000000000000000000000000000000000000..d09b6ec234b8e7ce6a15632f5479130aa0cc1c60
--- /dev/null
+++ b/src/TNL/Containers/ndarray/Executors.h
@@ -0,0 +1,358 @@
+/***************************************************************************
+                          Executors.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/ParallelFor.h>
+
+#include <TNL/Containers/ndarray/Meta.h>
+#include <TNL/Containers/ndarray/SizesHolder.h>
+
+namespace TNL {
+namespace Containers {
+namespace __ndarray_impl {
+
+template< typename Permutation,
+          typename LevelTag = IndexTag< 0 > >
+struct SequentialExecutor
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      SequentialExecutor< Permutation, IndexTag< LevelTag::value + 1 > > exec;
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         exec( begins, ends, f, std::forward< Indices >( indices )..., i );
+   }
+};
+
+template< typename Permutation >
+struct SequentialExecutor< Permutation, IndexTag< Permutation::size() - 1 > >
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      static_assert( sizeof...(indices) == Begins::getDimension() - 1,
+                     "invalid number of indices in the final step of the SequentialExecutor" );
+
+      using LevelTag = IndexTag< Permutation::size() - 1 >;
+
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
+   }
+};
+
+
+template< typename Permutation,
+          typename LevelTag = IndexTag< Permutation::size() - 1 > >
+struct SequentialExecutorRTL
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      SequentialExecutorRTL< Permutation, IndexTag< LevelTag::value - 1 > > exec;
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         exec( begins, ends, f, i, std::forward< Indices >( indices )... );
+   }
+};
+
+template< typename Permutation >
+struct SequentialExecutorRTL< Permutation, IndexTag< 0 > >
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      static_assert( sizeof...(indices) == Begins::getDimension() - 1,
+                     "invalid number of indices in the final step of the SequentialExecutorRTL" );
+
+      const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< 0 >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         call_with_unpermuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... );
+   }
+};
+
+
+template< typename Permutation,
+          typename Device >
+struct ParallelExecutorDeviceDispatch
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
+
+      auto kernel = [=] ( Index i2, Index i1, Index i0 )
+      {
+         SequentialExecutor< Permutation, IndexTag< 3 > > exec;
+         exec( begins, ends, f, i0, i1, i2 );
+      };
+
+      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+      const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
+      ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
+   }
+};
+
+template< typename Permutation >
+struct ParallelExecutorDeviceDispatch< Permutation, Devices::Cuda >
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
+
+      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
+      {
+         SequentialExecutorRTL< Permutation, IndexTag< Begins::getDimension() - 4 > > exec;
+         exec( begins, ends, f, i0, i1, i2 );
+      };
+
+      const Index begin0 = begins.template getSize< get< Begins::getDimension() - 3 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< Begins::getDimension() - 2 >( Permutation{} ) >();
+      const Index begin2 = begins.template getSize< get< Begins::getDimension() - 1 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< Ends::getDimension() - 3 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< Ends::getDimension() - 2 >( Permutation{} ) >();
+      const Index end2 = ends.template getSize< get< Ends::getDimension() - 1 >( Permutation{} ) >();
+      ParallelFor3D< Devices::Cuda >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
+   }
+};
+
+template< typename Permutation,
+          typename Device,
+          typename DimTag = IndexTag< Permutation::size() > >
+struct ParallelExecutor
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      ParallelExecutorDeviceDispatch< Permutation, Device > dispatch;
+      dispatch( begins, ends, f );
+   }
+};
+
+template< typename Permutation,
+          typename Device >
+struct ParallelExecutor< Permutation, Device, IndexTag< 3 > >
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
+
+      // nvcc does not like nested __cuda_callable__ and normal lambdas...
+//      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
+//      {
+//         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
+//      };
+      Kernel< Device > kernel;
+
+      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+      const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
+      ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel, f );
+   }
+
+   template< typename __Device, typename = void >
+   struct Kernel
+   {
+      template< typename Index, typename Func >
+      void operator()( Index i2, Index i1, Index i0, Func f )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
+      };
+   };
+
+   // dummy specialization to avoid a shitpile of nvcc warnings
+   template< typename __unused >
+   struct Kernel< Devices::Cuda, __unused >
+   {
+      template< typename Index, typename Func >
+      __cuda_callable__
+      void operator()( Index i2, Index i1, Index i0, Func f )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
+      };
+   };
+};
+
+template< typename Permutation,
+          typename Device >
+struct ParallelExecutor< Permutation, Device, IndexTag< 2 > >
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
+
+      // nvcc does not like nested __cuda_callable__ and normal lambdas...
+//      auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
+//      {
+//         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
+//      };
+      Kernel< Device > kernel;
+
+      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+      ParallelFor2D< Device >::exec( begin1, begin0, end1, end0, kernel, f );
+   }
+
+   template< typename __Device, typename = void >
+   struct Kernel
+   {
+      template< typename Index, typename Func >
+      void operator()( Index i1, Index i0, Func f )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
+      };
+   };
+
+   // dummy specialization to avoid a shitpile of nvcc warnings
+   template< typename __unused >
+   struct Kernel< Devices::Cuda, __unused >
+   {
+      template< typename Index, typename Func >
+      __cuda_callable__
+      void operator()( Index i1, Index i0, Func f )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
+      };
+   };
+};
+
+template< typename Permutation,
+          typename Device >
+struct ParallelExecutor< Permutation, Device, IndexTag< 1 > >
+{
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+
+      using Index = typename Ends::IndexType;
+
+//      auto kernel = [=] __cuda_callable__ ( Index i )
+//      {
+//         call_with_unpermuted_arguments< Permutation >( f, i );
+//      };
+
+      const Index begin = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end = ends.template getSize< get< 0 >( Permutation{} ) >();
+//      ParallelFor< Device >::exec( begin, end, kernel );
+      ParallelFor< Device >::exec( begin, end, f );
+   }
+};
+
+
+// Device may be void which stands for StaticNDArray
+template< typename Permutation,
+          typename Device >
+struct ExecutorDispatcher
+{
+   template< typename Begins, typename Ends, typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      SequentialExecutor< Permutation >()( begins, ends, f );
+   }
+};
+
+template< typename Permutation >
+struct ExecutorDispatcher< Permutation, Devices::Host >
+{
+   template< typename Begins, typename Ends, typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 )
+         ParallelExecutor< Permutation, Devices::Host >()( begins, ends, f );
+      else
+         SequentialExecutor< Permutation >()( begins, ends, f );
+   }
+};
+
+template< typename Permutation >
+struct ExecutorDispatcher< Permutation, Devices::Cuda >
+{
+   template< typename Begins, typename Ends, typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      ParallelExecutor< Permutation, Devices::Cuda >()( begins, ends, f );
+   }
+};
+
+} // namespace __ndarray_impl
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/ndarray/Indexing.h b/src/TNL/Containers/ndarray/Indexing.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1b83ae518c10ba0000b0aea7a14b808e04cb40b
--- /dev/null
+++ b/src/TNL/Containers/ndarray/Indexing.h
@@ -0,0 +1,295 @@
+/***************************************************************************
+                          Indexing.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Containers/ndarray/SizesHolderHelpers.h>
+
+namespace TNL {
+namespace Containers {
+namespace __ndarray_impl {
+
+template< typename OffsetsHolder,
+          typename Sequence >
+struct IndexShiftHelper
+{};
+
+template< typename OffsetsHolder,
+          std::size_t... N >
+struct IndexShiftHelper< OffsetsHolder, std::index_sequence< N... > >
+{
+   template< typename Func,
+             typename... Indices >
+   __cuda_callable__
+   static auto apply( const OffsetsHolder& offsets, Func&& f, Indices&&... indices ) -> decltype(auto)
+   {
+      return f( ( std::forward< Indices >( indices ) + offsets.template getSize< N >() )... );
+   }
+
+   template< typename Func,
+             typename... Indices >
+   static auto apply_host( const OffsetsHolder& offsets, Func&& f, Indices&&... indices ) -> decltype(auto)
+   {
+      return f( ( std::forward< Indices >( indices ) + offsets.template getSize< N >() )... );
+   }
+};
+
+template< typename OffsetsHolder,
+          typename Func,
+          typename... Indices >
+__cuda_callable__
+auto call_with_shifted_indices( const OffsetsHolder& offsets, Func&& f, Indices&&... indices ) -> decltype(auto)
+{
+   return IndexShiftHelper< OffsetsHolder, std::make_index_sequence< sizeof...( Indices ) > >
+          ::apply( offsets, std::forward< Func >( f ), std::forward< Indices >( indices )... );
+}
+
+template< typename OffsetsHolder,
+          typename Func,
+          typename... Indices >
+auto host_call_with_unshifted_indices( const OffsetsHolder& offsets, Func&& f, Indices&&... indices ) -> decltype(auto)
+{
+   return IndexShiftHelper< OffsetsHolder, std::make_index_sequence< sizeof...( Indices ) > >
+          ::apply_host( offsets, std::forward< Func >( f ), std::forward< Indices >( indices )... );
+}
+
+
+template< typename SizesHolder,
+          typename Overlaps,
+          typename Sequence >
+struct IndexUnshiftHelper
+{};
+
+template< typename SizesHolder,
+          typename Overlaps,
+          std::size_t... N >
+struct IndexUnshiftHelper< SizesHolder, Overlaps, std::index_sequence< N... > >
+{
+   template< typename Func,
+             typename... Indices >
+   __cuda_callable__
+   static auto apply( const SizesHolder& begins, Func&& f, Indices&&... indices ) -> decltype(auto)
+   {
+      return f( ( get<N>( Overlaps{} ) + std::forward< Indices >( indices ) - begins.template getSize< N >() )... );
+   }
+
+   template< typename Func,
+             typename... Indices >
+   static auto apply_host( const SizesHolder& begins, Func&& f, Indices&&... indices ) -> decltype(auto)
+   {
+      return f( ( get<N>( Overlaps{} ) + std::forward< Indices >( indices ) - begins.template getSize< N >() )... );
+   }
+};
+
+template< typename SizesHolder,
+          typename Overlaps = make_constant_index_sequence< SizesHolder::getDimension(), 0 >,
+          typename Func,
+          typename... Indices >
+__cuda_callable__
+auto call_with_unshifted_indices( const SizesHolder& begins, Func&& f, Indices&&... indices ) -> decltype(auto)
+{
+   return IndexUnshiftHelper< SizesHolder, Overlaps, std::make_index_sequence< sizeof...( Indices ) > >
+          ::apply( begins, std::forward< Func >( f ), std::forward< Indices >( indices )... );
+}
+
+template< typename SizesHolder,
+          typename Overlaps = make_constant_index_sequence< SizesHolder::getDimension(), 0 >,
+          typename Func,
+          typename... Indices >
+auto host_call_with_unshifted_indices( const SizesHolder& begins, Func&& f, Indices&&... indices ) -> decltype(auto)
+{
+   return IndexUnshiftHelper< SizesHolder, Overlaps, std::make_index_sequence< sizeof...( Indices ) > >
+          ::apply_host( begins, std::forward< Func >( f ), std::forward< Indices >( indices )... );
+}
+
+
+template< typename Permutation,
+          typename Overlaps,
+          typename Alignment,
+          typename SliceInfo,
+          std::size_t level = Permutation::size() - 1,
+          bool _sliced_level = ( SliceInfo::getSliceSize( get< level >( Permutation{} ) ) > 0 ) >
+struct SlicedIndexer
+{};
+
+template< typename Permutation,
+          typename Overlaps,
+          typename Alignment,
+          typename SliceInfo,
+          std::size_t level >
+struct SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, level, false >
+{
+   template< typename SizesHolder, typename StridesHolder, typename... Indices >
+   __cuda_callable__
+   static typename SizesHolder::IndexType
+   getIndex( const SizesHolder& sizes,
+             const StridesHolder& strides,
+             Indices&&... indices )
+   {
+      static constexpr std::size_t idx = get< level >( Permutation{} );
+      static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} );
+      const auto alpha = get_from_pack< idx >( std::forward< Indices >( indices )... );
+      const auto previous = SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, level - 1 >::getIndex( sizes, strides, std::forward< Indices >( indices )... );
+      return strides.template getStride< idx >( alpha ) * ( alpha + overlap + Alignment::template getAlignedSize< idx >( sizes ) * previous );
+   }
+};
+
+template< typename Permutation,
+          typename Overlaps,
+          typename Alignment,
+          typename SliceInfo,
+          std::size_t level >
+struct SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, level, true >
+{
+   template< typename SizesHolder, typename StridesHolder, typename... Indices >
+   __cuda_callable__
+   static typename SizesHolder::IndexType
+   getIndex( const SizesHolder& sizes,
+             const StridesHolder& strides,
+             Indices&&... indices )
+   {
+      static_assert( SizesHolder::template getStaticSize< get< level >( Permutation{} ) >() == 0,
+                     "Invalid SliceInfo: static dimension cannot be sliced." );
+
+      static constexpr std::size_t idx = get< level >( Permutation{} );
+      static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} );
+      const auto alpha = get_from_pack< idx >( std::forward< Indices >( indices )... );
+      static constexpr std::size_t S = SliceInfo::getSliceSize( idx );
+      // TODO: check the calculation with strides
+      return strides.template getStride< idx >( alpha ) *
+                  ( S * ((alpha + overlap) / S) * StorageSizeGetter< SizesHolder, Alignment, Overlaps, IndexTag< level - 1 > >::getPermuted( sizes, Permutation{} ) +
+                    (alpha + overlap) % S ) +
+             S * SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, level - 1 >::getIndex( sizes, strides, std::forward< Indices >( indices )... );
+   }
+};
+
+template< typename Permutation,
+          typename Overlaps,
+          typename Alignment,
+          typename SliceInfo >
+struct SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, 0, false >
+{
+   template< typename SizesHolder, typename StridesHolder, typename... Indices >
+   __cuda_callable__
+   static typename SizesHolder::IndexType
+   getIndex( const SizesHolder& sizes,
+             const StridesHolder& strides,
+             Indices&&... indices )
+   {
+      static constexpr std::size_t idx = get< 0 >( Permutation{} );
+      static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} );
+      const auto alpha = get_from_pack< idx >( std::forward< Indices >( indices )... );
+      return strides.template getStride< idx >( alpha ) * (alpha + overlap);
+   }
+};
+
+template< typename Permutation,
+          typename Overlaps,
+          typename Alignment,
+          typename SliceInfo >
+struct SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, 0, true >
+{
+   template< typename SizesHolder, typename StridesHolder, typename... Indices >
+   __cuda_callable__
+   static typename SizesHolder::IndexType
+   getIndex( const SizesHolder& sizes,
+             const StridesHolder& strides,
+             Indices&&... indices )
+   {
+      static constexpr std::size_t idx = get< 0 >( Permutation{} );
+      static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} );
+      const auto alpha = get_from_pack< idx >( std::forward< Indices >( indices )... );
+      return strides.template getStride< idx >( alpha ) * (alpha + overlap);
+   }
+};
+
+
+// SliceInfo should be always empty (i.e. sliceSize == 0)
+template< typename SliceInfo >
+struct NDArrayBase
+{
+   template< typename Permutation >
+   struct Alignment
+   {
+      template< std::size_t dimension, typename SizesHolder >
+      __cuda_callable__
+      static typename SizesHolder::IndexType
+      getAlignedSize( const SizesHolder& sizes )
+      {
+         const auto size = sizes.template getSize< dimension >();
+         // round up the last dynamic dimension to improve performance
+         // TODO: aligning is good for GPU, but bad for CPU
+//         static constexpr decltype(size) mult = 32;
+//         if( dimension == get< Permutation::size() - 1 >( Permutation{} )
+//                 && SizesHolder::template getStaticSize< dimension >() == 0 )
+//             return mult * ( size / mult + ( size % mult != 0 ) );
+         return size;
+      }
+   };
+
+   template< typename Permutation, typename Overlaps, typename SizesHolder, typename StridesHolder, typename... Indices >
+   __cuda_callable__
+   typename SizesHolder::IndexType
+   static getStorageIndex( const SizesHolder& sizes, const StridesHolder& strides, Indices&&... indices )
+   {
+      static_assert( check_slice_size( SizesHolder::getDimension(), 0 ), "BUG - invalid SliceInfo type passed to NDArrayBase" );
+      using Alignment = Alignment< Permutation >;
+      return SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo >::getIndex( sizes, strides, std::forward< Indices >( indices )... );
+   }
+
+private:
+   static constexpr bool check_slice_size( std::size_t dim, std::size_t sliceSize )
+   {
+      for( std::size_t i = 0; i < dim; i++ )
+         if( SliceInfo::getSliceSize( i ) != sliceSize )
+            return false;
+      return true;
+   }
+};
+
+
+template< typename SliceInfo >
+struct SlicedNDArrayBase
+{
+   template< typename Permutation >
+   struct Alignment
+   {
+      template< std::size_t dimension, typename SizesHolder >
+      __cuda_callable__
+      static typename SizesHolder::IndexType
+      getAlignedSize( const SizesHolder& sizes )
+      {
+         const auto size = sizes.template getSize< dimension >();
+         if( SliceInfo::getSliceSize(dimension) > 0 )
+            // round to multiple of SliceSize
+            return SliceInfo::getSliceSize(dimension) * (
+                        size / SliceInfo::getSliceSize(dimension) +
+                        ( size % SliceInfo::getSliceSize(dimension) != 0 )
+                     );
+         // unmodified
+         return size;
+      }
+   };
+
+   template< typename Permutation, typename Overlaps, typename SizesHolder, typename StridesHolder, typename... Indices >
+   __cuda_callable__
+   static typename SizesHolder::IndexType
+   getStorageIndex( const SizesHolder& sizes, const StridesHolder& strides, Indices&&... indices )
+   {
+      using Alignment = Alignment< Permutation >;
+      return SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo >::getIndex( sizes, strides, std::forward< Indices >( indices )... );
+   }
+};
+
+} // namespace __ndarray_impl
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/ndarray/Meta.h b/src/TNL/Containers/ndarray/Meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccff5a329aba36ceaa7e85e9bb915211858d8688
--- /dev/null
+++ b/src/TNL/Containers/ndarray/Meta.h
@@ -0,0 +1,396 @@
+/***************************************************************************
+                          Meta.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <utility>
+#include <initializer_list>
+
+#include <TNL/Devices/CudaCallable.h>
+
+namespace TNL {
+namespace Containers {
+namespace __ndarray_impl {
+
+/*
+ * Generic function to get the N-th element from a variadic pack.
+ * Reference:
+ * http://stackoverflow.com/questions/20162903/template-parameter-packs-access-nth-type-and-nth-element/37836252#37836252
+ */
+template< std::size_t index, typename T, typename... Ts,
+          typename = typename std::enable_if< index == 0 >::type >
+constexpr T
+get_from_pack( T&& arg, Ts&&... args )
+{
+   return arg;
+}
+
+template< std::size_t index, typename T, typename... Ts,
+          typename = typename std::enable_if< (index > 0) && index <= sizeof...( Ts ) >::type >
+constexpr auto
+get_from_pack( T&& arg, Ts&&... args )
+{
+   return get_from_pack< index-1 >( std::forward< Ts >( args )... );
+}
+
+// complementary specialization for getting a more readable compilation error
+// in case calling get with a bad index
+template< long long index, typename T, typename... Ts,
+          typename = typename std::enable_if< (index < 0) || (index > sizeof...( Ts )) >::type >
+constexpr T
+get_from_pack( T&& arg, Ts&&... args )
+{
+   static_assert( index >= 0 && index <= sizeof...( Ts ),
+                  "invalid index passed to the get function" );
+   return arg;
+}
+
+
+// Get N-th element from std::integer_sequence.
+template< std::size_t N, typename Index, Index... vals >
+constexpr Index
+get( std::integer_sequence< Index, vals... > )
+{
+   return get_from_pack< N >( vals... );
+}
+
+
+// Test if a variadic pack contains a value.
+template< typename Index, typename T >
+constexpr bool
+is_in_pack( Index value, T&& pack_value )
+{
+   return value == pack_value;
+}
+
+template< typename Index, typename T, typename... Ts >
+constexpr bool
+is_in_pack( Index value, T&& pack_value, Ts&&... vals )
+{
+   if( value == pack_value )
+      return true;
+   return is_in_pack( value, std::forward< Ts >( vals )... );
+}
+
+
+// Test if an std::integer_sequence contains an element.
+template< typename Index, Index... vals >
+constexpr bool
+is_in_sequence( Index value, std::integer_sequence< Index, vals... > )
+{
+   return is_in_pack( value, vals... );
+}
+
+
+// Get index of the first occurrence of value in a variadic pack.
+template< typename V >
+constexpr std::size_t
+index_in_pack( V&& value )
+{
+   return 0;
+}
+
+template< typename V, typename T, typename... Ts >
+constexpr std::size_t
+index_in_pack( V&& value, T&& arg, Ts&&... args )
+{
+   if( value == arg )
+      return 0;
+   return 1 + index_in_pack( value, std::forward< Ts >( args )... );
+}
+
+
+// Get index of the first occurrence of value in a std::integer_sequence
+template< typename V, typename Index, Index... vals >
+constexpr std::size_t
+index_in_sequence( V&& value, std::integer_sequence< Index, vals... > )
+{
+   return index_in_pack( std::forward< V >( value ), vals... );
+}
+
+
+/*
+ * Generic function to concatenate an arbitrary number of std::integer_sequence instances.
+ * Useful mainly for getting the type of the resulting sequence with `decltype`.
+ */
+// concatenate a single, potentially empty sequence
+template< typename Index, Index... s >
+constexpr auto
+concat_sequences( std::integer_sequence< Index, s... > )
+{
+   return std::integer_sequence< Index, s... >{};
+}
+
+// concatenate two sequences, each potentially empty
+template< typename Index, Index... s, Index... t>
+constexpr auto
+concat_sequences( std::integer_sequence< Index, s... >, std::integer_sequence< Index, t... > )
+{
+   return std::integer_sequence< Index, s... , t... >{};
+}
+
+// concatenate more than 2 sequences
+template< typename Index, Index... s, Index... t, typename... R >
+constexpr auto
+concat_sequences( std::integer_sequence< Index, s... >, std::integer_sequence< Index, t...>, R... )
+{
+   return concat_sequences( std::integer_sequence< Index, s..., t... >{}, R{}... );
+}
+
+
+// Integer wrapper necessary for C++ templates specializations.
+// As the C++ standard says:
+//    A partially specialized non-type argument expression shall not involve
+//    a template parameter of the partial specialization except when the argument
+//    expression is a simple identifier.
+template< std::size_t v >
+struct IndexTag
+{
+   static constexpr std::size_t value = v;
+};
+
+
+template< typename Permutation,
+          typename Sequence >
+struct CallPermutationHelper
+{};
+
+template< typename Permutation,
+          std::size_t... N >
+struct CallPermutationHelper< Permutation, std::index_sequence< N... > >
+{
+   template< typename Func,
+             typename... Args >
+   __cuda_callable__
+   static auto apply( Func&& f, Args&&... args ) -> decltype(auto)
+   {
+      return std::forward< Func >( f )( get_from_pack<
+                  get< N >( Permutation{} )
+                >( std::forward< Args >( args )... )... );
+   }
+};
+
+// Call specified function with permuted arguments.
+// [used in ndarray_operations.h]
+template< typename Permutation,
+          typename Func,
+          typename... Args >
+__cuda_callable__
+// FIXME: does not compile with nvcc 10.0
+//auto call_with_permuted_arguments( Func&& f, Args&&... args ) -> decltype(auto)
+//{
+//   return CallPermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > >
+//          ::apply( std::forward< Func >( f ), std::forward< Args >( args )... );
+//}
+auto call_with_permuted_arguments( Func f, Args&&... args ) -> decltype(auto)
+{
+   return CallPermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > >
+          ::apply( f, std::forward< Args >( args )... );
+}
+
+
+template< typename Permutation,
+          typename Sequence >
+struct CallInversePermutationHelper
+{};
+
+template< typename Permutation,
+          std::size_t... N >
+struct CallInversePermutationHelper< Permutation, std::index_sequence< N... > >
+{
+   template< typename Func,
+             typename... Args >
+   __cuda_callable__
+   static auto apply( Func&& f, Args&&... args ) -> decltype(auto)
+   {
+      return std::forward< Func >( f )( get_from_pack<
+                  index_in_sequence( N, Permutation{} )
+                >( std::forward< Args >( args )... )... );
+   }
+};
+
+// Call specified function with permuted arguments.
+// [used in ndarray_operations.h]
+template< typename Permutation,
+          typename Func,
+          typename... Args >
+__cuda_callable__
+// FIXME: does not compile with nvcc 10.0
+//auto call_with_unpermuted_arguments( Func&& f, Args&&... args ) -> decltype(auto)
+//{
+//   return CallInversePermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > >
+//          ::apply( std::forward< Func >( f ), std::forward< Args >( args )... );
+//}
+auto call_with_unpermuted_arguments( Func f, Args&&... args ) -> decltype(auto)
+{
+   return CallInversePermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > >
+          ::apply( f, std::forward< Args >( args )... );
+}
+
+
+// Check that all elements of the initializer list are equal to the specified value.
+// [used in ndarray_operations.h]
+constexpr bool
+all_elements_equal_to_value( std::size_t value, std::initializer_list< std::size_t > list )
+{
+   for( auto elem : list )
+      if( elem != value )
+         return false;
+   return true;
+}
+
+
+// Check that all elements of the initializer list are in the specified range [begin, end).
+// [used in ndarray.h -- static assertions on permutations]
+constexpr bool
+all_elements_in_range( std::size_t begin, std::size_t end, std::initializer_list< std::size_t > list )
+{
+   for( auto elem : list )
+      if( elem < begin || elem >= end )
+         return false;
+   return true;
+}
+
+
+// Check that the elements of the initializer list form an increasing sequence.
+// [used in ndarray.h -- static assertion in getSubarrayView()]
+constexpr bool
+is_increasing_sequence( std::initializer_list< std::size_t > list )
+{
+   std::size_t prev = *list.begin();
+   for( auto& elem : list ) {
+      if( &elem == list.begin() )
+         continue;
+      if( elem <= prev )
+         return false;
+      prev = elem;
+   }
+   return true;
+}
+
+
+// Count elements of a variadic pack smaller than a specified value
+// [used in ndarray_subarray.h to generate a subpermutation]
+template< typename T, typename V >
+constexpr std::size_t
+count_smaller( T threshold, V&& value )
+{
+   return value < threshold ? 1 : 0;
+}
+
+template< typename T, typename V, typename... Values >
+constexpr std::size_t
+count_smaller( T threshold, V&& value, Values&&... vals )
+{
+   if( value < threshold )
+      return 1 + count_smaller( threshold, vals... );
+   return count_smaller( threshold, vals... );
+}
+
+
+// C++17 version using "if constexpr" and a general predicate (lambda function)
+// Reference: https://stackoverflow.com/a/41723705
+//template< typename Index, Index a, typename Predicate >
+//constexpr auto
+//FilterSingle( std::integer_sequence< Index, a >, Predicate pred )
+//{
+//   if constexpr (pred(a))
+//      return std::integer_sequence< Index, a >{};
+//   else
+//      return std::integer_sequence< Index >{};
+//}
+//
+//// empty sequence case
+//template< typename Index, typename Predicate >
+//constexpr auto
+//filter_sequence( std::integer_sequence< Index >, [[maybe_unused]] Predicate pred )
+//{
+//   return std::integer_sequence< Index >{};
+//}
+//
+//// non empty sequence case
+//template< typename Index, Index... vals, typename Predicate >
+//constexpr auto
+//filter_sequence( std::integer_sequence< Index, vals... >, [[maybe_unused]] Predicate pred )
+//{
+//   return concat_sequences( FilterSingle( std::integer_sequence< Index, vals >{}, pred )... );
+//}
+
+// C++14 version, with hard-coded predicate
+template< typename Mask, typename Index, Index val >
+constexpr typename std::conditional_t< is_in_sequence( val, Mask{} ),
+                                       std::integer_sequence< Index, val >,
+                                       std::integer_sequence< Index > >
+FilterSingle( std::integer_sequence< Index, val > )
+{
+   return {};
+}
+
+/*
+ * Generic function returning a subsequence of a sequence obtained by omitting
+ * the elements not contained in the specified mask.
+ */
+// empty sequence case
+template< typename Mask, typename Index >
+constexpr auto
+filter_sequence( std::integer_sequence< Index > )
+{
+   return std::integer_sequence< Index >{};
+}
+
+// non empty sequence case
+template< typename Mask, typename Index, Index... vals >
+constexpr auto
+filter_sequence( std::integer_sequence< Index, vals... > )
+{
+   return concat_sequences( FilterSingle< Mask >( std::integer_sequence< Index, vals >{} )... );
+}
+
+
+/*
+ * make_constant_integer_sequence, make_constant_index_sequence - helper
+ * templates for the generation of constant sequences like
+ * std::make_integer_sequence, std::make_index_sequence
+ */
+template< typename T, typename N, T v > struct gen_const_seq;
+template< typename T, typename N, T v > using gen_const_seq_t = typename gen_const_seq< T, N, v >::type;
+
+template< typename T, typename N, T v >
+struct gen_const_seq
+{
+   using type = decltype(concat_sequences(
+                     gen_const_seq_t<T, std::integral_constant<T, N::value/2>, v>{},
+                     gen_const_seq_t<T, std::integral_constant<T, N::value - N::value/2>, v>{}
+                  ));
+};
+
+template< typename T, T v >
+struct gen_const_seq< T, std::integral_constant<T, 0>, v >
+{
+   using type = std::integer_sequence<T>;
+};
+
+template< typename T, T v >
+struct gen_const_seq< T, std::integral_constant<T, 1>, v >
+{
+   using type = std::integer_sequence<T, v>;
+};
+
+template< typename T, T N, T value >
+using make_constant_integer_sequence = gen_const_seq_t< T, std::integral_constant<T, N>, value >;
+
+template< std::size_t N, std::size_t value >
+using make_constant_index_sequence = gen_const_seq_t< std::size_t, std::integral_constant<std::size_t, N>, value >;
+
+} // namespace __ndarray_impl
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/ndarray/Operations.h b/src/TNL/Containers/ndarray/Operations.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb219b6e011e2b0a973e136d690318999f0b1c9a
--- /dev/null
+++ b/src/TNL/Containers/ndarray/Operations.h
@@ -0,0 +1,366 @@
+/***************************************************************************
+                          Operations.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Containers/ndarray/Executors.h>
+
+namespace TNL {
+namespace Containers {
+
+namespace __ndarray_impl {
+
+#ifndef __NVCC__
+template< typename Output,
+          typename Func,
+          typename... Input >
+void nd_map_view( Output output, Func f, const Input... input )
+{
+   static_assert( all_elements_equal_to_value( Output::getDimension(), {Input::getDimension()...} ),
+                  "all arrays must be of the same dimension" );
+
+   // without mutable, the operator() would be const so output would be const as well
+   // https://stackoverflow.com/a/2835645/4180822
+   auto wrapper = [=] __cuda_callable__ ( auto... indices ) mutable {
+      static_assert( sizeof...( indices ) == Output::getDimension(),
+                     "wrong number of indices passed to the wrapper lambda function" );
+      output( indices... ) = f( input( indices... )... );
+   };
+
+   ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
+   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
+   dispatch( Begins{}, output.getSizes(), wrapper );
+}
+
+#else
+
+   template< typename Output,
+             typename Func >
+   struct nvcc_map_helper_0
+   {
+      Output output;
+      Func f;
+
+      nvcc_map_helper_0( Output o, Func f ) : output(o), f(f) {}
+
+      template< typename... Ts >
+      __cuda_callable__
+      void operator()( Ts... indices )
+      {
+         static_assert( sizeof...( indices ) == Output::getDimension(),
+                        "wrong number of indices passed to the wrapper operator() function" );
+         output( indices... ) = f();
+      }
+   };
+
+   template< typename Output,
+             typename Func,
+             typename Input1 >
+   struct nvcc_map_helper_1
+   {
+      Output output;
+      Func f;
+      Input1 input1;
+
+      nvcc_map_helper_1( Output o, Func f, Input1 i1 ) : output(o), f(f), input1(i1) {}
+
+      template< typename... Ts >
+      __cuda_callable__
+      void operator()( Ts... indices )
+      {
+         static_assert( sizeof...( indices ) == Output::getDimension(),
+                        "wrong number of indices passed to the wrapper operator() function" );
+         output( indices... ) = f( input1( indices... ) );
+      }
+   };
+
+   template< typename Output,
+             typename Func,
+             typename Input1,
+             typename Input2 >
+   struct nvcc_map_helper_2
+   {
+      Output output;
+      Func f;
+      Input1 input1;
+      Input2 input2;
+
+      nvcc_map_helper_2( Output o, Func f, Input1 i1, Input2 i2 ) : output(o), f(f), input1(i1), input2(i2) {}
+
+      template< typename... Ts >
+      __cuda_callable__
+      void operator()( Ts... indices )
+      {
+         static_assert( sizeof...( indices ) == Output::getDimension(),
+                        "wrong number of indices passed to the wrapper operator() function" );
+         output( indices... ) = f( input1( indices... ), input2( indices... ) );
+      }
+   };
+
+   template< typename Output,
+             typename Func,
+             typename Input1,
+             typename Input2,
+             typename Input3 >
+   struct nvcc_map_helper_3
+   {
+      Output output;
+      Func f;
+      Input1 input1;
+      Input2 input2;
+      Input3 input3;
+
+      nvcc_map_helper_3( Output o, Func f, Input1 i1, Input2 i2, Input3 i3 ) : output(o), f(f), input1(i1), input2(i2), input3(i3) {}
+
+      template< typename... Ts >
+      __cuda_callable__
+      void operator()( Ts... indices )
+      {
+         static_assert( sizeof...( indices ) == Output::getDimension(),
+                        "wrong number of indices passed to the wrapper operator() function" );
+         output( indices... ) = f( input1( indices... ), input2( indices... ), input3( indices... ) );
+      }
+   };
+
+template< typename Output,
+          typename Func >
+void nd_map_view( Output output, Func f )
+{
+   nvcc_map_helper_0< Output, Func > wrapper( output, f );
+   ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
+   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
+   dispatch( Begins{}, output.getSizes(), wrapper );
+}
+
+template< typename Output,
+          typename Func,
+          typename Input1 >
+void nd_map_view( Output output, Func f, const Input1 input1 )
+{
+   static_assert( all_elements_equal_to_value( Output::getDimension(), {Input1::getDimension()} ),
+                  "all arrays must be of the same dimension" );
+
+   nvcc_map_helper_1< Output, Func, Input1 > wrapper( output, f, input1 );
+   ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
+   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
+   dispatch( Begins{}, output.getSizes(), wrapper );
+}
+
+template< typename Output,
+          typename Func,
+          typename Input1,
+          typename Input2 >
+void nd_map_view( Output output, Func f, const Input1 input1, const Input2 input2 )
+{
+   static_assert( all_elements_equal_to_value( Output::getDimension(), {Input1::getDimension(), Input2::getDimension()} ),
+                  "all arrays must be of the same dimension" );
+
+   nvcc_map_helper_2< Output, Func, Input1, Input2 > wrapper( output, f, input1, input2 );
+   ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
+   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
+   dispatch( Begins{}, output.getSizes(), wrapper );
+}
+
+template< typename Output,
+          typename Func,
+          typename Input1,
+          typename Input2,
+          typename Input3 >
+void nd_map_view( Output output, Func f, const Input1 input1, const Input2 input2, const Input3 input3 )
+{
+   static_assert( all_elements_equal_to_value( Output::getDimension(), {Input1::getDimension(), Input2::getDimension(), Input3::getDimension()} ),
+                  "all arrays must be of the same dimension" );
+
+   nvcc_map_helper_3< Output, Func, Input1, Input2, Input3 > wrapper( output, f, input1, input2, input3 );
+   ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
+   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
+   dispatch( Begins{}, output.getSizes(), wrapper );
+}
+
+#endif
+
+} // namespace __ndarray_impl
+
+
+// f must be an N-ary function, where N is the dimension of the output and input arrays:
+//      output( i1, ..., iN ) = f( input1( i1, ..., iN ), ... inputM( i1, ..., iN ) )
+template< typename Output,
+          typename Func,
+          typename... Input >
+void nd_map( Output& output, Func f, const Input&... input )
+{
+   __ndarray_impl::nd_map_view( output.getView(), f, input.getConstView()... );
+}
+
+template< typename Output,
+          typename Input >
+void nd_assign( Output& output, const Input& input )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v ){ return v; }, input );
+#else
+   using value_type = typename Input::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type v ){ return v; }, input );
+#endif
+}
+
+// Some mathematical functions, inspired by NumPy:
+// https://docs.scipy.org/doc/numpy/reference/ufuncs.html#math-operations
+
+template< typename Output,
+          typename Input1,
+          typename Input2 >
+void nd_add( Output& output, const Input1& input1, const Input2& input2 )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return v1 + v2; }, input1, input2 );
+#else
+   using value_type_1 = typename Input1::ValueType;
+   using value_type_2 = typename Input2::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return v1 + v2; }, input1, input2 );
+#endif
+}
+
+template< typename Output,
+          typename Input1,
+          typename Input2 >
+void nd_subtract( Output& output, const Input1& input1, const Input2& input2 )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return v1 - v2; }, input1, input2 );
+#else
+   using value_type_1 = typename Input1::ValueType;
+   using value_type_2 = typename Input2::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return v1 - v2; }, input1, input2 );
+#endif
+}
+
+template< typename Output,
+          typename Input1,
+          typename Input2 >
+void nd_multiply( Output& output, const Input1& input1, const Input2& input2 )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return v1 * v2; }, input1, input2 );
+#else
+   using value_type_1 = typename Input1::ValueType;
+   using value_type_2 = typename Input2::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return v1 * v2; }, input1, input2 );
+#endif
+}
+
+template< typename Output,
+          typename Input1,
+          typename Input2 >
+void nd_divide( Output& output, const Input1& input1, const Input2& input2 )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return v1 / v2; }, input1, input2 );
+#else
+   using value_type_1 = typename Input1::ValueType;
+   using value_type_2 = typename Input2::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return v1 / v2; }, input1, input2 );
+#endif
+}
+
+template< typename Output,
+          typename Input1,
+          typename Input2 >
+void nd_maximum( Output& output, const Input1& input1, const Input2& input2 )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return TNL::max( v1, v2 ); }, input1, input2 );
+#else
+   using value_type_1 = typename Input1::ValueType;
+   using value_type_2 = typename Input2::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return TNL::max( v1, v2 ); }, input1, input2 );
+#endif
+}
+
+template< typename Output,
+          typename Input1,
+          typename Input2 >
+void nd_minimum( Output& output, const Input1& input1, const Input2& input2 )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return TNL::min( v1, v2 ); }, input1, input2 );
+#else
+   using value_type_1 = typename Input1::ValueType;
+   using value_type_2 = typename Input2::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return TNL::min( v1, v2 ); }, input1, input2 );
+#endif
+}
+
+template< typename Output,
+          typename Input >
+void nd_absolute( Output& output, const Input& input )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v ){ return TNL::abs( v ); }, input );
+#else
+   using value_type = typename Input::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type v ){ return TNL::abs( v ); }, input );
+#endif
+}
+
+template< typename Output,
+          typename Input >
+void nd_sign( Output& output, const Input& input )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v ){ return TNL::sign( v ); }, input );
+#else
+   using value_type = typename Input::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type v ){ return TNL::sign( v ); }, input );
+#endif
+}
+
+template< typename Output,
+          typename Input1,
+          typename Input2 >
+void nd_pow( Output& output, const Input1& input1, const Input2& input2 )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return TNL::pow( v1, v2 ); }, input1, input2 );
+#else
+   using value_type_1 = typename Input1::ValueType;
+   using value_type_2 = typename Input2::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return TNL::pow( v1, v2 ); }, input1, input2 );
+#endif
+}
+
+template< typename Output,
+          typename Input >
+void nd_sqrt( Output& output, const Input& input )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v ){ return TNL::sqrt( v ); }, input );
+#else
+   using value_type = typename Input::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type v ){ return TNL::sqrt( v ); }, input );
+#endif
+}
+
+template< typename Output,
+          typename Input >
+void nd_square( Output& output, const Input& input )
+{
+#ifndef __NVCC__
+   nd_map( output, [] __cuda_callable__ ( auto v ){ return v*v; }, input );
+#else
+   using value_type = typename Input::ValueType;
+   nd_map( output, [] __cuda_callable__ ( value_type v ){ return v*v; }, input );
+#endif
+}
+
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/ndarray/SizesHolder.h b/src/TNL/Containers/ndarray/SizesHolder.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b6e1f83d41f07c62306729de5ae0ed65e656f53
--- /dev/null
+++ b/src/TNL/Containers/ndarray/SizesHolder.h
@@ -0,0 +1,374 @@
+/***************************************************************************
+                          SizesHolder.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Assert.h>
+#include <TNL/Devices/CudaCallable.h>
+#include <TNL/TemplateStaticFor.h>
+
+#include <TNL/Containers/ndarray/Meta.h>
+
+namespace TNL {
+namespace Containers {
+
+namespace __ndarray_impl {
+
+template< typename Index,
+          typename LevelTag,
+          std::size_t size >
+class SizeHolder
+{
+public:
+   __cuda_callable__
+   constexpr Index getSize( LevelTag ) const
+   {
+      return size;
+   }
+
+   __cuda_callable__
+   void setSize( LevelTag, Index newSize )
+   {
+      TNL_ASSERT_EQ( newSize, 0, "Dynamic size for a static dimension must be 0." );
+   }
+
+   __cuda_callable__
+   bool operator==( const SizeHolder& ) const
+   {
+      return true;
+   }
+};
+
+template< typename Index,
+          typename LevelTag >
+class SizeHolder< Index, LevelTag, 0 >
+{
+public:
+   __cuda_callable__
+   Index getSize( LevelTag ) const
+   {
+      return size;
+   }
+
+   __cuda_callable__
+   void setSize( LevelTag, Index size )
+   {
+      this->size = size;
+   }
+
+   __cuda_callable__
+   bool operator==( const SizeHolder& other ) const
+   {
+      return size == other.size;
+   }
+
+private:
+   Index size = 0;
+};
+
+template< typename Index,
+          std::size_t currentSize,
+          std::size_t... otherSizes >
+class SizesHolderLayer
+: public SizesHolderLayer< Index, otherSizes... >,
+  public SizeHolder< Index,
+                     IndexTag< sizeof...( otherSizes ) >,  // LevelTag
+                     currentSize >
+{
+   using BaseType = SizesHolderLayer< Index, otherSizes... >;
+   using Layer = SizeHolder< Index,
+                             IndexTag< sizeof...( otherSizes ) >,  // LevelTag
+                             currentSize >;
+protected:
+   using BaseType::getSize;
+   using BaseType::setSize;
+   using Layer::getSize;
+   using Layer::setSize;
+
+   __cuda_callable__
+   bool operator==( const SizesHolderLayer& other ) const
+   {
+      return BaseType::operator==( other ) &&
+             Layer::operator==( other );
+   }
+};
+
+// specializations to terminate the recursive inheritance
+template< typename Index,
+          std::size_t currentSize >
+class SizesHolderLayer< Index, currentSize >
+: public SizeHolder< Index,
+                     IndexTag< 0 >,  // LevelTag
+                     currentSize >
+{
+    using Layer = SizeHolder< Index,
+                              IndexTag< 0 >,  // LevelTag
+                              currentSize >;
+protected:
+    using Layer::getSize;
+    using Layer::setSize;
+
+    __cuda_callable__
+    bool operator==( const SizesHolderLayer& other ) const
+    {
+        return Layer::operator==( other );
+    }
+};
+
+template< std::size_t dimension >
+struct SizesHolderStaticSizePrinter
+{
+   template< typename SizesHolder >
+   static void exec( std::ostream& str, const SizesHolder& holder )
+   {
+      str << holder.template getStaticSize< dimension >() << ", ";
+   }
+};
+
+template< std::size_t dimension >
+struct SizesHolderSizePrinter
+{
+   template< typename SizesHolder >
+   static void exec( std::ostream& str, const SizesHolder& holder )
+   {
+      str << holder.template getSize< dimension >() << ", ";
+   }
+};
+
+template< std::size_t level >
+struct SizesHolerOperatorPlusHelper
+{
+   template< typename Result, typename LHS, typename RHS >
+   static void exec( Result& result, const LHS& lhs, const RHS& rhs )
+   {
+      if( result.template getStaticSize< level >() == 0 )
+         result.template setSize< level >( lhs.template getSize< level >() + rhs.template getSize< level >() );
+   }
+};
+
+template< std::size_t level >
+struct SizesHolerOperatorMinusHelper
+{
+   template< typename Result, typename LHS, typename RHS >
+   static void exec( Result& result, const LHS& lhs, const RHS& rhs )
+   {
+      if( result.template getStaticSize< level >() == 0 )
+         result.template setSize< level >( lhs.template getSize< level >() - rhs.template getSize< level >() );
+   }
+};
+
+} // namespace __ndarray_impl
+
+
+// dimensions and static sizes are specified as std::size_t,
+// the type of dynamic sizes is configurable with Index
+
+template< typename Index,
+          std::size_t... sizes >
+class SizesHolder
+: public __ndarray_impl::SizesHolderLayer< Index, sizes... >
+{
+   using BaseType = __ndarray_impl::SizesHolderLayer< Index, sizes... >;
+
+public:
+   using IndexType = Index;
+
+   static constexpr std::size_t getDimension()
+   {
+      return sizeof...( sizes );
+   }
+
+   template< std::size_t dimension >
+   static constexpr std::size_t getStaticSize()
+   {
+      static_assert( dimension < sizeof...(sizes), "Invalid dimension passed to getStaticSize()." );
+      return __ndarray_impl::get_from_pack< dimension >( sizes... );
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   Index getSize() const
+   {
+      static_assert( level < sizeof...(sizes), "Invalid level passed to getSize()." );
+      return BaseType::getSize( __ndarray_impl::IndexTag< getDimension() - level - 1 >() );
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   void setSize( Index size )
+   {
+      static_assert( level < sizeof...(sizes), "Invalid level passed to setSize()." );
+      BaseType::setSize( __ndarray_impl::IndexTag< getDimension() - level - 1 >(), size );
+   }
+
+   // methods for convenience
+   __cuda_callable__
+   bool operator==( const SizesHolder& other ) const
+   {
+      return BaseType::operator==( other );
+   }
+
+   __cuda_callable__
+   bool operator!=( const SizesHolder& other ) const
+   {
+      return ! operator==( other );
+   }
+};
+
+template< typename Index,
+          std::size_t... sizes,
+          typename OtherHolder >
+SizesHolder< Index, sizes... >
+operator+( const SizesHolder< Index, sizes... >& lhs, const OtherHolder& rhs )
+{
+   SizesHolder< Index, sizes... > result;
+   TemplateStaticFor< std::size_t, 0, sizeof...(sizes), __ndarray_impl::SizesHolerOperatorPlusHelper >::execHost( result, lhs, rhs );
+   return result;
+}
+
+template< typename Index,
+          std::size_t... sizes,
+          typename OtherHolder >
+SizesHolder< Index, sizes... >
+operator-( const SizesHolder< Index, sizes... >& lhs, const OtherHolder& rhs )
+{
+   SizesHolder< Index, sizes... > result;
+   TemplateStaticFor< std::size_t, 0, sizeof...(sizes), __ndarray_impl::SizesHolerOperatorMinusHelper >::execHost( result, lhs, rhs );
+   return result;
+}
+
+
+template< typename Index,
+          std::size_t dimension,
+          Index constSize >
+class ConstStaticSizesHolder
+{
+public:
+   using IndexType = Index;
+
+   static constexpr std::size_t getDimension()
+   {
+      return dimension;
+   }
+
+   template< std::size_t level >
+   static constexpr std::size_t getStaticSize()
+   {
+      static_assert( level < getDimension(), "Invalid level passed to getStaticSize()." );
+      return constSize;
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   Index getSize() const
+   {
+      static_assert( level < getDimension(), "Invalid dimension passed to getSize()." );
+      return constSize;
+   }
+
+   // methods for convenience
+   __cuda_callable__
+   bool operator==( const ConstStaticSizesHolder& other ) const
+   {
+      return true;
+   }
+
+   __cuda_callable__
+   bool operator!=( const ConstStaticSizesHolder& other ) const
+   {
+      return false;
+   }
+};
+
+
+template< typename Index,
+          std::size_t... sizes >
+std::ostream& operator<<( std::ostream& str, const SizesHolder< Index, sizes... >& holder )
+{
+   str << "SizesHolder< ";
+   TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderStaticSizePrinter >::execHost( str, holder );
+   str << holder.template getStaticSize< sizeof...(sizes) - 1 >() << " >( ";
+   TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderSizePrinter >::execHost( str, holder );
+   str << holder.template getSize< sizeof...(sizes) - 1 >() << " )";
+   return str;
+}
+
+
+namespace __ndarray_impl {
+
+// helper for the forInternal method
+template< typename SizesHolder,
+          std::size_t ConstValue >
+struct SubtractedSizesHolder
+{};
+
+template< typename Index,
+          std::size_t ConstValue,
+          std::size_t... sizes >
+struct SubtractedSizesHolder< SizesHolder< Index, sizes... >, ConstValue >
+{
+//   using type = SizesHolder< Index, std::max( (std::size_t) 0, sizes - ConstValue )... >;
+   using type = SizesHolder< Index, ( (sizes >= ConstValue) ? sizes - ConstValue : 0 )... >;
+};
+
+
+// wrapper for localBegins in DistributedNDArray (static sizes cannot be distributed, begins are always 0)
+template< typename SizesHolder,
+          // overridable value is useful in the forInternal method
+          std::size_t ConstValue = 0 >
+struct LocalBeginsHolder : public SizesHolder
+{
+   template< std::size_t dimension >
+   static constexpr std::size_t getStaticSize()
+   {
+      static_assert( dimension < SizesHolder::getDimension(), "Invalid dimension passed to getStaticSize()." );
+      return ConstValue;
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   typename SizesHolder::IndexType getSize() const
+   {
+      if( SizesHolder::template getStaticSize< level >() != 0 )
+         return ConstValue;
+      return SizesHolder::template getSize< level >();
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   void setSize( typename SizesHolder::IndexType newSize )
+   {
+      if( SizesHolder::template getStaticSize< level >() == 0 )
+         SizesHolder::template setSize< level >( newSize );
+      else
+         TNL_ASSERT_EQ( newSize, ConstValue, "Dynamic size for a static dimension must be equal to the specified ConstValue." );
+   }
+};
+
+template< typename Index,
+          std::size_t... sizes,
+          std::size_t ConstValue >
+std::ostream& operator<<( std::ostream& str, const __ndarray_impl::LocalBeginsHolder< SizesHolder< Index, sizes... >, ConstValue >& holder )
+{
+   str << "LocalBeginsHolder< SizesHolder< ";
+   TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderStaticSizePrinter >::execHost( str, (SizesHolder< Index, sizes... >) holder );
+   str << holder.template getStaticSize< sizeof...(sizes) - 1 >() << " >, ";
+   str << ConstValue << " >( ";
+   TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderSizePrinter >::execHost( str, holder );
+   str << holder.template getSize< sizeof...(sizes) - 1 >() << " )";
+   return str;
+}
+
+} // namespace __ndarray_impl
+
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/ndarray/SizesHolderHelpers.h b/src/TNL/Containers/ndarray/SizesHolderHelpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e92ed43dfba5cbe5041313312918464b70fc8e2
--- /dev/null
+++ b/src/TNL/Containers/ndarray/SizesHolderHelpers.h
@@ -0,0 +1,414 @@
+/***************************************************************************
+                          SizesHolderHelpers.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <algorithm>
+
+#include <TNL/Assert.h>
+#include <TNL/TemplateStaticFor.h>
+#include <TNL/Containers/ndarray/Meta.h>
+
+namespace TNL {
+namespace Containers {
+namespace __ndarray_impl {
+
+// Dynamic storage size with alignment
+template< typename SizesHolder,
+          typename Alignment,
+          typename Overlaps,
+          typename LevelTag = IndexTag< SizesHolder::getDimension() - 1 > >
+struct StorageSizeGetter
+{
+   static typename SizesHolder::IndexType
+   __cuda_callable__
+   get( const SizesHolder& sizes )
+   {
+      static constexpr std::size_t overlap = __ndarray_impl::get< LevelTag::value >( Overlaps{} );
+      const auto size = Alignment::template getAlignedSize< LevelTag::value >( sizes );
+      return ( size + 2 * overlap )
+             * StorageSizeGetter< SizesHolder, Alignment, Overlaps, IndexTag< LevelTag::value - 1 > >::get( sizes );
+   }
+
+   template< typename Permutation >
+   __cuda_callable__
+   static typename SizesHolder::IndexType
+   getPermuted( const SizesHolder& sizes, Permutation )
+   {
+      static constexpr std::size_t idx = __ndarray_impl::get< LevelTag::value >( Permutation{} );
+      static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} );
+      const auto size = Alignment::template getAlignedSize< idx >( sizes );
+      return ( size + 2 * overlap )
+             * StorageSizeGetter< SizesHolder, Alignment, Overlaps, IndexTag< LevelTag::value - 1 > >::get( sizes );
+   }
+};
+
+template< typename SizesHolder, typename Alignment, typename Overlaps >
+struct StorageSizeGetter< SizesHolder, Alignment, Overlaps, IndexTag< 0 > >
+{
+   static typename SizesHolder::IndexType
+   __cuda_callable__
+   get( const SizesHolder& sizes )
+   {
+      static constexpr std::size_t overlap = __ndarray_impl::get< 0 >( Overlaps{} );
+      return Alignment::template getAlignedSize< 0 >( sizes ) + 2 * overlap;
+   }
+
+   template< typename Permutation >
+   __cuda_callable__
+   static typename SizesHolder::IndexType
+   getPermuted( const SizesHolder& sizes, Permutation )
+   {
+      static constexpr std::size_t idx = __ndarray_impl::get< 0 >( Permutation{} );
+      static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} );
+      return Alignment::template getAlignedSize< idx >( sizes ) + 2 * overlap;
+   }
+};
+
+
+// Static storage size without alignment, used in StaticNDArray
+template< typename SizesHolder,
+          typename LevelTag = IndexTag< SizesHolder::getDimension() - 1 > >
+struct StaticStorageSizeGetter
+{
+   constexpr static std::size_t get()
+   {
+      return SizesHolder::template getStaticSize< LevelTag::value >() *
+             StaticStorageSizeGetter< SizesHolder, IndexTag< LevelTag::value - 1 > >::get();
+   }
+};
+
+template< typename SizesHolder >
+struct StaticStorageSizeGetter< SizesHolder, IndexTag< 0 > >
+{
+   constexpr static std::size_t get()
+   {
+      return SizesHolder::template getStaticSize< 0 >();
+   }
+};
+
+
+template< std::size_t level = 0,
+          typename SizesHolder,
+          typename Index,
+          typename... IndexTypes >
+void setSizesHelper( SizesHolder& holder,
+                     Index&& size,
+                     IndexTypes&&... otherSizes )
+{
+   holder.template setSize< level >( std::forward< Index >( size ) );
+   setSizesHelper< level + 1 >( holder, std::forward< IndexTypes >( otherSizes )... );
+}
+
+template< std::size_t level = 0,
+          typename SizesHolder,
+          typename Index >
+void setSizesHelper( SizesHolder& holder,
+                     Index&& size )
+{
+   holder.template setSize< level >( std::forward< Index >( size ) );
+}
+
+
+// A variadic bounds-checker for indices
+template< typename SizesHolder >
+__cuda_callable__
+void assertIndicesInBounds( const SizesHolder& )
+{}
+
+template< typename SizesHolder,
+          typename Index,
+          typename... IndexTypes >
+__cuda_callable__
+void assertIndicesInBounds( const SizesHolder& sizes, Index&& i, IndexTypes&&... indices )
+{
+#ifndef NDEBUG
+   // sizes.template getSize<...>() cannot be inside the assert macro, but the variables
+   // shouldn't be declared when compiling without assertions
+   constexpr std::size_t level = SizesHolder::getDimension() - sizeof...(indices) - 1;
+   const auto size = sizes.template getSize< level >();
+   TNL_ASSERT_LT( i, (Index) size, "Input error - some index is out of bounds." );
+#endif
+   assertIndicesInBounds( sizes, std::forward< IndexTypes >( indices )... );
+}
+
+
+// A variadic bounds-checker for distributed indices with overlaps
+template< typename SizesHolder1, typename SizesHolder2, typename Overlaps >
+__cuda_callable__
+void assertIndicesInRange( const SizesHolder1&, const SizesHolder2&, const Overlaps& )
+{}
+
+template< typename SizesHolder1,
+          typename SizesHolder2,
+          typename Overlaps,
+          typename Index,
+          typename... IndexTypes >
+__cuda_callable__
+void assertIndicesInRange( const SizesHolder1& begins, const SizesHolder2& ends, const Overlaps& overlaps, Index&& i, IndexTypes&&... indices )
+{
+   static_assert( SizesHolder1::getDimension() == SizesHolder2::getDimension(),
+                  "Inconsistent begins and ends." );
+#ifndef NDEBUG
+   // sizes.template getSize<...>() cannot be inside the assert macro, but the variables
+   // shouldn't be declared when compiling without assertions
+   constexpr std::size_t level = SizesHolder1::getDimension() - sizeof...(indices) - 1;
+   const auto begin = begins.template getSize< level >();
+   const auto end = ends.template getSize< level >();
+   TNL_ASSERT_LE( begin - (decltype(begin)) get<level>( overlaps ), i, "Input error - some index is below the lower bound." );
+   TNL_ASSERT_LT( i, end + (decltype(end)) get<level>( overlaps ), "Input error - some index is above the upper bound." );
+#endif
+   assertIndicesInRange( begins, ends, overlaps, std::forward< IndexTypes >( indices )... );
+}
+
+
+// helper for the assignment operator in NDArray
+template< typename TargetHolder,
+          typename SourceHolder,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesCopyHelper
+{
+   static void copy( TargetHolder& target,
+                     const SourceHolder& source )
+   {
+      if( target.template getStaticSize< level >() == 0 ) {
+         target.template setSize< level >( source.template getSize< level >() );
+         SetSizesCopyHelper< TargetHolder, SourceHolder, level - 1 >::copy( target, source );
+      }
+      else if( target.template getStaticSize< level >() != source.template getSize< level >() )
+         throw std::logic_error( "Cannot copy sizes due to inconsistent underlying types (static sizes don't match)." );
+   }
+};
+
+template< typename TargetHolder,
+          typename SourceHolder >
+struct SetSizesCopyHelper< TargetHolder, SourceHolder, 0 >
+{
+   static void copy( TargetHolder& target,
+                     const SourceHolder& source )
+   {
+      if( target.template getStaticSize< 0 >() == 0 )
+         target.template setSize< 0 >( source.template getSize< 0 >() );
+      else if( target.template getStaticSize< 0 >() != source.template getSize< 0 >() )
+         throw std::logic_error( "Cannot copy sizes due to inconsistent underlying types (static sizes don't match)." );
+   }
+};
+
+
+template< std::size_t level >
+struct WeakCompareHelper
+{
+   template< typename SizesHolder1,
+             typename SizesHolder2 >
+   __cuda_callable__
+   static void exec( const SizesHolder1& sizes1, const SizesHolder2& sizes2, bool& result )
+   {
+      result &= sizes1.template getSize< level >() == sizes2.template getSize< level >();
+   }
+};
+
+// helper for the assignment operator in NDArrayView
+template< typename SizesHolder1,
+          typename SizesHolder2 >
+__cuda_callable__
+bool sizesWeakCompare( const SizesHolder1& sizes1, const SizesHolder2& sizes2 )
+{
+   static_assert( SizesHolder1::getDimension() == SizesHolder2::getDimension(),
+                  "Cannot compare sizes of different dimensions." );
+   bool result = true;
+   TemplateStaticFor< std::size_t, 0, SizesHolder1::getDimension(), WeakCompareHelper >::exec( sizes1, sizes2, result );
+   return result;
+}
+
+
+// helper for the forInternal and forBoundary methods (NDArray and DistributedNDArray)
+template< std::size_t ConstValue,
+          typename TargetHolder,
+          typename SourceHolder,
+          typename Overlaps = make_constant_index_sequence< TargetHolder::getDimension(), 0 >,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesSubtractHelper
+{
+   static void subtract( TargetHolder& target,
+                         const SourceHolder& source )
+   {
+      if( source.template getStaticSize< level >() == 0 )
+         target.template setSize< level >( source.template getSize< level >() - ConstValue * ! get< level >( Overlaps{} ) );
+      SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, level - 1 >::subtract( target, source );
+   }
+};
+
+template< std::size_t ConstValue,
+          typename TargetHolder,
+          typename SourceHolder,
+          typename Overlaps >
+struct SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, 0 >
+{
+   static void subtract( TargetHolder& target,
+                         const SourceHolder& source )
+   {
+      if( source.template getStaticSize< 0 >() == 0 )
+         target.template setSize< 0 >( source.template getSize< 0 >() - ConstValue * ! get< 0 >( Overlaps{} ) );
+   }
+};
+
+
+// helper for the forInternal and forBoundary methods (DistributedNDArray)
+template< std::size_t ConstValue,
+          typename TargetHolder,
+          typename SourceHolder,
+          typename Overlaps = make_constant_index_sequence< TargetHolder::getDimension(), 0 >,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesAddHelper
+{
+   static void add( TargetHolder& target,
+                    const SourceHolder& source )
+   {
+      if( source.template getStaticSize< level >() == 0 )
+         target.template setSize< level >( source.template getSize< level >() + ConstValue * ! get< level >( Overlaps{} ) );
+      SetSizesAddHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, level - 1 >::add( target, source );
+   }
+};
+
+template< std::size_t ConstValue,
+          typename TargetHolder,
+          typename SourceHolder,
+          typename Overlaps >
+struct SetSizesAddHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, 0 >
+{
+   static void add( TargetHolder& target,
+                    const SourceHolder& source )
+   {
+      if( source.template getStaticSize< 0 >() == 0 )
+         target.template setSize< 0 >( source.template getSize< 0 >() + ConstValue * ! get< 0 >( Overlaps{} ) );
+   }
+};
+
+
+// helper for the forLocalInternal, forLocalBoundary and forOverlaps methods (DistributedNDArray)
+template< typename TargetHolder,
+          typename SourceHolder,
+          typename Overlaps = make_constant_index_sequence< TargetHolder::getDimension(), 0 >,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesSubtractOverlapsHelper
+{
+   static void subtract( TargetHolder& target,
+                         const SourceHolder& source )
+   {
+      if( source.template getStaticSize< level >() == 0 )
+         target.template setSize< level >( source.template getSize< level >() - get< level >( Overlaps{} ) );
+      SetSizesSubtractOverlapsHelper< TargetHolder, SourceHolder, Overlaps, level - 1 >::subtract( target, source );
+   }
+};
+
+template< typename TargetHolder,
+          typename SourceHolder,
+          typename Overlaps >
+struct SetSizesSubtractOverlapsHelper< TargetHolder, SourceHolder, Overlaps, 0 >
+{
+   static void subtract( TargetHolder& target,
+                         const SourceHolder& source )
+   {
+      if( source.template getStaticSize< 0 >() == 0 )
+         target.template setSize< 0 >( source.template getSize< 0 >() - get< 0 >( Overlaps{} ) );
+   }
+};
+
+
+// helper for the forLocalInternal, forLocalBoundary and forOverlaps methods (DistributedNDArray)
+template< typename TargetHolder,
+          typename SourceHolder,
+          typename Overlaps = make_constant_index_sequence< TargetHolder::getDimension(), 0 >,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesAddOverlapsHelper
+{
+   static void add( TargetHolder& target,
+                    const SourceHolder& source )
+   {
+      if( source.template getStaticSize< level >() == 0 )
+         target.template setSize< level >( source.template getSize< level >() + get< level >( Overlaps{} ) );
+      SetSizesAddOverlapsHelper< TargetHolder, SourceHolder, Overlaps, level - 1 >::add( target, source );
+   }
+};
+
+template< typename TargetHolder,
+          typename SourceHolder,
+          typename Overlaps >
+struct SetSizesAddOverlapsHelper< TargetHolder, SourceHolder, Overlaps, 0 >
+{
+   static void add( TargetHolder& target,
+                    const SourceHolder& source )
+   {
+      if( source.template getStaticSize< 0 >() == 0 )
+         target.template setSize< 0 >( source.template getSize< 0 >() + get< 0 >( Overlaps{} ) );
+   }
+};
+
+
+// helper for the forInternal method (DistributedNDArray)
+template< typename TargetHolder,
+          typename SourceHolder,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesMaxHelper
+{
+   static void max( TargetHolder& target,
+                    const SourceHolder& source )
+   {
+      if( source.template getStaticSize< level >() == 0 )
+         target.template setSize< level >( std::max( target.template getSize< level >(), source.template getSize< level >() ) );
+      SetSizesMaxHelper< TargetHolder, SourceHolder, level - 1 >::max( target, source );
+   }
+};
+
+template< typename TargetHolder,
+          typename SourceHolder >
+struct SetSizesMaxHelper< TargetHolder, SourceHolder, 0 >
+{
+   static void max( TargetHolder& target,
+                    const SourceHolder& source )
+   {
+      if( source.template getStaticSize< 0 >() == 0 )
+         target.template setSize< 0 >( std::max( target.template getSize< 0 >(), source.template getSize< 0 >() ) );
+   }
+};
+
+
+// helper for the forInternal method (DistributedNDArray)
+template< typename TargetHolder,
+          typename SourceHolder,
+          std::size_t level = TargetHolder::getDimension() - 1 >
+struct SetSizesMinHelper
+{
+   static void min( TargetHolder& target,
+                    const SourceHolder& source )
+   {
+      if( source.template getStaticSize< level >() == 0 )
+         target.template setSize< level >( std::min( target.template getSize< level >(), source.template getSize< level >() ) );
+      SetSizesMinHelper< TargetHolder, SourceHolder, level - 1 >::min( target, source );
+   }
+};
+
+template< typename TargetHolder,
+          typename SourceHolder >
+struct SetSizesMinHelper< TargetHolder, SourceHolder, 0 >
+{
+   static void min( TargetHolder& target,
+                    const SourceHolder& source )
+   {
+      if( source.template getStaticSize< 0 >() == 0 )
+         target.template setSize< 0 >( std::min( target.template getSize< 0 >(), source.template getSize< 0 >() ) );
+   }
+};
+
+} // namespace __ndarray_impl
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/ndarray/Subarrays.h b/src/TNL/Containers/ndarray/Subarrays.h
new file mode 100644
index 0000000000000000000000000000000000000000..d50a30ea1178743342685801f1c3c22af2a52c00
--- /dev/null
+++ b/src/TNL/Containers/ndarray/Subarrays.h
@@ -0,0 +1,356 @@
+/***************************************************************************
+                          Subarrays.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Containers/ndarray/Meta.h>
+#include <TNL/Containers/ndarray/SizesHolder.h>
+#include <TNL/Containers/ndarray/Indexing.h>
+
+namespace TNL {
+namespace Containers {
+namespace __ndarray_impl {
+
+template< typename Dimensions, typename Permutation >
+class SubpermutationGetter;
+
+template< std::size_t... dims, std::size_t... vals >
+class SubpermutationGetter< std::index_sequence< dims... >, std::index_sequence< vals... > >
+{
+private:
+   using Dimensions = std::index_sequence< dims... >;
+   using Permutation = std::index_sequence< vals... >;
+   using Subsequence = decltype(
+            filter_sequence< Dimensions >( Permutation{} )
+         );
+
+   template< std::size_t... v >
+   static constexpr auto
+   get_subpermutation( std::index_sequence< v... > )
+   {
+      using Subpermutation = std::index_sequence< count_smaller( v, v... )... >;
+      return Subpermutation{};
+   }
+
+public:
+   using Subpermutation = decltype(
+            get_subpermutation( Subsequence{} )
+         );
+};
+
+
+template< typename Dimensions, typename SihesHolder >
+class SizesFilter;
+
+template< std::size_t... dims, typename Index, std::size_t... sizes >
+class SizesFilter< std::index_sequence< dims... >, SizesHolder< Index, sizes... > >
+{
+private:
+   using Dimensions = std::index_sequence< dims... >;
+   using SizesSequence = std::index_sequence< sizes... >;
+   using Subsequence = decltype(
+            concat_sequences( std::index_sequence< get_from_pack< dims >( sizes... ) >{} ... )
+         );
+
+   template< std::size_t... v >
+   static constexpr auto
+   get_sizesholder( std::index_sequence< v... > )
+   {
+      using Sizes = SizesHolder< Index, v... >;
+      return Sizes{};
+   }
+
+   template< std::size_t level = 0, typename = void >
+   struct SizeSetterHelper
+   {
+      template< typename NewSizes,
+                typename OldSizes >
+      __cuda_callable__
+      static void setSizes( NewSizes& newSizes,
+                            const OldSizes& oldSizes )
+      {
+         if( oldSizes.template getStaticSize< level >() == 0 )
+            newSizes.template setSize< level >( oldSizes.template getSize< get< level >( Dimensions{} ) >() );
+         SizeSetterHelper< level + 1 >::setSizes( newSizes, oldSizes );
+      }
+   };
+
+   template< typename _unused >
+   struct SizeSetterHelper< Dimensions::size() - 1, _unused >
+   {
+      template< typename NewSizes,
+                typename OldSizes >
+      __cuda_callable__
+      static void setSizes( NewSizes& newSizes,
+                            const OldSizes& oldSizes )
+      {
+         static constexpr std::size_t level = Dimensions::size() - 1;
+         if( oldSizes.template getStaticSize< level >() == 0 )
+            newSizes.template setSize< level >( oldSizes.template getSize< get< level >( Dimensions{} ) >() );
+      }
+   };
+
+   template< std::size_t level = 0, typename = void >
+   struct IndexChecker
+   {
+      template< typename... IndexTypes >
+      static bool check( IndexTypes&&... indices )
+      {
+         static constexpr std::size_t d = get< level >( Dimensions{} );
+         if( get_from_pack< d >( std::forward< IndexTypes >( indices )... ) != 0 )
+            return false;
+         return IndexChecker< level + 1 >::check( std::forward< IndexTypes >( indices )... );
+      }
+   };
+
+   template< typename _unused >
+   struct IndexChecker< Dimensions::size() - 1, _unused >
+   {
+      template< typename... IndexTypes >
+      static bool check( IndexTypes&&... indices )
+      {
+         static constexpr std::size_t d = get< Dimensions::size() - 1 >( Dimensions{} );
+         if( get_from_pack< d >( std::forward< IndexTypes >( indices )... ) != 0 )
+            return false;
+         return true;
+      }
+   };
+
+public:
+   using Sizes = decltype(
+            get_sizesholder( Subsequence{} )
+         );
+
+   template< typename... IndexTypes >
+   __cuda_callable__
+   static Sizes filterSizes( const SizesHolder< Index, sizes... >& oldSizes, IndexTypes&&... indices )
+   {
+      Sizes newSizes;
+
+      // assert that indices are 0 for the dimensions in the subarray
+      // (contraction of dimensions is not supported yet, and it does not
+      // make sense for static dimensions anyway)
+      TNL_ASSERT_TRUE( IndexChecker<>::check( std::forward< IndexTypes >( indices )... ),
+                       "Static dimensions of the subarray must start at index 0 of the array." );
+
+      // set dynamic sizes
+      // pseudo-python-code:
+      //      for d, D in enumerate(dims...):
+      //          newSizes.setSize< d >( oldSizes.getSize< D >() )
+      SizeSetterHelper<>::setSizes( newSizes, oldSizes );
+
+      return newSizes;
+   }
+};
+
+
+template< typename Index, std::size_t Dimension >
+struct DummyStrideBase
+{
+   static constexpr std::size_t getDimension()
+   {
+      return Dimension;
+   }
+
+   static constexpr bool isContiguous()
+   {
+      return true;
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   constexpr Index getStride( Index i = 0 ) const
+   {
+      return 1;
+   }
+};
+
+template< typename Index,
+          std::size_t... sizes >
+class StridesHolder
+: private SizesHolder< Index, sizes... >
+{
+   using BaseType = SizesHolder< Index, sizes... >;
+
+public:
+   using BaseType::getDimension;
+
+   static constexpr bool isContiguous()
+   {
+      // a priori not contiguous (otherwise DummyStrideBase would be used)
+      return false;
+   }
+
+   template< std::size_t level >
+   static constexpr std::size_t getStaticStride( Index i = 0 )
+   {
+      return BaseType::template getStaticSize< level >();
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   Index getStride( Index i = 0 ) const
+   {
+      return BaseType::template getSize< level >();
+   }
+
+   template< std::size_t level >
+   __cuda_callable__
+   void setStride( Index size )
+   {
+      BaseType::template setSize< level >( size );
+   }
+};
+
+template< typename Base, typename Permutation, std::size_t... Dimensions >
+class SubarrayGetter;
+
+template< typename SliceInfo, typename Permutation, std::size_t... Dimensions >
+class SubarrayGetter< NDArrayBase< SliceInfo >, Permutation, Dimensions... >
+{
+   // returns the number of factors in the stride product
+   template< std::size_t dim, std::size_t... vals >
+   static constexpr std::size_t get_end( std::index_sequence< vals... > _perm )
+   {
+      if( dim == get< Permutation::size() - 1 >( Permutation{} ) )
+         return 0;
+      std::size_t i = 0;
+      std::size_t count = 0;
+// FIXME: nvcc chokes on the variadic brace-initialization
+#ifndef __NVCC__
+      for( auto v : std::initializer_list< std::size_t >{ vals... } )
+#else
+      for( auto v : (std::size_t [sizeof...(vals)]){ vals... } )
+#endif
+      {
+         if( i++ <= index_in_pack( dim, vals... ) )
+            continue;
+         if( is_in_sequence( v, std::index_sequence< Dimensions... >{} ) )
+            break;
+         count++;
+      }
+      return count;
+   }
+
+   // static calculation of the stride product
+   template< typename SizesHolder,
+             std::size_t start_dim,
+             std::size_t end = get_end< start_dim >( Permutation{} ),
+             std::size_t level = 0,
+             typename = void >
+   struct StaticStrideGetter
+   {
+      static constexpr std::size_t get()
+      {
+         constexpr std::size_t start_offset = index_in_sequence( start_dim, Permutation{} );
+         constexpr std::size_t dim = __ndarray_impl::get< start_offset + level + 1 >( Permutation{} );
+         return SizesHolder::template getStaticSize< dim >() * StaticStrideGetter< SizesHolder, start_dim, end, level + 1 >::get();
+      }
+   };
+
+   template< typename SizesHolder, std::size_t start_dim, std::size_t end, typename _unused >
+   struct StaticStrideGetter< SizesHolder, start_dim, end, end, _unused >
+   {
+      static constexpr std::size_t get()
+      {
+         return 1;
+      }
+   };
+
+   // dynamic calculation of the stride product
+   template< std::size_t start_dim,
+             std::size_t end = get_end< start_dim >( Permutation{} ),
+             std::size_t level = 0,
+             typename = void >
+   struct DynamicStrideGetter
+   {
+      template< typename SizesHolder >
+      static constexpr std::size_t get( const SizesHolder& sizes )
+      {
+         constexpr std::size_t start_offset = index_in_sequence( start_dim, Permutation{} );
+         constexpr std::size_t dim = __ndarray_impl::get< start_offset + level + 1 >( Permutation{} );
+         return sizes.template getSize< dim >() * DynamicStrideGetter< start_dim, end, level + 1 >::get( sizes );
+      }
+   };
+
+   template< std::size_t start_dim, std::size_t end, typename _unused >
+   struct DynamicStrideGetter< start_dim, end, end, _unused >
+   {
+      template< typename SizesHolder >
+      static constexpr std::size_t get( const SizesHolder& sizes )
+      {
+         return 1;
+      }
+   };
+
+   // helper class for setting dynamic strides
+   template< std::size_t level = 0, typename = void >
+   struct StrideSetterHelper
+   {
+      template< typename StridesHolder, typename SizesHolder >
+      __cuda_callable__
+      static void setStrides( StridesHolder& strides, const SizesHolder& sizes )
+      {
+         static constexpr std::size_t dim = get_from_pack< level >( Dimensions... );
+         if( StridesHolder::template getStaticStride< level >() == 0 )
+            strides.template setStride< level >( DynamicStrideGetter< dim >::get( sizes ) );
+         StrideSetterHelper< level + 1 >::setStrides( strides, sizes );
+      }
+   };
+
+   template< typename _unused >
+   struct StrideSetterHelper< sizeof...(Dimensions) - 1, _unused >
+   {
+      template< typename StridesHolder, typename SizesHolder >
+      __cuda_callable__
+      static void setStrides( StridesHolder& strides, const SizesHolder& sizes )
+      {
+         static constexpr std::size_t level = sizeof...(Dimensions) - 1;
+         static constexpr std::size_t dim = get_from_pack< level >( Dimensions... );
+         if( StridesHolder::template getStaticStride< level >() == 0 )
+            strides.template setStride< level >( DynamicStrideGetter< dim >::get( sizes ) );
+      }
+   };
+
+public:
+   using Subpermutation = typename SubpermutationGetter< std::index_sequence< Dimensions... >, Permutation >::Subpermutation;
+
+   template< typename SizesHolder, typename... IndexTypes >
+   __cuda_callable__
+   static auto filterSizes( const SizesHolder& sizes, IndexTypes&&... indices )
+   {
+      using Filter = SizesFilter< std::index_sequence< Dimensions... >, SizesHolder >;
+      return Filter::filterSizes( sizes, std::forward< IndexTypes >( indices )... );
+   }
+
+   template< typename SizesHolder, typename... IndexTypes >
+   __cuda_callable__
+   static auto getStrides( const SizesHolder& sizes, IndexTypes&&... indices )
+   {
+      using Strides = StridesHolder< typename SizesHolder::IndexType,
+                                     StaticStrideGetter< SizesHolder, Dimensions >::get()... >;
+      Strides strides;
+
+      // set dynamic strides
+      // pseudo-python-code:
+      //      for i, d in enumerate(Dimensions):
+      //          if is_dynamic_dimension(d):
+      //              strides.setStride< i >( dynamic_stride(d, sizes) )
+      StrideSetterHelper<>::setStrides( strides, sizes );
+
+      return strides;
+   }
+};
+
+} // namespace __ndarray_impl
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/ndarray/SynchronizerBuffers.h b/src/TNL/Containers/ndarray/SynchronizerBuffers.h
new file mode 100644
index 0000000000000000000000000000000000000000..d54fddfd709dea111c1b4ef7eabdcbf1fdaa08ba
--- /dev/null
+++ b/src/TNL/Containers/ndarray/SynchronizerBuffers.h
@@ -0,0 +1,88 @@
+/***************************************************************************
+                          SynchronizerBuffers.h  -  description
+                             -------------------
+    begin                : Mar 30, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Containers/NDArray.h>
+
+namespace TNL {
+namespace Containers {
+namespace __ndarray_impl {
+
+template< typename DistributedNDArray, std::size_t level >
+struct SynchronizerBuffersLayer
+{
+   SynchronizerBuffersLayer& getDimBuffers( std::integral_constant< std::size_t, level > )
+   {
+      return *this;
+   }
+
+   using NDArrayType = NDArray< typename DistributedNDArray::ValueType,
+                                typename DistributedNDArray::SizesHolderType,
+                                typename DistributedNDArray::PermutationType,
+                                typename DistributedNDArray::DeviceType >;
+   NDArrayType left_send_buffer, left_recv_buffer, right_send_buffer, right_recv_buffer;
+   typename DistributedNDArray::LocalBeginsType left_send_offsets, left_recv_offsets, right_send_offsets, right_recv_offsets;
+
+   int left_neighbor = -1;
+   int right_neighbor = -1;
+
+   void reset()
+   {
+      left_send_buffer.reset();
+      left_recv_buffer.reset();
+      right_send_buffer.reset();
+      right_recv_buffer.reset();
+
+      left_send_offsets = left_recv_offsets = right_send_offsets = right_recv_offsets = typename DistributedNDArray::LocalBeginsType{};
+
+      left_neighbor = right_neighbor = -1;
+   }
+};
+
+template< typename DistributedNDArray,
+          typename LevelTag = std::integral_constant< std::size_t, DistributedNDArray::getDimension() > >
+struct SynchronizerBuffersLayerHelper
+{};
+
+template< typename DistributedNDArray, std::size_t level >
+struct SynchronizerBuffersLayerHelper< DistributedNDArray, std::integral_constant< std::size_t, level > >
+: public SynchronizerBuffersLayerHelper< DistributedNDArray, std::integral_constant< std::size_t, level - 1 > >,
+  public SynchronizerBuffersLayer< DistributedNDArray, level >
+{
+   using SynchronizerBuffersLayerHelper< DistributedNDArray, std::integral_constant< std::size_t, level - 1 > >::getDimBuffers;
+   using SynchronizerBuffersLayer< DistributedNDArray, level >::getDimBuffers;
+};
+
+template< typename DistributedNDArray >
+struct SynchronizerBuffersLayerHelper< DistributedNDArray, std::integral_constant< std::size_t, 0 > >
+: public SynchronizerBuffersLayer< DistributedNDArray, 0 >
+{
+   using SynchronizerBuffersLayer< DistributedNDArray, 0 >::getDimBuffers;
+};
+
+template< typename DistributedNDArray >
+struct SynchronizerBuffers
+: public SynchronizerBuffersLayerHelper< DistributedNDArray >
+{
+   using SynchronizerBuffersLayerHelper< DistributedNDArray >::getDimBuffers;
+
+   template< std::size_t level >
+   auto& getDimBuffers()
+   {
+      return this->getDimBuffers( std::integral_constant< std::size_t, level >{} );
+   }
+};
+
+} // namespace __ndarray_impl
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/StaticFor.h b/src/TNL/StaticFor.h
index 1539e05aa52cd25222098f10a38b2688417bd8dd..0fa3bc0ef7a0f5b916a9a7546d4d6f59758f0681 100644
--- a/src/TNL/StaticFor.h
+++ b/src/TNL/StaticFor.h
@@ -14,25 +14,47 @@
 
 namespace TNL {
 
+// Manual unrolling does not make sense for loops with a large iterations
+// count. For a very large iterations count it would trigger the compiler's
+// limit on recursive template instantiation. Also note that the compiler
+// will (at least partially) unroll loops with static bounds anyway.
+template< int Begin, int End, bool unrolled = (End - Begin <= 8) >
+struct StaticFor;
+
 template< int Begin, int End >
-struct StaticFor
+struct StaticFor< Begin, End, true >
 {
-    template< typename Function, typename... Args >
-    __cuda_callable__
-    static void exec( const Function& f, Args... args )
-    {
-        static_assert( Begin < End, "Wrong index interval for StaticFor. Being must be lower than end." );
-        f( Begin, args... );
-        StaticFor< Begin + 1, End >::exec( f, args... );
-    };
+   static_assert( Begin < End, "Wrong index interval for StaticFor. Begin must be less than end." );
+
+   template< typename Function, typename... Args >
+   __cuda_callable__
+   static void exec( const Function& f, Args... args )
+   {
+      f( Begin, args... );
+      StaticFor< Begin + 1, End >::exec( f, args... );
+   }
 };
 
 template< int End >
-struct StaticFor< End, End >
+struct StaticFor< End, End, true >
 {
-    template< typename Function, typename... Args >
-    __cuda_callable__
-    static void exec( const Function& f, Args... args ){};
+   template< typename Function, typename... Args >
+   __cuda_callable__
+   static void exec( const Function& f, Args... args ) {}
+};
+
+template< int Begin, int End >
+struct StaticFor< Begin, End, false >
+{
+   static_assert( Begin <= End, "Wrong index interval for StaticFor. Begin must be less than or equal to end." );
+
+   template< typename Function, typename... Args >
+   __cuda_callable__
+   static void exec( const Function& f, Args... args )
+   {
+      for( int i = Begin; i < End; i++ )
+         f( i, args... );
+   }
 };
 
 } //namespace TNL
diff --git a/src/UnitTests/Containers/CMakeLists.txt b/src/UnitTests/Containers/CMakeLists.txt
index d33f5d2631bea262c9732d2489e02554738a72fb..c8cd88af9f3ae8df5109c439aba858bc059bca2d 100644
--- a/src/UnitTests/Containers/CMakeLists.txt
+++ b/src/UnitTests/Containers/CMakeLists.txt
@@ -120,6 +120,7 @@ ADD_TEST( StaticVectorOperationsTest ${EXECUTABLE_OUTPUT_PATH}/StaticVectorOpera
 
 
 ADD_SUBDIRECTORY( Multimaps )
+ADD_SUBDIRECTORY( ndarray )
 
 
 if( ${BUILD_MPI} )
diff --git a/src/UnitTests/Containers/ndarray/CMakeLists.txt b/src/UnitTests/Containers/ndarray/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2f87d64a9b79db1b29dad644660da0a94b56d9e4
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/CMakeLists.txt
@@ -0,0 +1,81 @@
+if( BUILD_CUDA )
+   cuda_add_executable( NDArrayTest NDArrayTest.cu
+                        OPTIONS ${CXX_TESTS_FLAGS} )
+   target_link_libraries( NDArrayTest ${GTEST_BOTH_LIBRARIES} )
+   add_test( NDArrayTest ${EXECUTABLE_OUTPUT_PATH}/NDArrayTest${CMAKE_EXECUTABLE_SUFFIX} )
+else()
+   add_executable( NDArrayTest NDArrayTest.cpp )
+   target_compile_options( NDArrayTest PRIVATE ${CXX_TESTS_FLAGS} )
+   target_link_libraries( NDArrayTest ${GTEST_BOTH_LIBRARIES} )
+   add_test( NDArrayTest ${EXECUTABLE_OUTPUT_PATH}/NDArrayTest${CMAKE_EXECUTABLE_SUFFIX} )
+endif()
+
+add_executable( NDSubarrayTest NDSubarrayTest.cpp )
+target_compile_options( NDSubarrayTest PRIVATE ${CXX_TESTS_FLAGS} )
+target_link_libraries( NDSubarrayTest ${GTEST_BOTH_LIBRARIES} )
+add_test( NDSubarrayTest ${EXECUTABLE_OUTPUT_PATH}/NDSubarrayTest${CMAKE_EXECUTABLE_SUFFIX} )
+
+add_executable( SlicedNDArrayTest SlicedNDArrayTest.cpp )
+target_compile_options( SlicedNDArrayTest PRIVATE ${CXX_TESTS_FLAGS} )
+target_link_libraries( SlicedNDArrayTest ${GTEST_BOTH_LIBRARIES} )
+add_test( SlicedNDArrayTest ${EXECUTABLE_OUTPUT_PATH}/SlicedNDArrayTest${CMAKE_EXECUTABLE_SUFFIX} )
+
+add_executable( StaticNDArrayTest StaticNDArrayTest.cpp )
+target_compile_options( StaticNDArrayTest PRIVATE ${CXX_TESTS_FLAGS} )
+target_link_libraries( StaticNDArrayTest ${GTEST_BOTH_LIBRARIES} )
+add_test( StaticNDArrayTest ${EXECUTABLE_OUTPUT_PATH}/StaticNDArrayTest${CMAKE_EXECUTABLE_SUFFIX} )
+
+if( BUILD_CUDA )
+   cuda_add_executable( StaticNDArrayCudaTest StaticNDArrayCudaTest.cu
+                        OPTIONS ${CXX_TESTS_FLAGS} )
+   target_link_libraries( StaticNDArrayCudaTest ${GTEST_BOTH_LIBRARIES} )
+   add_test( StaticNDArrayCudaTest ${EXECUTABLE_OUTPUT_PATH}/StaticNDArrayCudaTest${CMAKE_EXECUTABLE_SUFFIX} )
+endif()
+
+if( ${BUILD_MPI} )
+   if( BUILD_CUDA )
+      CUDA_ADD_EXECUTABLE( DistributedNDArray_1D_test DistributedNDArray_1D_test.cu
+                           OPTIONS ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedNDArray_1D_test ${GTEST_BOTH_LIBRARIES} )
+
+      CUDA_ADD_EXECUTABLE( DistributedNDArray_semi1D_test DistributedNDArray_semi1D_test.cu
+                           OPTIONS ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedNDArray_semi1D_test ${GTEST_BOTH_LIBRARIES} )
+
+      CUDA_ADD_EXECUTABLE( DistributedNDArrayOverlaps_1D_test DistributedNDArrayOverlaps_1D_test.cu
+                           OPTIONS ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedNDArrayOverlaps_1D_test ${GTEST_BOTH_LIBRARIES} )
+
+      CUDA_ADD_EXECUTABLE( DistributedNDArrayOverlaps_semi1D_test DistributedNDArrayOverlaps_semi1D_test.cu
+                           OPTIONS ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedNDArrayOverlaps_semi1D_test ${GTEST_BOTH_LIBRARIES} )
+   else()
+      ADD_EXECUTABLE( DistributedNDArray_1D_test DistributedNDArray_1D_test.cpp )
+      TARGET_COMPILE_OPTIONS( DistributedNDArray_1D_test PRIVATE ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedNDArray_1D_test ${GTEST_BOTH_LIBRARIES} )
+
+      ADD_EXECUTABLE( DistributedNDArray_semi1D_test DistributedNDArray_semi1D_test.cpp )
+      TARGET_COMPILE_OPTIONS( DistributedNDArray_semi1D_test PRIVATE ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedNDArray_semi1D_test ${GTEST_BOTH_LIBRARIES} )
+
+      ADD_EXECUTABLE( DistributedNDArrayOverlaps_1D_test DistributedNDArrayOverlaps_1D_test.cpp )
+      TARGET_COMPILE_OPTIONS( DistributedNDArrayOverlaps_1D_test PRIVATE ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedNDArrayOverlaps_1D_test ${GTEST_BOTH_LIBRARIES} )
+
+      ADD_EXECUTABLE( DistributedNDArrayOverlaps_semi1D_test DistributedNDArrayOverlaps_semi1D_test.cpp )
+      TARGET_COMPILE_OPTIONS( DistributedNDArrayOverlaps_semi1D_test PRIVATE ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedNDArrayOverlaps_semi1D_test ${GTEST_BOTH_LIBRARIES} )
+   endif()
+
+   SET( mpi_test_parameters -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedNDArray_1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
+   ADD_TEST( NAME DistributedNDArray_1D_test COMMAND "mpirun" ${mpi_test_parameters})
+
+   SET( mpi_test_parameters -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedNDArray_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
+   ADD_TEST( NAME DistributedNDArray_semi1D_test COMMAND "mpirun" ${mpi_test_parameters})
+
+   SET( mpi_test_parameters -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedNDArrayOverlaps_1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
+   ADD_TEST( NAME DistributedNDArrayOverlaps_1D_test COMMAND "mpirun" ${mpi_test_parameters})
+
+   SET( mpi_test_parameters -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedNDArrayOverlaps_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" )
+   ADD_TEST( NAME DistributedNDArrayOverlaps_semi1D_test COMMAND "mpirun" ${mpi_test_parameters})
+endif()
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cpp b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b0aa8e8e2d8a8158d7743a508d57acbb73426b2e
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cpp
@@ -0,0 +1 @@
+#include "DistributedNDArrayOverlaps_1D_test.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cu b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b0aa8e8e2d8a8158d7743a508d57acbb73426b2e
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cu
@@ -0,0 +1 @@
+#include "DistributedNDArrayOverlaps_1D_test.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7609ee7476931a0afed13711532bf162d0984ae
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h
@@ -0,0 +1,369 @@
+/***************************************************************************
+                          DistributedNDArrayOverlaps_1D_test.h  -  description
+                             -------------------
+    begin                : Dec 27, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/NoDistrCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/Containers/DistributedNDArray.h>
+#include <TNL/Containers/DistributedNDArrayView.h>
+#include <TNL/Containers/DistributedNDArraySynchronizer.h>
+#include <TNL/Containers/ArrayView.h>
+#include <TNL/Containers/Partitioner.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+/*
+ * Light check of DistributedNDArray.
+ *
+ * - Number of processes is not limited.
+ * - Global size is hardcoded as 97 to force non-uniform distribution.
+ * - Communication group is hardcoded as AllGroup -- it may be changed as needed.
+ */
+template< typename DistributedNDArray >
+class DistributedNDArrayOverlaps_1D_test
+: public ::testing::Test
+{
+protected:
+   using ValueType = typename DistributedNDArray::ValueType;
+   using DeviceType = typename DistributedNDArray::DeviceType;
+   using CommunicatorType = typename DistributedNDArray::CommunicatorType;
+   using IndexType = typename DistributedNDArray::IndexType;
+   using DistributedNDArrayType = DistributedNDArray;
+
+   // TODO: use ndarray
+   using LocalArrayType = Array< ValueType, DeviceType, IndexType >;
+   using LocalArrayViewType = ArrayView< ValueType, DeviceType, IndexType >;
+
+   const int globalSize = 97;  // prime number to force non-uniform distribution
+   const int overlaps = __ndarray_impl::get< 0 >( typename DistributedNDArray::OverlapsType{} );
+
+   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+
+   DistributedNDArrayType distributedNDArray;
+
+   const int rank = CommunicatorType::GetRank(group);
+   const int nproc = CommunicatorType::GetSize(group);
+
+   DistributedNDArrayOverlaps_1D_test()
+   {
+      using LocalRangeType = typename DistributedNDArray::LocalRangeType;
+      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      distributedNDArray.setSizes( globalSize );
+      distributedNDArray.template setDistribution< 0 >( localRange.getBegin(), localRange.getEnd(), group );
+      distributedNDArray.allocate();
+
+      EXPECT_EQ( distributedNDArray.template getLocalRange< 0 >(), localRange );
+      EXPECT_EQ( distributedNDArray.getCommunicationGroup(), group );
+   }
+};
+
+// types for which DistributedNDArrayOverlaps_1D_test is instantiated
+using DistributedNDArrayTypes = ::testing::Types<
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 0 >,
+                                std::index_sequence< 0 >,
+                                Devices::Host >,
+                       Communicators::MpiCommunicator,
+                       std::index_sequence< 2 > >
+// TODO: does it make sense for NoDistrCommunicator?
+//   DistributedNDArray< NDArray< double,
+//                                SizesHolder< int, 0 >,
+//                                std::index_sequence< 0 >,
+//                                Devices::Host >,
+//                       Communicators::NoDistrCommunicator,
+//                       std::index_sequence< 2 > >
+#ifdef HAVE_CUDA
+   ,
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 0 >,
+                                std::index_sequence< 0 >,
+                                Devices::Cuda >,
+                       Communicators::MpiCommunicator,
+                       std::index_sequence< 2 > >
+// TODO: does it make sense for NoDistrCommunicator?
+//   DistributedNDArray< NDArray< double,
+//                                SizesHolder< int, 0 >,
+//                                std::index_sequence< 0 >,
+//                                Devices::Cuda >,
+//                       Communicators::NoDistrCommunicator,
+//                       std::index_sequence< 2 > >
+#endif
+>;
+
+TYPED_TEST_SUITE( DistributedNDArrayOverlaps_1D_test, DistributedNDArrayTypes );
+
+TYPED_TEST( DistributedNDArrayOverlaps_1D_test, checkSumOfLocalSizes )
+{
+   using CommunicatorType = typename TestFixture::CommunicatorType;
+
+   const auto localRange = this->distributedNDArray.template getLocalRange< 0 >();
+   const int localSize = localRange.getEnd() - localRange.getBegin();
+   int sumOfLocalSizes = 0;
+   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   EXPECT_EQ( sumOfLocalSizes, this->globalSize );
+   EXPECT_EQ( this->distributedNDArray.template getSize< 0 >(), this->globalSize );
+
+   EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 2 * this->overlaps + localSize );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forLocalInternal( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const int overlaps = __ndarray_impl::get< 0 >( typename DistributedArray::OverlapsType{} );
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forLocalInternal( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+   for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+   for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+
+   a.setValue( 0 );
+   a_view.forLocalInternal( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+   for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+   for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArrayOverlaps_1D_test, forLocalInternal )
+{
+   test_helper_forLocalInternal( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forLocalBoundary( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const int overlaps = __ndarray_impl::get< 0 >( typename DistributedArray::OverlapsType{} );
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forLocalBoundary( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+   for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+   for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+
+   a.setValue( 0 );
+   a_view.forLocalBoundary( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+   for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+   for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArrayOverlaps_1D_test, forLocalBoundary )
+{
+   test_helper_forLocalBoundary( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forOverlaps( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const int overlaps = __ndarray_impl::get< 0 >( typename DistributedArray::OverlapsType{} );
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forOverlaps( setter );
+
+   for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+   for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+
+   a.setValue( 0 );
+   a_view.forOverlaps( setter );
+
+   for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+   for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArrayOverlaps_1D_test, forOverlaps )
+{
+   test_helper_forOverlaps( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_synchronize( DistributedArray& a, const int rank, const int nproc )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const int overlaps = __ndarray_impl::get< 0 >( typename DistributedArray::OverlapsType{} );
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) = i;
+   };
+
+   a.setValue( -1 );
+   a.forAll( setter );
+   DistributedNDArraySynchronizer< DistributedArray > s1;
+   s1.synchronize( a );
+
+   for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), gi + ((rank == 0) ? 97 : 0) );
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), gi );
+   for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), gi - ((rank == nproc-1) ? 97 : 0) );
+
+   a.setValue( -1 );
+   a_view.forAll( setter );
+   DistributedNDArraySynchronizer< decltype(a_view) > s2;
+   s2.synchronize( a_view );
+
+   for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), gi + ((rank == 0) ? 97 : 0) );
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), gi );
+   for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ )
+      EXPECT_EQ( a.getElement( gi ), gi - ((rank == nproc-1) ? 97 : 0) );
+}
+
+TYPED_TEST( DistributedNDArrayOverlaps_1D_test, synchronize )
+{
+   test_helper_synchronize( this->distributedNDArray, this->rank, this->nproc );
+}
+
+#endif  // HAVE_GTEST
+
+
+#if (defined(HAVE_GTEST) && defined(HAVE_MPI))
+using CommunicatorType = Communicators::MpiCommunicator;
+
+#include <sstream>
+
+class MinimalistBufferedPrinter
+: public ::testing::EmptyTestEventListener
+{
+private:
+   std::stringstream sout;
+
+public:
+   // Called before a test starts.
+   virtual void OnTestStart(const ::testing::TestInfo& test_info)
+   {
+      sout << test_info.test_case_name() << "." << test_info.name() << " Start." << std::endl;
+   }
+
+   // Called after a failed assertion or a SUCCEED() invocation.
+   virtual void OnTestPartResult(const ::testing::TestPartResult& test_part_result)
+   {
+      sout << (test_part_result.failed() ? "====Failure=== " : "===Success=== ")
+           << test_part_result.file_name() << " "
+           << test_part_result.line_number() <<std::endl
+           << test_part_result.summary() <<std::endl;
+   }
+
+   // Called after a test ends.
+   virtual void OnTestEnd(const ::testing::TestInfo& test_info)
+   {
+      const int rank = CommunicatorType::GetRank(CommunicatorType::AllGroup);
+      sout << test_info.test_case_name() << "." << test_info.name() << " End." <<std::endl;
+      std::cout << rank << ":" << std::endl << sout.str()<< std::endl;
+      sout.str( std::string() );
+      sout.clear();
+   }
+};
+#endif
+
+#include "../../GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+
+   #ifdef HAVE_MPI
+      ::testing::TestEventListeners& listeners =
+         ::testing::UnitTest::GetInstance()->listeners();
+
+      delete listeners.Release(listeners.default_result_printer());
+      listeners.Append(new MinimalistBufferedPrinter);
+
+      Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
+   #endif
+   return RUN_ALL_TESTS();
+#else
+   throw GtestMissingError();
+#endif
+}
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cpp b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cf74a71d11fa94ca9b2ad83c1389445c965a6797
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cpp
@@ -0,0 +1 @@
+#include "DistributedNDArrayOverlaps_semi1D_test.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cu b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cf74a71d11fa94ca9b2ad83c1389445c965a6797
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cu
@@ -0,0 +1 @@
+#include "DistributedNDArrayOverlaps_semi1D_test.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7f28ead5de63420e97730592948a1cfc2622b11
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h
@@ -0,0 +1,409 @@
+/***************************************************************************
+                          DistributedNDArrayOverlaps_semi1D_test.h  -  description
+                             -------------------
+    begin                : Dec 9, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/NoDistrCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/Containers/DistributedNDArray.h>
+#include <TNL/Containers/DistributedNDArrayView.h>
+#include <TNL/Containers/DistributedNDArraySynchronizer.h>
+#include <TNL/Containers/ArrayView.h>
+#include <TNL/Containers/Partitioner.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+/*
+ * Light check of DistributedNDArray.
+ *
+ * - Number of processes is not limited.
+ * - Global size is hardcoded as 97 to force non-uniform distribution.
+ * - Communication group is hardcoded as AllGroup -- it may be changed as needed.
+ */
+template< typename DistributedNDArray >
+class DistributedNDArrayOverlaps_semi1D_test
+: public ::testing::Test
+{
+protected:
+   using ValueType = typename DistributedNDArray::ValueType;
+   using DeviceType = typename DistributedNDArray::DeviceType;
+   using CommunicatorType = typename DistributedNDArray::CommunicatorType;
+   using IndexType = typename DistributedNDArray::IndexType;
+   using DistributedNDArrayType = DistributedNDArray;
+
+   // TODO: use ndarray
+   using LocalArrayType = Array< ValueType, DeviceType, IndexType >;
+   using LocalArrayViewType = ArrayView< ValueType, DeviceType, IndexType >;
+
+   const int globalSize = 97;  // prime number to force non-uniform distribution
+   const int overlaps = __ndarray_impl::get< 1 >( typename DistributedNDArray::OverlapsType{} );
+
+   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+
+   DistributedNDArrayType distributedNDArray;
+
+   const int rank = CommunicatorType::GetRank(group);
+   const int nproc = CommunicatorType::GetSize(group);
+
+   DistributedNDArrayOverlaps_semi1D_test()
+   {
+      using LocalRangeType = typename DistributedNDArray::LocalRangeType;
+      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      distributedNDArray.setSizes( 0, globalSize, globalSize / 2 );
+      distributedNDArray.template setDistribution< 1 >( localRange.getBegin(), localRange.getEnd(), group );
+      distributedNDArray.allocate();
+
+      EXPECT_EQ( distributedNDArray.template getLocalRange< 1 >(), localRange );
+      EXPECT_EQ( distributedNDArray.getCommunicationGroup(), group );
+   }
+};
+
+// types for which DistributedNDArrayOverlaps_semi1D_test is instantiated
+using DistributedNDArrayTypes = ::testing::Types<
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 9, 0, 0 >,  // Q, X, Y
+                                std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
+                                Devices::Host >,
+                       Communicators::MpiCommunicator,
+                       std::index_sequence< 0, 2, 0 > >
+#ifdef HAVE_CUDA
+   ,
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 9, 0, 0 >,  // Q, X, Y
+                                std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
+                                Devices::Cuda >,
+                       Communicators::MpiCommunicator,
+                       std::index_sequence< 0, 2, 0 > >
+#endif
+>;
+
+TYPED_TEST_SUITE( DistributedNDArrayOverlaps_semi1D_test, DistributedNDArrayTypes );
+
+TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, checkSumOfLocalSizes )
+{
+   using CommunicatorType = typename TestFixture::CommunicatorType;
+
+   const auto localRange = this->distributedNDArray.template getLocalRange< 1 >();
+   const int localSize = localRange.getEnd() - localRange.getBegin();
+   int sumOfLocalSizes = 0;
+   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   EXPECT_EQ( sumOfLocalSizes, this->globalSize );
+   EXPECT_EQ( this->distributedNDArray.template getSize< 1 >(), this->globalSize );
+
+   EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 9 * (2 * this->overlaps + localSize) * (this->globalSize / 2) );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forLocalInternal( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const int overlaps = __ndarray_impl::get< 1 >( typename DistributedArray::OverlapsType{} );
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( q, i, j ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forLocalInternal( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+
+   a.setValue( 0 );
+   a_view.forLocalInternal( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, forLocalInternal )
+{
+   test_helper_forLocalInternal( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forLocalBoundary( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const int overlaps = __ndarray_impl::get< 1 >( typename DistributedArray::OverlapsType{} );
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( q, i, j ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forLocalBoundary( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+
+   a.setValue( 0 );
+   a_view.forLocalBoundary( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, forLocalBoundary )
+{
+   test_helper_forLocalBoundary( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forOverlaps( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const int overlaps = __ndarray_impl::get< 1 >( typename DistributedArray::OverlapsType{} );
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( q, i, j ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forOverlaps( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+
+   a.setValue( 0 );
+   a_view.forOverlaps( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, forOverlaps )
+{
+   test_helper_forOverlaps( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_synchronize( DistributedArray& a, const int rank, const int nproc )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const int overlaps = __ndarray_impl::get< 1 >( typename DistributedArray::OverlapsType{} );
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( i ) = i;
+   };
+
+   a.setValue( -1 );
+   a.forAll( setter );
+   DistributedNDArraySynchronizer< DistributedArray > s1;
+   s1.synchronize( a );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), gi + ((rank == 0) ? 97 : 0) )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), gi )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), gi - ((rank == nproc-1) ? 97 : 0) )
+            << "gi = " << gi;
+
+   a.setValue( -1 );
+   a_view.forAll( setter );
+   DistributedNDArraySynchronizer< decltype(a_view) > s2;
+   s2.synchronize( a_view );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), gi + ((rank == 0) ? 97 : 0) )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), gi )
+            << "gi = " << gi;
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), gi - ((rank == nproc-1) ? 97 : 0) )
+            << "gi = " << gi;
+}
+
+//TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, synchronize )
+//{
+//   test_helper_synchronize( this->distributedNDArray, this->rank, this->nproc );
+//}
+
+#endif  // HAVE_GTEST
+
+
+#if (defined(HAVE_GTEST) && defined(HAVE_MPI))
+using CommunicatorType = Communicators::MpiCommunicator;
+
+#include <sstream>
+
+class MinimalistBufferedPrinter
+: public ::testing::EmptyTestEventListener
+{
+private:
+   std::stringstream sout;
+
+public:
+   // Called before a test starts.
+   virtual void OnTestStart(const ::testing::TestInfo& test_info)
+   {
+      sout << test_info.test_case_name() << "." << test_info.name() << " Start." << std::endl;
+   }
+
+   // Called after a failed assertion or a SUCCEED() invocation.
+   virtual void OnTestPartResult(const ::testing::TestPartResult& test_part_result)
+   {
+      sout << (test_part_result.failed() ? "====Failure=== " : "===Success=== ")
+           << test_part_result.file_name() << " "
+           << test_part_result.line_number() <<std::endl
+           << test_part_result.summary() <<std::endl;
+   }
+
+   // Called after a test ends.
+   virtual void OnTestEnd(const ::testing::TestInfo& test_info)
+   {
+      const int rank = CommunicatorType::GetRank(CommunicatorType::AllGroup);
+      sout << test_info.test_case_name() << "." << test_info.name() << " End." <<std::endl;
+      std::cout << rank << ":" << std::endl << sout.str()<< std::endl;
+      sout.str( std::string() );
+      sout.clear();
+   }
+};
+#endif
+
+#include "../../GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+
+   #ifdef HAVE_MPI
+      ::testing::TestEventListeners& listeners =
+         ::testing::UnitTest::GetInstance()->listeners();
+
+      delete listeners.Release(listeners.default_result_printer());
+      listeners.Append(new MinimalistBufferedPrinter);
+
+      Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
+   #endif
+   return RUN_ALL_TESTS();
+#else
+   throw GtestMissingError();
+#endif
+}
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cpp b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f234d7711a1056e2bf0298a7ff87bacac0d8717f
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cpp
@@ -0,0 +1 @@
+#include "DistributedNDArray_1D_test.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cu b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f234d7711a1056e2bf0298a7ff87bacac0d8717f
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cu
@@ -0,0 +1 @@
+#include "DistributedNDArray_1D_test.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
new file mode 100644
index 0000000000000000000000000000000000000000..3dda2d1b4716e8dc92671567c714b6ac9b1150d5
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
@@ -0,0 +1,555 @@
+/***************************************************************************
+                          DistributedNDArray_1D_test.h  -  description
+                             -------------------
+    begin                : Dec 27, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/NoDistrCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/Containers/DistributedNDArray.h>
+#include <TNL/Containers/DistributedNDArrayView.h>
+#include <TNL/Containers/ArrayView.h>
+#include <TNL/Containers/Partitioner.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+/*
+ * Light check of DistributedNDArray.
+ *
+ * - Number of processes is not limited.
+ * - Global size is hardcoded as 97 to force non-uniform distribution.
+ * - Communication group is hardcoded as AllGroup -- it may be changed as needed.
+ */
+template< typename DistributedNDArray >
+class DistributedNDArray_1D_test
+: public ::testing::Test
+{
+protected:
+   using ValueType = typename DistributedNDArray::ValueType;
+   using DeviceType = typename DistributedNDArray::DeviceType;
+   using CommunicatorType = typename DistributedNDArray::CommunicatorType;
+   using IndexType = typename DistributedNDArray::IndexType;
+   using DistributedNDArrayType = DistributedNDArray;
+
+   // TODO: use ndarray
+   using LocalArrayType = Array< ValueType, DeviceType, IndexType >;
+   using LocalArrayViewType = ArrayView< ValueType, DeviceType, IndexType >;
+
+   const int globalSize = 97;  // prime number to force non-uniform distribution
+
+   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+
+   DistributedNDArrayType distributedNDArray;
+
+   const int rank = CommunicatorType::GetRank(group);
+   const int nproc = CommunicatorType::GetSize(group);
+
+   DistributedNDArray_1D_test()
+   {
+      using LocalRangeType = typename DistributedNDArray::LocalRangeType;
+      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      distributedNDArray.setSizes( globalSize );
+      distributedNDArray.template setDistribution< 0 >( localRange.getBegin(), localRange.getEnd(), group );
+      distributedNDArray.allocate();
+
+      EXPECT_EQ( distributedNDArray.template getLocalRange< 0 >(), localRange );
+      EXPECT_EQ( distributedNDArray.getCommunicationGroup(), group );
+   }
+};
+
+// types for which DistributedNDArray_1D_test is instantiated
+using DistributedNDArrayTypes = ::testing::Types<
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 0 >,
+                                std::index_sequence< 0 >,
+                                Devices::Host >,
+                       Communicators::MpiCommunicator >,
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 0 >,
+                                std::index_sequence< 0 >,
+                                Devices::Host >,
+                       Communicators::NoDistrCommunicator >
+#ifdef HAVE_CUDA
+   ,
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 0 >,
+                                std::index_sequence< 0 >,
+                                Devices::Cuda >,
+                       Communicators::MpiCommunicator >,
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 0 >,
+                                std::index_sequence< 0 >,
+                                Devices::Cuda >,
+                       Communicators::NoDistrCommunicator >
+#endif
+>;
+
+TYPED_TEST_SUITE( DistributedNDArray_1D_test, DistributedNDArrayTypes );
+
+TYPED_TEST( DistributedNDArray_1D_test, checkSumOfLocalSizes )
+{
+   using CommunicatorType = typename TestFixture::CommunicatorType;
+
+   const auto localRange = this->distributedNDArray.template getLocalRange< 0 >();
+   const int localSize = localRange.getEnd() - localRange.getBegin();
+   int sumOfLocalSizes = 0;
+   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   EXPECT_EQ( sumOfLocalSizes, this->globalSize );
+   EXPECT_EQ( this->distributedNDArray.template getSize< 0 >(), this->globalSize );
+}
+
+TYPED_TEST( DistributedNDArray_1D_test, setLike )
+{
+   using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType;
+
+   const auto localRange = this->distributedNDArray.template getLocalRange< 0 >();
+   EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), localRange.getEnd() - localRange.getBegin() );
+   DistributedNDArrayType copy;
+   EXPECT_EQ( copy.getLocalStorageSize(), 0 );
+   copy.setLike( this->distributedNDArray );
+   EXPECT_EQ( copy.getLocalStorageSize(), localRange.getEnd() - localRange.getBegin() );
+}
+
+TYPED_TEST( DistributedNDArray_1D_test, reset )
+{
+   const auto localRange = this->distributedNDArray.template getLocalRange< 0 >();
+   EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), localRange.getEnd() - localRange.getBegin() );
+   this->distributedNDArray.reset();
+   EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 0 );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray, typename BufferView >
+void test_helper_setValue( DistributedArray& array, BufferView& buffer_view )
+{
+   using DeviceType = typename DistributedArray::DeviceType;
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = array.template getLocalRange< 0 >();
+   auto array_view = array.getConstView();
+   auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      buffer_view[ i - localRange.getBegin() ] = array_view( i );
+   };
+   ParallelFor< DeviceType >::exec( localRange.getBegin(), localRange.getEnd(), kernel );
+}
+
+TYPED_TEST( DistributedNDArray_1D_test, setValue )
+{
+   using LocalArrayType = typename TestFixture::LocalArrayType;
+   using LocalArrayViewType = typename TestFixture::LocalArrayViewType;
+
+   this->distributedNDArray.setValue( 1.0 );
+
+   const auto localRange = this->distributedNDArray.template getLocalRange< 0 >();
+   LocalArrayType buffer( localRange.getEnd() - localRange.getBegin() );
+   LocalArrayViewType buffer_view( buffer );
+   test_helper_setValue( this->distributedNDArray, buffer_view );
+
+   LocalArrayType expected( localRange.getEnd() - localRange.getBegin() );
+   expected.setValue( 1.0 );
+   EXPECT_EQ( buffer, expected );
+}
+
+TYPED_TEST( DistributedNDArray_1D_test, elementwiseAccess )
+{
+//   using ArrayViewType = typename TestFixture::ArrayViewType;
+   using IndexType = typename TestFixture::IndexType;
+
+   this->distributedNDArray.setValue( 0 );
+//   ArrayViewType localArrayView = this->distributedNDArray.getLocalArrayView();
+   const auto localRange = this->distributedNDArray.template getLocalRange< 0 >();
+
+   // check initial value
+   for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) {
+//      EXPECT_EQ( localArrayView.getElement( i ), 0 );
+      EXPECT_EQ( this->distributedNDArray.getElement( gi ), 0 );
+      if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value )
+         EXPECT_EQ( this->distributedNDArray[ gi ], 0 );
+   }
+
+   // use operator()
+   if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) {
+      for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) {
+         this->distributedNDArray( gi ) = gi + 1;
+      }
+
+      // check set value
+      for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) {
+//         EXPECT_EQ( localArrayView.getElement( i ), gi + 1 );
+         EXPECT_EQ( this->distributedNDArray.getElement( gi ), gi + 1 );
+         EXPECT_EQ( this->distributedNDArray( gi ), gi + 1 );
+         EXPECT_EQ( this->distributedNDArray[ gi ], gi + 1 );
+      }
+   }
+}
+
+TYPED_TEST( DistributedNDArray_1D_test, copyAssignment )
+{
+   using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType;
+
+   this->distributedNDArray.setValue( 1 );
+   DistributedNDArrayType copy;
+   copy = this->distributedNDArray;
+   // no binding, but deep copy
+//   EXPECT_NE( copy.getLocalArrayView().getData(), this->distributedNDArray.getLocalArrayView().getData() );
+//   EXPECT_EQ( copy.getLocalArrayView(), this->distributedNDArray.getLocalArrayView() );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_comparisonOperators( DistributedArray& u, DistributedArray& v, DistributedArray& w )
+{
+   using DeviceType = typename DistributedArray::DeviceType;
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = u.template getLocalRange< 0 >();
+   auto u_view = u.getView();
+   auto v_view = v.getView();
+   auto w_view = w.getView();
+
+   auto kernel = [=] __cuda_callable__ ( IndexType gi ) mutable
+   {
+      u_view( gi ) = gi;
+      v_view( gi ) = gi;
+      w_view( gi ) = 2 * gi;
+   };
+   ParallelFor< DeviceType >::exec( localRange.getBegin(), localRange.getEnd(), kernel );
+}
+
+TYPED_TEST( DistributedNDArray_1D_test, comparisonOperators )
+{
+   using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType;
+
+   DistributedNDArrayType& u = this->distributedNDArray;
+   DistributedNDArrayType v, w;
+   v.setLike( u );
+   w.setLike( u );
+
+   test_helper_comparisonOperators( u, v, w );
+
+   EXPECT_TRUE( u == u );
+   EXPECT_TRUE( u == v );
+   EXPECT_TRUE( v == u );
+   EXPECT_FALSE( u != v );
+   EXPECT_FALSE( v != u );
+   EXPECT_TRUE( u != w );
+   EXPECT_TRUE( w != u );
+   EXPECT_FALSE( u == w );
+   EXPECT_FALSE( w == u );
+
+   v.reset();
+   EXPECT_FALSE( u == v );
+   u.reset();
+   EXPECT_TRUE( u == v );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forAll( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forAll( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 );
+
+   a.setValue( 0 );
+   a_view.forAll( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 );
+}
+
+TYPED_TEST( DistributedNDArray_1D_test, forAll )
+{
+   test_helper_forAll( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forInternal( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forInternal( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   {
+      if( gi == 0 || gi == a.template getSize< 0 >() - 1 )
+         EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+      else
+         EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+   }
+
+   a.setValue( 0 );
+   a_view.forInternal( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   {
+      if( gi == 0 || gi == a.template getSize< 0 >() - 1 )
+         EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+      else
+         EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+   }
+}
+
+TYPED_TEST( DistributedNDArray_1D_test, forInternal )
+{
+   test_helper_forInternal( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forLocalInternal( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) += 1;
+   };
+
+   a.setValue( 0 );
+   // equivalent to forAll because all overlaps are 0
+   a.forLocalInternal( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+
+   a.setValue( 0 );
+   // equivalent to forAll because all overlaps are 0
+   a_view.forLocalInternal( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArray_1D_test, forLocalInternal )
+{
+   test_helper_forLocalInternal( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forBoundary( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forBoundary( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   {
+      if( gi == 0 || gi == a.template getSize< 0 >() - 1 )
+         EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+      else
+         EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+   }
+
+   a.setValue( 0 );
+   a_view.forBoundary( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   {
+      if( gi == 0 || gi == a.template getSize< 0 >() - 1 )
+         EXPECT_EQ( a.getElement( gi ), 1 )
+            << "gi = " << gi;
+      else
+         EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+   }
+}
+
+TYPED_TEST( DistributedNDArray_1D_test, forBoundary )
+{
+   test_helper_forBoundary( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forLocalBoundary( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) += 1;
+   };
+
+   a.setValue( 0 );
+   // empty set because all overlaps are 0
+   a.forLocalBoundary( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+
+   a.setValue( 0 );
+   // empty set because all overlaps are 0
+   a_view.forLocalBoundary( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArray_1D_test, forLocalBoundary )
+{
+   test_helper_forLocalBoundary( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forOverlaps( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 0 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType i ) mutable
+   {
+      a_view( i ) += 1;
+   };
+
+   a.setValue( 0 );
+   // empty set because all overlaps are 0
+   a.forOverlaps( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+
+   a.setValue( 0 );
+   // empty set because all overlaps are 0
+   a_view.forOverlaps( setter );
+
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      EXPECT_EQ( a.getElement( gi ), 0 )
+            << "gi = " << gi;
+}
+
+TYPED_TEST( DistributedNDArray_1D_test, forOverlaps )
+{
+   test_helper_forOverlaps( this->distributedNDArray );
+}
+
+#endif  // HAVE_GTEST
+
+
+#if (defined(HAVE_GTEST) && defined(HAVE_MPI))
+using CommunicatorType = Communicators::MpiCommunicator;
+
+#include <sstream>
+
+class MinimalistBufferedPrinter
+: public ::testing::EmptyTestEventListener
+{
+private:
+   std::stringstream sout;
+
+public:
+   // Called before a test starts.
+   virtual void OnTestStart(const ::testing::TestInfo& test_info)
+   {
+      sout << test_info.test_case_name() << "." << test_info.name() << " Start." << std::endl;
+   }
+
+   // Called after a failed assertion or a SUCCEED() invocation.
+   virtual void OnTestPartResult(const ::testing::TestPartResult& test_part_result)
+   {
+      sout << (test_part_result.failed() ? "====Failure=== " : "===Success=== ")
+           << test_part_result.file_name() << " "
+           << test_part_result.line_number() <<std::endl
+           << test_part_result.summary() <<std::endl;
+   }
+
+   // Called after a test ends.
+   virtual void OnTestEnd(const ::testing::TestInfo& test_info)
+   {
+      const int rank = CommunicatorType::GetRank(CommunicatorType::AllGroup);
+      sout << test_info.test_case_name() << "." << test_info.name() << " End." <<std::endl;
+      std::cout << rank << ":" << std::endl << sout.str()<< std::endl;
+      sout.str( std::string() );
+      sout.clear();
+   }
+};
+#endif
+
+#include "../../GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+
+   #ifdef HAVE_MPI
+      ::testing::TestEventListeners& listeners =
+         ::testing::UnitTest::GetInstance()->listeners();
+
+      delete listeners.Release(listeners.default_result_printer());
+      listeners.Append(new MinimalistBufferedPrinter);
+
+      Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
+   #endif
+   return RUN_ALL_TESTS();
+#else
+   throw GtestMissingError();
+#endif
+}
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cpp b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..720641ed9bd8c1122782f6139cecd33d59ef29e9
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cpp
@@ -0,0 +1 @@
+#include "DistributedNDArray_semi1D_test.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cu b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..720641ed9bd8c1122782f6139cecd33d59ef29e9
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cu
@@ -0,0 +1 @@
+#include "DistributedNDArray_semi1D_test.h"
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
new file mode 100644
index 0000000000000000000000000000000000000000..33390a33c8a230d2946f54569e211a4a711713d0
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h
@@ -0,0 +1,541 @@
+/***************************************************************************
+                          DistributedNDArray_semi1D_test.h  -  description
+                             -------------------
+    begin                : Dec 27, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/NoDistrCommunicator.h>
+#include <TNL/Communicators/ScopedInitializer.h>
+#include <TNL/Containers/DistributedNDArray.h>
+#include <TNL/Containers/DistributedNDArrayView.h>
+#include <TNL/Containers/ArrayView.h>
+#include <TNL/Containers/Partitioner.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+/*
+ * Light check of DistributedNDArray.
+ *
+ * - Number of processes is not limited.
+ * - Global size is hardcoded as 97 to force non-uniform distribution.
+ * - Communication group is hardcoded as AllGroup -- it may be changed as needed.
+ */
+template< typename DistributedNDArray >
+class DistributedNDArray_semi1D_test
+: public ::testing::Test
+{
+protected:
+   using ValueType = typename DistributedNDArray::ValueType;
+   using DeviceType = typename DistributedNDArray::DeviceType;
+   using CommunicatorType = typename DistributedNDArray::CommunicatorType;
+   using IndexType = typename DistributedNDArray::IndexType;
+   using DistributedNDArrayType = DistributedNDArray;
+
+   // TODO: use ndarray
+   using LocalArrayType = Array< ValueType, DeviceType, IndexType >;
+   using LocalArrayViewType = ArrayView< ValueType, DeviceType, IndexType >;
+
+   const int globalSize = 97;  // prime number to force non-uniform distribution
+
+   const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
+
+   DistributedNDArrayType distributedNDArray;
+
+   const int rank = CommunicatorType::GetRank(group);
+   const int nproc = CommunicatorType::GetSize(group);
+
+   DistributedNDArray_semi1D_test()
+   {
+      using LocalRangeType = typename DistributedNDArray::LocalRangeType;
+      const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
+      distributedNDArray.setSizes( 0, globalSize, globalSize / 2 );
+      distributedNDArray.template setDistribution< 1 >( localRange.getBegin(), localRange.getEnd(), group );
+      distributedNDArray.allocate();
+
+      EXPECT_EQ( distributedNDArray.template getLocalRange< 1 >(), localRange );
+      EXPECT_EQ( distributedNDArray.getCommunicationGroup(), group );
+   }
+};
+
+// types for which DistributedNDArray_semi1D_test is instantiated
+using DistributedNDArrayTypes = ::testing::Types<
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 9, 0, 0 >,  // Q, X, Y, Z
+                                std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
+                                Devices::Host >,
+                       Communicators::MpiCommunicator >
+#ifdef HAVE_CUDA
+   ,
+   DistributedNDArray< NDArray< double,
+                                SizesHolder< int, 9, 0, 0 >,  // Q, X, Y, Z
+                                std::index_sequence< 0, 1, 2 >,  // permutation - should not matter
+                                Devices::Cuda >,
+                       Communicators::NoDistrCommunicator >
+#endif
+>;
+
+TYPED_TEST_SUITE( DistributedNDArray_semi1D_test, DistributedNDArrayTypes );
+
+TYPED_TEST( DistributedNDArray_semi1D_test, checkSumOfLocalSizes )
+{
+   using CommunicatorType = typename TestFixture::CommunicatorType;
+
+   const auto localRange = this->distributedNDArray.template getLocalRange< 1 >();
+   const int localSize = localRange.getEnd() - localRange.getBegin();
+   int sumOfLocalSizes = 0;
+   CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group );
+   EXPECT_EQ( sumOfLocalSizes, this->globalSize );
+   EXPECT_EQ( this->distributedNDArray.template getSize< 1 >(), this->globalSize );
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, setLike )
+{
+   using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType;
+
+   const auto localRange = this->distributedNDArray.template getLocalRange< 1 >();
+   EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 9 * (localRange.getEnd() - localRange.getBegin()) * (this->globalSize / 2) );
+   DistributedNDArrayType copy;
+   EXPECT_EQ( copy.getLocalStorageSize(), 0 );
+   copy.setLike( this->distributedNDArray );
+   EXPECT_EQ( copy.getLocalStorageSize(), 9 * (localRange.getEnd() - localRange.getBegin()) * (this->globalSize / 2) );
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, reset )
+{
+   const auto localRange = this->distributedNDArray.template getLocalRange< 1 >();
+   EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 9 * (localRange.getEnd() - localRange.getBegin()) * (this->globalSize / 2) );
+   this->distributedNDArray.reset();
+   EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 0 );
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, elementwiseAccess )
+{
+//   using ArrayViewType = typename TestFixture::ArrayViewType;
+   using IndexType = typename TestFixture::IndexType;
+
+   this->distributedNDArray.setValue( 0 );
+//   ArrayViewType localArrayView = this->distributedNDArray.getLocalArrayView();
+   const auto localRange = this->distributedNDArray.template getLocalRange< 1 >();
+
+   // check initial value
+   for( int q = 0; q < 9; q++ )
+   for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < this->distributedNDArray.template getSize< 2 >(); j++ ) {
+//      EXPECT_EQ( localArrayView.getElement( i ), 0 );
+      EXPECT_EQ( this->distributedNDArray.getElement( q, gi, j ), 0 );
+   }
+
+   // use operator()
+   if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) {
+      for( int q = 0; q < 9; q++ )
+      for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      for( int j = 0; j < this->distributedNDArray.template getSize< 2 >(); j++ ) {
+         this->distributedNDArray( q, gi, j ) = gi + 1;
+      }
+
+      // check set value
+      for( int q = 0; q < 9; q++ )
+      for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+      for( int j = 0; j < this->distributedNDArray.template getSize< 2 >(); j++ ) {
+         EXPECT_EQ( this->distributedNDArray.getElement( q, gi, j ), gi + 1 );
+         EXPECT_EQ( this->distributedNDArray( q, gi, j ), gi + 1 );
+      }
+   }
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, copyAssignment )
+{
+   using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType;
+
+   this->distributedNDArray.setValue( 1 );
+   DistributedNDArrayType copy;
+   copy = this->distributedNDArray;
+   // no binding, but deep copy
+//   EXPECT_NE( copy.getLocalArrayView().getData(), this->distributedNDArray.getLocalArrayView().getData() );
+//   EXPECT_EQ( copy.getLocalArrayView(), this->distributedNDArray.getLocalArrayView() );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_comparisonOperators( DistributedArray& u, DistributedArray& v, DistributedArray& w )
+{
+   using DeviceType = typename DistributedArray::DeviceType;
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = u.template getLocalRange< 1 >();
+   auto u_view = u.getView();
+   auto v_view = v.getView();
+   auto w_view = w.getView();
+
+   auto kernel = [=] __cuda_callable__ ( IndexType q, IndexType gi, IndexType j ) mutable
+   {
+      u_view( q, gi, j ) = gi;
+      v_view( q, gi, j ) = gi;
+      w_view( q, gi, j ) = 2 * gi;
+   };
+   ParallelFor3D< DeviceType >::exec( (IndexType) 0, localRange.getBegin(), (IndexType) 0,
+                                      9, localRange.getEnd(), u.template getSize< 2 >(),
+                                      kernel );
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, comparisonOperators )
+{
+   using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType;
+
+   DistributedNDArrayType& u = this->distributedNDArray;
+   DistributedNDArrayType v, w;
+   v.setLike( u );
+   w.setLike( u );
+
+   test_helper_comparisonOperators( u, v, w );
+
+   EXPECT_TRUE( u == u );
+   EXPECT_TRUE( u == v );
+   EXPECT_TRUE( v == u );
+   EXPECT_FALSE( u != v );
+   EXPECT_FALSE( v != u );
+   EXPECT_TRUE( u != w );
+   EXPECT_TRUE( w != u );
+   EXPECT_FALSE( u == w );
+   EXPECT_FALSE( w == u );
+
+   v.reset();
+   EXPECT_FALSE( u == v );
+   u.reset();
+   EXPECT_TRUE( u == v );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forAll( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( q, i, j ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forAll( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 );
+
+   a.setValue( 0 );
+   a_view.forAll( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 );
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, forAll )
+{
+   test_helper_forAll( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forInternal( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( q, i, j ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forInternal( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+   {
+      if( q == 0 || q == 8 ||
+          gi == 0 || gi == a.template getSize< 1 >() - 1 ||
+          j == 0 || j == a.template getSize< 2 >() - 1 )
+         EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+      else
+         EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+   }
+
+   a.setValue( 0 );
+   a_view.forInternal( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+   {
+      if( q == 0 || q == 8 ||
+          gi == 0 || gi == a.template getSize< 1 >() - 1 ||
+          j == 0 || j == a.template getSize< 2 >() - 1 )
+         EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+      else
+         EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+   }
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, forInternal )
+{
+   test_helper_forInternal( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forLocalInternal( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( q, i, j ) += 1;
+   };
+
+   a.setValue( 0 );
+   // equivalent to forAll because all overlaps are 0
+   a.forLocalInternal( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 );
+
+   a.setValue( 0 );
+   // equivalent to forAll because all overlaps are 0
+   a_view.forLocalInternal( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 1 );
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, forLocalInternal )
+{
+   test_helper_forLocalInternal( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forBoundary( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( q, i, j ) += 1;
+   };
+
+   a.setValue( 0 );
+   a.forBoundary( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+   {
+      if( q == 0 || q == 8 ||
+          gi == 0 || gi == a.template getSize< 1 >() - 1 ||
+          j == 0 || j == a.template getSize< 2 >() - 1 )
+         EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+      else
+         EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+   }
+
+   a.setValue( 0 );
+   a_view.forBoundary( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+   {
+      if( q == 0 || q == 8 ||
+          gi == 0 || gi == a.template getSize< 1 >() - 1 ||
+          j == 0 || j == a.template getSize< 2 >() - 1 )
+         EXPECT_EQ( a.getElement( q, gi, j ), 1 )
+            << "gi = " << gi;
+      else
+         EXPECT_EQ( a.getElement( q, gi, j ), 0 )
+            << "gi = " << gi;
+   }
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, forBoundary )
+{
+   test_helper_forBoundary( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forLocalBoundary( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( q, i, j ) += 1;
+   };
+
+   a.setValue( 0 );
+   // empty set because all overlaps are 0
+   a.forLocalBoundary( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 );
+
+   a.setValue( 0 );
+   // empty set because all overlaps are 0
+   a_view.forLocalBoundary( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 );
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, forLocalBoundary )
+{
+   test_helper_forLocalBoundary( this->distributedNDArray );
+}
+
+// separate function because nvcc does not allow __cuda_callable__ lambdas inside
+// private or protected methods (which are created by TYPED_TEST macro)
+template< typename DistributedArray >
+void test_helper_forOverlaps( DistributedArray& a )
+{
+   using IndexType = typename DistributedArray::IndexType;
+
+   const auto localRange = a.template getLocalRange< 1 >();
+   auto a_view = a.getView();
+
+   auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable
+   {
+      a_view( q, i, j ) += 1;
+   };
+
+   a.setValue( 0 );
+   // empty set because all overlaps are 0
+   a.forOverlaps( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 );
+
+   a.setValue( 0 );
+   // empty set because all overlaps are 0
+   a_view.forOverlaps( setter );
+
+   for( int q = 0; q < 9; q++ )
+   for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ )
+   for( int j = 0; j < a.template getSize< 2 >(); j++ )
+      EXPECT_EQ( a.getElement( q, gi, j ), 0 );
+}
+
+TYPED_TEST( DistributedNDArray_semi1D_test, forOverlaps )
+{
+   test_helper_forOverlaps( this->distributedNDArray );
+}
+
+#endif  // HAVE_GTEST
+
+
+#if (defined(HAVE_GTEST) && defined(HAVE_MPI))
+using CommunicatorType = Communicators::MpiCommunicator;
+
+#include <sstream>
+
+class MinimalistBufferedPrinter
+: public ::testing::EmptyTestEventListener
+{
+private:
+   std::stringstream sout;
+
+public:
+   // Called before a test starts.
+   virtual void OnTestStart(const ::testing::TestInfo& test_info)
+   {
+      sout << test_info.test_case_name() << "." << test_info.name() << " Start." << std::endl;
+   }
+
+   // Called after a failed assertion or a SUCCEED() invocation.
+   virtual void OnTestPartResult(const ::testing::TestPartResult& test_part_result)
+   {
+      sout << (test_part_result.failed() ? "====Failure=== " : "===Success=== ")
+           << test_part_result.file_name() << " "
+           << test_part_result.line_number() <<std::endl
+           << test_part_result.summary() <<std::endl;
+   }
+
+   // Called after a test ends.
+   virtual void OnTestEnd(const ::testing::TestInfo& test_info)
+   {
+      const int rank = CommunicatorType::GetRank(CommunicatorType::AllGroup);
+      sout << test_info.test_case_name() << "." << test_info.name() << " End." <<std::endl;
+      std::cout << rank << ":" << std::endl << sout.str()<< std::endl;
+      sout.str( std::string() );
+      sout.clear();
+   }
+};
+#endif
+
+#include "../../GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+
+   #ifdef HAVE_MPI
+      ::testing::TestEventListeners& listeners =
+         ::testing::UnitTest::GetInstance()->listeners();
+
+      delete listeners.Release(listeners.default_result_printer());
+      listeners.Append(new MinimalistBufferedPrinter);
+
+      Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv);
+   #endif
+   return RUN_ALL_TESTS();
+#else
+   throw GtestMissingError();
+#endif
+}
diff --git a/src/UnitTests/Containers/ndarray/NDArrayTest.cpp b/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d0e9222189724207463b9b3b6a5ebebe3e568521
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/NDArrayTest.cpp
@@ -0,0 +1 @@
+#include "NDArrayTest.h"
diff --git a/src/UnitTests/Containers/ndarray/NDArrayTest.cu b/src/UnitTests/Containers/ndarray/NDArrayTest.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d0e9222189724207463b9b3b6a5ebebe3e568521
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/NDArrayTest.cu
@@ -0,0 +1 @@
+#include "NDArrayTest.h"
diff --git a/src/UnitTests/Containers/ndarray/NDArrayTest.h b/src/UnitTests/Containers/ndarray/NDArrayTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e5d9a30cedf4c25b4e56d1c3a44511588848d86
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/NDArrayTest.h
@@ -0,0 +1,1338 @@
+#include "gtest/gtest.h"
+
+#include <TNL/Containers/NDArray.h>
+
+using namespace TNL::Containers;
+using std::index_sequence;
+
+template< typename Array >
+void expect_identity( const Array& a )
+{
+    Array identity;
+    identity.setLike( a );
+    int last = 0;
+    for( int i = 0; i < identity.getSize(); i++ ) {
+        // skip negative/invalid entries due to alignment
+        if( a[ i ] < 0 )
+            identity[ i ] = a[ i ];
+        else
+            identity[ i ] = last++;
+    }
+    EXPECT_EQ( a, identity );
+}
+
+TEST( NDArrayTest, setLike )
+{
+    int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+
+    decltype(a) b;
+    EXPECT_EQ( b.template getSize< 0 >(), 0 );
+    EXPECT_EQ( b.template getSize< 1 >(), 0 );
+    EXPECT_EQ( b.template getSize< 2 >(), 0 );
+    EXPECT_EQ( b.template getSize< 3 >(), 0 );
+    EXPECT_EQ( b.template getSize< 4 >(), 0 );
+    EXPECT_EQ( b.template getSize< 5 >(), 0 );
+    b.setLike( a );
+    EXPECT_EQ( b.template getSize< 0 >(), I );
+    EXPECT_EQ( b.template getSize< 1 >(), J );
+    EXPECT_EQ( b.template getSize< 2 >(), K );
+    EXPECT_EQ( b.template getSize< 3 >(), L );
+    EXPECT_EQ( b.template getSize< 4 >(), M );
+    EXPECT_EQ( b.template getSize< 5 >(), N );
+}
+
+TEST( NDArrayTest, reset )
+{
+    int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+    EXPECT_EQ( a.template getSize< 0 >(), I );
+    EXPECT_EQ( a.template getSize< 1 >(), J );
+    EXPECT_EQ( a.template getSize< 2 >(), K );
+    EXPECT_EQ( a.template getSize< 3 >(), L );
+    EXPECT_EQ( a.template getSize< 4 >(), M );
+    EXPECT_EQ( a.template getSize< 5 >(), N );
+
+    a.reset();
+    EXPECT_EQ( a.template getSize< 0 >(), 0 );
+    EXPECT_EQ( a.template getSize< 1 >(), 0 );
+    EXPECT_EQ( a.template getSize< 2 >(), 0 );
+    EXPECT_EQ( a.template getSize< 3 >(), 0 );
+    EXPECT_EQ( a.template getSize< 4 >(), 0 );
+    EXPECT_EQ( a.template getSize< 5 >(), 0 );
+}
+
+TEST( NDArrayTest, Static_1D )
+{
+    constexpr int I = 3;
+    NDArray< int, SizesHolder< int, I > > a;
+    a.setSizes( 0 );
+
+    int v = 0;
+    for( int i = 0; i < I; i++ ) {
+        a( i ) = v++;
+        EXPECT_EQ( a[ i ], a( i ) );
+    }
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( NDArrayTest, Static_2D_Identity )
+{
+    constexpr int I = 3, J = 5;
+    NDArray< int, SizesHolder< int, I, J > > a;
+    a.setSizes( 0, 0 );
+
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+        for( int j = 0; j < J; j++ )
+            a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( NDArrayTest, Static_2D_Permuted )
+{
+    constexpr int I = 3, J = 5;
+    NDArray< int,
+             SizesHolder< int, I, J >,
+             index_sequence< 1, 0 > > a;
+    a.setSizes( 0, 0 );
+
+    int v = 0;
+    for( int j = 0; j < J; j++ )
+        for( int i = 0; i < I; i++ )
+            a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( NDArrayTest, Dynamic_6D )
+{
+    int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+
+    // initialize entries invalid due to alignment to -1
+    a.getStorageArray().setValue( -1 );
+
+    int v = 0;
+    for( int n = 0; n < N; n++ )
+        for( int l = 0; l < L; l++ )
+            for( int m = 0; m < M; m++ )
+                for( int k = 0; k < K; k++ )
+                    for( int i = 0; i < I; i++ )
+                        for( int j = 0; j < J; j++ )
+                            a( i, j, k, l, m, n ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( NDArrayTest, CopySemantics )
+{
+    constexpr int I = 3, J = 4;
+    NDArray< int, SizesHolder< int, 0, 0 > > a;
+    a.setSizes( I, J );
+
+    auto a_view = a.getView();
+
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+
+    // assignment with zero sizes
+    NDArray< int, SizesHolder< int, 0, 0 > > b;
+    b = a;
+    auto b_view = b.getView();
+    EXPECT_EQ( a, b );
+    EXPECT_EQ( a_view, b_view );
+    EXPECT_EQ( a_view.getView(), b_view );
+    EXPECT_EQ( a_view.getConstView(), b_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), b.getConstView() );
+    EXPECT_EQ( a.getConstView(), b_view.getConstView() );
+
+    // assignment between views
+    NDArray< int, SizesHolder< int, 0, 0 > > c;
+    c.setSizes( I, J );
+    auto c_view = c.getView();
+    c_view = a_view;
+    EXPECT_EQ( a, c );
+    EXPECT_EQ( a_view, c_view );
+    EXPECT_EQ( a_view.getView(), c_view );
+    EXPECT_EQ( a_view.getConstView(), c_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), c.getConstView() );
+    EXPECT_EQ( a.getConstView(), c_view.getConstView() );
+
+    // move-assignment between views should do a deep copy
+    b_view = a.getView();
+    EXPECT_EQ( a_view, b_view );
+    EXPECT_EQ( a, b );
+    EXPECT_NE( &b_view( 0, 0 ), &a_view( 0, 0 ) );
+
+    // assignment of view to array
+    c.setValue( 0 );
+    c = a_view;
+    EXPECT_EQ( a, c );
+    EXPECT_EQ( a_view, c_view );
+    EXPECT_EQ( a_view.getView(), c_view );
+    EXPECT_EQ( a_view.getConstView(), c_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), c.getConstView() );
+    EXPECT_EQ( a.getConstView(), c_view.getConstView() );
+
+    // assignment of array to view
+    c.setValue( 0 );
+    c_view = a;
+    EXPECT_EQ( a, c );
+    EXPECT_EQ( a_view, c_view );
+    EXPECT_EQ( a_view.getView(), c_view );
+    EXPECT_EQ( a_view.getConstView(), c_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), c.getConstView() );
+    EXPECT_EQ( a.getConstView(), c_view.getConstView() );
+
+    // assignment with different ValueType
+    NDArray< double, SizesHolder< int, 0, 0 > > d;
+    d = a;
+    expect_identity( d.getStorageArray() );
+
+    // assignment with different SizesHolder
+    NDArray< double, SizesHolder< int, I, J > > e;
+    e = a;
+    expect_identity( e.getStorageArray() );
+
+    // assignment with different IndexType
+    NDArray< double, SizesHolder< short int, 0, 0 > > f;
+    f = a;
+    expect_identity( f.getStorageArray() );
+
+    // assignment with different Permutation
+    // TODO
+}
+
+#ifdef HAVE_CUDA
+TEST( NDArrayTest, CopySemanticsCrossDevice )
+{
+    constexpr int I = 3, J = 4;
+    NDArray< int, SizesHolder< int, 0, 0 > > a;
+    NDArray< int, SizesHolder< int, 0, 0 >,
+             std::index_sequence< 0, 1 >,
+             TNL::Devices::Cuda > da;
+    a.setSizes( I, J );
+    da.setSizes( I, J );
+
+    auto a_view = a.getView();
+    auto da_view = da.getView();
+
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+
+    // copy to the device, simple check
+    da = a;
+    EXPECT_EQ( da.getStorageArray(), a.getStorageArray() );
+
+    // assignment with zero sizes
+    NDArray< int, SizesHolder< int, 0, 0 > > b;
+    b = da;
+    auto b_view = b.getView();
+    EXPECT_EQ( a, b );
+    EXPECT_EQ( a_view, b_view );
+    EXPECT_EQ( a_view.getView(), b_view );
+    EXPECT_EQ( a_view.getConstView(), b_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), b.getConstView() );
+    EXPECT_EQ( a.getConstView(), b_view.getConstView() );
+
+    // assignment between views
+    NDArray< int, SizesHolder< int, 0, 0 > > c;
+    c.setSizes( I, J );
+    auto c_view = c.getView();
+    c_view = da_view;
+    EXPECT_EQ( a, c );
+    EXPECT_EQ( a_view, c_view );
+    EXPECT_EQ( a_view.getView(), c_view );
+    EXPECT_EQ( a_view.getConstView(), c_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), c.getConstView() );
+    EXPECT_EQ( a.getConstView(), c_view.getConstView() );
+
+    // move-assignment between views should do a deep copy
+    b_view = da.getView();
+    EXPECT_EQ( a_view, b_view );
+    EXPECT_EQ( a, b );
+    EXPECT_NE( &b_view( 0, 0 ), &a_view( 0, 0 ) );
+
+    // assignment of view to array
+    c.setValue( 0 );
+    c = da_view;
+    EXPECT_EQ( a, c );
+    EXPECT_EQ( a_view, c_view );
+    EXPECT_EQ( a_view.getView(), c_view );
+    EXPECT_EQ( a_view.getConstView(), c_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), c.getConstView() );
+    EXPECT_EQ( a.getConstView(), c_view.getConstView() );
+
+    // assignment of array to view
+    c.setValue( 0 );
+    c_view = da;
+    EXPECT_EQ( a, c );
+    EXPECT_EQ( a_view, c_view );
+    EXPECT_EQ( a_view.getView(), c_view );
+    EXPECT_EQ( a_view.getConstView(), c_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), c.getConstView() );
+    EXPECT_EQ( a.getConstView(), c_view.getConstView() );
+
+    // assignment with different ValueType
+    NDArray< double, SizesHolder< int, 0, 0 > > d;
+    d = da;
+    expect_identity( d.getStorageArray() );
+
+    // assignment with different SizesHolder
+    NDArray< double, SizesHolder< int, I, J > > e;
+    e = da;
+    expect_identity( e.getStorageArray() );
+
+    // assignment with different IndexType
+    NDArray< double, SizesHolder< short int, 0, 0 > > f;
+    f = da;
+    expect_identity( f.getStorageArray() );
+
+    // assignment with different Permutation
+    // TODO
+}
+#endif
+
+TEST( NDArrayTest, SizesHolderPrinter )
+{
+   SizesHolder< int, 0, 1, 2 > holder;
+   holder.setSize< 0 >( 3 );
+
+   std::stringstream str;
+   str << holder;
+   EXPECT_EQ( str.str(), "SizesHolder< 0, 1, 2 >( 3, 1, 2 )" );
+}
+
+TEST( NDArrayTest, forAll_dynamic_1D )
+{
+    int I = 2;
+    NDArray< int,
+             SizesHolder< int, 0 >,
+             index_sequence< 0 > > a;
+    a.setSizes( I );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+        EXPECT_EQ( a( i ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_2D )
+{
+    int I = 2, J = 3;
+    NDArray< int,
+             SizesHolder< int, 0, 0 >,
+             index_sequence< 1, 0 > > a;
+    a.setSizes( I, J );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int j = 0; j < J; j++ )
+    for( int i = 0; i < I; i++ )
+        EXPECT_EQ( a( i, j ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_3D )
+{
+    int I = 2, J = 3, K = 4;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0 >,
+             index_sequence< 2, 0, 1 > > a;
+    a.setSizes( I, J, K );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        EXPECT_EQ( a( i, j, k ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_4D )
+{
+    int I = 2, J = 3, K = 4, L = 5;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0 >,
+             index_sequence< 3, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        EXPECT_EQ( a( i, j, k, l ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_5D )
+{
+    int I = 2, J = 3, K = 4, L = 5, M = 6;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0 >,
+             index_sequence< 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        EXPECT_EQ( a( i, j, k, l, m ), 1 );
+}
+
+TEST( NDArrayTest, forAll_dynamic_6D )
+{
+    int I = 2, J = 3, K = 4, L = 5, M = 6, N = 7;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+    {
+       a( i, j, k, l, m, n ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int n = 0; n < N; n++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        EXPECT_EQ( a( i, j, k, l, m, n ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_1D )
+{
+    constexpr int I = 3;
+    StaticNDArray< int, SizesHolder< int, I > > a;
+//    a.setSizes( 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+        EXPECT_EQ( a( i ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_2D )
+{
+    constexpr int I = 3, J = 4;
+    StaticNDArray< int, SizesHolder< int, I, J > > a;
+//    a.setSizes( 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        EXPECT_EQ( a( i, j ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_3D )
+{
+    constexpr int I = 3, J = 4, K = 5;
+    StaticNDArray< int, SizesHolder< int, I, J, K > > a;
+//    a.setSizes( 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    for( int k = 0; k < K; k++ )
+        EXPECT_EQ( a( i, j, k ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_4D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L > > a;
+//    a.setSizes( 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    for( int k = 0; k < K; k++ )
+    for( int l = 0; l < L; l++ )
+        EXPECT_EQ( a( i, j, k, l ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_5D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a;
+//    a.setSizes( 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    for( int k = 0; k < K; k++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+        EXPECT_EQ( a( i, j, k, l, m ), 1 );
+}
+
+TEST( NDArrayTest, forAll_static_6D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a;
+//    a.setSizes( 0, 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+    {
+       a( i, j, k, l, m, n ) += 1;
+    };
+
+    a.forAll( setter );
+
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    for( int k = 0; k < K; k++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int n = 0; n < N; n++ )
+        EXPECT_EQ( a( i, j, k, l, m, n ), 1 );
+}
+
+TEST( NDArrayTest, forInternal_dynamic_1D )
+{
+    int I = 3;
+    NDArray< int,
+             SizesHolder< int, 0 >,
+             index_sequence< 0 > > a;
+    a.setSizes( I );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 )
+            EXPECT_EQ( a( i ), 0 )
+               << "i = " << i;
+        else
+            EXPECT_EQ( a( i ), 1 )
+               << "i = " << i;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_2D )
+{
+    int I = 3, J = 4;
+    NDArray< int,
+             SizesHolder< int, 0, 0 >,
+             index_sequence< 1, 0 > > a;
+    a.setSizes( I, J );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int j = 0; j < J; j++ )
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 )
+            EXPECT_EQ( a( i, j ), 0 )
+               << "i = " << i << ", j = " << j;
+        else
+            EXPECT_EQ( a( i, j ), 1 )
+               << "i = " << i << ", j = " << j;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_3D )
+{
+    int I = 3, J = 4, K = 5;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0 >,
+             index_sequence< 2, 0, 1 > > a;
+    a.setSizes( I, J, K );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 )
+            EXPECT_EQ( a( i, j, k ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+        else
+            EXPECT_EQ( a( i, j, k ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_4D )
+{
+    int I = 3, J = 4, K = 5, L = 6;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0 >,
+             index_sequence< 3, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 )
+            EXPECT_EQ( a( i, j, k, l ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+        else
+            EXPECT_EQ( a( i, j, k, l ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_5D )
+{
+    int I = 3, J = 4, K = 5, L = 6, M = 7;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0 >,
+             index_sequence< 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 )
+            EXPECT_EQ( a( i, j, k, l, m ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+        else
+            EXPECT_EQ( a( i, j, k, l, m ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+    }
+}
+
+TEST( NDArrayTest, forInternal_dynamic_6D )
+{
+    int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+    {
+       a( i, j, k, l, m, n ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int n = 0; n < N; n++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 ||
+            n == 0 || n == N - 1 )
+            EXPECT_EQ( a( i, j, k, l, m, n ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+        else
+            EXPECT_EQ( a( i, j, k, l, m, n ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_1D )
+{
+    constexpr int I = 3;
+    StaticNDArray< int, SizesHolder< int, I > > a;
+//    a.setSizes( 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 )
+            EXPECT_EQ( a( i ), 0 )
+               << "i = " << i;
+        else
+            EXPECT_EQ( a( i ), 1 )
+               << "i = " << i;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_2D )
+{
+    constexpr int I = 3, J = 4;
+    StaticNDArray< int, SizesHolder< int, I, J > > a;
+//    a.setSizes( 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int j = 0; j < J; j++ )
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 )
+            EXPECT_EQ( a( i, j ), 0 )
+               << "i = " << i << ", j = " << j;
+        else
+            EXPECT_EQ( a( i, j ), 1 )
+               << "i = " << i << ", j = " << j;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_3D )
+{
+    constexpr int I = 3, J = 4, K = 5;
+    StaticNDArray< int, SizesHolder< int, I, J, K > > a;
+//    a.setSizes( 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 )
+            EXPECT_EQ( a( i, j, k ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+        else
+            EXPECT_EQ( a( i, j, k ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_4D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L > > a;
+//    a.setSizes( 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 )
+            EXPECT_EQ( a( i, j, k, l ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+        else
+            EXPECT_EQ( a( i, j, k, l ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_5D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a;
+//    a.setSizes( 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 )
+            EXPECT_EQ( a( i, j, k, l, m ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+        else
+            EXPECT_EQ( a( i, j, k, l, m ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+    }
+}
+
+TEST( NDArrayTest, forInternal_static_6D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a;
+//    a.setSizes( 0, 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+    {
+       a( i, j, k, l, m, n ) += 1;
+    };
+
+    a.forInternal( setter );
+
+    for( int n = 0; n < N; n++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 ||
+            n == 0 || n == N - 1 )
+            EXPECT_EQ( a( i, j, k, l, m, n ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+        else
+            EXPECT_EQ( a( i, j, k, l, m, n ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_dynamic_1D )
+{
+    int I = 3;
+    NDArray< int,
+             SizesHolder< int, 0 >,
+             index_sequence< 0 > > a;
+    a.setSizes( I );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 )
+            EXPECT_EQ( a( i ), 1 )
+               << "i = " << i;
+        else
+            EXPECT_EQ( a( i ), 0 )
+               << "i = " << i;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_dynamic_2D )
+{
+    int I = 3, J = 4;
+    NDArray< int,
+             SizesHolder< int, 0, 0 >,
+             index_sequence< 1, 0 > > a;
+    a.setSizes( I, J );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int j = 0; j < J; j++ )
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 )
+            EXPECT_EQ( a( i, j ), 1 )
+               << "i = " << i << ", j = " << j;
+        else
+            EXPECT_EQ( a( i, j ), 0 )
+               << "i = " << i << ", j = " << j;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_dynamic_3D )
+{
+    int I = 3, J = 4, K = 5;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0 >,
+             index_sequence< 2, 0, 1 > > a;
+    a.setSizes( I, J, K );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 )
+            EXPECT_EQ( a( i, j, k ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+        else
+            EXPECT_EQ( a( i, j, k ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+    }
+}
+
+// TODO: implement general ParallelBoundaryExecutor
+//TEST( NDArrayTest, forBoundary_dynamic_4D )
+//{
+//    int I = 3, J = 4, K = 5, L = 6;
+//    NDArray< int,
+//             SizesHolder< int, 0, 0, 0, 0 >,
+//             index_sequence< 3, 2, 0, 1 > > a;
+//    a.setSizes( I, J, K, L );
+//    a.setValue( 0 );
+//
+//    auto setter = [&] ( int i, int j, int k, int l )
+//    {
+//       a( i, j, k, l ) += 1;
+//    };
+//
+//    a.forBoundary( setter );
+//
+//    for( int l = 0; l < L; l++ )
+//    for( int k = 0; k < K; k++ )
+//    for( int i = 0; i < I; i++ )
+//    for( int j = 0; j < J; j++ )
+//    {
+//        if( i == 0 || i == I - 1 ||
+//            j == 0 || j == J - 1 ||
+//            k == 0 || k == K - 1 ||
+//            l == 0 || l == L - 1 )
+//            EXPECT_EQ( a( i, j, k, l ), 1 )
+//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+//        else
+//            EXPECT_EQ( a( i, j, k, l ), 0 )
+//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+//    }
+//}
+//
+//TEST( NDArrayTest, forBoundary_dynamic_5D )
+//{
+//    int I = 3, J = 4, K = 5, L = 6, M = 7;
+//    NDArray< int,
+//             SizesHolder< int, 0, 0, 0, 0, 0 >,
+//             index_sequence< 3, 4, 2, 0, 1 > > a;
+//    a.setSizes( I, J, K, L, M );
+//    a.setValue( 0 );
+//
+//    auto setter = [&] ( int i, int j, int k, int l, int m )
+//    {
+//       a( i, j, k, l, m ) += 1;
+//    };
+//
+//    a.forBoundary( setter );
+//
+//    for( int l = 0; l < L; l++ )
+//    for( int m = 0; m < M; m++ )
+//    for( int k = 0; k < K; k++ )
+//    for( int i = 0; i < I; i++ )
+//    for( int j = 0; j < J; j++ )
+//    {
+//        if( i == 0 || i == I - 1 ||
+//            j == 0 || j == J - 1 ||
+//            k == 0 || k == K - 1 ||
+//            l == 0 || l == L - 1 ||
+//            m == 0 || m == M - 1 )
+//            EXPECT_EQ( a( i, j, k, l, m ), 1 )
+//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+//        else
+//            EXPECT_EQ( a( i, j, k, l, m ), 0 )
+//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+//    }
+//}
+//
+//TEST( NDArrayTest, forBoundary_dynamic_6D )
+//{
+//    int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
+//    NDArray< int,
+//             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+//             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+//    a.setSizes( I, J, K, L, M, N );
+//    a.setValue( 0 );
+//
+//    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+//    {
+//       a( i, j, k, l, m, n ) += 1;
+//    };
+//
+//    a.forBoundary( setter );
+//
+//    for( int n = 0; n < N; n++ )
+//    for( int l = 0; l < L; l++ )
+//    for( int m = 0; m < M; m++ )
+//    for( int k = 0; k < K; k++ )
+//    for( int i = 0; i < I; i++ )
+//    for( int j = 0; j < J; j++ )
+//    {
+//        if( i == 0 || i == I - 1 ||
+//            j == 0 || j == J - 1 ||
+//            k == 0 || k == K - 1 ||
+//            l == 0 || l == L - 1 ||
+//            m == 0 || m == M - 1 ||
+//            n == 0 || n == N - 1 )
+//            EXPECT_EQ( a( i, j, k, l, m, n ), 1 )
+//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+//        else
+//            EXPECT_EQ( a( i, j, k, l, m, n ), 0 )
+//               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+//    }
+//}
+
+TEST( NDArrayTest, forBoundary_static_1D )
+{
+    constexpr int I = 3;
+    StaticNDArray< int, SizesHolder< int, I > > a;
+//    a.setSizes( 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i )
+    {
+       a( i ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 )
+            EXPECT_EQ( a( i ), 1 )
+               << "i = " << i;
+        else
+            EXPECT_EQ( a( i ), 0 )
+               << "i = " << i;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_static_2D )
+{
+    constexpr int I = 3, J = 4;
+    StaticNDArray< int, SizesHolder< int, I, J > > a;
+//    a.setSizes( 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j )
+    {
+       a( i, j ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int j = 0; j < J; j++ )
+    for( int i = 0; i < I; i++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 )
+            EXPECT_EQ( a( i, j ), 1 )
+               << "i = " << i << ", j = " << j;
+        else
+            EXPECT_EQ( a( i, j ), 0 )
+               << "i = " << i << ", j = " << j;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_static_3D )
+{
+    constexpr int I = 3, J = 4, K = 5;
+    StaticNDArray< int, SizesHolder< int, I, J, K > > a;
+//    a.setSizes( 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k )
+    {
+       a( i, j, k ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 )
+            EXPECT_EQ( a( i, j, k ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+        else
+            EXPECT_EQ( a( i, j, k ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_static_4D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L > > a;
+//    a.setSizes( 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l )
+    {
+       a( i, j, k, l ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 )
+            EXPECT_EQ( a( i, j, k, l ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+        else
+            EXPECT_EQ( a( i, j, k, l ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_static_5D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a;
+//    a.setSizes( 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m )
+    {
+       a( i, j, k, l, m ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 )
+            EXPECT_EQ( a( i, j, k, l, m ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+        else
+            EXPECT_EQ( a( i, j, k, l, m ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m;
+    }
+}
+
+TEST( NDArrayTest, forBoundary_static_6D )
+{
+    constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8;
+    StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a;
+//    a.setSizes( 0, 0, 0, 0, 0, 0 );
+    a.setValue( 0 );
+
+    auto setter = [&] ( int i, int j, int k, int l, int m, int n )
+    {
+       a( i, j, k, l, m, n ) += 1;
+    };
+
+    a.forBoundary( setter );
+
+    for( int n = 0; n < N; n++ )
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ )
+    for( int k = 0; k < K; k++ )
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+    {
+        if( i == 0 || i == I - 1 ||
+            j == 0 || j == J - 1 ||
+            k == 0 || k == K - 1 ||
+            l == 0 || l == L - 1 ||
+            m == 0 || m == M - 1 ||
+            n == 0 || n == N - 1 )
+            EXPECT_EQ( a( i, j, k, l, m, n ), 1 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+        else
+            EXPECT_EQ( a( i, j, k, l, m, n ), 0 )
+               << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n;
+    }
+}
+
+//#include "GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+//#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+   return RUN_ALL_TESTS();
+//#else
+//   throw GtestMissingError();
+//#endif
+}
diff --git a/src/UnitTests/Containers/ndarray/NDSubarrayTest.cpp b/src/UnitTests/Containers/ndarray/NDSubarrayTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1b57eed281183f4163501fabdc9027254e255eb9
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/NDSubarrayTest.cpp
@@ -0,0 +1,405 @@
+#include "gtest/gtest.h"
+
+#include <TNL/Containers/NDArray.h>
+
+using namespace TNL::Containers;
+using std::index_sequence;
+
+// wrapper around static_assert to get the type names in the error message
+template< typename Permutation, typename ExpectedPermutation >
+void check_permutation()
+{
+    static_assert( std::is_same< Permutation, ExpectedPermutation >::value,
+                   "The permutation is not the same as the expected permutation." );
+}
+
+TEST( NDArraySubarrayTest, StaticAsserts )
+{
+    using namespace TNL::Containers::__ndarray_impl;
+
+//    auto is_even = [](int _in) {return _in % 2 == 0;};
+    using expected_type = std::integer_sequence<int, 0, 2, 4, 6, 8>;
+    using test_type = std::integer_sequence<int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9>;
+//    constexpr auto result = filter_sequence(test_type{}, is_even);
+    constexpr auto result = filter_sequence< expected_type >(test_type{});
+    using result_type = std::decay_t<decltype(result)>;
+    static_assert(std::is_same<expected_type, result_type>::value, "Integer sequences should be equal");
+
+
+
+    using Permutation = std::integer_sequence< std::size_t, 5, 3, 1, 4, 2, 6, 0 >;
+    {
+        using Dimensions = std::integer_sequence< std::size_t, 3, 4, 6 >;
+        using Subpermutation = typename SubpermutationGetter< Dimensions, Permutation >::Subpermutation;
+        check_permutation< Subpermutation,
+                           std::integer_sequence< std::size_t, 0, 1, 2 > >();
+    }
+    {
+        using Dimensions = std::integer_sequence< std::size_t, 1, 4, 2 >;
+        using Subpermutation = typename SubpermutationGetter< Dimensions, Permutation >::Subpermutation;
+        check_permutation< Subpermutation,
+                           std::integer_sequence< std::size_t, 0, 2, 1 > >();
+    }
+    {
+        using Dimensions = std::integer_sequence< std::size_t, 5, 1, 6 >;
+        using Subpermutation = typename SubpermutationGetter< Dimensions, Permutation >::Subpermutation;
+        check_permutation< Subpermutation,
+                           std::integer_sequence< std::size_t, 1, 0, 2 > >();
+    }
+    {
+        using Dimensions = std::integer_sequence< std::size_t, 5, 1, 2 >;
+        using Subpermutation = typename SubpermutationGetter< Dimensions, Permutation >::Subpermutation;
+        check_permutation< Subpermutation,
+                           std::integer_sequence< std::size_t, 2, 0, 1 > >();
+    }
+    {
+        using Dimensions = std::integer_sequence< std::size_t, 2, 3, 4 >;
+        using Subpermutation = typename SubpermutationGetter< Dimensions, Permutation >::Subpermutation;
+        check_permutation< Subpermutation,
+                           std::integer_sequence< std::size_t, 1, 2, 0 > >();
+    }
+    {
+        using Dimensions = std::integer_sequence< std::size_t, 0, 1, 5 >;
+        using Subpermutation = typename SubpermutationGetter< Dimensions, Permutation >::Subpermutation;
+        check_permutation< Subpermutation,
+                           std::integer_sequence< std::size_t, 2, 1, 0 > >();
+    }
+
+    static_assert( is_increasing_sequence( {0, 1, 2, 3, 4} ), "bug" );
+    static_assert( ! is_increasing_sequence( {0, 1, 2, 0, 4} ), "bug" );
+    static_assert( ! is_increasing_sequence( {1, 0, 2, 3, 4} ), "bug" );
+}
+
+TEST( NDArraySubarrayTest, Dynamic_6D )
+{
+    int I = 2, J = 3, K = 4, L = 5, M = 6, N = 7;
+    NDArray< int,
+             SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+             index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+    a.setValue( 0 );
+
+    auto v = a.getView();
+
+    auto s1 = v.template getSubarrayView< 0 >( 0, 0, 0, 0, 0, 0 );
+    const int size1 = s1.template getSize< 0 >();
+    const int stride1 = s1.template getStride< 0 >();
+    EXPECT_EQ( size1, I );
+    EXPECT_EQ( stride1, J );
+    for( int i = 0; i < I; i++ ) {
+        s1( i ) = 1 + i;
+        EXPECT_EQ( v( i, 0, 0, 0, 0, 0 ), 1 + i );
+    }
+    a.setValue( 0 );
+
+    auto s2 = v.template getSubarrayView< 1 >( 0, 0, 0, 0, 0, 0 );
+    const int size2 = s2.template getSize< 0 >();
+    const int stride2 = s2.template getStride< 0 >();
+    EXPECT_EQ( size2, J );
+    EXPECT_EQ( stride2, 1 );
+    for( int j = 0; j < J; j++ ) {
+        s2( j ) = 1 + j;
+        EXPECT_EQ( v( 0, j, 0, 0, 0, 0 ), 1 + j );
+    }
+    a.setValue( 0 );
+
+    auto s3 = v.template getSubarrayView< 2 >( 0, 0, 0, 0, 0, 0 );
+    const int size3 = s3.template getSize< 0 >();
+    const int stride3 = s3.template getStride< 0 >();
+    EXPECT_EQ( size3, K );
+    EXPECT_EQ( stride3, I*J );
+    for( int k = 0; k < K; k++ ) {
+        s3( k ) = 1 + k;
+        EXPECT_EQ( v( 0, 0, k, 0, 0, 0 ), 1 + k );
+    }
+    a.setValue( 0 );
+
+    auto s4 = v.template getSubarrayView< 3 >( 0, 0, 0, 0, 0, 0 );
+    const int size4 = s4.template getSize< 0 >();
+    const int stride4 = s4.template getStride< 0 >();
+    EXPECT_EQ( size4, L );
+    EXPECT_EQ( stride4, I*J*K*M );
+    for( int l = 0; l < L; l++ ) {
+        s4( l ) = 1 + l;
+        EXPECT_EQ( v( 0, 0, 0, l, 0, 0 ), 1 + l );
+    }
+    a.setValue( 0 );
+
+    auto s5 = v.template getSubarrayView< 4 >( 0, 0, 0, 0, 0, 0 );
+    const int size5 = s5.template getSize< 0 >();
+    const int stride5 = s5.template getStride< 0 >();
+    EXPECT_EQ( size5, M );
+    EXPECT_EQ( stride5, I*J*K );
+    for( int m = 0; m < M; m++ ) {
+        s5( m ) = 1 + m;
+        EXPECT_EQ( v( 0, 0, 0, 0, m, 0 ), 1 + m );
+    }
+    a.setValue( 0 );
+
+    auto s6 = v.template getSubarrayView< 5 >( 0, 0, 0, 0, 0, 0 );
+    const int size6 = s6.template getSize< 0 >();
+    const int stride6 = s6.template getStride< 0 >();
+    EXPECT_EQ( size6, N );
+    EXPECT_EQ( stride6, I*J*K*L*M );
+    for( int n = 0; n < N; n++ ) {
+        s6( n ) = 1 + n;
+        EXPECT_EQ( v( 0, 0, 0, 0, 0, n ), 1 + n );
+    }
+    a.setValue( 0 );
+
+
+    auto s_ij = v.template getSubarrayView< 0, 1 >( 0, 0, 0, 0, 0, 0 );
+    const int size_ij_0 = s_ij.template getSize< 0 >();
+    const int size_ij_1 = s_ij.template getSize< 1 >();
+    const int stride_ij_0 = s_ij.template getStride< 0 >();
+    const int stride_ij_1 = s_ij.template getStride< 1 >();
+    EXPECT_EQ( size_ij_0, I );
+    EXPECT_EQ( size_ij_1, J );
+    EXPECT_EQ( stride_ij_0, 1 );
+    EXPECT_EQ( stride_ij_1, 1 );
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ ) {
+        s_ij( i, j ) = 1;
+        EXPECT_EQ( v( i, j, 0, 0, 0, 0 ), 1 );
+    }
+    a.setValue( 0 );
+
+    auto s_ik = v.template getSubarrayView< 0, 2 >( 0, 0, 0, 0, 0, 0 );
+    const int size_ik_0 = s_ik.template getSize< 0 >();
+    const int size_ik_1 = s_ik.template getSize< 1 >();
+    const int stride_ik_0 = s_ik.template getStride< 0 >();
+    const int stride_ik_1 = s_ik.template getStride< 1 >();
+    EXPECT_EQ( size_ik_0, I );
+    EXPECT_EQ( size_ik_1, K );
+    EXPECT_EQ( stride_ik_0, J );
+    EXPECT_EQ( stride_ik_1, 1 );
+    for( int i = 0; i < I; i++ )
+    for( int k = 0; k < K; k++ ) {
+        s_ik( i, k ) = 1 + k;
+        EXPECT_EQ( v( i, 0, k, 0, 0, 0 ), 1 + k );
+    }
+    a.setValue( 0 );
+
+    auto s_il = v.template getSubarrayView< 0, 3 >( 0, 0, 0, 0, 0, 0 );
+    const int size_il_0 = s_il.template getSize< 0 >();
+    const int size_il_1 = s_il.template getSize< 1 >();
+    const int stride_il_0 = s_il.template getStride< 0 >();
+    const int stride_il_1 = s_il.template getStride< 1 >();
+    EXPECT_EQ( size_il_0, I );
+    EXPECT_EQ( size_il_1, L );
+    EXPECT_EQ( stride_il_0, J );
+    EXPECT_EQ( stride_il_1, K*M );
+    for( int i = 0; i < I; i++ )
+    for( int l = 0; l < L; l++ ) {
+        s_il( i, l ) = 1 + l;
+        EXPECT_EQ( v( i, 0, 0, l, 0, 0 ), 1 + l );
+    }
+    a.setValue( 0 );
+
+    auto s_im = v.template getSubarrayView< 0, 4 >( 0, 0, 0, 0, 0, 0 );
+    const int size_im_0 = s_im.template getSize< 0 >();
+    const int size_im_1 = s_im.template getSize< 1 >();
+    const int stride_im_0 = s_im.template getStride< 0 >();
+    const int stride_im_1 = s_im.template getStride< 1 >();
+    EXPECT_EQ( size_im_0, I );
+    EXPECT_EQ( size_im_1, M );
+    EXPECT_EQ( stride_im_0, J );
+    EXPECT_EQ( stride_im_1, K );
+    for( int i = 0; i < I; i++ )
+    for( int m = 0; m < M; m++ ) {
+        s_im( i, m ) = 1 + m;
+        EXPECT_EQ( v( i, 0, 0, 0, m, 0 ), 1 + m );
+    }
+    a.setValue( 0 );
+
+    auto s_in = v.template getSubarrayView< 0, 5 >( 0, 0, 0, 0, 0, 0 );
+    const int size_in_0 = s_in.template getSize< 0 >();
+    const int size_in_1 = s_in.template getSize< 1 >();
+    const int stride_in_0 = s_in.template getStride< 0 >();
+    const int stride_in_1 = s_in.template getStride< 1 >();
+    EXPECT_EQ( size_in_0, I );
+    EXPECT_EQ( size_in_1, N );
+    EXPECT_EQ( stride_in_0, J );
+    EXPECT_EQ( stride_in_1, K*L*M );
+    for( int i = 0; i < I; i++ )
+    for( int n = 0; n < N; n++ ) {
+        s_in( i, n ) = 1 + n;
+        EXPECT_EQ( v( i, 0, 0, 0, 0, n ), 1 + n );
+    }
+    a.setValue( 0 );
+
+
+    auto s_jk = v.template getSubarrayView< 1, 2 >( 0, 0, 0, 0, 0, 0 );
+    const int size_jk_0 = s_jk.template getSize< 0 >();
+    const int size_jk_1 = s_jk.template getSize< 1 >();
+    const int stride_jk_0 = s_jk.template getStride< 0 >();
+    const int stride_jk_1 = s_jk.template getStride< 1 >();
+    EXPECT_EQ( size_jk_0, J );
+    EXPECT_EQ( size_jk_1, K );
+    EXPECT_EQ( stride_jk_0, 1 );
+    EXPECT_EQ( stride_jk_1, I );
+    for( int j = 0; j < J; j++ )
+    for( int k = 0; k < K; k++ ) {
+        s_jk( j, k ) = 1 + k;
+        EXPECT_EQ( v( 0, j, k, 0, 0, 0 ), 1 + k );
+    }
+    a.setValue( 0 );
+
+    auto s_jl = v.template getSubarrayView< 1, 3 >( 0, 0, 0, 0, 0, 0 );
+    const int size_jl_0 = s_jl.template getSize< 0 >();
+    const int size_jl_1 = s_jl.template getSize< 1 >();
+    const int stride_jl_0 = s_jl.template getStride< 0 >();
+    const int stride_jl_1 = s_jl.template getStride< 1 >();
+    EXPECT_EQ( size_jl_0, J );
+    EXPECT_EQ( size_jl_1, L );
+    EXPECT_EQ( stride_jl_0, 1 );
+    EXPECT_EQ( stride_jl_1, I*K*M );
+    for( int j = 0; j < J; j++ )
+    for( int l = 0; l < L; l++ ) {
+        s_jl( j, l ) = 1 + l;
+        EXPECT_EQ( v( 0, j, 0, l, 0, 0 ), 1 + l );
+    }
+    a.setValue( 0 );
+
+    auto s_jm = v.template getSubarrayView< 1, 4 >( 0, 0, 0, 0, 0, 0 );
+    const int size_jm_0 = s_jm.template getSize< 0 >();
+    const int size_jm_1 = s_jm.template getSize< 1 >();
+    const int stride_jm_0 = s_jm.template getStride< 0 >();
+    const int stride_jm_1 = s_jm.template getStride< 1 >();
+    EXPECT_EQ( size_jm_0, J );
+    EXPECT_EQ( size_jm_1, M );
+    EXPECT_EQ( stride_jm_0, 1 );
+    EXPECT_EQ( stride_jm_1, I*K );
+    for( int j = 0; j < J; j++ )
+    for( int m = 0; m < M; m++ ) {
+        s_jm( j, m ) = 1 + m;
+        EXPECT_EQ( v( 0, j, 0, 0, m, 0 ), 1 + m );
+    }
+    a.setValue( 0 );
+
+    auto s_jn = v.template getSubarrayView< 1, 5 >( 0, 0, 0, 0, 0, 0 );
+    const int size_jn_0 = s_jn.template getSize< 0 >();
+    const int size_jn_1 = s_jn.template getSize< 1 >();
+    const int stride_jn_0 = s_jn.template getStride< 0 >();
+    const int stride_jn_1 = s_jn.template getStride< 1 >();
+    EXPECT_EQ( size_jn_0, J );
+    EXPECT_EQ( size_jn_1, N );
+    EXPECT_EQ( stride_jn_0, 1 );
+    EXPECT_EQ( stride_jn_1, I*K*L*M );
+    for( int j = 0; j < J; j++ )
+    for( int n = 0; n < N; n++ ) {
+        s_jn( j, n ) = 1 + n;
+        EXPECT_EQ( v( 0, j, 0, 0, 0, n ), 1 + n );
+    }
+    a.setValue( 0 );
+
+
+    auto s_kl = v.template getSubarrayView< 2, 3 >( 0, 0, 0, 0, 0, 0 );
+    const int size_kl_0 = s_kl.template getSize< 0 >();
+    const int size_kl_1 = s_kl.template getSize< 1 >();
+    const int stride_kl_0 = s_kl.template getStride< 0 >();
+    const int stride_kl_1 = s_kl.template getStride< 1 >();
+    EXPECT_EQ( size_kl_0, K );
+    EXPECT_EQ( size_kl_1, L );
+    EXPECT_EQ( stride_kl_0, I*J );
+    EXPECT_EQ( stride_kl_1, M );
+    for( int k = 0; k < K; k++ )
+    for( int l = 0; l < L; l++ ) {
+        s_kl( k, l ) = 1 + l;
+        EXPECT_EQ( v( 0, 0, k, l, 0, 0 ), 1 + l );
+    }
+    a.setValue( 0 );
+
+    auto s_km = v.template getSubarrayView< 2, 4 >( 0, 0, 0, 0, 0, 0 );
+    const int size_km_0 = s_km.template getSize< 0 >();
+    const int size_km_1 = s_km.template getSize< 1 >();
+    const int stride_km_0 = s_km.template getStride< 0 >();
+    const int stride_km_1 = s_km.template getStride< 1 >();
+    EXPECT_EQ( size_km_0, K );
+    EXPECT_EQ( size_km_1, M );
+    EXPECT_EQ( stride_km_0, I*J );
+    EXPECT_EQ( stride_km_1, 1 );
+    for( int k = 0; k < K; k++ )
+    for( int m = 0; m < M; m++ ) {
+        s_km( k, m ) = 1 + m;
+        EXPECT_EQ( v( 0, 0, k, 0, m, 0 ), 1 + m );
+    }
+    a.setValue( 0 );
+
+    auto s_kn = v.template getSubarrayView< 2, 5 >( 0, 0, 0, 0, 0, 0 );
+    const int size_kn_0 = s_kn.template getSize< 0 >();
+    const int size_kn_1 = s_kn.template getSize< 1 >();
+    const int stride_kn_0 = s_kn.template getStride< 0 >();
+    const int stride_kn_1 = s_kn.template getStride< 1 >();
+    EXPECT_EQ( size_kn_0, K );
+    EXPECT_EQ( size_kn_1, N );
+    EXPECT_EQ( stride_kn_0, I*J );
+    EXPECT_EQ( stride_kn_1, L*M );
+    for( int k = 0; k < K; k++ )
+    for( int n = 0; n < N; n++ ) {
+        s_kn( k, n ) = 1 + n;
+        EXPECT_EQ( v( 0, 0, k, 0, 0, n ), 1 + n );
+    }
+    a.setValue( 0 );
+
+
+    auto s_lm = v.template getSubarrayView< 3, 4 >( 0, 0, 0, 0, 0, 0 );
+    const int size_lm_0 = s_lm.template getSize< 0 >();
+    const int size_lm_1 = s_lm.template getSize< 1 >();
+    const int stride_lm_0 = s_lm.template getStride< 0 >();
+    const int stride_lm_1 = s_lm.template getStride< 1 >();
+    EXPECT_EQ( size_lm_0, L );
+    EXPECT_EQ( size_lm_1, M );
+    EXPECT_EQ( stride_lm_0, 1 );
+    EXPECT_EQ( stride_lm_1, I*J*K );
+    for( int l = 0; l < L; l++ )
+    for( int m = 0; m < M; m++ ) {
+        s_lm( l, m ) = 1 + m;
+        EXPECT_EQ( v( 0, 0, 0, l, m, 0 ), 1 + m );
+    }
+    a.setValue( 0 );
+
+    auto s_ln = v.template getSubarrayView< 3, 5 >( 0, 0, 0, 0, 0, 0 );
+    const int size_ln_0 = s_ln.template getSize< 0 >();
+    const int size_ln_1 = s_ln.template getSize< 1 >();
+    const int stride_ln_0 = s_ln.template getStride< 0 >();
+    const int stride_ln_1 = s_ln.template getStride< 1 >();
+    EXPECT_EQ( size_ln_0, L );
+    EXPECT_EQ( size_ln_1, N );
+    EXPECT_EQ( stride_ln_0, I*J*K*M );
+    EXPECT_EQ( stride_ln_1, 1 );
+    for( int l = 0; l < L; l++ )
+    for( int n = 0; n < N; n++ ) {
+        s_ln( l, n ) = 1 + n;
+        EXPECT_EQ( v( 0, 0, 0, l, 0, n ), 1 + n );
+    }
+    a.setValue( 0 );
+
+
+    auto s_mn = v.template getSubarrayView< 4, 5 >( 0, 0, 0, 0, 0, 0 );
+    const int size_mn_0 = s_mn.template getSize< 0 >();
+    const int size_mn_1 = s_mn.template getSize< 1 >();
+    const int stride_mn_0 = s_mn.template getStride< 0 >();
+    const int stride_mn_1 = s_mn.template getStride< 1 >();
+    EXPECT_EQ( size_mn_0, M );
+    EXPECT_EQ( size_mn_1, N );
+    EXPECT_EQ( stride_mn_0, I*J*K );
+    EXPECT_EQ( stride_mn_1, L );
+    for( int m = 0; m < M; m++ )
+    for( int n = 0; n < N; n++ ) {
+        s_mn( m, n ) = 1 + n;
+        EXPECT_EQ( v( 0, 0, 0, 0, m, n ), 1 + n );
+    }
+    a.setValue( 0 );
+}
+
+//#include "GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+//#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+   return RUN_ALL_TESTS();
+//#else
+//   throw GtestMissingError();
+//#endif
+}
diff --git a/src/UnitTests/Containers/ndarray/SlicedNDArrayTest.cpp b/src/UnitTests/Containers/ndarray/SlicedNDArrayTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8574a56021c6da57331dc63bb7747aa1ba262cb1
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/SlicedNDArrayTest.cpp
@@ -0,0 +1,251 @@
+#include "gtest/gtest.h"
+
+#include <TNL/Containers/NDArray.h>
+
+using namespace TNL::Containers;
+using std::index_sequence;
+
+template< typename Array >
+void expect_identity( const Array& a )
+{
+    Array identity;
+    identity.setLike( a );
+    for( int i = 0; i < identity.getSize(); i++ )
+        identity[ i ] = i;
+    EXPECT_EQ( a, identity );
+}
+
+template< typename Array, typename Seq >
+void expect_seq( const Array& a, const Seq& seq )
+{
+    for( int i = 0; i < a.getSize(); i++ )
+        EXPECT_EQ( a[ i ], seq[ i ] );
+}
+
+TEST( SlicedNDArrayTest, 2D_Static_Identity )
+{
+    constexpr int I = 3, J = 5;
+    SlicedNDArray< int, SizesHolder< int, I, J > > a;
+    a.setSizes( 0, 0 );
+
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+        for( int j = 0; j < J; j++ )
+            a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( SlicedNDArrayTest, 2D_Static_Permuted )
+{
+    constexpr int I = 3, J = 5;
+    SlicedNDArray< int,
+                   SizesHolder< int, I, J >,
+                   index_sequence< 1, 0 > > a;
+    a.setSizes( 0, 0 );
+
+    int v = 0;
+    for( int j = 0; j < J; j++ )
+        for( int i = 0; i < I; i++ )
+            a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( SlicedNDArrayTest, 6D_Dynamic )
+{
+    int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
+    SlicedNDArray< int,
+                   SizesHolder< int, 0, 0, 0, 0, 0, 0 >,
+                   index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+    a.setSizes( I, J, K, L, M, N );
+
+    int v = 0;
+    for( int n = 0; n < N; n++ )
+        for( int l = 0; l < L; l++ )
+            for( int m = 0; m < M; m++ )
+                for( int k = 0; k < K; k++ )
+                    for( int i = 0; i < I; i++ )
+                        for( int j = 0; j < J; j++ )
+                            a( i, j, k, l, m, n ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+
+TEST( SlicedNDArrayTest, Sliced2D_Dynamic_Identity )
+{
+    const int I = 3, J = 5;
+    SlicedNDArray< int,
+                   SizesHolder< int, 0, 0 >,
+                   index_sequence< 0, 1 >,
+                   SliceInfo< 1, 2 > > a;  // J is sliced
+    a.setSizes( I, J );
+
+    a.getStorageArray().setValue(-1);
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+        for( int j = 0; j < J; j++ )
+            a( i, j ) = v++;
+
+    const std::vector< int > seq({
+            // first slice
+            0, 1,
+            5, 6,
+            10, 11,
+            // second slice
+            2, 3,
+            7, 8,
+            12, 13,
+            // third slice
+            4, -1,
+            9, -1,
+            14, -1,
+        });
+    expect_seq( a.getStorageArray(), seq );
+}
+
+TEST( SlicedNDArrayTest, Sliced2D_HalfStatic_Identity )
+{
+    constexpr int I = 3;
+    const int J = 5;
+    SlicedNDArray< int,
+                   SizesHolder< int, I, 0 >,
+                   index_sequence< 0, 1 >,
+                   SliceInfo< 1, 2 > > a;  // J is sliced
+    a.setSizes( 0, J );
+
+    a.getStorageArray().setValue(-1);
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+        for( int j = 0; j < J; j++ )
+            a( i, j ) = v++;
+
+    const std::vector< int > seq({
+            // first slice
+            0, 1,
+            5, 6,
+            10, 11,
+            // second slice
+            2, 3,
+            7, 8,
+            12, 13,
+            // third slice
+            4, -1,
+            9, -1,
+            14, -1,
+        });
+    expect_seq( a.getStorageArray(), seq );
+}
+
+TEST( SlicedNDArrayTest, Sliced2D_Dynamic_Permuted )
+{
+    const int I = 3, J = 5;
+    SlicedNDArray< int,
+                   SizesHolder< int, 0, 0 >,
+                   index_sequence< 1, 0 >,
+                   SliceInfo< 0, 2 > > a;  // I is sliced
+    a.setSizes( I, J );
+
+    a.getStorageArray().setValue(-1);
+    int v = 0;
+    for( int j = 0; j < J; j++ )
+        for( int i = 0; i < I; i++ )
+            a( i, j ) = v++;
+
+    const std::vector< int > seq({
+            // first slice (transposed)
+            0, 1,
+            3, 4,
+            6, 7,
+            9, 10,
+            12, 13,
+            // second slice (transposed)
+            2, -1,
+            5, -1,
+            8, -1,
+            11, -1,
+            14, -1,
+        });
+    expect_seq( a.getStorageArray(), seq );
+}
+
+TEST( SlicedNDArrayTest, Sliced2D_HalfStatic_Permuted )
+{
+    const int I = 3;
+    constexpr int J = 5;
+    SlicedNDArray< int,
+                   SizesHolder< int, 0, J >,
+                   index_sequence< 1, 0 >,
+                   SliceInfo< 0, 2 > > a;  // I is sliced
+    a.setSizes( I, 0 );
+
+    a.getStorageArray().setValue(-1);
+    int v = 0;
+    for( int j = 0; j < J; j++ )
+        for( int i = 0; i < I; i++ )
+            a( i, j ) = v++;
+
+    const std::vector< int > seq({
+            // first slice (transposed)
+            0, 1,
+            3, 4,
+            6, 7,
+            9, 10,
+            12, 13,
+            // second slice (transposed)
+            2, -1,
+            5, -1,
+            8, -1,
+            11, -1,
+            14, -1,
+        });
+    expect_seq( a.getStorageArray(), seq );
+}
+
+
+TEST( SlicedNDArrayTest, CopySemantics )
+{
+    const int I = 3, J = 4;
+    SlicedNDArray< int,
+                   SizesHolder< int, 0, 0 >,
+                   index_sequence< 0, 1 >,
+                   SliceInfo< 1, 2 > > a, b, c;  // J is sliced
+    a.setSizes( I, J );
+
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+    for( int j = 0; j < J; j++ )
+        a( i, j ) = v++;
+
+    b = a;
+    EXPECT_EQ( a, b );
+
+    auto a_view = a.getView();
+    auto b_view = b.getView();
+    EXPECT_EQ( a_view, b_view );
+    EXPECT_EQ( a_view.getView(), b_view );
+    EXPECT_EQ( a_view.getConstView(), b_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), b.getConstView() );
+    EXPECT_EQ( a.getConstView(), b_view.getConstView() );
+
+    c.setSizes( I, J );
+    auto c_view = c.getView();
+    c_view = b_view;
+    EXPECT_EQ( a_view, c_view );
+    EXPECT_EQ( a_view.getView(), c_view );
+    EXPECT_EQ( a_view.getConstView(), c_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), c.getConstView() );
+    EXPECT_EQ( a.getConstView(), c_view.getConstView() );
+}
+
+//#include "GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+//#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+   return RUN_ALL_TESTS();
+//#else
+//   throw GtestMissingError();
+//#endif
+}
diff --git a/src/UnitTests/Containers/ndarray/StaticNDArrayCudaTest.cu b/src/UnitTests/Containers/ndarray/StaticNDArrayCudaTest.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0a0a83dd83fae72ff0f1b5c349d81ba05ed0da65
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/StaticNDArrayCudaTest.cu
@@ -0,0 +1,88 @@
+#include "gtest/gtest.h"
+
+#include <TNL/Containers/NDArray.h>
+
+#include <TNL/ParallelFor.h>
+
+using namespace TNL::Containers;
+using std::index_sequence;
+
+template< typename Array >
+void expect_identity( const Array& a )
+{
+    Array identity;
+    identity.setSize( a.getSize() );
+    for( int i = 0; i < identity.getSize(); i++ )
+        identity.setElement( i, i );
+    EXPECT_EQ( a, identity );
+}
+
+// nvcc fuck-up: __host__ __device__ lambdas cannot be inside protected/private class methods
+void __test_SetThroughView()
+{
+    constexpr int I = 3, J = 5;
+    using ViewType = typename StaticNDArray< int, SizesHolder< int, I, J > >::ViewType;
+    NDArray< int,
+             SizesHolder< int, I, J >,
+             std::make_index_sequence< 2 >,
+             TNL::Devices::Cuda > a;
+    a.setSizes( 0, 0 );
+    ViewType a_view( a.getStorageArray().getData(), SizesHolder< int, I, J >{} );
+
+    auto kernel = [] __cuda_callable__ ( int, ViewType a ) {
+        int v = 0;
+        for( int i = 0; i < I; i++ )
+            for( int j = 0; j < J; j++ )
+                a( i, j ) = v++;
+    };
+
+    a.setValue(0);
+    TNL::ParallelFor< TNL::Devices::Cuda >::exec( 0, 1, kernel, a_view );
+    expect_identity( a.getStorageArray() );
+}
+TEST( StaticNDArrayCudaTest, SetThroughView )
+{
+    __test_SetThroughView();
+}
+
+// nvcc fuck-up: __host__ __device__ lambdas cannot be inside protected/private class methods
+void __test_CopyFromArray()
+{
+    constexpr int I = 3, J = 5;
+    using ViewType = typename StaticNDArray< int, SizesHolder< int, I, J > >::ViewType;
+    NDArray< int,
+             SizesHolder< int, I, J >,
+             std::make_index_sequence< 2 >,
+             TNL::Devices::Cuda > a;
+    a.setSizes( 0, 0 );
+    ViewType a_view( a.getStorageArray().getData(), SizesHolder< int, I, J >{} );
+
+    auto kernel = [] __cuda_callable__ ( int, ViewType a ) {
+        StaticNDArray< int, SizesHolder< int, I, J > > b;
+        int v = 0;
+        for( int i = 0; i < I; i++ )
+            for( int j = 0; j < J; j++ )
+                b( i, j ) = v++;
+        a = b.getView();
+        a( 0, 0 ) = a != b.getView();
+    };
+
+    a.setValue(0);
+    TNL::ParallelFor< TNL::Devices::Cuda >::exec( 0, 1, kernel, a_view );
+    expect_identity( a.getStorageArray() );
+}
+TEST( StaticNDArrayCudaTest, CopyFromArray )
+{
+    __test_CopyFromArray();
+}
+
+//#include "GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+//#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+   return RUN_ALL_TESTS();
+//#else
+//   throw GtestMissingError();
+//#endif
+}
diff --git a/src/UnitTests/Containers/ndarray/StaticNDArrayTest.cpp b/src/UnitTests/Containers/ndarray/StaticNDArrayTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e3ea290f26b92723d0490873f7caa263fee35f36
--- /dev/null
+++ b/src/UnitTests/Containers/ndarray/StaticNDArrayTest.cpp
@@ -0,0 +1,105 @@
+#include "gtest/gtest.h"
+
+#include <TNL/Containers/NDArray.h>
+
+using namespace TNL::Containers;
+using std::index_sequence;
+
+template< typename Array >
+void expect_identity( const Array& a )
+{
+    Array identity;
+    for( int i = 0; i < identity.getSize(); i++ )
+        identity[ i ] = i;
+    EXPECT_EQ( a, identity );
+}
+
+TEST( StaticNDArrayTest, Static_2D_Identity )
+{
+    constexpr int I = 3, J = 5;
+    StaticNDArray< int, SizesHolder< int, I, J > > a;
+
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+        for( int j = 0; j < J; j++ )
+            a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( StaticNDArrayTest, Static_2D_Permuted )
+{
+    constexpr int I = 3, J = 5;
+    StaticNDArray< int,
+                   SizesHolder< int, I, J >,
+                   index_sequence< 1, 0 > > a;
+
+    int v = 0;
+    for( int j = 0; j < J; j++ )
+        for( int i = 0; i < I; i++ )
+            a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( StaticNDArrayTest, Static_6D_Permuted )
+{
+    constexpr int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2;
+    StaticNDArray< int,
+                   SizesHolder< int, I, J, K, L, M, N >,
+                   index_sequence< 5, 3, 4, 2, 0, 1 > > a;
+
+    int v = 0;
+    for( int n = 0; n < N; n++ )
+        for( int l = 0; l < L; l++ )
+            for( int m = 0; m < M; m++ )
+                for( int k = 0; k < K; k++ )
+                    for( int i = 0; i < I; i++ )
+                        for( int j = 0; j < J; j++ )
+                            a( i, j, k, l, m, n ) = v++;
+
+    expect_identity( a.getStorageArray() );
+}
+
+TEST( StaticNDArrayTest, CopySemantics )
+{
+    constexpr int I = 3, J = 5;
+    StaticNDArray< int, SizesHolder< int, I, J > > a, b, c;
+
+    int v = 0;
+    for( int i = 0; i < I; i++ )
+        for( int j = 0; j < J; j++ )
+            a( i, j ) = v++;
+
+    expect_identity( a.getStorageArray() );
+
+    b = a;
+    EXPECT_EQ( a, b );
+
+    auto a_view = a.getView();
+    auto b_view = b.getView();
+    EXPECT_EQ( a_view, b_view );
+    EXPECT_EQ( a_view.getView(), b_view );
+    EXPECT_EQ( a_view.getConstView(), b_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), b.getConstView() );
+    EXPECT_EQ( a.getConstView(), b_view.getConstView() );
+
+    auto c_view = c.getView();
+    c_view = b_view;
+    EXPECT_EQ( a_view, c_view );
+    EXPECT_EQ( a_view.getView(), c_view );
+    EXPECT_EQ( a_view.getConstView(), c_view.getConstView() );
+    EXPECT_EQ( a.getConstView(), c.getConstView() );
+    EXPECT_EQ( a.getConstView(), c_view.getConstView() );
+}
+
+//#include "GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+//#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+   return RUN_ALL_TESTS();
+//#else
+//   throw GtestMissingError();
+//#endif
+}