From 061263a6412a51314282418be75e3ac9b61b9307 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <>
Date: Sat, 9 Feb 2019 15:17:39 +0100
Subject: [PATCH] NDArray: split implementation of executors into a separate
 header file

 src/TNL/Containers/NDArrayView.h        |   1 +
 src/TNL/Containers/ndarray/Executors.h  | 310 ++++++++++++++++++++++++
 src/TNL/Containers/ndarray/Operations.h | 289 +---------------------
 3 files changed, 312 insertions(+), 288 deletions(-)
 create mode 100644 src/TNL/Containers/ndarray/Executors.h

diff --git a/src/TNL/Containers/NDArrayView.h b/src/TNL/Containers/NDArrayView.h
index cafaabe9b2..50119eda42 100644
--- a/src/TNL/Containers/NDArrayView.h
+++ b/src/TNL/Containers/NDArrayView.h
@@ -15,6 +15,7 @@
 #include <TNL/Containers/ndarray/Indexing.h>
 #include <TNL/Containers/ndarray/SizesHolder.h>
 #include <TNL/Containers/ndarray/Subarrays.h>
+#include <TNL/Containers/ndarray/Executors.h>
 #include <TNL/Containers/ndarray/Operations.h>
 namespace TNL {
diff --git a/src/TNL/Containers/ndarray/Executors.h b/src/TNL/Containers/ndarray/Executors.h
new file mode 100644
index 0000000000..ba37fe345a
--- /dev/null
+++ b/src/TNL/Containers/ndarray/Executors.h
@@ -0,0 +1,310 @@
+                          Executors.h  -  description
+                             -------------------
+    begin                : Dec 24, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                :
+ ***************************************************************************/
+/* See Copyright Notice in tnl/Copyright */
+// Implemented by: Jakub Klinkovsky
+#pragma once
+#include <TNL/ParallelFor.h>
+#include <TNL/Containers/ndarray/Meta.h>
+#include <TNL/Containers/ndarray/SizesHolder.h>
+namespace TNL {
+namespace Containers {
+namespace __ndarray_impl {
+template< typename Permutation,
+          typename LevelTag = IndexTag< 0 > >
+struct SequentialExecutor
+   template< typename Begins,
+             typename Ends,
+             typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      SequentialExecutor< Permutation, IndexTag< LevelTag::value + 1 > > exec;
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         exec( begins, ends, f, std::forward< Indices >( indices )..., i );
+   }
+template< typename Permutation >
+struct SequentialExecutor< Permutation, IndexTag< Permutation::size() - 1 > >
+   template< typename Begins,
+             typename Ends,
+             typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      static_assert( sizeof...(indices) == Begins::getDimension() - 1,
+                     "invalid number of indices in the final step of the SequentialExecutor" );
+      using LevelTag = IndexTag< Permutation::size() - 1 >;
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
+   }
+template< typename Permutation,
+          typename LevelTag = IndexTag< Permutation::size() - 1 > >
+struct SequentialExecutorRTL
+   template< typename Begins,
+             typename Ends,
+             typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      SequentialExecutorRTL< Permutation, IndexTag< LevelTag::value - 1 > > exec;
+      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         exec( begins, ends, f, i, std::forward< Indices >( indices )... );
+   }
+template< typename Permutation >
+struct SequentialExecutorRTL< Permutation, IndexTag< 0 > >
+   template< typename Begins,
+             typename Ends,
+             typename Func,
+             typename... Indices >
+   __cuda_callable__
+   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      static_assert( sizeof...(indices) == Begins::getDimension() - 1,
+                     "invalid number of indices in the final step of the SequentialExecutorRTL" );
+      const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const auto end = ends.template getSize< get< 0 >( Permutation{} ) >();
+      for( auto i = begin; i < end; i++ )
+         call_with_unpermuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... );
+   }
+template< typename Permutation,
+          typename Device >
+struct ParallelExecutorDeviceDispatch
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      using Index = typename Ends::IndexType;
+      auto kernel = [=] ( Index i2, Index i1, Index i0 )
+      {
+         SequentialExecutor< Permutation, IndexTag< 3 > > exec;
+         exec( begins, ends, f, i0, i1, i2 );
+      };
+      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+      const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
+      ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
+   }
+template< typename Permutation >
+struct ParallelExecutorDeviceDispatch< Permutation, Devices::Cuda >
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      using Index = typename Ends::IndexType;
+      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
+      {
+         SequentialExecutorRTL< Permutation, IndexTag< Begins::getDimension() - 4 > > exec;
+         exec( begins, ends, f, i0, i1, i2 );
+      };
+      const Index begin0 = begins.template getSize< get< Begins::getDimension() - 3 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< Begins::getDimension() - 2 >( Permutation{} ) >();
+      const Index begin2 = begins.template getSize< get< Begins::getDimension() - 1 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< Ends::getDimension() - 3 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< Ends::getDimension() - 2 >( Permutation{} ) >();
+      const Index end2 = ends.template getSize< get< Ends::getDimension() - 1 >( Permutation{} ) >();
+      ParallelFor3D< Devices::Cuda >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
+   }
+template< typename Permutation,
+          typename Device,
+          typename DimTag = IndexTag< Permutation::size() > >
+struct ParallelExecutor
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      ParallelExecutorDeviceDispatch< Permutation, Device > dispatch;
+      dispatch( begins, ends, f );
+   }
+template< typename Permutation,
+          typename Device >
+struct ParallelExecutor< Permutation, Device, IndexTag< 3 > >
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      using Index = typename Ends::IndexType;
+      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
+      };
+      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+      const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
+      ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
+   }
+template< typename Permutation,
+          typename Device >
+struct ParallelExecutor< Permutation, Device, IndexTag< 2 > >
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      using Index = typename Ends::IndexType;
+      auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
+      {
+         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
+      };
+      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
+      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
+      ParallelFor2D< Device >::exec( begin1, begin0, end1, end0, kernel );
+   }
+template< typename Permutation,
+          typename Device >
+struct ParallelExecutor< Permutation, Device, IndexTag< 1 > >
+   template< typename Begins,
+             typename Ends,
+             typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      static_assert( Begins::getDimension() == Ends::getDimension(),
+                     "wrong begins or ends" );
+      using Index = typename Ends::IndexType;
+//      auto kernel = [=] __cuda_callable__ ( Index i )
+//      {
+//         call_with_unpermuted_arguments< Permutation >( f, i );
+//      };
+      const Index begin = begins.template getSize< get< 0 >( Permutation{} ) >();
+      const Index end = ends.template getSize< get< 0 >( Permutation{} ) >();
+//      ParallelFor< Device >::exec( begin, end, kernel );
+      ParallelFor< Device >::exec( begin, end, f );
+   }
+// Device may be void which stands for StaticNDArray
+template< typename Permutation,
+          typename Device >
+struct ExecutorDispatcher
+   template< typename Begins, typename Ends, typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      SequentialExecutor< Permutation >()( begins, ends, f );
+   }
+template< typename Permutation >
+struct ExecutorDispatcher< Permutation, Devices::Host >
+   template< typename Begins, typename Ends, typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 )
+         ParallelExecutor< Permutation, Devices::Host >()( begins, ends, f );
+      else
+         SequentialExecutor< Permutation >()( begins, ends, f );
+   }
+template< typename Permutation >
+struct ExecutorDispatcher< Permutation, Devices::Cuda >
+   template< typename Begins, typename Ends, typename Func >
+   void operator()( const Begins& begins, const Ends& ends, Func f )
+   {
+      ParallelExecutor< Permutation, Devices::Cuda >()( begins, ends, f );
+   }
+} // namespace __ndarray_impl
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/ndarray/Operations.h b/src/TNL/Containers/ndarray/Operations.h
index b1f793405d..eb219b6e01 100644
--- a/src/TNL/Containers/ndarray/Operations.h
+++ b/src/TNL/Containers/ndarray/Operations.h
@@ -12,300 +12,13 @@
 #pragma once
-#include <TNL/ParallelFor.h>
-#include <TNL/Containers/ndarray/Meta.h>
-#include <TNL/Containers/ndarray/SizesHolder.h>
+#include <TNL/Containers/ndarray/Executors.h>
 namespace TNL {
 namespace Containers {
 namespace __ndarray_impl {
-template< typename Permutation,
-          typename LevelTag = IndexTag< 0 > >
-struct SequentialExecutor
-   template< typename Begins,
-             typename Ends,
-             typename Func,
-             typename... Indices >
-   __cuda_callable__
-   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
-   {
-      static_assert( Begins::getDimension() == Ends::getDimension(),
-                     "wrong begins or ends" );
-      SequentialExecutor< Permutation, IndexTag< LevelTag::value + 1 > > exec;
-      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
-      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
-      for( auto i = begin; i < end; i++ )
-         exec( begins, ends, f, std::forward< Indices >( indices )..., i );
-   }
-template< typename Permutation >
-struct SequentialExecutor< Permutation, IndexTag< Permutation::size() - 1 > >
-   template< typename Begins,
-             typename Ends,
-             typename Func,
-             typename... Indices >
-   __cuda_callable__
-   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
-   {
-      static_assert( Begins::getDimension() == Ends::getDimension(),
-                     "wrong begins or ends" );
-      static_assert( sizeof...(indices) == Begins::getDimension() - 1,
-                     "invalid number of indices in the final step of the SequentialExecutor" );
-      using LevelTag = IndexTag< Permutation::size() - 1 >;
-      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
-      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
-      for( auto i = begin; i < end; i++ )
-         call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
-   }
-template< typename Permutation,
-          typename LevelTag = IndexTag< Permutation::size() - 1 > >
-struct SequentialExecutorRTL
-   template< typename Begins,
-             typename Ends,
-             typename Func,
-             typename... Indices >
-   __cuda_callable__
-   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
-   {
-      static_assert( Begins::getDimension() == Ends::getDimension(),
-                     "wrong begins or ends" );
-      SequentialExecutorRTL< Permutation, IndexTag< LevelTag::value - 1 > > exec;
-      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
-      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
-      for( auto i = begin; i < end; i++ )
-         exec( begins, ends, f, i, std::forward< Indices >( indices )... );
-   }
-template< typename Permutation >
-struct SequentialExecutorRTL< Permutation, IndexTag< 0 > >
-   template< typename Begins,
-             typename Ends,
-             typename Func,
-             typename... Indices >
-   __cuda_callable__
-   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
-   {
-      static_assert( Begins::getDimension() == Ends::getDimension(),
-                     "wrong begins or ends" );
-      static_assert( sizeof...(indices) == Begins::getDimension() - 1,
-                     "invalid number of indices in the final step of the SequentialExecutorRTL" );
-      const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >();
-      const auto end = ends.template getSize< get< 0 >( Permutation{} ) >();
-      for( auto i = begin; i < end; i++ )
-         call_with_unpermuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... );
-   }
-template< typename Permutation,
-          typename Device >
-struct ParallelExecutorDeviceDispatch
-   template< typename Begins,
-             typename Ends,
-             typename Func >
-   void operator()( const Begins& begins, const Ends& ends, Func f )
-   {
-      static_assert( Begins::getDimension() == Ends::getDimension(),
-                     "wrong begins or ends" );
-      using Index = typename Ends::IndexType;
-      auto kernel = [=] ( Index i2, Index i1, Index i0 )
-      {
-         SequentialExecutor< Permutation, IndexTag< 3 > > exec;
-         exec( begins, ends, f, i0, i1, i2 );
-      };
-      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
-      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
-      const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >();
-      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
-      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
-      const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
-      ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
-   }
-template< typename Permutation >
-struct ParallelExecutorDeviceDispatch< Permutation, Devices::Cuda >
-   template< typename Begins,
-             typename Ends,
-             typename Func >
-   void operator()( const Begins& begins, const Ends& ends, Func f )
-   {
-      static_assert( Begins::getDimension() == Ends::getDimension(),
-                     "wrong begins or ends" );
-      using Index = typename Ends::IndexType;
-      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
-      {
-         SequentialExecutorRTL< Permutation, IndexTag< Begins::getDimension() - 4 > > exec;
-         exec( begins, ends, f, i0, i1, i2 );
-      };
-      const Index begin0 = begins.template getSize< get< Begins::getDimension() - 3 >( Permutation{} ) >();
-      const Index begin1 = begins.template getSize< get< Begins::getDimension() - 2 >( Permutation{} ) >();
-      const Index begin2 = begins.template getSize< get< Begins::getDimension() - 1 >( Permutation{} ) >();
-      const Index end0 = ends.template getSize< get< Ends::getDimension() - 3 >( Permutation{} ) >();
-      const Index end1 = ends.template getSize< get< Ends::getDimension() - 2 >( Permutation{} ) >();
-      const Index end2 = ends.template getSize< get< Ends::getDimension() - 1 >( Permutation{} ) >();
-      ParallelFor3D< Devices::Cuda >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
-   }
-template< typename Permutation,
-          typename Device,
-          typename DimTag = IndexTag< Permutation::size() > >
-struct ParallelExecutor
-   template< typename Begins,
-             typename Ends,
-             typename Func >
-   void operator()( const Begins& begins, const Ends& ends, Func f )
-   {
-      ParallelExecutorDeviceDispatch< Permutation, Device > dispatch;
-      dispatch( begins, ends, f );
-   }
-template< typename Permutation,
-          typename Device >
-struct ParallelExecutor< Permutation, Device, IndexTag< 3 > >
-   template< typename Begins,
-             typename Ends,
-             typename Func >
-   void operator()( const Begins& begins, const Ends& ends, Func f )
-   {
-      static_assert( Begins::getDimension() == Ends::getDimension(),
-                     "wrong begins or ends" );
-      using Index = typename Ends::IndexType;
-      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
-      {
-         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
-      };
-      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
-      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
-      const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >();
-      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
-      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
-      const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
-      ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
-   }
-template< typename Permutation,
-          typename Device >
-struct ParallelExecutor< Permutation, Device, IndexTag< 2 > >
-   template< typename Begins,
-             typename Ends,
-             typename Func >
-   void operator()( const Begins& begins, const Ends& ends, Func f )
-   {
-      static_assert( Begins::getDimension() == Ends::getDimension(),
-                     "wrong begins or ends" );
-      using Index = typename Ends::IndexType;
-      auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
-      {
-         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
-      };
-      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
-      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
-      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
-      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
-      ParallelFor2D< Device >::exec( begin1, begin0, end1, end0, kernel );
-   }
-template< typename Permutation,
-          typename Device >
-struct ParallelExecutor< Permutation, Device, IndexTag< 1 > >
-   template< typename Begins,
-             typename Ends,
-             typename Func >
-   void operator()( const Begins& begins, const Ends& ends, Func f )
-   {
-      static_assert( Begins::getDimension() == Ends::getDimension(),
-                     "wrong begins or ends" );
-      using Index = typename Ends::IndexType;
-//      auto kernel = [=] __cuda_callable__ ( Index i )
-//      {
-//         call_with_unpermuted_arguments< Permutation >( f, i );
-//      };
-      const Index begin = begins.template getSize< get< 0 >( Permutation{} ) >();
-      const Index end = ends.template getSize< get< 0 >( Permutation{} ) >();
-//      ParallelFor< Device >::exec( begin, end, kernel );
-      ParallelFor< Device >::exec( begin, end, f );
-   }
-// Device may be void which stands for StaticNDArray
-template< typename Permutation,
-          typename Device >
-struct ExecutorDispatcher
-   template< typename Begins, typename Ends, typename Func >
-   void operator()( const Begins& begins, const Ends& ends, Func f )
-   {
-      SequentialExecutor< Permutation >()( begins, ends, f );
-   }
-template< typename Permutation >
-struct ExecutorDispatcher< Permutation, Devices::Host >
-   template< typename Begins, typename Ends, typename Func >
-   void operator()( const Begins& begins, const Ends& ends, Func f )
-   {
-      if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 )
-         ParallelExecutor< Permutation, Devices::Host >()( begins, ends, f );
-      else
-         SequentialExecutor< Permutation >()( begins, ends, f );
-   }
-template< typename Permutation >
-struct ExecutorDispatcher< Permutation, Devices::Cuda >
-   template< typename Begins, typename Ends, typename Func >
-   void operator()( const Begins& begins, const Ends& ends, Func f )
-   {
-      ParallelExecutor< Permutation, Devices::Cuda >()( begins, ends, f );
-   }
 #ifndef __NVCC__
 template< typename Output,
           typename Func,