Commit f4d2f8c6 authored by Jakub Klinkovský's avatar Jakub Klinkovský Committed by Jakub Klinkovský
Browse files

NDArray: added forInternal method

- fixed executors for operations: use inverse permutation when calling
  the wrapped lambda function
- custom internal region can be specified with custom begins/ends
  multiindices
parent 0ef45b13
Loading
Loading
Loading
Loading
+23 −1
Original line number Diff line number Diff line
@@ -177,7 +177,29 @@ public:
   void forAll( Func f ) const
   {
      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
      dispatch( sizes, f );
      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
      dispatch( Begins{}, sizes, f );
   }

   template< typename Device2 = DeviceType, typename Func >
   void forInternal( Func f ) const
   {
      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >;
      // subtract static sizes
      using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type;
      // subtract dynamic sizes
      Ends ends;
      __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolder >::subtract( ends, sizes );
      dispatch( Begins{}, ends, f );
   }

   template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends >
   void forInternal( Func f, const Begins& begins, const Ends& ends ) const
   {
      // TODO: assert "begins <= sizes", "ends <= sizes"
      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
      dispatch( begins, ends, f );
   }


+23 −1
Original line number Diff line number Diff line
@@ -231,7 +231,29 @@ public:
   void forAll( Func f ) const
   {
      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
      dispatch( sizes, f );
      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >;
      dispatch( Begins{}, sizes, f );
   }

   template< typename Device2 = DeviceType, typename Func >
   void forInternal( Func f ) const
   {
      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
      using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >;
      // subtract static sizes
      using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type;
      // subtract dynamic sizes
      Ends ends;
      __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolder >::subtract( ends, sizes );
      dispatch( Begins{}, ends, f );
   }

   template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends >
   void forInternal( Func f, const Begins& begins, const Ends& ends ) const
   {
      // TODO: assert "begins <= sizes", "ends <= sizes"
      __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch;
      dispatch( begins, ends, f );
   }

protected:
+30 −0
Original line number Diff line number Diff line
@@ -111,6 +111,36 @@ void setSizesHelper( SizesHolder& holder,
}


// helper for the forInternal method
template< std::size_t ConstValue,
          typename TargetHolder,
          typename SourceHolder,
          std::size_t level = TargetHolder::getDimension() - 1 >
struct SetSizesSubtractHelper
{
   static void subtract( TargetHolder& target,
                         const SourceHolder& source )
   {
      if( source.template getStaticSize< level >() == 0 )
         target.template setSize< level >( source.template getSize< level >() - ConstValue );
      SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, level - 1 >::subtract( target, source );
   }
};

template< std::size_t ConstValue,
          typename TargetHolder,
          typename SourceHolder >
struct SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, 0 >
{
   static void subtract( TargetHolder& target,
                         const SourceHolder& source )
   {
      if( source.template getStaticSize< 0 >() == 0 )
         target.template setSize< 0 >( source.template getSize< 0 >() - ConstValue );
   }
};


// A variadic bounds-checker for indices
template< typename SizesHolder >
__cuda_callable__
+43 −2
Original line number Diff line number Diff line
@@ -93,13 +93,15 @@ is_in_sequence( Index value, std::integer_sequence< Index, vals... > )

// Get index of the first occurrence of value in a variadic pack.
template< typename V >
constexpr std::size_t index_in_pack( V&& value )
constexpr std::size_t
index_in_pack( V&& value )
{
   return 0;
}

template< typename V, typename T, typename... Ts >
constexpr std::size_t index_in_pack( V&& value, T&& arg, Ts&&... args )
constexpr std::size_t
index_in_pack( V&& value, T&& arg, Ts&&... args )
{
   if( value == arg )
      return 0;
@@ -196,6 +198,45 @@ auto call_with_permuted_arguments( Func f, Args&&... args ) -> decltype(auto)
}


template< typename Permutation,
          typename Sequence >
struct CallInversePermutationHelper
{};

template< typename Permutation,
          std::size_t... N >
struct CallInversePermutationHelper< Permutation, std::index_sequence< N... > >
{
   template< typename Func,
             typename... Args >
   __cuda_callable__
   static auto apply( Func&& f, Args&&... args ) -> decltype(auto)
   {
      return std::forward< Func >( f )( get_from_pack<
                  index_in_sequence( N, Permutation{} )
                >( std::forward< Args >( args )... )... );
   }
};

// Call specified function with permuted arguments.
// [used in ndarray_operations.h]
template< typename Permutation,
          typename Func,
          typename... Args >
__cuda_callable__
// FIXME: does not compile with nvcc 10.0
//auto call_with_unpermuted_arguments( Func&& f, Args&&... args ) -> decltype(auto)
//{
//   return CallInversePermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > >
//          ::apply( std::forward< Func >( f ), std::forward< Args >( args )... );
//}
auto call_with_unpermuted_arguments( Func f, Args&&... args ) -> decltype(auto)
{
   return CallInversePermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > >
          ::apply( f, std::forward< Args >( args )... );
}


// Check that all elements of the initializer list are equal to the specified value.
// [used in ndarray_operations.h]
constexpr bool
+149 −83
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@
#include <TNL/ParallelFor.h>

#include <TNL/Containers/ndarray/Meta.h>
#include <TNL/Containers/ndarray/SizesHolder.h>

namespace TNL {
namespace Containers {
@@ -25,34 +26,45 @@ template< typename Permutation,
          typename LevelTag = IndexTag< 0 > >
struct SequentialExecutor
{
   template< typename SizesHolder,
   template< typename Begins,
             typename Ends,
             typename Func,
             typename... Indices >
   __cuda_callable__
   void operator()( const SizesHolder& sizes, Func f, Indices&&... indices )
   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
   {
      static_assert( Begins::getDimension() == Ends::getDimension(),
                     "wrong begins or ends" );

      SequentialExecutor< Permutation, IndexTag< LevelTag::value + 1 > > exec;
      const auto size = sizes.template getSize< get< LevelTag::value >( Permutation{} ) >();
      for( typename SizesHolder::IndexType i = 0; i < size; i++ )
         exec( sizes, f, std::forward< Indices >( indices )..., i );
      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
      for( auto i = begin; i < end; i++ )
         exec( begins, ends, f, std::forward< Indices >( indices )..., i );
   }
};

template< typename Permutation >
struct SequentialExecutor< Permutation, IndexTag< Permutation::size() - 1 > >
{
   template< typename SizesHolder,
   template< typename Begins,
             typename Ends,
             typename Func,
             typename... Indices >
   __cuda_callable__
   void operator()( const SizesHolder& sizes, Func f, Indices&&... indices )
   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
   {
      static_assert( sizeof...(indices) == SizesHolder::getDimension() - 1,
      static_assert( Begins::getDimension() == Ends::getDimension(),
                     "wrong begins or ends" );
      static_assert( sizeof...(indices) == Begins::getDimension() - 1,
                     "invalid number of indices in the final step of the SequentialExecutor" );

      const auto size = sizes.template getSize< get< SizesHolder::getDimension() - 1 >( Permutation{} ) >();
      for( typename SizesHolder::IndexType i = 0; i < size; i++ )
         call_with_permuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
      using LevelTag = IndexTag< Permutation::size() - 1 >;

      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
      for( auto i = begin; i < end; i++ )
         call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i );
   }
};

@@ -61,34 +73,43 @@ template< typename Permutation,
          typename LevelTag = IndexTag< Permutation::size() - 1 > >
struct SequentialExecutorRTL
{
   template< typename SizesHolder,
   template< typename Begins,
             typename Ends,
             typename Func,
             typename... Indices >
   __cuda_callable__
   void operator()( const SizesHolder& sizes, Func f, Indices&&... indices )
   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
   {
      static_assert( Begins::getDimension() == Ends::getDimension(),
                     "wrong begins or ends" );

      SequentialExecutorRTL< Permutation, IndexTag< LevelTag::value - 1 > > exec;
      const auto size = sizes.template getSize< get< LevelTag::value >( Permutation{} ) >();
      for( typename SizesHolder::IndexType i = 0; i < size; i++ )
         exec( sizes, f, i, std::forward< Indices >( indices )... );
      const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >();
      const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >();
      for( auto i = begin; i < end; i++ )
         exec( begins, ends, f, i, std::forward< Indices >( indices )... );
   }
};

template< typename Permutation >
struct SequentialExecutorRTL< Permutation, IndexTag< 0 > >
{
   template< typename SizesHolder,
   template< typename Begins,
             typename Ends,
             typename Func,
             typename... Indices >
   __cuda_callable__
   void operator()( const SizesHolder& sizes, Func f, Indices&&... indices )
   void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices )
   {
      static_assert( sizeof...(indices) == SizesHolder::getDimension() - 1,
                     "invalid number of indices in the final step of the SequentialExecutor" );
      static_assert( Begins::getDimension() == Ends::getDimension(),
                     "wrong begins or ends" );
      static_assert( sizeof...(indices) == Begins::getDimension() - 1,
                     "invalid number of indices in the final step of the SequentialExecutorRTL" );

      const auto size = sizes.template getSize< get< 0 >( Permutation{} ) >();
      for( typename SizesHolder::IndexType i = 0; i < size; i++ )
         call_with_permuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... );
      const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >();
      const auto end = ends.template getSize< get< 0 >( Permutation{} ) >();
      for( auto i = begin; i < end; i++ )
         call_with_unpermuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... );
   }
};

@@ -97,42 +118,58 @@ template< typename Permutation,
          typename Device >
struct ParallelExecutorDeviceDispatch
{
   template< typename SizesHolder, typename Func >
   void operator()( const SizesHolder& sizes, Func f )
   template< typename Begins,
             typename Ends,
             typename Func >
   void operator()( const Begins& begins, const Ends& ends, Func f )
   {
      using Index = typename SizesHolder::IndexType;
      static_assert( Begins::getDimension() == Ends::getDimension(),
                     "wrong begins or ends" );

      using Index = typename Ends::IndexType;

      auto kernel = [=] ( Index i2, Index i1, Index i0 )
      {
         SequentialExecutor< Permutation, IndexTag< 3 > > exec;
         exec( sizes, f, i0, i1, i2 );
         exec( begins, ends, f, i0, i1, i2 );
      };

      const Index size0 = sizes.template getSize< get< 0 >( Permutation{} ) >();
      const Index size1 = sizes.template getSize< get< 1 >( Permutation{} ) >();
      const Index size2 = sizes.template getSize< get< 2 >( Permutation{} ) >();
      ParallelFor3D< Device >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
      const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >();
      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
      const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
      ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
   }
};

template< typename Permutation >
struct ParallelExecutorDeviceDispatch< Permutation, Devices::Cuda >
{
   template< typename SizesHolder, typename Func >
   void operator()( const SizesHolder& sizes, Func f )
   template< typename Begins,
             typename Ends,
             typename Func >
   void operator()( const Begins& begins, const Ends& ends, Func f )
   {
      using Index = typename SizesHolder::IndexType;
      static_assert( Begins::getDimension() == Ends::getDimension(),
                     "wrong begins or ends" );

      using Index = typename Ends::IndexType;

      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
      {
         SequentialExecutorRTL< Permutation, IndexTag< SizesHolder::getDimension() - 4 > > exec;
         exec( sizes, f, i0, i1, i2 );
         SequentialExecutorRTL< Permutation, IndexTag< Begins::getDimension() - 4 > > exec;
         exec( begins, ends, f, i0, i1, i2 );
      };

      const Index size0 = sizes.template getSize< get< SizesHolder::getDimension() - 3 >( Permutation{} ) >();
      const Index size1 = sizes.template getSize< get< SizesHolder::getDimension() - 2 >( Permutation{} ) >();
      const Index size2 = sizes.template getSize< get< SizesHolder::getDimension() - 1 >( Permutation{} ) >();
      ParallelFor3D< Devices::Cuda >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
      const Index begin0 = begins.template getSize< get< Begins::getDimension() - 3 >( Permutation{} ) >();
      const Index begin1 = begins.template getSize< get< Begins::getDimension() - 2 >( Permutation{} ) >();
      const Index begin2 = begins.template getSize< get< Begins::getDimension() - 1 >( Permutation{} ) >();
      const Index end0 = ends.template getSize< get< Ends::getDimension() - 3 >( Permutation{} ) >();
      const Index end1 = ends.template getSize< get< Ends::getDimension() - 2 >( Permutation{} ) >();
      const Index end2 = ends.template getSize< get< Ends::getDimension() - 1 >( Permutation{} ) >();
      ParallelFor3D< Devices::Cuda >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
   }
};

@@ -141,11 +178,13 @@ template< typename Permutation,
          typename DimTag = IndexTag< Permutation::size() > >
struct ParallelExecutor
{
   template< typename SizesHolder, typename Func >
   void operator()( const SizesHolder& sizes, Func f )
   template< typename Begins,
             typename Ends,
             typename Func >
   void operator()( const Begins& begins, const Ends& ends, Func f )
   {
      ParallelExecutorDeviceDispatch< Permutation, Device > dispatch;
      dispatch( sizes, f );
      dispatch( begins, ends, f );
   }
};

@@ -153,20 +192,28 @@ template< typename Permutation,
          typename Device >
struct ParallelExecutor< Permutation, Device, IndexTag< 3 > >
{
   template< typename SizesHolder, typename Func >
   void operator()( const SizesHolder& sizes, Func f )
   template< typename Begins,
             typename Ends,
             typename Func >
   void operator()( const Begins& begins, const Ends& ends, Func f )
   {
      using Index = typename SizesHolder::IndexType;
      static_assert( Begins::getDimension() == Ends::getDimension(),
                     "wrong begins or ends" );

      using Index = typename Ends::IndexType;

      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
      {
         call_with_permuted_arguments< Permutation >( f, i0, i1, i2 );
         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
      };

      const Index size0 = sizes.template getSize< get< 0 >( Permutation{} ) >();
      const Index size1 = sizes.template getSize< get< 1 >( Permutation{} ) >();
      const Index size2 = sizes.template getSize< get< 2 >( Permutation{} ) >();
      ParallelFor3D< Device >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
      const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >();
      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
      const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >();
      ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel );
   }
};

@@ -174,19 +221,26 @@ template< typename Permutation,
          typename Device >
struct ParallelExecutor< Permutation, Device, IndexTag< 2 > >
{
   template< typename SizesHolder, typename Func >
   void operator()( const SizesHolder& sizes, Func f )
   template< typename Begins,
             typename Ends,
             typename Func >
   void operator()( const Begins& begins, const Ends& ends, Func f )
   {
      using Index = typename SizesHolder::IndexType;
      static_assert( Begins::getDimension() == Ends::getDimension(),
                     "wrong begins or ends" );

      using Index = typename Ends::IndexType;

      auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
      {
         call_with_permuted_arguments< Permutation >( f, i0, i1 );
         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
      };

      const Index size0 = sizes.template getSize< get< 0 >( Permutation{} ) >();
      const Index size1 = sizes.template getSize< get< 1 >( Permutation{} ) >();
      ParallelFor2D< Device >::exec( (Index) 0, (Index) 0, size1, size0, kernel );
      const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
      const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
      const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
      const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
      ParallelFor2D< Device >::exec( begin1, begin0, end1, end0, kernel );
   }
};

@@ -194,18 +248,25 @@ template< typename Permutation,
          typename Device >
struct ParallelExecutor< Permutation, Device, IndexTag< 1 > >
{
   template< typename SizesHolder, typename Func >
   void operator()( const SizesHolder& sizes, Func f )
   template< typename Begins,
             typename Ends,
             typename Func >
   void operator()( const Begins& begins, const Ends& ends, Func f )
   {
      using Index = typename SizesHolder::IndexType;
      static_assert( Begins::getDimension() == Ends::getDimension(),
                     "wrong begins or ends" );

      auto kernel = [=] __cuda_callable__ ( Index i )
      {
         call_with_permuted_arguments< Permutation >( f, i );
      };
      using Index = typename Ends::IndexType;

//      auto kernel = [=] __cuda_callable__ ( Index i )
//      {
//         call_with_unpermuted_arguments< Permutation >( f, i );
//      };

      const Index size = sizes.template getSize< get< 0 >( Permutation{} ) >();
      ParallelFor< Device >::exec( (Index) 0, size, kernel );
      const Index begin = begins.template getSize< get< 0 >( Permutation{} ) >();
      const Index end = ends.template getSize< get< 0 >( Permutation{} ) >();
//      ParallelFor< Device >::exec( begin, end, kernel );
      ParallelFor< Device >::exec( begin, end, f );
   }
};

@@ -215,33 +276,33 @@ template< typename Permutation,
          typename Device >
struct ExecutorDispatcher
{
   template< typename SizesHolder, typename Func >
   void operator()( const SizesHolder& sizes, Func f )
   template< typename Begins, typename Ends, typename Func >
   void operator()( const Begins& begins, const Ends& ends, Func f )
   {
      SequentialExecutor< Permutation >()( sizes, f );
      SequentialExecutor< Permutation >()( begins, ends, f );
   }
};

template< typename Permutation >
struct ExecutorDispatcher< Permutation, Devices::Host >
{
   template< typename SizesHolder, typename Func >
   void operator()( const SizesHolder& sizes, Func f )
   template< typename Begins, typename Ends, typename Func >
   void operator()( const Begins& begins, const Ends& ends, Func f )
   {
      if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 )
         ParallelExecutor< Permutation, Devices::Host >()( sizes, f );
         ParallelExecutor< Permutation, Devices::Host >()( begins, ends, f );
      else
         SequentialExecutor< Permutation >()( sizes, f );
         SequentialExecutor< Permutation >()( begins, ends, f );
   }
};

template< typename Permutation >
struct ExecutorDispatcher< Permutation, Devices::Cuda >
{
   template< typename SizesHolder, typename Func >
   void operator()( const SizesHolder& sizes, Func f )
   template< typename Begins, typename Ends, typename Func >
   void operator()( const Begins& begins, const Ends& ends, Func f )
   {
      ParallelExecutor< Permutation, Devices::Cuda >()( sizes, f );
      ParallelExecutor< Permutation, Devices::Cuda >()( begins, ends, f );
   }
};

@@ -263,7 +324,8 @@ void nd_map_view( Output output, Func f, const Input... input )
   };

   ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
   dispatch( output.getSizes(), wrapper );
   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
   dispatch( Begins{}, output.getSizes(), wrapper );
}

#else
@@ -362,7 +424,8 @@ void nd_map_view( Output output, Func f )
{
   nvcc_map_helper_0< Output, Func > wrapper( output, f );
   ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
   dispatch( output.getSizes(), wrapper );
   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
   dispatch( Begins{}, output.getSizes(), wrapper );
}

template< typename Output,
@@ -375,7 +438,8 @@ void nd_map_view( Output output, Func f, const Input1 input1 )

   nvcc_map_helper_1< Output, Func, Input1 > wrapper( output, f, input1 );
   ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
   dispatch( output.getSizes(), wrapper );
   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
   dispatch( Begins{}, output.getSizes(), wrapper );
}

template< typename Output,
@@ -389,7 +453,8 @@ void nd_map_view( Output output, Func f, const Input1 input1, const Input2 input

   nvcc_map_helper_2< Output, Func, Input1, Input2 > wrapper( output, f, input1, input2 );
   ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
   dispatch( output.getSizes(), wrapper );
   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
   dispatch( Begins{}, output.getSizes(), wrapper );
}

template< typename Output,
@@ -404,7 +469,8 @@ void nd_map_view( Output output, Func f, const Input1 input1, const Input2 input

   nvcc_map_helper_3< Output, Func, Input1, Input2, Input3 > wrapper( output, f, input1, input2, input3 );
   ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch;
   dispatch( output.getSizes(), wrapper );
   using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >;
   dispatch( Begins{}, output.getSizes(), wrapper );
}

#endif
Loading