Loading src/TNL/Containers/NDArrayView.h +1 −0 Original line number Diff line number Diff line Loading @@ -15,6 +15,7 @@ #include <TNL/Containers/ndarray/Indexing.h> #include <TNL/Containers/ndarray/SizesHolder.h> #include <TNL/Containers/ndarray/Subarrays.h> #include <TNL/Containers/ndarray/Executors.h> #include <TNL/Containers/ndarray/Operations.h> namespace TNL { Loading src/TNL/Containers/ndarray/Executors.h 0 → 100644 +310 −0 Original line number Diff line number Diff line /*************************************************************************** Executors.h - description ------------------- begin : Dec 24, 2018 copyright : (C) 2018 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ // Implemented by: Jakub Klinkovsky #pragma once #include <TNL/ParallelFor.h> #include <TNL/Containers/ndarray/Meta.h> #include <TNL/Containers/ndarray/SizesHolder.h> namespace TNL { namespace Containers { namespace __ndarray_impl { template< typename Permutation, typename LevelTag = IndexTag< 0 > > struct SequentialExecutor { template< typename Begins, typename Ends, typename Func, typename... Indices > __cuda_callable__ void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); SequentialExecutor< Permutation, IndexTag< LevelTag::value + 1 > > exec; const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); for( auto i = begin; i < end; i++ ) exec( begins, ends, f, std::forward< Indices >( indices )..., i ); } }; template< typename Permutation > struct SequentialExecutor< Permutation, IndexTag< Permutation::size() - 1 > > { template< typename Begins, typename Ends, typename Func, typename... Indices > __cuda_callable__ void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); static_assert( sizeof...(indices) == Begins::getDimension() - 1, "invalid number of indices in the final step of the SequentialExecutor" ); using LevelTag = IndexTag< Permutation::size() - 1 >; const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); for( auto i = begin; i < end; i++ ) call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i ); } }; template< typename Permutation, typename LevelTag = IndexTag< Permutation::size() - 1 > > struct SequentialExecutorRTL { template< typename Begins, typename Ends, typename Func, typename... Indices > __cuda_callable__ void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); SequentialExecutorRTL< Permutation, IndexTag< LevelTag::value - 1 > > exec; const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); for( auto i = begin; i < end; i++ ) exec( begins, ends, f, i, std::forward< Indices >( indices )... ); } }; template< typename Permutation > struct SequentialExecutorRTL< Permutation, IndexTag< 0 > > { template< typename Begins, typename Ends, typename Func, typename... Indices > __cuda_callable__ void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); static_assert( sizeof...(indices) == Begins::getDimension() - 1, "invalid number of indices in the final step of the SequentialExecutorRTL" ); const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >(); const auto end = ends.template getSize< get< 0 >( Permutation{} ) >(); for( auto i = begin; i < end; i++ ) call_with_unpermuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... ); } }; template< typename Permutation, typename Device > struct ParallelExecutorDeviceDispatch { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] ( Index i2, Index i1, Index i0 ) { SequentialExecutor< Permutation, IndexTag< 3 > > exec; exec( begins, ends, f, i0, i1, i2 ); }; const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >(); const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); } }; template< typename Permutation > struct ParallelExecutorDeviceDispatch< Permutation, Devices::Cuda > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) { SequentialExecutorRTL< Permutation, IndexTag< Begins::getDimension() - 4 > > exec; exec( begins, ends, f, i0, i1, i2 ); }; const Index begin0 = begins.template getSize< get< Begins::getDimension() - 3 >( Permutation{} ) >(); const Index begin1 = begins.template getSize< get< Begins::getDimension() - 2 >( Permutation{} ) >(); const Index begin2 = begins.template getSize< get< Begins::getDimension() - 1 >( Permutation{} ) >(); const Index end0 = ends.template getSize< get< Ends::getDimension() - 3 >( Permutation{} ) >(); const Index end1 = ends.template getSize< get< Ends::getDimension() - 2 >( Permutation{} ) >(); const Index end2 = ends.template getSize< get< Ends::getDimension() - 1 >( Permutation{} ) >(); ParallelFor3D< Devices::Cuda >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); } }; template< typename Permutation, typename Device, typename DimTag = IndexTag< Permutation::size() > > struct ParallelExecutor { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { ParallelExecutorDeviceDispatch< Permutation, Device > dispatch; dispatch( begins, ends, f ); } }; template< typename Permutation, typename Device > struct ParallelExecutor< Permutation, Device, IndexTag< 3 > > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); }; const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >(); const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); } }; template< typename Permutation, typename Device > struct ParallelExecutor< Permutation, Device, IndexTag< 2 > > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); }; const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); ParallelFor2D< Device >::exec( begin1, begin0, end1, end0, kernel ); } }; template< typename Permutation, typename Device > struct ParallelExecutor< Permutation, Device, IndexTag< 1 > > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; // auto kernel = [=] __cuda_callable__ ( Index i ) // { // call_with_unpermuted_arguments< Permutation >( f, i ); // }; const Index begin = begins.template getSize< get< 0 >( Permutation{} ) >(); const Index end = ends.template getSize< get< 0 >( Permutation{} ) >(); // ParallelFor< Device >::exec( begin, end, kernel ); ParallelFor< Device >::exec( begin, end, f ); } }; // Device may be void which stands for StaticNDArray template< typename Permutation, typename Device > struct ExecutorDispatcher { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { SequentialExecutor< Permutation >()( begins, ends, f ); } }; template< typename Permutation > struct ExecutorDispatcher< Permutation, Devices::Host > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) ParallelExecutor< Permutation, Devices::Host >()( begins, ends, f ); else SequentialExecutor< Permutation >()( begins, ends, f ); } }; template< typename Permutation > struct ExecutorDispatcher< Permutation, Devices::Cuda > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { ParallelExecutor< Permutation, Devices::Cuda >()( begins, ends, f ); } }; } // namespace __ndarray_impl } // namespace Containers } // namespace TNL src/TNL/Containers/ndarray/Operations.h +1 −288 Original line number Diff line number Diff line Loading @@ -12,300 +12,13 @@ #pragma once #include <TNL/ParallelFor.h> #include <TNL/Containers/ndarray/Meta.h> #include <TNL/Containers/ndarray/SizesHolder.h> #include <TNL/Containers/ndarray/Executors.h> namespace TNL { namespace Containers { namespace __ndarray_impl { template< typename Permutation, typename LevelTag = IndexTag< 0 > > struct SequentialExecutor { template< typename Begins, typename Ends, typename Func, typename... Indices > __cuda_callable__ void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); SequentialExecutor< Permutation, IndexTag< LevelTag::value + 1 > > exec; const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); for( auto i = begin; i < end; i++ ) exec( begins, ends, f, std::forward< Indices >( indices )..., i ); } }; template< typename Permutation > struct SequentialExecutor< Permutation, IndexTag< Permutation::size() - 1 > > { template< typename Begins, typename Ends, typename Func, typename... Indices > __cuda_callable__ void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); static_assert( sizeof...(indices) == Begins::getDimension() - 1, "invalid number of indices in the final step of the SequentialExecutor" ); using LevelTag = IndexTag< Permutation::size() - 1 >; const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); for( auto i = begin; i < end; i++ ) call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i ); } }; template< typename Permutation, typename LevelTag = IndexTag< Permutation::size() - 1 > > struct SequentialExecutorRTL { template< typename Begins, typename Ends, typename Func, typename... Indices > __cuda_callable__ void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); SequentialExecutorRTL< Permutation, IndexTag< LevelTag::value - 1 > > exec; const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); for( auto i = begin; i < end; i++ ) exec( begins, ends, f, i, std::forward< Indices >( indices )... ); } }; template< typename Permutation > struct SequentialExecutorRTL< Permutation, IndexTag< 0 > > { template< typename Begins, typename Ends, typename Func, typename... Indices > __cuda_callable__ void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); static_assert( sizeof...(indices) == Begins::getDimension() - 1, "invalid number of indices in the final step of the SequentialExecutorRTL" ); const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >(); const auto end = ends.template getSize< get< 0 >( Permutation{} ) >(); for( auto i = begin; i < end; i++ ) call_with_unpermuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... ); } }; template< typename Permutation, typename Device > struct ParallelExecutorDeviceDispatch { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] ( Index i2, Index i1, Index i0 ) { SequentialExecutor< Permutation, IndexTag< 3 > > exec; exec( begins, ends, f, i0, i1, i2 ); }; const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >(); const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); } }; template< typename Permutation > struct ParallelExecutorDeviceDispatch< Permutation, Devices::Cuda > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) { SequentialExecutorRTL< Permutation, IndexTag< Begins::getDimension() - 4 > > exec; exec( begins, ends, f, i0, i1, i2 ); }; const Index begin0 = begins.template getSize< get< Begins::getDimension() - 3 >( Permutation{} ) >(); const Index begin1 = begins.template getSize< get< Begins::getDimension() - 2 >( Permutation{} ) >(); const Index begin2 = begins.template getSize< get< Begins::getDimension() - 1 >( Permutation{} ) >(); const Index end0 = ends.template getSize< get< Ends::getDimension() - 3 >( Permutation{} ) >(); const Index end1 = ends.template getSize< get< Ends::getDimension() - 2 >( Permutation{} ) >(); const Index end2 = ends.template getSize< get< Ends::getDimension() - 1 >( Permutation{} ) >(); ParallelFor3D< Devices::Cuda >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); } }; template< typename Permutation, typename Device, typename DimTag = IndexTag< Permutation::size() > > struct ParallelExecutor { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { ParallelExecutorDeviceDispatch< Permutation, Device > dispatch; dispatch( begins, ends, f ); } }; template< typename Permutation, typename Device > struct ParallelExecutor< Permutation, Device, IndexTag< 3 > > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); }; const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >(); const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); } }; template< typename Permutation, typename Device > struct ParallelExecutor< Permutation, Device, IndexTag< 2 > > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); }; const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); ParallelFor2D< Device >::exec( begin1, begin0, end1, end0, kernel ); } }; template< typename Permutation, typename Device > struct ParallelExecutor< Permutation, Device, IndexTag< 1 > > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; // auto kernel = [=] __cuda_callable__ ( Index i ) // { // call_with_unpermuted_arguments< Permutation >( f, i ); // }; const Index begin = begins.template getSize< get< 0 >( Permutation{} ) >(); const Index end = ends.template getSize< get< 0 >( Permutation{} ) >(); // ParallelFor< Device >::exec( begin, end, kernel ); ParallelFor< Device >::exec( begin, end, f ); } }; // Device may be void which stands for StaticNDArray template< typename Permutation, typename Device > struct ExecutorDispatcher { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { SequentialExecutor< Permutation >()( begins, ends, f ); } }; template< typename Permutation > struct ExecutorDispatcher< Permutation, Devices::Host > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) ParallelExecutor< Permutation, Devices::Host >()( begins, ends, f ); else SequentialExecutor< Permutation >()( begins, ends, f ); } }; template< typename Permutation > struct ExecutorDispatcher< Permutation, Devices::Cuda > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { ParallelExecutor< Permutation, Devices::Cuda >()( begins, ends, f ); } }; #ifndef __NVCC__ template< typename Output, typename Func, Loading Loading
src/TNL/Containers/NDArrayView.h +1 −0 Original line number Diff line number Diff line Loading @@ -15,6 +15,7 @@ #include <TNL/Containers/ndarray/Indexing.h> #include <TNL/Containers/ndarray/SizesHolder.h> #include <TNL/Containers/ndarray/Subarrays.h> #include <TNL/Containers/ndarray/Executors.h> #include <TNL/Containers/ndarray/Operations.h> namespace TNL { Loading
src/TNL/Containers/ndarray/Executors.h 0 → 100644 +310 −0 Original line number Diff line number Diff line /*************************************************************************** Executors.h - description ------------------- begin : Dec 24, 2018 copyright : (C) 2018 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ // Implemented by: Jakub Klinkovsky #pragma once #include <TNL/ParallelFor.h> #include <TNL/Containers/ndarray/Meta.h> #include <TNL/Containers/ndarray/SizesHolder.h> namespace TNL { namespace Containers { namespace __ndarray_impl { template< typename Permutation, typename LevelTag = IndexTag< 0 > > struct SequentialExecutor { template< typename Begins, typename Ends, typename Func, typename... Indices > __cuda_callable__ void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); SequentialExecutor< Permutation, IndexTag< LevelTag::value + 1 > > exec; const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); for( auto i = begin; i < end; i++ ) exec( begins, ends, f, std::forward< Indices >( indices )..., i ); } }; template< typename Permutation > struct SequentialExecutor< Permutation, IndexTag< Permutation::size() - 1 > > { template< typename Begins, typename Ends, typename Func, typename... Indices > __cuda_callable__ void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); static_assert( sizeof...(indices) == Begins::getDimension() - 1, "invalid number of indices in the final step of the SequentialExecutor" ); using LevelTag = IndexTag< Permutation::size() - 1 >; const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); for( auto i = begin; i < end; i++ ) call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i ); } }; template< typename Permutation, typename LevelTag = IndexTag< Permutation::size() - 1 > > struct SequentialExecutorRTL { template< typename Begins, typename Ends, typename Func, typename... Indices > __cuda_callable__ void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); SequentialExecutorRTL< Permutation, IndexTag< LevelTag::value - 1 > > exec; const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); for( auto i = begin; i < end; i++ ) exec( begins, ends, f, i, std::forward< Indices >( indices )... ); } }; template< typename Permutation > struct SequentialExecutorRTL< Permutation, IndexTag< 0 > > { template< typename Begins, typename Ends, typename Func, typename... Indices > __cuda_callable__ void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); static_assert( sizeof...(indices) == Begins::getDimension() - 1, "invalid number of indices in the final step of the SequentialExecutorRTL" ); const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >(); const auto end = ends.template getSize< get< 0 >( Permutation{} ) >(); for( auto i = begin; i < end; i++ ) call_with_unpermuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... ); } }; template< typename Permutation, typename Device > struct ParallelExecutorDeviceDispatch { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] ( Index i2, Index i1, Index i0 ) { SequentialExecutor< Permutation, IndexTag< 3 > > exec; exec( begins, ends, f, i0, i1, i2 ); }; const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >(); const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); } }; template< typename Permutation > struct ParallelExecutorDeviceDispatch< Permutation, Devices::Cuda > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) { SequentialExecutorRTL< Permutation, IndexTag< Begins::getDimension() - 4 > > exec; exec( begins, ends, f, i0, i1, i2 ); }; const Index begin0 = begins.template getSize< get< Begins::getDimension() - 3 >( Permutation{} ) >(); const Index begin1 = begins.template getSize< get< Begins::getDimension() - 2 >( Permutation{} ) >(); const Index begin2 = begins.template getSize< get< Begins::getDimension() - 1 >( Permutation{} ) >(); const Index end0 = ends.template getSize< get< Ends::getDimension() - 3 >( Permutation{} ) >(); const Index end1 = ends.template getSize< get< Ends::getDimension() - 2 >( Permutation{} ) >(); const Index end2 = ends.template getSize< get< Ends::getDimension() - 1 >( Permutation{} ) >(); ParallelFor3D< Devices::Cuda >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); } }; template< typename Permutation, typename Device, typename DimTag = IndexTag< Permutation::size() > > struct ParallelExecutor { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { ParallelExecutorDeviceDispatch< Permutation, Device > dispatch; dispatch( begins, ends, f ); } }; template< typename Permutation, typename Device > struct ParallelExecutor< Permutation, Device, IndexTag< 3 > > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); }; const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >(); const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); } }; template< typename Permutation, typename Device > struct ParallelExecutor< Permutation, Device, IndexTag< 2 > > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); }; const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); ParallelFor2D< Device >::exec( begin1, begin0, end1, end0, kernel ); } }; template< typename Permutation, typename Device > struct ParallelExecutor< Permutation, Device, IndexTag< 1 > > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; // auto kernel = [=] __cuda_callable__ ( Index i ) // { // call_with_unpermuted_arguments< Permutation >( f, i ); // }; const Index begin = begins.template getSize< get< 0 >( Permutation{} ) >(); const Index end = ends.template getSize< get< 0 >( Permutation{} ) >(); // ParallelFor< Device >::exec( begin, end, kernel ); ParallelFor< Device >::exec( begin, end, f ); } }; // Device may be void which stands for StaticNDArray template< typename Permutation, typename Device > struct ExecutorDispatcher { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { SequentialExecutor< Permutation >()( begins, ends, f ); } }; template< typename Permutation > struct ExecutorDispatcher< Permutation, Devices::Host > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) ParallelExecutor< Permutation, Devices::Host >()( begins, ends, f ); else SequentialExecutor< Permutation >()( begins, ends, f ); } }; template< typename Permutation > struct ExecutorDispatcher< Permutation, Devices::Cuda > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { ParallelExecutor< Permutation, Devices::Cuda >()( begins, ends, f ); } }; } // namespace __ndarray_impl } // namespace Containers } // namespace TNL
src/TNL/Containers/ndarray/Operations.h +1 −288 Original line number Diff line number Diff line Loading @@ -12,300 +12,13 @@ #pragma once #include <TNL/ParallelFor.h> #include <TNL/Containers/ndarray/Meta.h> #include <TNL/Containers/ndarray/SizesHolder.h> #include <TNL/Containers/ndarray/Executors.h> namespace TNL { namespace Containers { namespace __ndarray_impl { template< typename Permutation, typename LevelTag = IndexTag< 0 > > struct SequentialExecutor { template< typename Begins, typename Ends, typename Func, typename... Indices > __cuda_callable__ void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); SequentialExecutor< Permutation, IndexTag< LevelTag::value + 1 > > exec; const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); for( auto i = begin; i < end; i++ ) exec( begins, ends, f, std::forward< Indices >( indices )..., i ); } }; template< typename Permutation > struct SequentialExecutor< Permutation, IndexTag< Permutation::size() - 1 > > { template< typename Begins, typename Ends, typename Func, typename... Indices > __cuda_callable__ void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); static_assert( sizeof...(indices) == Begins::getDimension() - 1, "invalid number of indices in the final step of the SequentialExecutor" ); using LevelTag = IndexTag< Permutation::size() - 1 >; const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); for( auto i = begin; i < end; i++ ) call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i ); } }; template< typename Permutation, typename LevelTag = IndexTag< Permutation::size() - 1 > > struct SequentialExecutorRTL { template< typename Begins, typename Ends, typename Func, typename... Indices > __cuda_callable__ void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); SequentialExecutorRTL< Permutation, IndexTag< LevelTag::value - 1 > > exec; const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); for( auto i = begin; i < end; i++ ) exec( begins, ends, f, i, std::forward< Indices >( indices )... ); } }; template< typename Permutation > struct SequentialExecutorRTL< Permutation, IndexTag< 0 > > { template< typename Begins, typename Ends, typename Func, typename... Indices > __cuda_callable__ void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); static_assert( sizeof...(indices) == Begins::getDimension() - 1, "invalid number of indices in the final step of the SequentialExecutorRTL" ); const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >(); const auto end = ends.template getSize< get< 0 >( Permutation{} ) >(); for( auto i = begin; i < end; i++ ) call_with_unpermuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... ); } }; template< typename Permutation, typename Device > struct ParallelExecutorDeviceDispatch { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] ( Index i2, Index i1, Index i0 ) { SequentialExecutor< Permutation, IndexTag< 3 > > exec; exec( begins, ends, f, i0, i1, i2 ); }; const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >(); const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); } }; template< typename Permutation > struct ParallelExecutorDeviceDispatch< Permutation, Devices::Cuda > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) { SequentialExecutorRTL< Permutation, IndexTag< Begins::getDimension() - 4 > > exec; exec( begins, ends, f, i0, i1, i2 ); }; const Index begin0 = begins.template getSize< get< Begins::getDimension() - 3 >( Permutation{} ) >(); const Index begin1 = begins.template getSize< get< Begins::getDimension() - 2 >( Permutation{} ) >(); const Index begin2 = begins.template getSize< get< Begins::getDimension() - 1 >( Permutation{} ) >(); const Index end0 = ends.template getSize< get< Ends::getDimension() - 3 >( Permutation{} ) >(); const Index end1 = ends.template getSize< get< Ends::getDimension() - 2 >( Permutation{} ) >(); const Index end2 = ends.template getSize< get< Ends::getDimension() - 1 >( Permutation{} ) >(); ParallelFor3D< Devices::Cuda >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); } }; template< typename Permutation, typename Device, typename DimTag = IndexTag< Permutation::size() > > struct ParallelExecutor { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { ParallelExecutorDeviceDispatch< Permutation, Device > dispatch; dispatch( begins, ends, f ); } }; template< typename Permutation, typename Device > struct ParallelExecutor< Permutation, Device, IndexTag< 3 > > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); }; const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >(); const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); } }; template< typename Permutation, typename Device > struct ParallelExecutor< Permutation, Device, IndexTag< 2 > > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); }; const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); ParallelFor2D< Device >::exec( begin1, begin0, end1, end0, kernel ); } }; template< typename Permutation, typename Device > struct ParallelExecutor< Permutation, Device, IndexTag< 1 > > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; // auto kernel = [=] __cuda_callable__ ( Index i ) // { // call_with_unpermuted_arguments< Permutation >( f, i ); // }; const Index begin = begins.template getSize< get< 0 >( Permutation{} ) >(); const Index end = ends.template getSize< get< 0 >( Permutation{} ) >(); // ParallelFor< Device >::exec( begin, end, kernel ); ParallelFor< Device >::exec( begin, end, f ); } }; // Device may be void which stands for StaticNDArray template< typename Permutation, typename Device > struct ExecutorDispatcher { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { SequentialExecutor< Permutation >()( begins, ends, f ); } }; template< typename Permutation > struct ExecutorDispatcher< Permutation, Devices::Host > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) ParallelExecutor< Permutation, Devices::Host >()( begins, ends, f ); else SequentialExecutor< Permutation >()( begins, ends, f ); } }; template< typename Permutation > struct ExecutorDispatcher< Permutation, Devices::Cuda > { template< typename Begins, typename Ends, typename Func > void operator()( const Begins& begins, const Ends& ends, Func f ) { ParallelExecutor< Permutation, Devices::Cuda >()( begins, ends, f ); } }; #ifndef __NVCC__ template< typename Output, typename Func, Loading