From 061263a6412a51314282418be75e3ac9b61b9307 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz> Date: Sat, 9 Feb 2019 15:17:39 +0100 Subject: [PATCH] NDArray: split implementation of executors into a separate header file --- src/TNL/Containers/NDArrayView.h | 1 + src/TNL/Containers/ndarray/Executors.h | 310 ++++++++++++++++++++++++ src/TNL/Containers/ndarray/Operations.h | 289 +--------------------- 3 files changed, 312 insertions(+), 288 deletions(-) create mode 100644 src/TNL/Containers/ndarray/Executors.h diff --git a/src/TNL/Containers/NDArrayView.h b/src/TNL/Containers/NDArrayView.h index cafaabe9b2..50119eda42 100644 --- a/src/TNL/Containers/NDArrayView.h +++ b/src/TNL/Containers/NDArrayView.h @@ -15,6 +15,7 @@ #include <TNL/Containers/ndarray/Indexing.h> #include <TNL/Containers/ndarray/SizesHolder.h> #include <TNL/Containers/ndarray/Subarrays.h> +#include <TNL/Containers/ndarray/Executors.h> #include <TNL/Containers/ndarray/Operations.h> namespace TNL { diff --git a/src/TNL/Containers/ndarray/Executors.h b/src/TNL/Containers/ndarray/Executors.h new file mode 100644 index 0000000000..ba37fe345a --- /dev/null +++ b/src/TNL/Containers/ndarray/Executors.h @@ -0,0 +1,310 @@ +/*************************************************************************** + Executors.h - description + ------------------- + begin : Dec 24, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include <TNL/ParallelFor.h> + +#include <TNL/Containers/ndarray/Meta.h> +#include <TNL/Containers/ndarray/SizesHolder.h> + +namespace TNL { +namespace Containers { +namespace __ndarray_impl { + +template< typename Permutation, + typename LevelTag = IndexTag< 0 > > +struct SequentialExecutor +{ + template< typename Begins, + typename Ends, + typename Func, + typename... Indices > + __cuda_callable__ + void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + SequentialExecutor< Permutation, IndexTag< LevelTag::value + 1 > > exec; + const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); + const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); + for( auto i = begin; i < end; i++ ) + exec( begins, ends, f, std::forward< Indices >( indices )..., i ); + } +}; + +template< typename Permutation > +struct SequentialExecutor< Permutation, IndexTag< Permutation::size() - 1 > > +{ + template< typename Begins, + typename Ends, + typename Func, + typename... Indices > + __cuda_callable__ + void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + static_assert( sizeof...(indices) == Begins::getDimension() - 1, + "invalid number of indices in the final step of the SequentialExecutor" ); + + using LevelTag = IndexTag< Permutation::size() - 1 >; + + const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); + const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); + for( auto i = begin; i < end; i++ ) + call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i ); + } +}; + + +template< typename Permutation, + typename LevelTag = IndexTag< Permutation::size() - 1 > > +struct SequentialExecutorRTL +{ + template< typename Begins, + typename Ends, + typename Func, + typename... Indices > + __cuda_callable__ + void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + SequentialExecutorRTL< Permutation, IndexTag< LevelTag::value - 1 > > exec; + const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); + const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); + for( auto i = begin; i < end; i++ ) + exec( begins, ends, f, i, std::forward< Indices >( indices )... ); + } +}; + +template< typename Permutation > +struct SequentialExecutorRTL< Permutation, IndexTag< 0 > > +{ + template< typename Begins, + typename Ends, + typename Func, + typename... Indices > + __cuda_callable__ + void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + static_assert( sizeof...(indices) == Begins::getDimension() - 1, + "invalid number of indices in the final step of the SequentialExecutorRTL" ); + + const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >(); + const auto end = ends.template getSize< get< 0 >( Permutation{} ) >(); + for( auto i = begin; i < end; i++ ) + call_with_unpermuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... ); + } +}; + + +template< typename Permutation, + typename Device > +struct ParallelExecutorDeviceDispatch +{ + template< typename Begins, + typename Ends, + typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + using Index = typename Ends::IndexType; + + auto kernel = [=] ( Index i2, Index i1, Index i0 ) + { + SequentialExecutor< Permutation, IndexTag< 3 > > exec; + exec( begins, ends, f, i0, i1, i2 ); + }; + + const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); + const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); + const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >(); + const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); + const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); + const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); + ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); + } +}; + +template< typename Permutation > +struct ParallelExecutorDeviceDispatch< Permutation, Devices::Cuda > +{ + template< typename Begins, + typename Ends, + typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + using Index = typename Ends::IndexType; + + auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) + { + SequentialExecutorRTL< Permutation, IndexTag< Begins::getDimension() - 4 > > exec; + exec( begins, ends, f, i0, i1, i2 ); + }; + + const Index begin0 = begins.template getSize< get< Begins::getDimension() - 3 >( Permutation{} ) >(); + const Index begin1 = begins.template getSize< get< Begins::getDimension() - 2 >( Permutation{} ) >(); + const Index begin2 = begins.template getSize< get< Begins::getDimension() - 1 >( Permutation{} ) >(); + const Index end0 = ends.template getSize< get< Ends::getDimension() - 3 >( Permutation{} ) >(); + const Index end1 = ends.template getSize< get< Ends::getDimension() - 2 >( Permutation{} ) >(); + const Index end2 = ends.template getSize< get< Ends::getDimension() - 1 >( Permutation{} ) >(); + ParallelFor3D< Devices::Cuda >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); + } +}; + +template< typename Permutation, + typename Device, + typename DimTag = IndexTag< Permutation::size() > > +struct ParallelExecutor +{ + template< typename Begins, + typename Ends, + typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) + { + ParallelExecutorDeviceDispatch< Permutation, Device > dispatch; + dispatch( begins, ends, f ); + } +}; + +template< typename Permutation, + typename Device > +struct ParallelExecutor< Permutation, Device, IndexTag< 3 > > +{ + template< typename Begins, + typename Ends, + typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + using Index = typename Ends::IndexType; + + auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) + { + call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); + }; + + const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); + const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); + const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >(); + const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); + const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); + const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); + ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); + } +}; + +template< typename Permutation, + typename Device > +struct ParallelExecutor< Permutation, Device, IndexTag< 2 > > +{ + template< typename Begins, + typename Ends, + typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + using Index = typename Ends::IndexType; + + auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 ) + { + call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); + }; + + const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); + const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); + const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); + const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); + ParallelFor2D< Device >::exec( begin1, begin0, end1, end0, kernel ); + } +}; + +template< typename Permutation, + typename Device > +struct ParallelExecutor< Permutation, Device, IndexTag< 1 > > +{ + template< typename Begins, + typename Ends, + typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + using Index = typename Ends::IndexType; + +// auto kernel = [=] __cuda_callable__ ( Index i ) +// { +// call_with_unpermuted_arguments< Permutation >( f, i ); +// }; + + const Index begin = begins.template getSize< get< 0 >( Permutation{} ) >(); + const Index end = ends.template getSize< get< 0 >( Permutation{} ) >(); +// ParallelFor< Device >::exec( begin, end, kernel ); + ParallelFor< Device >::exec( begin, end, f ); + } +}; + + +// Device may be void which stands for StaticNDArray +template< typename Permutation, + typename Device > +struct ExecutorDispatcher +{ + template< typename Begins, typename Ends, typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) + { + SequentialExecutor< Permutation >()( begins, ends, f ); + } +}; + +template< typename Permutation > +struct ExecutorDispatcher< Permutation, Devices::Host > +{ + template< typename Begins, typename Ends, typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) + { + if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) + ParallelExecutor< Permutation, Devices::Host >()( begins, ends, f ); + else + SequentialExecutor< Permutation >()( begins, ends, f ); + } +}; + +template< typename Permutation > +struct ExecutorDispatcher< Permutation, Devices::Cuda > +{ + template< typename Begins, typename Ends, typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) + { + ParallelExecutor< Permutation, Devices::Cuda >()( begins, ends, f ); + } +}; + +} // namespace __ndarray_impl +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/ndarray/Operations.h b/src/TNL/Containers/ndarray/Operations.h index b1f793405d..eb219b6e01 100644 --- a/src/TNL/Containers/ndarray/Operations.h +++ b/src/TNL/Containers/ndarray/Operations.h @@ -12,300 +12,13 @@ #pragma once -#include <TNL/ParallelFor.h> - -#include <TNL/Containers/ndarray/Meta.h> -#include <TNL/Containers/ndarray/SizesHolder.h> +#include <TNL/Containers/ndarray/Executors.h> namespace TNL { namespace Containers { namespace __ndarray_impl { -template< typename Permutation, - typename LevelTag = IndexTag< 0 > > -struct SequentialExecutor -{ - template< typename Begins, - typename Ends, - typename Func, - typename... Indices > - __cuda_callable__ - void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) - { - static_assert( Begins::getDimension() == Ends::getDimension(), - "wrong begins or ends" ); - - SequentialExecutor< Permutation, IndexTag< LevelTag::value + 1 > > exec; - const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); - const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); - for( auto i = begin; i < end; i++ ) - exec( begins, ends, f, std::forward< Indices >( indices )..., i ); - } -}; - -template< typename Permutation > -struct SequentialExecutor< Permutation, IndexTag< Permutation::size() - 1 > > -{ - template< typename Begins, - typename Ends, - typename Func, - typename... Indices > - __cuda_callable__ - void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) - { - static_assert( Begins::getDimension() == Ends::getDimension(), - "wrong begins or ends" ); - static_assert( sizeof...(indices) == Begins::getDimension() - 1, - "invalid number of indices in the final step of the SequentialExecutor" ); - - using LevelTag = IndexTag< Permutation::size() - 1 >; - - const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); - const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); - for( auto i = begin; i < end; i++ ) - call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i ); - } -}; - - -template< typename Permutation, - typename LevelTag = IndexTag< Permutation::size() - 1 > > -struct SequentialExecutorRTL -{ - template< typename Begins, - typename Ends, - typename Func, - typename... Indices > - __cuda_callable__ - void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) - { - static_assert( Begins::getDimension() == Ends::getDimension(), - "wrong begins or ends" ); - - SequentialExecutorRTL< Permutation, IndexTag< LevelTag::value - 1 > > exec; - const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); - const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); - for( auto i = begin; i < end; i++ ) - exec( begins, ends, f, i, std::forward< Indices >( indices )... ); - } -}; - -template< typename Permutation > -struct SequentialExecutorRTL< Permutation, IndexTag< 0 > > -{ - template< typename Begins, - typename Ends, - typename Func, - typename... Indices > - __cuda_callable__ - void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) - { - static_assert( Begins::getDimension() == Ends::getDimension(), - "wrong begins or ends" ); - static_assert( sizeof...(indices) == Begins::getDimension() - 1, - "invalid number of indices in the final step of the SequentialExecutorRTL" ); - - const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >(); - const auto end = ends.template getSize< get< 0 >( Permutation{} ) >(); - for( auto i = begin; i < end; i++ ) - call_with_unpermuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... ); - } -}; - - -template< typename Permutation, - typename Device > -struct ParallelExecutorDeviceDispatch -{ - template< typename Begins, - typename Ends, - typename Func > - void operator()( const Begins& begins, const Ends& ends, Func f ) - { - static_assert( Begins::getDimension() == Ends::getDimension(), - "wrong begins or ends" ); - - using Index = typename Ends::IndexType; - - auto kernel = [=] ( Index i2, Index i1, Index i0 ) - { - SequentialExecutor< Permutation, IndexTag< 3 > > exec; - exec( begins, ends, f, i0, i1, i2 ); - }; - - const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); - const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); - const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >(); - const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); - const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); - const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); - ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); - } -}; - -template< typename Permutation > -struct ParallelExecutorDeviceDispatch< Permutation, Devices::Cuda > -{ - template< typename Begins, - typename Ends, - typename Func > - void operator()( const Begins& begins, const Ends& ends, Func f ) - { - static_assert( Begins::getDimension() == Ends::getDimension(), - "wrong begins or ends" ); - - using Index = typename Ends::IndexType; - - auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) - { - SequentialExecutorRTL< Permutation, IndexTag< Begins::getDimension() - 4 > > exec; - exec( begins, ends, f, i0, i1, i2 ); - }; - - const Index begin0 = begins.template getSize< get< Begins::getDimension() - 3 >( Permutation{} ) >(); - const Index begin1 = begins.template getSize< get< Begins::getDimension() - 2 >( Permutation{} ) >(); - const Index begin2 = begins.template getSize< get< Begins::getDimension() - 1 >( Permutation{} ) >(); - const Index end0 = ends.template getSize< get< Ends::getDimension() - 3 >( Permutation{} ) >(); - const Index end1 = ends.template getSize< get< Ends::getDimension() - 2 >( Permutation{} ) >(); - const Index end2 = ends.template getSize< get< Ends::getDimension() - 1 >( Permutation{} ) >(); - ParallelFor3D< Devices::Cuda >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); - } -}; - -template< typename Permutation, - typename Device, - typename DimTag = IndexTag< Permutation::size() > > -struct ParallelExecutor -{ - template< typename Begins, - typename Ends, - typename Func > - void operator()( const Begins& begins, const Ends& ends, Func f ) - { - ParallelExecutorDeviceDispatch< Permutation, Device > dispatch; - dispatch( begins, ends, f ); - } -}; - -template< typename Permutation, - typename Device > -struct ParallelExecutor< Permutation, Device, IndexTag< 3 > > -{ - template< typename Begins, - typename Ends, - typename Func > - void operator()( const Begins& begins, const Ends& ends, Func f ) - { - static_assert( Begins::getDimension() == Ends::getDimension(), - "wrong begins or ends" ); - - using Index = typename Ends::IndexType; - - auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) - { - call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); - }; - - const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); - const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); - const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >(); - const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); - const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); - const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); - ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); - } -}; - -template< typename Permutation, - typename Device > -struct ParallelExecutor< Permutation, Device, IndexTag< 2 > > -{ - template< typename Begins, - typename Ends, - typename Func > - void operator()( const Begins& begins, const Ends& ends, Func f ) - { - static_assert( Begins::getDimension() == Ends::getDimension(), - "wrong begins or ends" ); - - using Index = typename Ends::IndexType; - - auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 ) - { - call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); - }; - - const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); - const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); - const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); - const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); - ParallelFor2D< Device >::exec( begin1, begin0, end1, end0, kernel ); - } -}; - -template< typename Permutation, - typename Device > -struct ParallelExecutor< Permutation, Device, IndexTag< 1 > > -{ - template< typename Begins, - typename Ends, - typename Func > - void operator()( const Begins& begins, const Ends& ends, Func f ) - { - static_assert( Begins::getDimension() == Ends::getDimension(), - "wrong begins or ends" ); - - using Index = typename Ends::IndexType; - -// auto kernel = [=] __cuda_callable__ ( Index i ) -// { -// call_with_unpermuted_arguments< Permutation >( f, i ); -// }; - - const Index begin = begins.template getSize< get< 0 >( Permutation{} ) >(); - const Index end = ends.template getSize< get< 0 >( Permutation{} ) >(); -// ParallelFor< Device >::exec( begin, end, kernel ); - ParallelFor< Device >::exec( begin, end, f ); - } -}; - - -// Device may be void which stands for StaticNDArray -template< typename Permutation, - typename Device > -struct ExecutorDispatcher -{ - template< typename Begins, typename Ends, typename Func > - void operator()( const Begins& begins, const Ends& ends, Func f ) - { - SequentialExecutor< Permutation >()( begins, ends, f ); - } -}; - -template< typename Permutation > -struct ExecutorDispatcher< Permutation, Devices::Host > -{ - template< typename Begins, typename Ends, typename Func > - void operator()( const Begins& begins, const Ends& ends, Func f ) - { - if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) - ParallelExecutor< Permutation, Devices::Host >()( begins, ends, f ); - else - SequentialExecutor< Permutation >()( begins, ends, f ); - } -}; - -template< typename Permutation > -struct ExecutorDispatcher< Permutation, Devices::Cuda > -{ - template< typename Begins, typename Ends, typename Func > - void operator()( const Begins& begins, const Ends& ends, Func f ) - { - ParallelExecutor< Permutation, Devices::Cuda >()( begins, ends, f ); - } -}; - #ifndef __NVCC__ template< typename Output, typename Func, -- GitLab