From ffc002609b5d8e91446113161982473e477773ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sun, 3 Feb 2019 16:33:48 +0100
Subject: [PATCH] NDArray: simplified executors for operations

---
 src/TNL/Containers/ndarray/Operations.h | 114 ++++++++----------------
 1 file changed, 36 insertions(+), 78 deletions(-)

diff --git a/src/TNL/Containers/ndarray/Operations.h b/src/TNL/Containers/ndarray/Operations.h
index 2462771d38..04c66a7f4e 100644
--- a/src/TNL/Containers/ndarray/Operations.h
+++ b/src/TNL/Containers/ndarray/Operations.h
@@ -90,86 +90,29 @@ struct SequentialExecutorRTL< Array, IndexTag< 0 > >
 
 
 template< typename Array,
-          typename DimTag = IndexTag< Array::getDimension() > >
-struct OpenMPExecutor
+          typename Device = typename Array::DeviceType >
+struct ParallelExecutorDeviceDispatch
 {
    template< typename Func >
    void operator()( const Array& array, Func f )
    {
-      SequentialExecutor< Array, IndexTag< 3 > > exec;
-
-      const auto size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
-      const auto size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
-      const auto size2 = array.template getSize< get< 2 >( typename Array::PermutationType{} ) >();
-
-      #ifdef HAVE_OPENMP
-      #pragma omp parallel for collapse(3)
-      #endif
-      for( typename Array::IndexType i0 = 0; i0 < size0; i0++ )
-      for( typename Array::IndexType i1 = 0; i1 < size1; i1++ )
-      for( typename Array::IndexType i2 = 0; i2 < size2; i2++ )
-         exec( array, f, i0, i1, i2 );
-   }
-};
+      using Index = typename Array::IndexType;
 
-template< typename Array >
-struct OpenMPExecutor< Array, IndexTag< 3 > >
-{
-   template< typename Func >
-   void operator()( const Array& array, Func f )
-   {
-      const auto size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
-      const auto size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
-      const auto size2 = array.template getSize< get< 2 >( typename Array::PermutationType{} ) >();
-
-      #ifdef HAVE_OPENMP
-      #pragma omp parallel for collapse(2)
-      #endif
-      for( typename Array::IndexType i0 = 0; i0 < size0; i0++ )
-      for( typename Array::IndexType i1 = 0; i1 < size1; i1++ )
-      for( typename Array::IndexType i2 = 0; i2 < size2; i2++ )
-         call_with_permuted_arguments< typename Array::PermutationType >( f, i0, i1, i2 );
-   }
-};
+      auto kernel = [=] ( Index i2, Index i1, Index i0 )
+      {
+         SequentialExecutor< Array, IndexTag< 3 > > exec;
+         exec( array, f, i0, i1, i2 );
+      };
 
-template< typename Array >
-struct OpenMPExecutor< Array, IndexTag< 2 > >
-{
-   template< typename Func >
-   void operator()( const Array& array, Func f )
-   {
-      const auto size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
-      const auto size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
-
-      #ifdef HAVE_OPENMP
-      #pragma omp parallel for
-      #endif
-      for( typename Array::IndexType i0 = 0; i0 < size0; i0++ )
-      for( typename Array::IndexType i1 = 0; i1 < size1; i1++ )
-         call_with_permuted_arguments< typename Array::PermutationType >( f, i0, i1 );
+      const Index size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
+      const Index size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
+      const Index size2 = array.template getSize< get< 2 >( typename Array::PermutationType{} ) >();
+      ParallelFor3D< Device >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
    }
 };
 
 template< typename Array >
-struct OpenMPExecutor< Array, IndexTag< 1 > >
-{
-   template< typename Func >
-   void operator()( const Array& array, Func f )
-   {
-      const auto size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
-
-      #ifdef HAVE_OPENMP
-      #pragma omp parallel for
-      #endif
-      for( typename Array::IndexType i0 = 0; i0 < size0; i0++ )
-         call_with_permuted_arguments< typename Array::PermutationType >( f, i0 );
-   }
-};
-
-
-template< typename Array,
-          typename DimTag = IndexTag< Array::getDimension() > >
-struct CudaExecutor
+struct ParallelExecutorDeviceDispatch< Array, Devices::Cuda >
 {
    template< typename Func >
    void operator()( const Array& array, Func f )
@@ -189,12 +132,25 @@ struct CudaExecutor
    }
 };
 
+template< typename Array,
+          typename DimTag = IndexTag< Array::getDimension() > >
+struct ParallelExecutor
+{
+   template< typename Func >
+   void operator()( const Array& array, Func f )
+   {
+      ParallelExecutorDeviceDispatch< Array > dispatch;
+      dispatch( array, f );
+   }
+};
+
 template< typename Array >
-struct CudaExecutor< Array, IndexTag< 3 > >
+struct ParallelExecutor< Array, IndexTag< 3 > >
 {
    template< typename Func >
    void operator()( const Array& array, Func f )
    {
+      using Device = typename Array::DeviceType;
       using Index = typename Array::IndexType;
 
       auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
@@ -205,16 +161,17 @@ struct CudaExecutor< Array, IndexTag< 3 > >
       const Index size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
       const Index size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
       const Index size2 = array.template getSize< get< 2 >( typename Array::PermutationType{} ) >();
-      ParallelFor3D< Devices::Cuda >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
+      ParallelFor3D< Device >::exec( (Index) 0, (Index) 0, (Index) 0, size2, size1, size0, kernel );
    }
 };
 
 template< typename Array >
-struct CudaExecutor< Array, IndexTag< 2 > >
+struct ParallelExecutor< Array, IndexTag< 2 > >
 {
    template< typename Func >
    void operator()( const Array& array, Func f )
    {
+      using Device = typename Array::DeviceType;
       using Index = typename Array::IndexType;
 
       auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
@@ -224,16 +181,17 @@ struct CudaExecutor< Array, IndexTag< 2 > >
 
       const Index size0 = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
       const Index size1 = array.template getSize< get< 1 >( typename Array::PermutationType{} ) >();
-      ParallelFor2D< Devices::Cuda >::exec( (Index) 0, (Index) 0, size1, size0, kernel );
+      ParallelFor2D< Device >::exec( (Index) 0, (Index) 0, size1, size0, kernel );
    }
 };
 
 template< typename Array >
-struct CudaExecutor< Array, IndexTag< 1 > >
+struct ParallelExecutor< Array, IndexTag< 1 > >
 {
    template< typename Func >
    void operator()( const Array& array, Func f )
    {
+      using Device = typename Array::DeviceType;
       using Index = typename Array::IndexType;
 
       auto kernel = [=] __cuda_callable__ ( Index i )
@@ -242,7 +200,7 @@ struct CudaExecutor< Array, IndexTag< 1 > >
       };
 
       const Index size = array.template getSize< get< 0 >( typename Array::PermutationType{} ) >();
-      ParallelFor< Devices::Cuda >::exec( (Index) 0, size, kernel );
+      ParallelFor< Device >::exec( (Index) 0, size, kernel );
    }
 };
 
@@ -265,7 +223,7 @@ struct ExecutorDispatcher< Array, Devices::Host >
    void operator()( const Array& array, Func f )
    {
       if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 )
-         OpenMPExecutor< Array >()( array, f );
+         ParallelExecutor< Array >()( array, f );
       else
          SequentialExecutor< Array >()( array, f );
    }
@@ -277,7 +235,7 @@ struct ExecutorDispatcher< Array, Devices::Cuda >
    template< typename Func >
    void operator()( const Array& array, Func f )
    {
-      CudaExecutor< Array >()( array, f );
+      ParallelExecutor< Array >()( array, f );
    }
 };
 
-- 
GitLab