Commit e4ba543b authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Refactoring StaticFor

- used variadic templates to avoid code bloat
- added execHost static method to avoid nvcc warnings
parent 2b6cb697
Loading
Loading
Loading
Loading
+33 −141
Original line number Diff line number Diff line
@@ -10,17 +10,17 @@

#pragma once

#undef __INTEL_COMPILER
#include <utility>

#include <TNL/Devices/CudaCallable.h>

namespace TNL {

template< typename IndexType, IndexType val >
class StaticForIndexTag
struct StaticForIndexTag
{
public:
   static const IndexType value = val;

   typedef StaticForIndexTag<IndexType, val - 1> Decrement;
   static constexpr IndexType value = val;
   using Decrement = StaticForIndexTag<IndexType, val - 1>;
};


@@ -28,175 +28,67 @@ template< typename IndexType,
          typename Begin,
          typename N,
          template< IndexType > class LoopBody >
class StaticForExecutor
{
   public:

   __cuda_callable__
   static void exec()
   {
      StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec();
      LoopBody< Begin::value + N::value - 1 >::exec();
   }

   template< typename T >
   __cuda_callable__
   static void exec( T& p )
   {
      StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( p );
      LoopBody< Begin::value + N::value - 1 >::exec( p );
   }

   template< typename T0,
             typename T1 >
   __cuda_callable__
   static void exec( T0& p0, T1& p1 )
struct StaticForExecutor
{
      StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( p0, p1 );
      LoopBody< Begin::value + N::value - 1 >::exec( p0, p1 );
   }

   template< typename T0,
             typename T1,
             typename T2 >
   template< typename... Args >
   __cuda_callable__
   static void exec( T0& p0, T1& p1, T2& p2 )
   static void exec( Args&&... args )
   {
      StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( p0, p1, p2 );
      LoopBody< Begin::value + N::value - 1 >::exec( p0, p1, p2 );
      StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( std::forward< Args >( args )... );
      LoopBody< Begin::value + N::value - 1 >::exec( std::forward< Args >( args )... );
   }

   template< typename T0,
             typename T1,
             typename T2,
             typename T3 >
   __cuda_callable__
   static void exec( T0& p0, T1& p1, T2& p2, T3& p3 )
   template< typename... Args >
   static void execHost( Args&&... args )
   {
      StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( p0, p1, p2, p3 );
      LoopBody< Begin::value + N::value - 1 >::exec( p0, p1, p2, p3 );
      StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::execHost( std::forward< Args >( args )... );
      LoopBody< Begin::value + N::value - 1 >::exec( std::forward< Args >( args )... );
   }
};

template< typename IndexType,
          typename Begin,
          template< IndexType > class LoopBody >
class StaticForExecutor< IndexType,
struct StaticForExecutor< IndexType,
                          Begin,
                          StaticForIndexTag< IndexType, 0 >,
                          LoopBody >
{
   public:

   __cuda_callable__
   static void exec() {}

   template< typename T >
   __cuda_callable__
   static void exec( T& p ) {}

   template< typename T0,
             typename T1 >
   template< typename... Args >
   __cuda_callable__
   static void exec( T0& p0, T1& p1 ) {}
   static void exec( Args&&... args )
   {}

   template< typename T0,
             typename T1,
             typename T2 >
   __cuda_callable__
   static void exec( T0& p0, T1& p1, T2& p2 ) {}

   template< typename T0,
             typename T1,
             typename T2,
             typename T3 >
   __cuda_callable__
   static void exec( T0& p0, T1& p1, T2& p2, T3& p3 ) {}
   template< typename... Args >
   static void execHost( Args&&... args )
   {}
};

template< typename IndexType,
          IndexType begin,
          IndexType end,
          template< IndexType > class LoopBody >
class StaticFor
struct StaticFor
{
   public:

   __cuda_callable__
   static void exec()
   {
#ifndef __INTEL_COMPILER
      StaticForExecutor< IndexType,
                         StaticForIndexTag< IndexType, begin >,
                         StaticForIndexTag< IndexType, end - begin >,
                         LoopBody >::exec();
#else
     TNL_ASSERT( false, );
#endif
   }

   template< typename T >
   __cuda_callable__
   static void exec( T &p )
   {
#ifndef __INTEL_COMPILER
      StaticForExecutor< IndexType,
                         StaticForIndexTag< IndexType, begin >,
                         StaticForIndexTag< IndexType, end - begin >,
                         LoopBody >::exec( p );
#else
     TNL_ASSERT( false, );
#endif
   }

   template< typename T0,
             typename T1 >
   __cuda_callable__
   static void exec( T0& p0, T1& p1 )
   {
#ifndef __INTEL_COMPILER
      StaticForExecutor< IndexType,
                         StaticForIndexTag< IndexType, begin >,
                         StaticForIndexTag< IndexType, end - begin >,
                         LoopBody >::exec( p0, p1 );
#else
     TNL_ASSERT( false, );
#endif
   }

   template< typename T0,
             typename T1,
             typename T2 >
   template< typename... Args >
   __cuda_callable__
   static void exec( T0& p0, T1& p1, T2& p2 )
   static void exec( Args&&... args )
   {
#ifndef __INTEL_COMPILER
      StaticForExecutor< IndexType,
                         StaticForIndexTag< IndexType, begin >,
                         StaticForIndexTag< IndexType, end - begin >,
                         LoopBody >::exec( p0, p1, p2 );
#else
     TNL_ASSERT( false, );
#endif
                         LoopBody >::exec( std::forward< Args >( args )... );
   }

   template< typename T0,
             typename T1,
             typename T2,
             typename T3 >
   __cuda_callable__
   static void exec( T0& p0, T1& p1, T2& p2, T3& p3 )
   // nvcc would complain if we wonted to call a host-only function from the __cuda_callable__ exec above
   template< typename... Args >
   static void execHost( Args&&... args )
   {
#ifndef __INTEL_COMPILER
      StaticForExecutor< IndexType,
                         StaticForIndexTag< IndexType, begin >,
                         StaticForIndexTag< IndexType, end - begin >,
                         LoopBody >::exec( p0, p1, p2, p3 );
#else
     TNL_ASSERT( false, );
#endif
                         LoopBody >::execHost( std::forward< Args >( args )... );
   }
};


} // namespace TNL