Loading src/TNL/StaticFor.h +33 −141 Original line number Diff line number Diff line Loading @@ -10,17 +10,17 @@ #pragma once #undef __INTEL_COMPILER #include <utility> #include <TNL/Devices/CudaCallable.h> namespace TNL { template< typename IndexType, IndexType val > class StaticForIndexTag struct StaticForIndexTag { public: static const IndexType value = val; typedef StaticForIndexTag<IndexType, val - 1> Decrement; static constexpr IndexType value = val; using Decrement = StaticForIndexTag<IndexType, val - 1>; }; Loading @@ -28,175 +28,67 @@ template< typename IndexType, typename Begin, typename N, template< IndexType > class LoopBody > class StaticForExecutor { public: __cuda_callable__ static void exec() { StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec(); LoopBody< Begin::value + N::value - 1 >::exec(); } template< typename T > __cuda_callable__ static void exec( T& p ) { StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( p ); LoopBody< Begin::value + N::value - 1 >::exec( p ); } template< typename T0, typename T1 > __cuda_callable__ static void exec( T0& p0, T1& p1 ) struct StaticForExecutor { StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( p0, p1 ); LoopBody< Begin::value + N::value - 1 >::exec( p0, p1 ); } template< typename T0, typename T1, typename T2 > template< typename... Args > __cuda_callable__ static void exec( T0& p0, T1& p1, T2& p2 ) static void exec( Args&&... args ) { StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( p0, p1, p2 ); LoopBody< Begin::value + N::value - 1 >::exec( p0, p1, p2 ); StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( std::forward< Args >( args )... ); LoopBody< Begin::value + N::value - 1 >::exec( std::forward< Args >( args )... ); } template< typename T0, typename T1, typename T2, typename T3 > __cuda_callable__ static void exec( T0& p0, T1& p1, T2& p2, T3& p3 ) template< typename... Args > static void execHost( Args&&... args ) { StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( p0, p1, p2, p3 ); LoopBody< Begin::value + N::value - 1 >::exec( p0, p1, p2, p3 ); StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::execHost( std::forward< Args >( args )... ); LoopBody< Begin::value + N::value - 1 >::exec( std::forward< Args >( args )... ); } }; template< typename IndexType, typename Begin, template< IndexType > class LoopBody > class StaticForExecutor< IndexType, struct StaticForExecutor< IndexType, Begin, StaticForIndexTag< IndexType, 0 >, LoopBody > { public: __cuda_callable__ static void exec() {} template< typename T > __cuda_callable__ static void exec( T& p ) {} template< typename T0, typename T1 > template< typename... Args > __cuda_callable__ static void exec( T0& p0, T1& p1 ) {} static void exec( Args&&... args ) {} template< typename T0, typename T1, typename T2 > __cuda_callable__ static void exec( T0& p0, T1& p1, T2& p2 ) {} template< typename T0, typename T1, typename T2, typename T3 > __cuda_callable__ static void exec( T0& p0, T1& p1, T2& p2, T3& p3 ) {} template< typename... Args > static void execHost( Args&&... args ) {} }; template< typename IndexType, IndexType begin, IndexType end, template< IndexType > class LoopBody > class StaticFor struct StaticFor { public: __cuda_callable__ static void exec() { #ifndef __INTEL_COMPILER StaticForExecutor< IndexType, StaticForIndexTag< IndexType, begin >, StaticForIndexTag< IndexType, end - begin >, LoopBody >::exec(); #else TNL_ASSERT( false, ); #endif } template< typename T > __cuda_callable__ static void exec( T &p ) { #ifndef __INTEL_COMPILER StaticForExecutor< IndexType, StaticForIndexTag< IndexType, begin >, StaticForIndexTag< IndexType, end - begin >, LoopBody >::exec( p ); #else TNL_ASSERT( false, ); #endif } template< typename T0, typename T1 > __cuda_callable__ static void exec( T0& p0, T1& p1 ) { #ifndef __INTEL_COMPILER StaticForExecutor< IndexType, StaticForIndexTag< IndexType, begin >, StaticForIndexTag< IndexType, end - begin >, LoopBody >::exec( p0, p1 ); #else TNL_ASSERT( false, ); #endif } template< typename T0, typename T1, typename T2 > template< typename... Args > __cuda_callable__ static void exec( T0& p0, T1& p1, T2& p2 ) static void exec( Args&&... args ) { #ifndef __INTEL_COMPILER StaticForExecutor< IndexType, StaticForIndexTag< IndexType, begin >, StaticForIndexTag< IndexType, end - begin >, LoopBody >::exec( p0, p1, p2 ); #else TNL_ASSERT( false, ); #endif LoopBody >::exec( std::forward< Args >( args )... ); } template< typename T0, typename T1, typename T2, typename T3 > __cuda_callable__ static void exec( T0& p0, T1& p1, T2& p2, T3& p3 ) // nvcc would complain if we wonted to call a host-only function from the __cuda_callable__ exec above template< typename... Args > static void execHost( Args&&... args ) { #ifndef __INTEL_COMPILER StaticForExecutor< IndexType, StaticForIndexTag< IndexType, begin >, StaticForIndexTag< IndexType, end - begin >, LoopBody >::exec( p0, p1, p2, p3 ); #else TNL_ASSERT( false, ); #endif LoopBody >::execHost( std::forward< Args >( args )... ); } }; } // namespace TNL Loading
src/TNL/StaticFor.h +33 −141 Original line number Diff line number Diff line Loading @@ -10,17 +10,17 @@ #pragma once #undef __INTEL_COMPILER #include <utility> #include <TNL/Devices/CudaCallable.h> namespace TNL { template< typename IndexType, IndexType val > class StaticForIndexTag struct StaticForIndexTag { public: static const IndexType value = val; typedef StaticForIndexTag<IndexType, val - 1> Decrement; static constexpr IndexType value = val; using Decrement = StaticForIndexTag<IndexType, val - 1>; }; Loading @@ -28,175 +28,67 @@ template< typename IndexType, typename Begin, typename N, template< IndexType > class LoopBody > class StaticForExecutor { public: __cuda_callable__ static void exec() { StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec(); LoopBody< Begin::value + N::value - 1 >::exec(); } template< typename T > __cuda_callable__ static void exec( T& p ) { StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( p ); LoopBody< Begin::value + N::value - 1 >::exec( p ); } template< typename T0, typename T1 > __cuda_callable__ static void exec( T0& p0, T1& p1 ) struct StaticForExecutor { StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( p0, p1 ); LoopBody< Begin::value + N::value - 1 >::exec( p0, p1 ); } template< typename T0, typename T1, typename T2 > template< typename... Args > __cuda_callable__ static void exec( T0& p0, T1& p1, T2& p2 ) static void exec( Args&&... args ) { StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( p0, p1, p2 ); LoopBody< Begin::value + N::value - 1 >::exec( p0, p1, p2 ); StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( std::forward< Args >( args )... ); LoopBody< Begin::value + N::value - 1 >::exec( std::forward< Args >( args )... ); } template< typename T0, typename T1, typename T2, typename T3 > __cuda_callable__ static void exec( T0& p0, T1& p1, T2& p2, T3& p3 ) template< typename... Args > static void execHost( Args&&... args ) { StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( p0, p1, p2, p3 ); LoopBody< Begin::value + N::value - 1 >::exec( p0, p1, p2, p3 ); StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::execHost( std::forward< Args >( args )... ); LoopBody< Begin::value + N::value - 1 >::exec( std::forward< Args >( args )... ); } }; template< typename IndexType, typename Begin, template< IndexType > class LoopBody > class StaticForExecutor< IndexType, struct StaticForExecutor< IndexType, Begin, StaticForIndexTag< IndexType, 0 >, LoopBody > { public: __cuda_callable__ static void exec() {} template< typename T > __cuda_callable__ static void exec( T& p ) {} template< typename T0, typename T1 > template< typename... Args > __cuda_callable__ static void exec( T0& p0, T1& p1 ) {} static void exec( Args&&... args ) {} template< typename T0, typename T1, typename T2 > __cuda_callable__ static void exec( T0& p0, T1& p1, T2& p2 ) {} template< typename T0, typename T1, typename T2, typename T3 > __cuda_callable__ static void exec( T0& p0, T1& p1, T2& p2, T3& p3 ) {} template< typename... Args > static void execHost( Args&&... args ) {} }; template< typename IndexType, IndexType begin, IndexType end, template< IndexType > class LoopBody > class StaticFor struct StaticFor { public: __cuda_callable__ static void exec() { #ifndef __INTEL_COMPILER StaticForExecutor< IndexType, StaticForIndexTag< IndexType, begin >, StaticForIndexTag< IndexType, end - begin >, LoopBody >::exec(); #else TNL_ASSERT( false, ); #endif } template< typename T > __cuda_callable__ static void exec( T &p ) { #ifndef __INTEL_COMPILER StaticForExecutor< IndexType, StaticForIndexTag< IndexType, begin >, StaticForIndexTag< IndexType, end - begin >, LoopBody >::exec( p ); #else TNL_ASSERT( false, ); #endif } template< typename T0, typename T1 > __cuda_callable__ static void exec( T0& p0, T1& p1 ) { #ifndef __INTEL_COMPILER StaticForExecutor< IndexType, StaticForIndexTag< IndexType, begin >, StaticForIndexTag< IndexType, end - begin >, LoopBody >::exec( p0, p1 ); #else TNL_ASSERT( false, ); #endif } template< typename T0, typename T1, typename T2 > template< typename... Args > __cuda_callable__ static void exec( T0& p0, T1& p1, T2& p2 ) static void exec( Args&&... args ) { #ifndef __INTEL_COMPILER StaticForExecutor< IndexType, StaticForIndexTag< IndexType, begin >, StaticForIndexTag< IndexType, end - begin >, LoopBody >::exec( p0, p1, p2 ); #else TNL_ASSERT( false, ); #endif LoopBody >::exec( std::forward< Args >( args )... ); } template< typename T0, typename T1, typename T2, typename T3 > __cuda_callable__ static void exec( T0& p0, T1& p1, T2& p2, T3& p3 ) // nvcc would complain if we wonted to call a host-only function from the __cuda_callable__ exec above template< typename... Args > static void execHost( Args&&... args ) { #ifndef __INTEL_COMPILER StaticForExecutor< IndexType, StaticForIndexTag< IndexType, begin >, StaticForIndexTag< IndexType, end - begin >, LoopBody >::exec( p0, p1, p2, p3 ); #else TNL_ASSERT( false, ); #endif LoopBody >::execHost( std::forward< Args >( args )... ); } }; } // namespace TNL