Skip to content
Snippets Groups Projects
Commit e4ba543b authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Refactoring StaticFor

- used variadic templates to avoid code bloat
- added execHost static method to avoid nvcc warnings
parent 2b6cb697
No related branches found
No related tags found
No related merge requests found
......@@ -10,17 +10,17 @@
#pragma once
#undef __INTEL_COMPILER
#include <utility>
#include <TNL/Devices/CudaCallable.h>
namespace TNL {
template< typename IndexType, IndexType val >
class StaticForIndexTag
struct StaticForIndexTag
{
public:
static const IndexType value = val;
typedef StaticForIndexTag<IndexType, val - 1> Decrement;
static constexpr IndexType value = val;
using Decrement = StaticForIndexTag<IndexType, val - 1>;
};
......@@ -28,175 +28,67 @@ template< typename IndexType,
typename Begin,
typename N,
template< IndexType > class LoopBody >
class StaticForExecutor
struct StaticForExecutor
{
public:
__cuda_callable__
static void exec()
{
StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec();
LoopBody< Begin::value + N::value - 1 >::exec();
}
template< typename T >
__cuda_callable__
static void exec( T& p )
{
StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( p );
LoopBody< Begin::value + N::value - 1 >::exec( p );
}
template< typename T0,
typename T1 >
__cuda_callable__
static void exec( T0& p0, T1& p1 )
{
StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( p0, p1 );
LoopBody< Begin::value + N::value - 1 >::exec( p0, p1 );
}
template< typename T0,
typename T1,
typename T2 >
template< typename... Args >
__cuda_callable__
static void exec( T0& p0, T1& p1, T2& p2 )
static void exec( Args&&... args )
{
StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( p0, p1, p2 );
LoopBody< Begin::value + N::value - 1 >::exec( p0, p1, p2 );
StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( std::forward< Args >( args )... );
LoopBody< Begin::value + N::value - 1 >::exec( std::forward< Args >( args )... );
}
template< typename T0,
typename T1,
typename T2,
typename T3 >
__cuda_callable__
static void exec( T0& p0, T1& p1, T2& p2, T3& p3 )
template< typename... Args >
static void execHost( Args&&... args )
{
StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::exec( p0, p1, p2, p3 );
LoopBody< Begin::value + N::value - 1 >::exec( p0, p1, p2, p3 );
StaticForExecutor< IndexType, Begin, typename N::Decrement, LoopBody >::execHost( std::forward< Args >( args )... );
LoopBody< Begin::value + N::value - 1 >::exec( std::forward< Args >( args )... );
}
};
template< typename IndexType,
typename Begin,
template< IndexType > class LoopBody >
class StaticForExecutor< IndexType,
Begin,
StaticForIndexTag< IndexType, 0 >,
LoopBody >
struct StaticForExecutor< IndexType,
Begin,
StaticForIndexTag< IndexType, 0 >,
LoopBody >
{
public:
__cuda_callable__
static void exec() {}
template< typename T >
__cuda_callable__
static void exec( T& p ) {}
template< typename T0,
typename T1 >
template< typename... Args >
__cuda_callable__
static void exec( T0& p0, T1& p1 ) {}
static void exec( Args&&... args )
{}
template< typename T0,
typename T1,
typename T2 >
__cuda_callable__
static void exec( T0& p0, T1& p1, T2& p2 ) {}
template< typename T0,
typename T1,
typename T2,
typename T3 >
__cuda_callable__
static void exec( T0& p0, T1& p1, T2& p2, T3& p3 ) {}
template< typename... Args >
static void execHost( Args&&... args )
{}
};
template< typename IndexType,
IndexType begin,
IndexType end,
template< IndexType > class LoopBody >
class StaticFor
struct StaticFor
{
public:
__cuda_callable__
static void exec()
{
#ifndef __INTEL_COMPILER
StaticForExecutor< IndexType,
StaticForIndexTag< IndexType, begin >,
StaticForIndexTag< IndexType, end - begin >,
LoopBody >::exec();
#else
TNL_ASSERT( false, );
#endif
}
template< typename T >
__cuda_callable__
static void exec( T &p )
{
#ifndef __INTEL_COMPILER
StaticForExecutor< IndexType,
StaticForIndexTag< IndexType, begin >,
StaticForIndexTag< IndexType, end - begin >,
LoopBody >::exec( p );
#else
TNL_ASSERT( false, );
#endif
}
template< typename T0,
typename T1 >
__cuda_callable__
static void exec( T0& p0, T1& p1 )
{
#ifndef __INTEL_COMPILER
StaticForExecutor< IndexType,
StaticForIndexTag< IndexType, begin >,
StaticForIndexTag< IndexType, end - begin >,
LoopBody >::exec( p0, p1 );
#else
TNL_ASSERT( false, );
#endif
}
template< typename T0,
typename T1,
typename T2 >
template< typename... Args >
__cuda_callable__
static void exec( T0& p0, T1& p1, T2& p2 )
static void exec( Args&&... args )
{
#ifndef __INTEL_COMPILER
StaticForExecutor< IndexType,
StaticForIndexTag< IndexType, begin >,
StaticForIndexTag< IndexType, end - begin >,
LoopBody >::exec( p0, p1, p2 );
#else
TNL_ASSERT( false, );
#endif
LoopBody >::exec( std::forward< Args >( args )... );
}
template< typename T0,
typename T1,
typename T2,
typename T3 >
__cuda_callable__
static void exec( T0& p0, T1& p1, T2& p2, T3& p3 )
// nvcc would complain if we wonted to call a host-only function from the __cuda_callable__ exec above
template< typename... Args >
static void execHost( Args&&... args )
{
#ifndef __INTEL_COMPILER
StaticForExecutor< IndexType,
StaticForIndexTag< IndexType, begin >,
StaticForIndexTag< IndexType, end - begin >,
LoopBody >::exec( p0, p1, p2, p3 );
#else
TNL_ASSERT( false, );
#endif
LoopBody >::execHost( std::forward< Args >( args )... );
}
};
} // namespace TNL
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment