Commit ea84053e authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Implemented MemoryOperations::construct and MemoryOperations::destruct for HIP

parent e74c2686
Loading
Loading
Loading
Loading
+17 −0
Original line number Diff line number Diff line
@@ -242,6 +242,23 @@ struct MemoryOperations< Devices::Cuda >
template<>
struct MemoryOperations< Devices::Hip >
{
   template< typename Element, typename Index >
   static void construct( Element* data,
                          const Index size );

   // note that args are passed by value to the constructor, not via
   // std::forward or even by reference, since move-semantics does not apply for
   // the construction of multiple elements and pass-by-reference cannot be used
   // with CUDA kernels
   template< typename Element, typename Index, typename... Args >
   static void construct( Element* data,
                          const Index size,
                          const Args&... args );

   template< typename Element, typename Index >
   static void destruct( Element* data,
                         const Index size );

   template< typename Element >
   __device_callable__
   static void setElement( Element* data,
+50 −0
Original line number Diff line number Diff line
@@ -23,6 +23,56 @@
namespace TNL {
namespace Algorithms {

template< typename Element, typename Index >
void
MemoryOperations< Devices::Hip >::
construct( Element* data,
           const Index size )
{
   TNL_ASSERT_TRUE( data, "Attempted to create elements through a nullptr." );
   auto kernel = [data] __device_callable__ ( Index i )
   {
      // placement-new
      ::new( (void*) (data + i) ) Element();
   };
   ParallelFor< Devices::Hip >::exec( (Index) 0, size, kernel );
}

template< typename Element, typename Index, typename... Args >
void
MemoryOperations< Devices::Hip >::
construct( Element* data,
           const Index size,
           const Args&... args )
{
   TNL_ASSERT_TRUE( data, "Attempted to create elements through a nullptr." );
   // NOTE: nvcc does not allow __device_callable__ lambdas with a variadic capture
   auto kernel = [data] __device_callable__ ( Index i, Args... args )
   {
      // placement-new
      // (note that args are passed by value to the constructor, not via
      // std::forward or even by reference, since move-semantics does not apply for
      // the construction of multiple elements and pass-by-reference cannot be used
      // with GPU kernels)
      ::new( (void*) (data + i) ) Element( args... );
   };
   ParallelFor< Devices::Hip >::exec( (Index) 0, size, kernel, args... );
}

template< typename Element, typename Index >
void
MemoryOperations< Devices::Hip >::
destruct( Element* data,
          const Index size )
{
   TNL_ASSERT_TRUE( data, "Attempted to destroy data through a nullptr." );
   auto kernel = [data] __device_callable__ ( Index i )
   {
      (data + i)->~Element();
   };
   ParallelFor< Devices::Hip >::exec( (Index) 0, size, kernel );
}

template< typename Element >
__device_callable__ void
MemoryOperations< Devices::Hip >::