Loading src/TNL/Algorithms/MemoryOperations.h +17 −0 Original line number Diff line number Diff line Loading @@ -242,6 +242,23 @@ struct MemoryOperations< Devices::Cuda > template<> struct MemoryOperations< Devices::Hip > { template< typename Element, typename Index > static void construct( Element* data, const Index size ); // note that args are passed by value to the constructor, not via // std::forward or even by reference, since move-semantics does not apply for // the construction of multiple elements and pass-by-reference cannot be used // with CUDA kernels template< typename Element, typename Index, typename... Args > static void construct( Element* data, const Index size, const Args&... args ); template< typename Element, typename Index > static void destruct( Element* data, const Index size ); template< typename Element > __device_callable__ static void setElement( Element* data, Loading src/TNL/Algorithms/MemoryOperationsHip.hpp +50 −0 Original line number Diff line number Diff line Loading @@ -23,6 +23,56 @@ namespace TNL { namespace Algorithms { template< typename Element, typename Index > void MemoryOperations< Devices::Hip >:: construct( Element* data, const Index size ) { TNL_ASSERT_TRUE( data, "Attempted to create elements through a nullptr." ); auto kernel = [data] __device_callable__ ( Index i ) { // placement-new ::new( (void*) (data + i) ) Element(); }; ParallelFor< Devices::Hip >::exec( (Index) 0, size, kernel ); } template< typename Element, typename Index, typename... Args > void MemoryOperations< Devices::Hip >:: construct( Element* data, const Index size, const Args&... args ) { TNL_ASSERT_TRUE( data, "Attempted to create elements through a nullptr." ); // NOTE: nvcc does not allow __device_callable__ lambdas with a variadic capture auto kernel = [data] __device_callable__ ( Index i, Args... args ) { // placement-new // (note that args are passed by value to the constructor, not via // std::forward or even by reference, since move-semantics does not apply for // the construction of multiple elements and pass-by-reference cannot be used // with GPU kernels) ::new( (void*) (data + i) ) Element( args... ); }; ParallelFor< Devices::Hip >::exec( (Index) 0, size, kernel, args... ); } template< typename Element, typename Index > void MemoryOperations< Devices::Hip >:: destruct( Element* data, const Index size ) { TNL_ASSERT_TRUE( data, "Attempted to destroy data through a nullptr." ); auto kernel = [data] __device_callable__ ( Index i ) { (data + i)->~Element(); }; ParallelFor< Devices::Hip >::exec( (Index) 0, size, kernel ); } template< typename Element > __device_callable__ void MemoryOperations< Devices::Hip >:: Loading Loading
src/TNL/Algorithms/MemoryOperations.h +17 −0 Original line number Diff line number Diff line Loading @@ -242,6 +242,23 @@ struct MemoryOperations< Devices::Cuda > template<> struct MemoryOperations< Devices::Hip > { template< typename Element, typename Index > static void construct( Element* data, const Index size ); // note that args are passed by value to the constructor, not via // std::forward or even by reference, since move-semantics does not apply for // the construction of multiple elements and pass-by-reference cannot be used // with CUDA kernels template< typename Element, typename Index, typename... Args > static void construct( Element* data, const Index size, const Args&... args ); template< typename Element, typename Index > static void destruct( Element* data, const Index size ); template< typename Element > __device_callable__ static void setElement( Element* data, Loading
src/TNL/Algorithms/MemoryOperationsHip.hpp +50 −0 Original line number Diff line number Diff line Loading @@ -23,6 +23,56 @@ namespace TNL { namespace Algorithms { template< typename Element, typename Index > void MemoryOperations< Devices::Hip >:: construct( Element* data, const Index size ) { TNL_ASSERT_TRUE( data, "Attempted to create elements through a nullptr." ); auto kernel = [data] __device_callable__ ( Index i ) { // placement-new ::new( (void*) (data + i) ) Element(); }; ParallelFor< Devices::Hip >::exec( (Index) 0, size, kernel ); } template< typename Element, typename Index, typename... Args > void MemoryOperations< Devices::Hip >:: construct( Element* data, const Index size, const Args&... args ) { TNL_ASSERT_TRUE( data, "Attempted to create elements through a nullptr." ); // NOTE: nvcc does not allow __device_callable__ lambdas with a variadic capture auto kernel = [data] __device_callable__ ( Index i, Args... args ) { // placement-new // (note that args are passed by value to the constructor, not via // std::forward or even by reference, since move-semantics does not apply for // the construction of multiple elements and pass-by-reference cannot be used // with GPU kernels) ::new( (void*) (data + i) ) Element( args... ); }; ParallelFor< Devices::Hip >::exec( (Index) 0, size, kernel, args... ); } template< typename Element, typename Index > void MemoryOperations< Devices::Hip >:: destruct( Element* data, const Index size ) { TNL_ASSERT_TRUE( data, "Attempted to destroy data through a nullptr." ); auto kernel = [data] __device_callable__ ( Index i ) { (data + i)->~Element(); }; ParallelFor< Devices::Hip >::exec( (Index) 0, size, kernel ); } template< typename Element > __device_callable__ void MemoryOperations< Devices::Hip >:: Loading