Implemented MemoryOperations::construct and MemoryOperations::destruct for HIP (ea84053e) · Commits · TNL / tnl-dev

src/TNL/Algorithms/MemoryOperations.h

+17 −0

Original line number	Diff line number	Diff line
		@@ -242,6 +242,23 @@ struct MemoryOperations< Devices::Cuda >
		template<>
		struct MemoryOperations< Devices::Hip >
		{
		template< typename Element, typename Index >
		static void construct( Element* data,
		const Index size );

		// note that args are passed by value to the constructor, not via
		// std::forward or even by reference, since move-semantics does not apply for
		// the construction of multiple elements and pass-by-reference cannot be used
		// with CUDA kernels
		template< typename Element, typename Index, typename... Args >
		static void construct( Element* data,
		const Index size,
		const Args&... args );

		template< typename Element, typename Index >
		static void destruct( Element* data,
		const Index size );

		template< typename Element >
		__device_callable__
		static void setElement( Element* data,

src/TNL/Algorithms/MemoryOperationsHip.hpp

+50 −0

Original line number	Diff line number	Diff line
		@@ -23,6 +23,56 @@
		namespace TNL {
		namespace Algorithms {

		template< typename Element, typename Index >
		void
		MemoryOperations< Devices::Hip >::
		construct( Element* data,
		const Index size )
		{
		TNL_ASSERT_TRUE( data, "Attempted to create elements through a nullptr." );
		auto kernel = [data] __device_callable__ ( Index i )
		{
		// placement-new
		::new( (void*) (data + i) ) Element();
		};
		ParallelFor< Devices::Hip >::exec( (Index) 0, size, kernel );
		}

		template< typename Element, typename Index, typename... Args >
		void
		MemoryOperations< Devices::Hip >::
		construct( Element* data,
		const Index size,
		const Args&... args )
		{
		TNL_ASSERT_TRUE( data, "Attempted to create elements through a nullptr." );
		// NOTE: nvcc does not allow __device_callable__ lambdas with a variadic capture
		auto kernel = [data] __device_callable__ ( Index i, Args... args )
		{
		// placement-new
		// (note that args are passed by value to the constructor, not via
		// std::forward or even by reference, since move-semantics does not apply for
		// the construction of multiple elements and pass-by-reference cannot be used
		// with GPU kernels)
		::new( (void*) (data + i) ) Element( args... );
		};
		ParallelFor< Devices::Hip >::exec( (Index) 0, size, kernel, args... );
		}

		template< typename Element, typename Index >
		void
		MemoryOperations< Devices::Hip >::
		destruct( Element* data,
		const Index size )
		{
		TNL_ASSERT_TRUE( data, "Attempted to destroy data through a nullptr." );
		auto kernel = [data] __device_callable__ ( Index i )
		{
		(data + i)->~Element();
		};
		ParallelFor< Devices::Hip >::exec( (Index) 0, size, kernel );
		}

		template< typename Element >
		__device_callable__ void
		MemoryOperations< Devices::Hip >::