Added scan and multireduction for HIP. (a85da5ed) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Multireduction.h

+30 −0

Original line number	Diff line number	Diff line
		@@ -17,6 +17,7 @@
		#include <TNL/Devices/Sequential.h>
		#include <TNL/Devices/Host.h>
		#include <TNL/Devices/Cuda.h>
		#include <TNL/Devices/Hip.h>

		namespace TNL {
		namespace Algorithms {
		@@ -111,6 +112,35 @@ struct Multireduction< Devices::Cuda >
		Result* hostResult );
		};

		template<>
		struct Multireduction< Devices::Hip >
		{
		/**
		* Parameters:
		* zero: starting value for reduction
		* dataFetcher: callable object such that `dataFetcher( i, j )` yields
		* the i-th value to be reduced from the j-th dataset
		* (i = 0,...,size-1; j = 0,...,n-1)
		* reduction: callable object representing the reduction operation
		* for example, it can be an instance of std::plus, std::logical_and,
		* std::logical_or etc.
		* size: the size of each dataset
		* n: number of datasets to be reduced
		* hostResult: output array of size = n
		*/
		template< typename Result,
		typename DataFetcher,
		typename Reduction,
		typename Index >
		static void
		reduce( const Result zero,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const Index size,
		const int n,
		Result* hostResult );
		};

		} // namespace Algorithms
		} // namespace TNL

src/TNL/Algorithms/Multireduction.hpp

+57 −1

Original line number	Diff line number	Diff line
		@@ -15,13 +15,15 @@
		#include <memory> // std::unique_ptr

		//#define CUDA_REDUCTION_PROFILING
		//#define HIP_REDUCTION_PROFILING

		#include <TNL/Assert.h>
		#include <TNL/Algorithms/Multireduction.h>
		#include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
		#include <TNL/Algorithms/detail/CudaMultireductionKernel.h>
		#include <TNL/Algorithms/detail/HipMultireductionKernel.h>

		#ifdef CUDA_REDUCTION_PROFILING
		#if defined CUDA_REDUCTION_PROFILING \|\| defined HIP_REDUCTION_PROFILING
		#include <TNL/Timer.h>
		#include <iostream>
		#endif
		@@ -242,5 +244,59 @@ reduce( const Result zero,
		#endif
		};

		template< typename Result,
		typename DataFetcher,
		typename Reduction,
		typename Index >
		void
		Multireduction< Devices::Hip >::
		reduce( const Result zero,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const Index size,
		const int n,
		Result* hostResult )
		{
		TNL_ASSERT_GT( size, 0, "The size of datasets must be positive." );
		TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." );

		#ifdef HIP_REDUCTION_PROFILING
		Timer timer;
		timer.reset();
		timer.start();
		#endif

		// start the reduction on the GPU
		Result* deviceAux1 = nullptr;
		const int reducedSize = detail::HipMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 );

		#ifdef HIP_REDUCTION_PROFILING
		timer.stop();
		std::cout << " Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl;
		timer.reset();
		timer.start();
		#endif

		// transfer the reduced data from device to host
		std::unique_ptr< Result[] > resultArray{ new Result[ n * reducedSize ] };
		MultiDeviceMemoryOperations< void, Devices::Hip >::copy( resultArray.get(), deviceAux1, n * reducedSize );

		#ifdef HIP_REDUCTION_PROFILING
		timer.stop();
		std::cout << " Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl;
		timer.reset();
		timer.start();
		#endif

		// finish the reduction on the host
		auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; };
		Multireduction< Devices::Sequential >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult );

		#ifdef HIP_REDUCTION_PROFILING
		timer.stop();
		std::cout << " Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
		#endif
		};

		} // namespace Algorithms
		} // namespace TNL

src/TNL/Algorithms/Scan.h

+107 −0

Original line number	Diff line number	Diff line
		@@ -15,6 +15,7 @@
		#include <TNL/Devices/Sequential.h>
		#include <TNL/Devices/Host.h>
		#include <TNL/Devices/Cuda.h>
		#include <TNL/Devices/Hip.h>

		namespace TNL {
		namespace Algorithms {
		@@ -277,6 +278,66 @@ struct Scan< Devices::Cuda, Type >
		const typename Vector::RealType shift );
		};

		template< ScanType Type >
		struct Scan< Devices::Hip, Type >
		{
		/**
		* \brief Computes scan (prefix sum) on GPU.
		*
		* \tparam Vector type vector being used for the scan.
		* \tparam Reduction lambda function defining the reduction operation
		*
		* \param v input vector, the result of scan is stored in the same vector
		* \param begin the first element in the array to be scanned
		* \param end the last element in the array to be scanned
		* \param reduction lambda function implementing the reduction operation
		* \param zero is the idempotent element for the reduction operation, i.e. element which
		* does not change the result of the reduction.
		*
		* The reduction lambda function takes two variables which are supposed to be reduced:
		*
		* ```
		* auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
		* ```
		*
		* \par Example
		*
		* \include ReductionAndScan/ScanExample.cpp
		*
		* \par Output
		*
		* \include ScanExample.out
		*/
		template< typename Vector,
		typename Reduction >
		static void
		perform( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero );

		template< typename Vector,
		typename Reduction >
		static auto
		performFirstPhase( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero );

		template< typename Vector,
		typename BlockShifts,
		typename Reduction >
		static void
		performSecondPhase( Vector& v,
		const BlockShifts& blockShifts,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType shift );
		};

		template< ScanType Type >
		struct SegmentedScan< Devices::Sequential, Type >
		{
		@@ -411,6 +472,52 @@ struct SegmentedScan< Devices::Cuda, Type >
		const typename Vector::RealType zero );
		};

		template< ScanType Type >
		struct SegmentedScan< Devices::Hip, Type >
		{
		/**
		* \brief Computes segmented scan (prefix sum) on GPU.
		*
		* \tparam Vector type vector being used for the scan.
		* \tparam Reduction lambda function defining the reduction operation
		* \tparam Flags array type containing zeros and ones defining the segments begining
		*
		* \param v input vector, the result of scan is stored in the same vector
		* \param flags is an array with zeros and ones defining the segments begining
		* \param begin the first element in the array to be scanned
		* \param end the last element in the array to be scanned
		* \param reduction lambda function implementing the reduction operation
		* \param zero is the idempotent element for the reduction operation, i.e. element which
		* does not change the result of the reduction.
		*
		* The reduction lambda function takes two variables which are supposed to be reduced:
		*
		* ```
		* auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
		* ```
		*
		* \par Example
		*
		* \include ReductionAndScan/SegmentedScanExample.cpp
		*
		* \par Output
		*
		* \include SegmentedScanExample.out
		*
		* Note: Segmented scan is not implemented for HIP yet.
		*/
		template< typename Vector,
		typename Reduction,
		typename Flags >
		static void
		perform( Vector& v,
		Flags& flags,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero );
		};

		} // namespace Algorithms
		} // namespace TNL

src/TNL/Algorithms/Scan.hpp

+109 −0

Original line number	Diff line number	Diff line
		@@ -18,7 +18,9 @@
		#include <TNL/Containers/Array.h>
		#include <TNL/Containers/StaticArray.h>
		#include <TNL/Algorithms/detail/CudaScanKernel.h>
		#include <TNL/Algorithms/detail/HipScanKernel.h>
		#include <TNL/Exceptions/CudaSupportMissing.h>
		#include <TNL/Exceptions/HipSupportMissing.h>
		#include <TNL/Exceptions/NotImplementedError.h>

		namespace TNL {
		@@ -212,6 +214,8 @@ performSecondPhase( Vector& v,
		#endif
		}

		/////
		// CUDA specialization
		template< ScanType Type >
		template< typename Vector,
		typename Reduction >
		@@ -292,6 +296,88 @@ performSecondPhase( Vector& v,
		#endif
		}

		/////
		// HIP specialization
		template< ScanType Type >
		template< typename Vector,
		typename Reduction >
		void
		Scan< Devices::Hip, Type >::
		perform( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero )
		{
		#ifdef HAVE_HIP
		using RealType = typename Vector::RealType;
		using IndexType = typename Vector::IndexType;

		detail::HipScanKernelLauncher< Type, RealType, IndexType >::perform(
		end - begin,
		&v.getData()[ begin ], // input
		&v.getData()[ begin ], // output
		reduction,
		zero );
		#else
		throw Exceptions::HipSupportMissing();
		#endif
		}

		template< ScanType Type >
		template< typename Vector,
		typename Reduction >
		auto
		Scan< Devices::Hip, Type >::
		performFirstPhase( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero )
		{
		#ifdef HAVE_HIP
		using RealType = typename Vector::RealType;
		using IndexType = typename Vector::IndexType;

		return detail::HipScanKernelLauncher< Type, RealType, IndexType >::performFirstPhase(
		end - begin,
		&v.getData()[ begin ], // input
		&v.getData()[ begin ], // output
		reduction,
		zero );
		#else
		throw Exceptions::HipSupportMissing();
		#endif
		}

		template< ScanType Type >
		template< typename Vector,
		typename BlockShifts,
		typename Reduction >
		void
		Scan< Devices::Hip, Type >::
		performSecondPhase( Vector& v,
		const BlockShifts& blockShifts,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType shift )
		{
		#ifdef HAVE_HIP
		using RealType = typename Vector::RealType;
		using IndexType = typename Vector::IndexType;

		detail::HipScanKernelLauncher< Type, RealType, IndexType >::performSecondPhase(
		end - begin,
		&v.getData()[ begin ], // output
		blockShifts.getData(),
		reduction,
		shift );
		#else
		throw Exceptions::HipSupportMissing();
		#endif
		}


		template< ScanType Type >
		template< typename Vector,
		@@ -374,5 +460,28 @@ perform( Vector& v,
		#endif
		}

		template< ScanType Type >
		template< typename Vector,
		typename Reduction,
		typename Flags >
		void
		SegmentedScan< Devices::Hip, Type >::
		perform( Vector& v,
		Flags& flags,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::RealType zero )
		{
		#ifdef HAVE_CUDA
		using RealType = typename Vector::RealType;
		using IndexType = typename Vector::IndexType;

		throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for HIP." );
		#else
		throw Exceptions::HipSupportMissing();
		#endif
		}

		} // namespace Algorithms
		} // namespace TNL

src/TNL/Algorithms/detail/CudaMultireductionKernel.h

+3 −3

Original line number	Diff line number	Diff line
		@@ -35,9 +35,9 @@ static constexpr int Multireduction_registersPerThread = 32; // empirically de
		// __CUDA_ARCH__ is defined only in device code!
		#if (__CUDA_ARCH__ == 750 )
		// Turing has a limit of 1024 threads per multiprocessor
		static constexpr int Multireduction_minBlocksPerMultiprocessor = 4;
		static constexpr int cudaMultireduction_minBlocksPerMultiprocessor = 4;
		#else
		static constexpr int Multireduction_minBlocksPerMultiprocessor = 8;
		static constexpr int cudaMultireduction_minBlocksPerMultiprocessor = 8;
		#endif

		template< int blockSizeX,
		@@ -46,7 +46,7 @@ template< int blockSizeX,
		typename Reduction,
		typename Index >
		__global__ void
		__launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor )
		__launch_bounds__( Multireduction_maxThreadsPerBlock, cudaMultireduction_minBlocksPerMultiprocessor )
		CudaMultireductionKernel( const Result zero,
		DataFetcher dataFetcher,
		const Reduction reduction,