Loading src/TNL/Algorithms/Multireduction.h +30 −0 Original line number Diff line number Diff line Loading @@ -17,6 +17,7 @@ #include <TNL/Devices/Sequential.h> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> #include <TNL/Devices/Hip.h> namespace TNL { namespace Algorithms { Loading Loading @@ -111,6 +112,35 @@ struct Multireduction< Devices::Cuda > Result* hostResult ); }; template<> struct Multireduction< Devices::Hip > { /** * Parameters: * zero: starting value for reduction * dataFetcher: callable object such that `dataFetcher( i, j )` yields * the i-th value to be reduced from the j-th dataset * (i = 0,...,size-1; j = 0,...,n-1) * reduction: callable object representing the reduction operation * for example, it can be an instance of std::plus, std::logical_and, * std::logical_or etc. * size: the size of each dataset * n: number of datasets to be reduced * hostResult: output array of size = n */ template< typename Result, typename DataFetcher, typename Reduction, typename Index > static void reduce( const Result zero, DataFetcher dataFetcher, const Reduction reduction, const Index size, const int n, Result* hostResult ); }; } // namespace Algorithms } // namespace TNL Loading src/TNL/Algorithms/Multireduction.hpp +57 −1 Original line number Diff line number Diff line Loading @@ -15,13 +15,15 @@ #include <memory> // std::unique_ptr //#define CUDA_REDUCTION_PROFILING //#define HIP_REDUCTION_PROFILING #include <TNL/Assert.h> #include <TNL/Algorithms/Multireduction.h> #include <TNL/Algorithms/MultiDeviceMemoryOperations.h> #include <TNL/Algorithms/detail/CudaMultireductionKernel.h> #include <TNL/Algorithms/detail/HipMultireductionKernel.h> #ifdef CUDA_REDUCTION_PROFILING #if defined CUDA_REDUCTION_PROFILING || defined HIP_REDUCTION_PROFILING #include <TNL/Timer.h> #include <iostream> #endif Loading Loading @@ -242,5 +244,59 @@ reduce( const Result zero, #endif }; template< typename Result, typename DataFetcher, typename Reduction, typename Index > void Multireduction< Devices::Hip >:: reduce( const Result zero, DataFetcher dataFetcher, const Reduction reduction, const Index size, const int n, Result* hostResult ) { TNL_ASSERT_GT( size, 0, "The size of datasets must be positive." ); TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." ); #ifdef HIP_REDUCTION_PROFILING Timer timer; timer.reset(); timer.start(); #endif // start the reduction on the GPU Result* deviceAux1 = nullptr; const int reducedSize = detail::HipMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 ); #ifdef HIP_REDUCTION_PROFILING timer.stop(); std::cout << " Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl; timer.reset(); timer.start(); #endif // transfer the reduced data from device to host std::unique_ptr< Result[] > resultArray{ new Result[ n * reducedSize ] }; MultiDeviceMemoryOperations< void, Devices::Hip >::copy( resultArray.get(), deviceAux1, n * reducedSize ); #ifdef HIP_REDUCTION_PROFILING timer.stop(); std::cout << " Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl; timer.reset(); timer.start(); #endif // finish the reduction on the host auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; }; Multireduction< Devices::Sequential >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult ); #ifdef HIP_REDUCTION_PROFILING timer.stop(); std::cout << " Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif }; } // namespace Algorithms } // namespace TNL src/TNL/Algorithms/Scan.h +107 −0 Original line number Diff line number Diff line Loading @@ -15,6 +15,7 @@ #include <TNL/Devices/Sequential.h> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> #include <TNL/Devices/Hip.h> namespace TNL { namespace Algorithms { Loading Loading @@ -277,6 +278,66 @@ struct Scan< Devices::Cuda, Type > const typename Vector::RealType shift ); }; template< ScanType Type > struct Scan< Devices::Hip, Type > { /** * \brief Computes scan (prefix sum) on GPU. * * \tparam Vector type vector being used for the scan. * \tparam Reduction lambda function defining the reduction operation * * \param v input vector, the result of scan is stored in the same vector * \param begin the first element in the array to be scanned * \param end the last element in the array to be scanned * \param reduction lambda function implementing the reduction operation * \param zero is the idempotent element for the reduction operation, i.e. element which * does not change the result of the reduction. * * The reduction lambda function takes two variables which are supposed to be reduced: * * ``` * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... }; * ``` * * \par Example * * \include ReductionAndScan/ScanExample.cpp * * \par Output * * \include ScanExample.out */ template< typename Vector, typename Reduction > static void perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); template< typename Vector, typename Reduction > static auto performFirstPhase( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); template< typename Vector, typename BlockShifts, typename Reduction > static void performSecondPhase( Vector& v, const BlockShifts& blockShifts, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType shift ); }; template< ScanType Type > struct SegmentedScan< Devices::Sequential, Type > { Loading Loading @@ -411,6 +472,52 @@ struct SegmentedScan< Devices::Cuda, Type > const typename Vector::RealType zero ); }; template< ScanType Type > struct SegmentedScan< Devices::Hip, Type > { /** * \brief Computes segmented scan (prefix sum) on GPU. * * \tparam Vector type vector being used for the scan. * \tparam Reduction lambda function defining the reduction operation * \tparam Flags array type containing zeros and ones defining the segments begining * * \param v input vector, the result of scan is stored in the same vector * \param flags is an array with zeros and ones defining the segments begining * \param begin the first element in the array to be scanned * \param end the last element in the array to be scanned * \param reduction lambda function implementing the reduction operation * \param zero is the idempotent element for the reduction operation, i.e. element which * does not change the result of the reduction. * * The reduction lambda function takes two variables which are supposed to be reduced: * * ``` * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... }; * ``` * * \par Example * * \include ReductionAndScan/SegmentedScanExample.cpp * * \par Output * * \include SegmentedScanExample.out * * **Note: Segmented scan is not implemented for HIP yet.** */ template< typename Vector, typename Reduction, typename Flags > static void perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); }; } // namespace Algorithms } // namespace TNL Loading src/TNL/Algorithms/Scan.hpp +109 −0 Original line number Diff line number Diff line Loading @@ -18,7 +18,9 @@ #include <TNL/Containers/Array.h> #include <TNL/Containers/StaticArray.h> #include <TNL/Algorithms/detail/CudaScanKernel.h> #include <TNL/Algorithms/detail/HipScanKernel.h> #include <TNL/Exceptions/CudaSupportMissing.h> #include <TNL/Exceptions/HipSupportMissing.h> #include <TNL/Exceptions/NotImplementedError.h> namespace TNL { Loading Loading @@ -212,6 +214,8 @@ performSecondPhase( Vector& v, #endif } ///// // CUDA specialization template< ScanType Type > template< typename Vector, typename Reduction > Loading Loading @@ -292,6 +296,88 @@ performSecondPhase( Vector& v, #endif } ///// // HIP specialization template< ScanType Type > template< typename Vector, typename Reduction > void Scan< Devices::Hip, Type >:: perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) { #ifdef HAVE_HIP using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; detail::HipScanKernelLauncher< Type, RealType, IndexType >::perform( end - begin, &v.getData()[ begin ], // input &v.getData()[ begin ], // output reduction, zero ); #else throw Exceptions::HipSupportMissing(); #endif } template< ScanType Type > template< typename Vector, typename Reduction > auto Scan< Devices::Hip, Type >:: performFirstPhase( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) { #ifdef HAVE_HIP using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; return detail::HipScanKernelLauncher< Type, RealType, IndexType >::performFirstPhase( end - begin, &v.getData()[ begin ], // input &v.getData()[ begin ], // output reduction, zero ); #else throw Exceptions::HipSupportMissing(); #endif } template< ScanType Type > template< typename Vector, typename BlockShifts, typename Reduction > void Scan< Devices::Hip, Type >:: performSecondPhase( Vector& v, const BlockShifts& blockShifts, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType shift ) { #ifdef HAVE_HIP using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; detail::HipScanKernelLauncher< Type, RealType, IndexType >::performSecondPhase( end - begin, &v.getData()[ begin ], // output blockShifts.getData(), reduction, shift ); #else throw Exceptions::HipSupportMissing(); #endif } template< ScanType Type > template< typename Vector, Loading Loading @@ -374,5 +460,28 @@ perform( Vector& v, #endif } template< ScanType Type > template< typename Vector, typename Reduction, typename Flags > void SegmentedScan< Devices::Hip, Type >:: perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) { #ifdef HAVE_CUDA using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for HIP." ); #else throw Exceptions::HipSupportMissing(); #endif } } // namespace Algorithms } // namespace TNL src/TNL/Algorithms/detail/CudaMultireductionKernel.h +3 −3 Original line number Diff line number Diff line Loading @@ -35,9 +35,9 @@ static constexpr int Multireduction_registersPerThread = 32; // empirically de // __CUDA_ARCH__ is defined only in device code! #if (__CUDA_ARCH__ == 750 ) // Turing has a limit of 1024 threads per multiprocessor static constexpr int Multireduction_minBlocksPerMultiprocessor = 4; static constexpr int cudaMultireduction_minBlocksPerMultiprocessor = 4; #else static constexpr int Multireduction_minBlocksPerMultiprocessor = 8; static constexpr int cudaMultireduction_minBlocksPerMultiprocessor = 8; #endif template< int blockSizeX, Loading @@ -46,7 +46,7 @@ template< int blockSizeX, typename Reduction, typename Index > __global__ void __launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor ) __launch_bounds__( Multireduction_maxThreadsPerBlock, cudaMultireduction_minBlocksPerMultiprocessor ) CudaMultireductionKernel( const Result zero, DataFetcher dataFetcher, const Reduction reduction, Loading Loading
src/TNL/Algorithms/Multireduction.h +30 −0 Original line number Diff line number Diff line Loading @@ -17,6 +17,7 @@ #include <TNL/Devices/Sequential.h> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> #include <TNL/Devices/Hip.h> namespace TNL { namespace Algorithms { Loading Loading @@ -111,6 +112,35 @@ struct Multireduction< Devices::Cuda > Result* hostResult ); }; template<> struct Multireduction< Devices::Hip > { /** * Parameters: * zero: starting value for reduction * dataFetcher: callable object such that `dataFetcher( i, j )` yields * the i-th value to be reduced from the j-th dataset * (i = 0,...,size-1; j = 0,...,n-1) * reduction: callable object representing the reduction operation * for example, it can be an instance of std::plus, std::logical_and, * std::logical_or etc. * size: the size of each dataset * n: number of datasets to be reduced * hostResult: output array of size = n */ template< typename Result, typename DataFetcher, typename Reduction, typename Index > static void reduce( const Result zero, DataFetcher dataFetcher, const Reduction reduction, const Index size, const int n, Result* hostResult ); }; } // namespace Algorithms } // namespace TNL Loading
src/TNL/Algorithms/Multireduction.hpp +57 −1 Original line number Diff line number Diff line Loading @@ -15,13 +15,15 @@ #include <memory> // std::unique_ptr //#define CUDA_REDUCTION_PROFILING //#define HIP_REDUCTION_PROFILING #include <TNL/Assert.h> #include <TNL/Algorithms/Multireduction.h> #include <TNL/Algorithms/MultiDeviceMemoryOperations.h> #include <TNL/Algorithms/detail/CudaMultireductionKernel.h> #include <TNL/Algorithms/detail/HipMultireductionKernel.h> #ifdef CUDA_REDUCTION_PROFILING #if defined CUDA_REDUCTION_PROFILING || defined HIP_REDUCTION_PROFILING #include <TNL/Timer.h> #include <iostream> #endif Loading Loading @@ -242,5 +244,59 @@ reduce( const Result zero, #endif }; template< typename Result, typename DataFetcher, typename Reduction, typename Index > void Multireduction< Devices::Hip >:: reduce( const Result zero, DataFetcher dataFetcher, const Reduction reduction, const Index size, const int n, Result* hostResult ) { TNL_ASSERT_GT( size, 0, "The size of datasets must be positive." ); TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." ); #ifdef HIP_REDUCTION_PROFILING Timer timer; timer.reset(); timer.start(); #endif // start the reduction on the GPU Result* deviceAux1 = nullptr; const int reducedSize = detail::HipMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 ); #ifdef HIP_REDUCTION_PROFILING timer.stop(); std::cout << " Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl; timer.reset(); timer.start(); #endif // transfer the reduced data from device to host std::unique_ptr< Result[] > resultArray{ new Result[ n * reducedSize ] }; MultiDeviceMemoryOperations< void, Devices::Hip >::copy( resultArray.get(), deviceAux1, n * reducedSize ); #ifdef HIP_REDUCTION_PROFILING timer.stop(); std::cout << " Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl; timer.reset(); timer.start(); #endif // finish the reduction on the host auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; }; Multireduction< Devices::Sequential >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult ); #ifdef HIP_REDUCTION_PROFILING timer.stop(); std::cout << " Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif }; } // namespace Algorithms } // namespace TNL
src/TNL/Algorithms/Scan.h +107 −0 Original line number Diff line number Diff line Loading @@ -15,6 +15,7 @@ #include <TNL/Devices/Sequential.h> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> #include <TNL/Devices/Hip.h> namespace TNL { namespace Algorithms { Loading Loading @@ -277,6 +278,66 @@ struct Scan< Devices::Cuda, Type > const typename Vector::RealType shift ); }; template< ScanType Type > struct Scan< Devices::Hip, Type > { /** * \brief Computes scan (prefix sum) on GPU. * * \tparam Vector type vector being used for the scan. * \tparam Reduction lambda function defining the reduction operation * * \param v input vector, the result of scan is stored in the same vector * \param begin the first element in the array to be scanned * \param end the last element in the array to be scanned * \param reduction lambda function implementing the reduction operation * \param zero is the idempotent element for the reduction operation, i.e. element which * does not change the result of the reduction. * * The reduction lambda function takes two variables which are supposed to be reduced: * * ``` * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... }; * ``` * * \par Example * * \include ReductionAndScan/ScanExample.cpp * * \par Output * * \include ScanExample.out */ template< typename Vector, typename Reduction > static void perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); template< typename Vector, typename Reduction > static auto performFirstPhase( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); template< typename Vector, typename BlockShifts, typename Reduction > static void performSecondPhase( Vector& v, const BlockShifts& blockShifts, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType shift ); }; template< ScanType Type > struct SegmentedScan< Devices::Sequential, Type > { Loading Loading @@ -411,6 +472,52 @@ struct SegmentedScan< Devices::Cuda, Type > const typename Vector::RealType zero ); }; template< ScanType Type > struct SegmentedScan< Devices::Hip, Type > { /** * \brief Computes segmented scan (prefix sum) on GPU. * * \tparam Vector type vector being used for the scan. * \tparam Reduction lambda function defining the reduction operation * \tparam Flags array type containing zeros and ones defining the segments begining * * \param v input vector, the result of scan is stored in the same vector * \param flags is an array with zeros and ones defining the segments begining * \param begin the first element in the array to be scanned * \param end the last element in the array to be scanned * \param reduction lambda function implementing the reduction operation * \param zero is the idempotent element for the reduction operation, i.e. element which * does not change the result of the reduction. * * The reduction lambda function takes two variables which are supposed to be reduced: * * ``` * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... }; * ``` * * \par Example * * \include ReductionAndScan/SegmentedScanExample.cpp * * \par Output * * \include SegmentedScanExample.out * * **Note: Segmented scan is not implemented for HIP yet.** */ template< typename Vector, typename Reduction, typename Flags > static void perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ); }; } // namespace Algorithms } // namespace TNL Loading
src/TNL/Algorithms/Scan.hpp +109 −0 Original line number Diff line number Diff line Loading @@ -18,7 +18,9 @@ #include <TNL/Containers/Array.h> #include <TNL/Containers/StaticArray.h> #include <TNL/Algorithms/detail/CudaScanKernel.h> #include <TNL/Algorithms/detail/HipScanKernel.h> #include <TNL/Exceptions/CudaSupportMissing.h> #include <TNL/Exceptions/HipSupportMissing.h> #include <TNL/Exceptions/NotImplementedError.h> namespace TNL { Loading Loading @@ -212,6 +214,8 @@ performSecondPhase( Vector& v, #endif } ///// // CUDA specialization template< ScanType Type > template< typename Vector, typename Reduction > Loading Loading @@ -292,6 +296,88 @@ performSecondPhase( Vector& v, #endif } ///// // HIP specialization template< ScanType Type > template< typename Vector, typename Reduction > void Scan< Devices::Hip, Type >:: perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) { #ifdef HAVE_HIP using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; detail::HipScanKernelLauncher< Type, RealType, IndexType >::perform( end - begin, &v.getData()[ begin ], // input &v.getData()[ begin ], // output reduction, zero ); #else throw Exceptions::HipSupportMissing(); #endif } template< ScanType Type > template< typename Vector, typename Reduction > auto Scan< Devices::Hip, Type >:: performFirstPhase( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) { #ifdef HAVE_HIP using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; return detail::HipScanKernelLauncher< Type, RealType, IndexType >::performFirstPhase( end - begin, &v.getData()[ begin ], // input &v.getData()[ begin ], // output reduction, zero ); #else throw Exceptions::HipSupportMissing(); #endif } template< ScanType Type > template< typename Vector, typename BlockShifts, typename Reduction > void Scan< Devices::Hip, Type >:: performSecondPhase( Vector& v, const BlockShifts& blockShifts, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType shift ) { #ifdef HAVE_HIP using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; detail::HipScanKernelLauncher< Type, RealType, IndexType >::performSecondPhase( end - begin, &v.getData()[ begin ], // output blockShifts.getData(), reduction, shift ); #else throw Exceptions::HipSupportMissing(); #endif } template< ScanType Type > template< typename Vector, Loading Loading @@ -374,5 +460,28 @@ perform( Vector& v, #endif } template< ScanType Type > template< typename Vector, typename Reduction, typename Flags > void SegmentedScan< Devices::Hip, Type >:: perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::RealType zero ) { #ifdef HAVE_CUDA using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for HIP." ); #else throw Exceptions::HipSupportMissing(); #endif } } // namespace Algorithms } // namespace TNL
src/TNL/Algorithms/detail/CudaMultireductionKernel.h +3 −3 Original line number Diff line number Diff line Loading @@ -35,9 +35,9 @@ static constexpr int Multireduction_registersPerThread = 32; // empirically de // __CUDA_ARCH__ is defined only in device code! #if (__CUDA_ARCH__ == 750 ) // Turing has a limit of 1024 threads per multiprocessor static constexpr int Multireduction_minBlocksPerMultiprocessor = 4; static constexpr int cudaMultireduction_minBlocksPerMultiprocessor = 4; #else static constexpr int Multireduction_minBlocksPerMultiprocessor = 8; static constexpr int cudaMultireduction_minBlocksPerMultiprocessor = 8; #endif template< int blockSizeX, Loading @@ -46,7 +46,7 @@ template< int blockSizeX, typename Reduction, typename Index > __global__ void __launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor ) __launch_bounds__( Multireduction_maxThreadsPerBlock, cudaMultireduction_minBlocksPerMultiprocessor ) CudaMultireductionKernel( const Result zero, DataFetcher dataFetcher, const Reduction reduction, Loading