Commit a85da5ed authored by Tomáš Oberhuber's avatar Tomáš Oberhuber Committed by Jakub Klinkovský
Browse files

Added scan and multireduction for HIP.

parent a9a566f7
Loading
Loading
Loading
Loading
+30 −0
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@
#include <TNL/Devices/Sequential.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Devices/Hip.h>

namespace TNL {
namespace Algorithms {
@@ -111,6 +112,35 @@ struct Multireduction< Devices::Cuda >
           Result* hostResult );
};

template<>
struct Multireduction< Devices::Hip >
{
   /**
    * Parameters:
    *    zero: starting value for reduction
    *    dataFetcher: callable object such that `dataFetcher( i, j )` yields
    *                 the i-th value to be reduced from the j-th dataset
    *                 (i = 0,...,size-1; j = 0,...,n-1)
    *    reduction: callable object representing the reduction operation
    *               for example, it can be an instance of std::plus, std::logical_and,
    *               std::logical_or etc.
    *    size: the size of each dataset
    *    n: number of datasets to be reduced
    *    hostResult: output array of size = n
    */
   template< typename Result,
             typename DataFetcher,
             typename Reduction,
             typename Index >
   static void
   reduce( const Result zero,
           DataFetcher dataFetcher,
           const Reduction reduction,
           const Index size,
           const int n,
           Result* hostResult );
};

} // namespace Algorithms
} // namespace TNL

+57 −1
Original line number Diff line number Diff line
@@ -15,13 +15,15 @@
#include <memory>  // std::unique_ptr

//#define CUDA_REDUCTION_PROFILING
//#define HIP_REDUCTION_PROFILING

#include <TNL/Assert.h>
#include <TNL/Algorithms/Multireduction.h>
#include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
#include <TNL/Algorithms/detail/CudaMultireductionKernel.h>
#include <TNL/Algorithms/detail/HipMultireductionKernel.h>

#ifdef CUDA_REDUCTION_PROFILING
#if defined CUDA_REDUCTION_PROFILING || defined HIP_REDUCTION_PROFILING
#include <TNL/Timer.h>
#include <iostream>
#endif
@@ -242,5 +244,59 @@ reduce( const Result zero,
   #endif
};

template< typename Result,
          typename DataFetcher,
          typename Reduction,
          typename Index >
void
Multireduction< Devices::Hip >::
reduce( const Result zero,
        DataFetcher dataFetcher,
        const Reduction reduction,
        const Index size,
        const int n,
        Result* hostResult )
{
   TNL_ASSERT_GT( size, 0, "The size of datasets must be positive." );
   TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." );

   #ifdef HIP_REDUCTION_PROFILING
      Timer timer;
      timer.reset();
      timer.start();
   #endif

   // start the reduction on the GPU
   Result* deviceAux1 = nullptr;
   const int reducedSize = detail::HipMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 );

   #ifdef HIP_REDUCTION_PROFILING
      timer.stop();
      std::cout << "   Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl;
      timer.reset();
      timer.start();
   #endif

   // transfer the reduced data from device to host
   std::unique_ptr< Result[] > resultArray{ new Result[ n * reducedSize ] };
   MultiDeviceMemoryOperations< void, Devices::Hip >::copy( resultArray.get(), deviceAux1, n * reducedSize );

   #ifdef HIP_REDUCTION_PROFILING
      timer.stop();
      std::cout << "   Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl;
      timer.reset();
      timer.start();
   #endif

   // finish the reduction on the host
   auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; };
   Multireduction< Devices::Sequential >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult );

   #ifdef HIP_REDUCTION_PROFILING
      timer.stop();
      std::cout << "   Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
   #endif
};

} // namespace Algorithms
} // namespace TNL
+107 −0
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@
#include <TNL/Devices/Sequential.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Devices/Hip.h>

namespace TNL {
namespace Algorithms {
@@ -277,6 +278,66 @@ struct Scan< Devices::Cuda, Type >
                       const typename Vector::RealType shift );
};

template< ScanType Type >
struct Scan< Devices::Hip, Type >
{
   /**
    * \brief Computes scan (prefix sum) on GPU.
    *
    * \tparam Vector type vector being used for the scan.
    * \tparam Reduction lambda function defining the reduction operation
    *
    * \param v input vector, the result of scan is stored in the same vector
    * \param begin the first element in the array to be scanned
    * \param end the last element in the array to be scanned
    * \param reduction lambda function implementing the reduction operation
    * \param zero is the idempotent element for the reduction operation, i.e. element which
    *             does not change the result of the reduction.
    *
    * The reduction lambda function takes two variables which are supposed to be reduced:
    *
    * ```
    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
    * ```
    *
    * \par Example
    *
    * \include ReductionAndScan/ScanExample.cpp
    *
    * \par Output
    *
    * \include ScanExample.out
    */
   template< typename Vector,
             typename Reduction >
   static void
   perform( Vector& v,
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const typename Vector::RealType zero );

   template< typename Vector,
             typename Reduction >
   static auto
   performFirstPhase( Vector& v,
                      const typename Vector::IndexType begin,
                      const typename Vector::IndexType end,
                      const Reduction& reduction,
                      const typename Vector::RealType zero );

   template< typename Vector,
             typename BlockShifts,
             typename Reduction >
   static void
   performSecondPhase( Vector& v,
                       const BlockShifts& blockShifts,
                       const typename Vector::IndexType begin,
                       const typename Vector::IndexType end,
                       const Reduction& reduction,
                       const typename Vector::RealType shift );
};

template< ScanType Type >
struct SegmentedScan< Devices::Sequential, Type >
{
@@ -411,6 +472,52 @@ struct SegmentedScan< Devices::Cuda, Type >
            const typename Vector::RealType zero );
};

template< ScanType Type >
struct SegmentedScan< Devices::Hip, Type >
{
   /**
    * \brief Computes segmented scan (prefix sum) on GPU.
    *
    * \tparam Vector type vector being used for the scan.
    * \tparam Reduction lambda function defining the reduction operation
    * \tparam Flags array type containing zeros and ones defining the segments begining
    *
    * \param v input vector, the result of scan is stored in the same vector
    * \param flags is an array with zeros and ones defining the segments begining
    * \param begin the first element in the array to be scanned
    * \param end the last element in the array to be scanned
    * \param reduction lambda function implementing the reduction operation
    * \param zero is the idempotent element for the reduction operation, i.e. element which
    *             does not change the result of the reduction.
    *
    * The reduction lambda function takes two variables which are supposed to be reduced:
    *
    * ```
    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
    * ```
    *
    * \par Example
    *
    * \include ReductionAndScan/SegmentedScanExample.cpp
    *
    * \par Output
    *
    * \include SegmentedScanExample.out
    *
    * **Note: Segmented scan is not implemented for HIP yet.**
    */
   template< typename Vector,
             typename Reduction,
             typename Flags >
   static void
   perform( Vector& v,
            Flags& flags,
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const typename Vector::RealType zero );
};

} // namespace Algorithms
} // namespace TNL

+109 −0
Original line number Diff line number Diff line
@@ -18,7 +18,9 @@
#include <TNL/Containers/Array.h>
#include <TNL/Containers/StaticArray.h>
#include <TNL/Algorithms/detail/CudaScanKernel.h>
#include <TNL/Algorithms/detail/HipScanKernel.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Exceptions/HipSupportMissing.h>
#include <TNL/Exceptions/NotImplementedError.h>

namespace TNL {
@@ -212,6 +214,8 @@ performSecondPhase( Vector& v,
#endif
}

/////
// CUDA specialization
template< ScanType Type >
   template< typename Vector,
             typename Reduction >
@@ -292,6 +296,88 @@ performSecondPhase( Vector& v,
#endif
}

/////
// HIP specialization
template< ScanType Type >
   template< typename Vector,
             typename Reduction >
void
Scan< Devices::Hip, Type >::
perform( Vector& v,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const typename Vector::RealType zero )
{
#ifdef HAVE_HIP
   using RealType = typename Vector::RealType;
   using IndexType = typename Vector::IndexType;

   detail::HipScanKernelLauncher< Type, RealType, IndexType >::perform(
      end - begin,
      &v.getData()[ begin ],  // input
      &v.getData()[ begin ],  // output
      reduction,
      zero );
#else
   throw Exceptions::HipSupportMissing();
#endif
}

template< ScanType Type >
   template< typename Vector,
             typename Reduction >
auto
Scan< Devices::Hip, Type >::
performFirstPhase( Vector& v,
                   const typename Vector::IndexType begin,
                   const typename Vector::IndexType end,
                   const Reduction& reduction,
                   const typename Vector::RealType zero )
{
#ifdef HAVE_HIP
   using RealType = typename Vector::RealType;
   using IndexType = typename Vector::IndexType;

   return detail::HipScanKernelLauncher< Type, RealType, IndexType >::performFirstPhase(
      end - begin,
      &v.getData()[ begin ],  // input
      &v.getData()[ begin ],  // output
      reduction,
      zero );
#else
   throw Exceptions::HipSupportMissing();
#endif
}

template< ScanType Type >
   template< typename Vector,
             typename BlockShifts,
             typename Reduction >
void
Scan< Devices::Hip, Type >::
performSecondPhase( Vector& v,
                    const BlockShifts& blockShifts,
                    const typename Vector::IndexType begin,
                    const typename Vector::IndexType end,
                    const Reduction& reduction,
                    const typename Vector::RealType shift )
{
#ifdef HAVE_HIP
   using RealType = typename Vector::RealType;
   using IndexType = typename Vector::IndexType;

   detail::HipScanKernelLauncher< Type, RealType, IndexType >::performSecondPhase(
      end - begin,
      &v.getData()[ begin ],  // output
      blockShifts.getData(),
      reduction,
      shift );
#else
   throw Exceptions::HipSupportMissing();
#endif
}


template< ScanType Type >
   template< typename Vector,
@@ -374,5 +460,28 @@ perform( Vector& v,
#endif
}

template< ScanType Type >
   template< typename Vector,
             typename Reduction,
             typename Flags >
void
SegmentedScan< Devices::Hip, Type >::
perform( Vector& v,
         Flags& flags,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const typename Vector::RealType zero )
{
#ifdef HAVE_CUDA
   using RealType = typename Vector::RealType;
   using IndexType = typename Vector::IndexType;

   throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for HIP." );
#else
   throw Exceptions::HipSupportMissing();
#endif
}

} // namespace Algorithms
} // namespace TNL
+3 −3
Original line number Diff line number Diff line
@@ -35,9 +35,9 @@ static constexpr int Multireduction_registersPerThread = 32; // empirically de
// __CUDA_ARCH__ is defined only in device code!
#if (__CUDA_ARCH__ == 750 )
   // Turing has a limit of 1024 threads per multiprocessor
   static constexpr int Multireduction_minBlocksPerMultiprocessor = 4;
   static constexpr int cudaMultireduction_minBlocksPerMultiprocessor = 4;
#else
   static constexpr int Multireduction_minBlocksPerMultiprocessor = 8;
   static constexpr int cudaMultireduction_minBlocksPerMultiprocessor = 8;
#endif

template< int blockSizeX,
@@ -46,7 +46,7 @@ template< int blockSizeX,
          typename Reduction,
          typename Index >
__global__ void
__launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor )
__launch_bounds__( Multireduction_maxThreadsPerBlock, cudaMultireduction_minBlocksPerMultiprocessor )
CudaMultireductionKernel( const Result zero,
                          DataFetcher dataFetcher,
                          const Reduction reduction,
Loading