Commit e347b486 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Moved segmented scan into its own header file under Algorithms

Also moved the test under Algorithms and made sure it is actually
being compiled.
parent a4e15b08
Loading
Loading
Loading
Loading
+0 −178
Original line number Diff line number Diff line
@@ -53,50 +53,6 @@ template< typename Device,
          ScanType Type = ScanType::Inclusive >
struct Scan;

/**
 * \brief Computes segmented scan (or prefix sum) on a vector.
 *
 * Segmented scan is a modification of common scan. In this case the sequence of
 * numbers in hand is divided into segments like this, for example
 *
 * ```
 * [1,3,5][2,4,6,9][3,5],[3,6,9,12,15]
 * ```
 *
 * and we want to compute inclusive or exclusive scan of each segment. For inclusive segmented prefix sum we get
 *
 * ```
 * [1,4,9][2,6,12,21][3,8][3,9,18,30,45]
 * ```
 *
 * and for exclusive segmented prefix sum it is
 *
 * ```
 * [0,1,4][0,2,6,12][0,3][0,3,9,18,30]
 * ```
 *
 * In addition to common scan, we need to encode the segments of the input sequence.
 * It is done by auxiliary flags array (it can be array of booleans) having `1` at the
 * beginning of each segment and `0` on all other positions. In our example, it would be like this:
 *
 * ```
 * [1,0,0,1,0,0,0,1,0,1,0,0, 0, 0]
 * [1,3,5,2,4,6,9,3,5,3,6,9,12,15]
 *
 * ```
 *
 * \tparam Device parameter says on what device the reduction is gonna be performed.
 * \tparam Type parameter says if inclusive or exclusive is scan is to be computed.
 *
 * See \ref Scan< Devices::Host, Type > and \ref Scan< Devices::Cuda, Type >.
 *
 * **Note: Segmented scan is not implemented for CUDA yet.**
 */
template< typename Device,
          ScanType Type = ScanType::Inclusive >
struct SegmentedScan;


template< ScanType Type >
struct Scan< Devices::Sequential, Type >
{
@@ -277,140 +233,6 @@ struct Scan< Devices::Cuda, Type >
                       const typename Vector::ValueType zero );
};

template< ScanType Type >
struct SegmentedScan< Devices::Sequential, Type >
{
   /**
    * \brief Computes segmented scan (prefix sum) sequentially.
    *
    * \tparam Vector type vector being used for the scan.
    * \tparam Reduction lambda function defining the reduction operation
    * \tparam Flags array type containing zeros and ones defining the segments begining
    *
    * \param v input vector, the result of scan is stored in the same vector
    * \param flags is an array with zeros and ones defining the segments begining
    * \param begin the first element in the array to be scanned
    * \param end the last element in the array to be scanned
    * \param reduction lambda function implementing the reduction operation
    * \param zero is the idempotent element for the reduction operation, i.e. element which
    *             does not change the result of the reduction.
    *
    * The reduction lambda function takes two variables which are supposed to be reduced:
    *
    * ```
    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
    * ```
    *
    * \par Example
    *
    * \include ReductionAndScan/SegmentedScanExample.cpp
    *
    * \par Output
    *
    * \include SegmentedScanExample.out
    */
   template< typename Vector,
             typename Reduction,
             typename Flags >
   static void
   perform( Vector& v,
            Flags& flags,
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const typename Vector::ValueType zero );
};

template< ScanType Type >
struct SegmentedScan< Devices::Host, Type >
{
   /**
    * \brief Computes segmented scan (prefix sum) using OpenMP.
    *
    * \tparam Vector type vector being used for the scan.
    * \tparam Reduction lambda function defining the reduction operation
    * \tparam Flags array type containing zeros and ones defining the segments begining
    *
    * \param v input vector, the result of scan is stored in the same vector
    * \param flags is an array with zeros and ones defining the segments begining
    * \param begin the first element in the array to be scanned
    * \param end the last element in the array to be scanned
    * \param reduction lambda function implementing the reduction operation
    * \param zero is the idempotent element for the reduction operation, i.e. element which
    *             does not change the result of the reduction.
    *
    * The reduction lambda function takes two variables which are supposed to be reduced:
    *
    * ```
    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
    * ```
    *
    * \par Example
    *
    * \include ReductionAndScan/SegmentedScanExample.cpp
    *
    * \par Output
    *
    * \include SegmentedScanExample.out
    */
   template< typename Vector,
             typename Reduction,
             typename Flags >
   static void
   perform( Vector& v,
            Flags& flags,
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const typename Vector::ValueType zero );
};

template< ScanType Type >
struct SegmentedScan< Devices::Cuda, Type >
{
   /**
    * \brief Computes segmented scan (prefix sum) on GPU.
    *
    * \tparam Vector type vector being used for the scan.
    * \tparam Reduction lambda function defining the reduction operation
    * \tparam Flags array type containing zeros and ones defining the segments begining
    *
    * \param v input vector, the result of scan is stored in the same vector
    * \param flags is an array with zeros and ones defining the segments begining
    * \param begin the first element in the array to be scanned
    * \param end the last element in the array to be scanned
    * \param reduction lambda function implementing the reduction operation
    * \param zero is the idempotent element for the reduction operation, i.e. element which
    *             does not change the result of the reduction.
    *
    * The reduction lambda function takes two variables which are supposed to be reduced:
    *
    * ```
    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
    * ```
    *
    * \par Example
    *
    * \include ReductionAndScan/SegmentedScanExample.cpp
    *
    * \par Output
    *
    * \include SegmentedScanExample.out
    *
    * **Note: Segmented scan is not implemented for CUDA yet.**
    */
   template< typename Vector,
             typename Reduction,
             typename Flags >
   static void
   perform( Vector& v,
            Flags& flags,
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const typename Vector::ValueType zero );
};

} // namespace Algorithms
} // namespace TNL

+0 −83
Original line number Diff line number Diff line
@@ -20,7 +20,6 @@
#include <TNL/Containers/StaticArray.h>
#include <TNL/Algorithms/detail/CudaScanKernel.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Exceptions/NotImplementedError.h>

namespace TNL {
namespace Algorithms {
@@ -306,87 +305,5 @@ performSecondPhase( Vector& v,
#endif
}


template< ScanType Type >
   template< typename Vector,
             typename Reduction,
             typename Flags >
void
SegmentedScan< Devices::Sequential, Type >::
perform( Vector& v,
         Flags& flags,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const typename Vector::ValueType zero )
{
   using ValueType = typename Vector::ValueType;
   using IndexType = typename Vector::IndexType;

   if( Type == ScanType::Inclusive )
   {
      for( IndexType i = begin + 1; i < end; i++ )
         if( ! flags[ i ] )
            v[ i ] = reduction( v[ i ], v[ i - 1 ] );
   }
   else // Exclusive scan
   {
      ValueType aux( v[ begin ] );
      v[ begin ] = zero;
      for( IndexType i = begin + 1; i < end; i++ )
      {
         ValueType x = v[ i ];
         if( flags[ i ] )
            aux = zero;
         v[ i ] = aux;
         aux = reduction( aux, x );
      }
   }
}

template< ScanType Type >
   template< typename Vector,
             typename Reduction,
             typename Flags >
void
SegmentedScan< Devices::Host, Type >::
perform( Vector& v,
         Flags& flags,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const typename Vector::ValueType zero )
{
#ifdef HAVE_OPENMP
   // TODO: parallelize with OpenMP
   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
#else
   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
#endif
}

template< ScanType Type >
   template< typename Vector,
             typename Reduction,
             typename Flags >
void
SegmentedScan< Devices::Cuda, Type >::
perform( Vector& v,
         Flags& flags,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const typename Vector::ValueType zero )
{
#ifdef HAVE_CUDA
   using ValueType = typename Vector::ValueType;
   using IndexType = typename Vector::IndexType;

   throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for CUDA." );
#else
   throw Exceptions::CudaSupportMissing();
#endif
}

} // namespace Algorithms
} // namespace TNL
+204 −0
Original line number Diff line number Diff line
/***************************************************************************
                          SegmentedScan.h  -  description
                             -------------------
    begin                : May 9, 2019
    copyright            : (C) 2019 by Tomas Oberhuber et al.
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

// Implemented by: Tomas Oberhuber, Jakub Klinkovsky

#pragma once

#include <TNL/Devices/Sequential.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>

#include "Scan.h"  // only for the ScanType

namespace TNL {
namespace Algorithms {

/**
 * \brief Computes segmented scan (or prefix sum) on a vector.
 *
 * Segmented scan is a modification of common scan. In this case the sequence of
 * numbers in hand is divided into segments like this, for example
 *
 * ```
 * [1,3,5][2,4,6,9][3,5],[3,6,9,12,15]
 * ```
 *
 * and we want to compute inclusive or exclusive scan of each segment. For inclusive segmented prefix sum we get
 *
 * ```
 * [1,4,9][2,6,12,21][3,8][3,9,18,30,45]
 * ```
 *
 * and for exclusive segmented prefix sum it is
 *
 * ```
 * [0,1,4][0,2,6,12][0,3][0,3,9,18,30]
 * ```
 *
 * In addition to common scan, we need to encode the segments of the input sequence.
 * It is done by auxiliary flags array (it can be array of booleans) having `1` at the
 * beginning of each segment and `0` on all other positions. In our example, it would be like this:
 *
 * ```
 * [1,0,0,1,0,0,0,1,0,1,0,0, 0, 0]
 * [1,3,5,2,4,6,9,3,5,3,6,9,12,15]
 *
 * ```
 *
 * \tparam Device parameter says on what device the reduction is gonna be performed.
 * \tparam Type parameter says if inclusive or exclusive is scan is to be computed.
 *
 * See \ref Scan< Devices::Host, Type > and \ref Scan< Devices::Cuda, Type >.
 *
 * **Note: Segmented scan is not implemented for CUDA yet.**
 */
template< typename Device,
          ScanType Type = ScanType::Inclusive >
struct SegmentedScan;

template< ScanType Type >
struct SegmentedScan< Devices::Sequential, Type >
{
   /**
    * \brief Computes segmented scan (prefix sum) sequentially.
    *
    * \tparam Vector type vector being used for the scan.
    * \tparam Reduction lambda function defining the reduction operation
    * \tparam Flags array type containing zeros and ones defining the segments begining
    *
    * \param v input vector, the result of scan is stored in the same vector
    * \param flags is an array with zeros and ones defining the segments begining
    * \param begin the first element in the array to be scanned
    * \param end the last element in the array to be scanned
    * \param reduction lambda function implementing the reduction operation
    * \param zero is the idempotent element for the reduction operation, i.e. element which
    *             does not change the result of the reduction.
    *
    * The reduction lambda function takes two variables which are supposed to be reduced:
    *
    * ```
    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
    * ```
    *
    * \par Example
    *
    * \include ReductionAndScan/SegmentedScanExample.cpp
    *
    * \par Output
    *
    * \include SegmentedScanExample.out
    */
   template< typename Vector,
             typename Reduction,
             typename Flags >
   static void
   perform( Vector& v,
            Flags& flags,
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const typename Vector::ValueType zero );
};

template< ScanType Type >
struct SegmentedScan< Devices::Host, Type >
{
   /**
    * \brief Computes segmented scan (prefix sum) using OpenMP.
    *
    * \tparam Vector type vector being used for the scan.
    * \tparam Reduction lambda function defining the reduction operation
    * \tparam Flags array type containing zeros and ones defining the segments begining
    *
    * \param v input vector, the result of scan is stored in the same vector
    * \param flags is an array with zeros and ones defining the segments begining
    * \param begin the first element in the array to be scanned
    * \param end the last element in the array to be scanned
    * \param reduction lambda function implementing the reduction operation
    * \param zero is the idempotent element for the reduction operation, i.e. element which
    *             does not change the result of the reduction.
    *
    * The reduction lambda function takes two variables which are supposed to be reduced:
    *
    * ```
    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
    * ```
    *
    * \par Example
    *
    * \include ReductionAndScan/SegmentedScanExample.cpp
    *
    * \par Output
    *
    * \include SegmentedScanExample.out
    */
   template< typename Vector,
             typename Reduction,
             typename Flags >
   static void
   perform( Vector& v,
            Flags& flags,
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const typename Vector::ValueType zero );
};

template< ScanType Type >
struct SegmentedScan< Devices::Cuda, Type >
{
   /**
    * \brief Computes segmented scan (prefix sum) on GPU.
    *
    * \tparam Vector type vector being used for the scan.
    * \tparam Reduction lambda function defining the reduction operation
    * \tparam Flags array type containing zeros and ones defining the segments begining
    *
    * \param v input vector, the result of scan is stored in the same vector
    * \param flags is an array with zeros and ones defining the segments begining
    * \param begin the first element in the array to be scanned
    * \param end the last element in the array to be scanned
    * \param reduction lambda function implementing the reduction operation
    * \param zero is the idempotent element for the reduction operation, i.e. element which
    *             does not change the result of the reduction.
    *
    * The reduction lambda function takes two variables which are supposed to be reduced:
    *
    * ```
    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
    * ```
    *
    * \par Example
    *
    * \include ReductionAndScan/SegmentedScanExample.cpp
    *
    * \par Output
    *
    * \include SegmentedScanExample.out
    *
    * **Note: Segmented scan is not implemented for CUDA yet.**
    */
   template< typename Vector,
             typename Reduction,
             typename Flags >
   static void
   perform( Vector& v,
            Flags& flags,
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const typename Vector::ValueType zero );
};

} // namespace Algorithms
} // namespace TNL

#include <TNL/Algorithms/SegmentedScan.hpp>
+104 −0
Original line number Diff line number Diff line
/***************************************************************************
                          SegmentedScan.hpp  -  description
                             -------------------
    begin                : Mar 24, 2013
    copyright            : (C) 2013 by Tomas Oberhuber et al.
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

// Implemented by: Tomas Oberhuber, Jakub Klinkovsky

#pragma once

#include "SegmentedScan.h"

#include <TNL/Exceptions/NotImplementedError.h>

namespace TNL {
namespace Algorithms {

template< ScanType Type >
   template< typename Vector,
             typename Reduction,
             typename Flags >
void
SegmentedScan< Devices::Sequential, Type >::
perform( Vector& v,
         Flags& flags,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const typename Vector::ValueType zero )
{
   using ValueType = typename Vector::ValueType;
   using IndexType = typename Vector::IndexType;

   if( Type == ScanType::Inclusive )
   {
      for( IndexType i = begin + 1; i < end; i++ )
         if( ! flags[ i ] )
            v[ i ] = reduction( v[ i ], v[ i - 1 ] );
   }
   else // Exclusive scan
   {
      ValueType aux( v[ begin ] );
      v[ begin ] = zero;
      for( IndexType i = begin + 1; i < end; i++ )
      {
         ValueType x = v[ i ];
         if( flags[ i ] )
            aux = zero;
         v[ i ] = aux;
         aux = reduction( aux, x );
      }
   }
}

template< ScanType Type >
   template< typename Vector,
             typename Reduction,
             typename Flags >
void
SegmentedScan< Devices::Host, Type >::
perform( Vector& v,
         Flags& flags,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const typename Vector::ValueType zero )
{
#ifdef HAVE_OPENMP
   // TODO: parallelize with OpenMP
   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
#else
   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
#endif
}

template< ScanType Type >
   template< typename Vector,
             typename Reduction,
             typename Flags >
void
SegmentedScan< Devices::Cuda, Type >::
perform( Vector& v,
         Flags& flags,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const typename Vector::ValueType zero )
{
#ifdef HAVE_CUDA
   using ValueType = typename Vector::ValueType;
   using IndexType = typename Vector::IndexType;

   throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for CUDA." );
#else
   throw Exceptions::CudaSupportMissing();
#endif
}

} // namespace Algorithms
} // namespace TNL
+1 −0
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@
#include <TNL/Containers/ArrayView.h>
#include <TNL/Containers/Expressions/ExpressionTemplates.h>
#include <TNL/Algorithms/Scan.h>
#include <TNL/Algorithms/SegmentedScan.h>

namespace TNL {
namespace Containers {
Loading