Moved segmented scan into its own header file under Algorithms (e347b486) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Scan.h

+0 −178

Original line number	Diff line number	Diff line
		@@ -53,50 +53,6 @@ template< typename Device,
		ScanType Type = ScanType::Inclusive >
		struct Scan;

		/**
		* \brief Computes segmented scan (or prefix sum) on a vector.
		*
		* Segmented scan is a modification of common scan. In this case the sequence of
		* numbers in hand is divided into segments like this, for example
		*
		* ```
		* [1,3,5][2,4,6,9][3,5],[3,6,9,12,15]
		* ```
		*
		* and we want to compute inclusive or exclusive scan of each segment. For inclusive segmented prefix sum we get
		*
		* ```
		* [1,4,9][2,6,12,21][3,8][3,9,18,30,45]
		* ```
		*
		* and for exclusive segmented prefix sum it is
		*
		* ```
		* [0,1,4][0,2,6,12][0,3][0,3,9,18,30]
		* ```
		*
		* In addition to common scan, we need to encode the segments of the input sequence.
		* It is done by auxiliary flags array (it can be array of booleans) having `1` at the
		* beginning of each segment and `0` on all other positions. In our example, it would be like this:
		*
		* ```
		* [1,0,0,1,0,0,0,1,0,1,0,0, 0, 0]
		* [1,3,5,2,4,6,9,3,5,3,6,9,12,15]
		*
		* ```
		*
		* \tparam Device parameter says on what device the reduction is gonna be performed.
		* \tparam Type parameter says if inclusive or exclusive is scan is to be computed.
		*
		* See \ref Scan< Devices::Host, Type > and \ref Scan< Devices::Cuda, Type >.
		*
		* Note: Segmented scan is not implemented for CUDA yet.
		*/
		template< typename Device,
		ScanType Type = ScanType::Inclusive >
		struct SegmentedScan;


		template< ScanType Type >
		struct Scan< Devices::Sequential, Type >
		{
		@@ -277,140 +233,6 @@ struct Scan< Devices::Cuda, Type >
		const typename Vector::ValueType zero );
		};

		template< ScanType Type >
		struct SegmentedScan< Devices::Sequential, Type >
		{
		/**
		* \brief Computes segmented scan (prefix sum) sequentially.
		*
		* \tparam Vector type vector being used for the scan.
		* \tparam Reduction lambda function defining the reduction operation
		* \tparam Flags array type containing zeros and ones defining the segments begining
		*
		* \param v input vector, the result of scan is stored in the same vector
		* \param flags is an array with zeros and ones defining the segments begining
		* \param begin the first element in the array to be scanned
		* \param end the last element in the array to be scanned
		* \param reduction lambda function implementing the reduction operation
		* \param zero is the idempotent element for the reduction operation, i.e. element which
		* does not change the result of the reduction.
		*
		* The reduction lambda function takes two variables which are supposed to be reduced:
		*
		* ```
		* auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
		* ```
		*
		* \par Example
		*
		* \include ReductionAndScan/SegmentedScanExample.cpp
		*
		* \par Output
		*
		* \include SegmentedScanExample.out
		*/
		template< typename Vector,
		typename Reduction,
		typename Flags >
		static void
		perform( Vector& v,
		Flags& flags,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::ValueType zero );
		};

		template< ScanType Type >
		struct SegmentedScan< Devices::Host, Type >
		{
		/**
		* \brief Computes segmented scan (prefix sum) using OpenMP.
		*
		* \tparam Vector type vector being used for the scan.
		* \tparam Reduction lambda function defining the reduction operation
		* \tparam Flags array type containing zeros and ones defining the segments begining
		*
		* \param v input vector, the result of scan is stored in the same vector
		* \param flags is an array with zeros and ones defining the segments begining
		* \param begin the first element in the array to be scanned
		* \param end the last element in the array to be scanned
		* \param reduction lambda function implementing the reduction operation
		* \param zero is the idempotent element for the reduction operation, i.e. element which
		* does not change the result of the reduction.
		*
		* The reduction lambda function takes two variables which are supposed to be reduced:
		*
		* ```
		* auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
		* ```
		*
		* \par Example
		*
		* \include ReductionAndScan/SegmentedScanExample.cpp
		*
		* \par Output
		*
		* \include SegmentedScanExample.out
		*/
		template< typename Vector,
		typename Reduction,
		typename Flags >
		static void
		perform( Vector& v,
		Flags& flags,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::ValueType zero );
		};

		template< ScanType Type >
		struct SegmentedScan< Devices::Cuda, Type >
		{
		/**
		* \brief Computes segmented scan (prefix sum) on GPU.
		*
		* \tparam Vector type vector being used for the scan.
		* \tparam Reduction lambda function defining the reduction operation
		* \tparam Flags array type containing zeros and ones defining the segments begining
		*
		* \param v input vector, the result of scan is stored in the same vector
		* \param flags is an array with zeros and ones defining the segments begining
		* \param begin the first element in the array to be scanned
		* \param end the last element in the array to be scanned
		* \param reduction lambda function implementing the reduction operation
		* \param zero is the idempotent element for the reduction operation, i.e. element which
		* does not change the result of the reduction.
		*
		* The reduction lambda function takes two variables which are supposed to be reduced:
		*
		* ```
		* auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
		* ```
		*
		* \par Example
		*
		* \include ReductionAndScan/SegmentedScanExample.cpp
		*
		* \par Output
		*
		* \include SegmentedScanExample.out
		*
		* Note: Segmented scan is not implemented for CUDA yet.
		*/
		template< typename Vector,
		typename Reduction,
		typename Flags >
		static void
		perform( Vector& v,
		Flags& flags,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::ValueType zero );
		};

		} // namespace Algorithms
		} // namespace TNL

src/TNL/Algorithms/Scan.hpp

+0 −83

Original line number	Diff line number	Diff line
		@@ -20,7 +20,6 @@
		#include <TNL/Containers/StaticArray.h>
		#include <TNL/Algorithms/detail/CudaScanKernel.h>
		#include <TNL/Exceptions/CudaSupportMissing.h>
		#include <TNL/Exceptions/NotImplementedError.h>

		namespace TNL {
		namespace Algorithms {
		@@ -306,87 +305,5 @@ performSecondPhase( Vector& v,
		#endif
		}


		template< ScanType Type >
		template< typename Vector,
		typename Reduction,
		typename Flags >
		void
		SegmentedScan< Devices::Sequential, Type >::
		perform( Vector& v,
		Flags& flags,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::ValueType zero )
		{
		using ValueType = typename Vector::ValueType;
		using IndexType = typename Vector::IndexType;

		if( Type == ScanType::Inclusive )
		{
		for( IndexType i = begin + 1; i < end; i++ )
		if( ! flags[ i ] )
		v[ i ] = reduction( v[ i ], v[ i - 1 ] );
		}
		else // Exclusive scan
		{
		ValueType aux( v[ begin ] );
		v[ begin ] = zero;
		for( IndexType i = begin + 1; i < end; i++ )
		{
		ValueType x = v[ i ];
		if( flags[ i ] )
		aux = zero;
		v[ i ] = aux;
		aux = reduction( aux, x );
		}
		}
		}

		template< ScanType Type >
		template< typename Vector,
		typename Reduction,
		typename Flags >
		void
		SegmentedScan< Devices::Host, Type >::
		perform( Vector& v,
		Flags& flags,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::ValueType zero )
		{
		#ifdef HAVE_OPENMP
		// TODO: parallelize with OpenMP
		SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
		#else
		SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
		#endif
		}

		template< ScanType Type >
		template< typename Vector,
		typename Reduction,
		typename Flags >
		void
		SegmentedScan< Devices::Cuda, Type >::
		perform( Vector& v,
		Flags& flags,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::ValueType zero )
		{
		#ifdef HAVE_CUDA
		using ValueType = typename Vector::ValueType;
		using IndexType = typename Vector::IndexType;

		throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for CUDA." );
		#else
		throw Exceptions::CudaSupportMissing();
		#endif
		}

		} // namespace Algorithms
		} // namespace TNL

src/TNL/Algorithms/SegmentedScan.h

0 → 100644

+204 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		SegmentedScan.h - description
		-------------------
		begin : May 9, 2019
		copyright : (C) 2019 by Tomas Oberhuber et al.
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		// Implemented by: Tomas Oberhuber, Jakub Klinkovsky

		#pragma once

		#include <TNL/Devices/Sequential.h>
		#include <TNL/Devices/Host.h>
		#include <TNL/Devices/Cuda.h>

		#include "Scan.h" // only for the ScanType

		namespace TNL {
		namespace Algorithms {

		/**
		* \brief Computes segmented scan (or prefix sum) on a vector.
		*
		* Segmented scan is a modification of common scan. In this case the sequence of
		* numbers in hand is divided into segments like this, for example
		*
		* ```
		* [1,3,5][2,4,6,9][3,5],[3,6,9,12,15]
		* ```
		*
		* and we want to compute inclusive or exclusive scan of each segment. For inclusive segmented prefix sum we get
		*
		* ```
		* [1,4,9][2,6,12,21][3,8][3,9,18,30,45]
		* ```
		*
		* and for exclusive segmented prefix sum it is
		*
		* ```
		* [0,1,4][0,2,6,12][0,3][0,3,9,18,30]
		* ```
		*
		* In addition to common scan, we need to encode the segments of the input sequence.
		* It is done by auxiliary flags array (it can be array of booleans) having `1` at the
		* beginning of each segment and `0` on all other positions. In our example, it would be like this:
		*
		* ```
		* [1,0,0,1,0,0,0,1,0,1,0,0, 0, 0]
		* [1,3,5,2,4,6,9,3,5,3,6,9,12,15]
		*
		* ```
		*
		* \tparam Device parameter says on what device the reduction is gonna be performed.
		* \tparam Type parameter says if inclusive or exclusive is scan is to be computed.
		*
		* See \ref Scan< Devices::Host, Type > and \ref Scan< Devices::Cuda, Type >.
		*
		* Note: Segmented scan is not implemented for CUDA yet.
		*/
		template< typename Device,
		ScanType Type = ScanType::Inclusive >
		struct SegmentedScan;

		template< ScanType Type >
		struct SegmentedScan< Devices::Sequential, Type >
		{
		/**
		* \brief Computes segmented scan (prefix sum) sequentially.
		*
		* \tparam Vector type vector being used for the scan.
		* \tparam Reduction lambda function defining the reduction operation
		* \tparam Flags array type containing zeros and ones defining the segments begining
		*
		* \param v input vector, the result of scan is stored in the same vector
		* \param flags is an array with zeros and ones defining the segments begining
		* \param begin the first element in the array to be scanned
		* \param end the last element in the array to be scanned
		* \param reduction lambda function implementing the reduction operation
		* \param zero is the idempotent element for the reduction operation, i.e. element which
		* does not change the result of the reduction.
		*
		* The reduction lambda function takes two variables which are supposed to be reduced:
		*
		* ```
		* auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
		* ```
		*
		* \par Example
		*
		* \include ReductionAndScan/SegmentedScanExample.cpp
		*
		* \par Output
		*
		* \include SegmentedScanExample.out
		*/
		template< typename Vector,
		typename Reduction,
		typename Flags >
		static void
		perform( Vector& v,
		Flags& flags,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::ValueType zero );
		};

		template< ScanType Type >
		struct SegmentedScan< Devices::Host, Type >
		{
		/**
		* \brief Computes segmented scan (prefix sum) using OpenMP.
		*
		* \tparam Vector type vector being used for the scan.
		* \tparam Reduction lambda function defining the reduction operation
		* \tparam Flags array type containing zeros and ones defining the segments begining
		*
		* \param v input vector, the result of scan is stored in the same vector
		* \param flags is an array with zeros and ones defining the segments begining
		* \param begin the first element in the array to be scanned
		* \param end the last element in the array to be scanned
		* \param reduction lambda function implementing the reduction operation
		* \param zero is the idempotent element for the reduction operation, i.e. element which
		* does not change the result of the reduction.
		*
		* The reduction lambda function takes two variables which are supposed to be reduced:
		*
		* ```
		* auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
		* ```
		*
		* \par Example
		*
		* \include ReductionAndScan/SegmentedScanExample.cpp
		*
		* \par Output
		*
		* \include SegmentedScanExample.out
		*/
		template< typename Vector,
		typename Reduction,
		typename Flags >
		static void
		perform( Vector& v,
		Flags& flags,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::ValueType zero );
		};

		template< ScanType Type >
		struct SegmentedScan< Devices::Cuda, Type >
		{
		/**
		* \brief Computes segmented scan (prefix sum) on GPU.
		*
		* \tparam Vector type vector being used for the scan.
		* \tparam Reduction lambda function defining the reduction operation
		* \tparam Flags array type containing zeros and ones defining the segments begining
		*
		* \param v input vector, the result of scan is stored in the same vector
		* \param flags is an array with zeros and ones defining the segments begining
		* \param begin the first element in the array to be scanned
		* \param end the last element in the array to be scanned
		* \param reduction lambda function implementing the reduction operation
		* \param zero is the idempotent element for the reduction operation, i.e. element which
		* does not change the result of the reduction.
		*
		* The reduction lambda function takes two variables which are supposed to be reduced:
		*
		* ```
		* auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
		* ```
		*
		* \par Example
		*
		* \include ReductionAndScan/SegmentedScanExample.cpp
		*
		* \par Output
		*
		* \include SegmentedScanExample.out
		*
		* Note: Segmented scan is not implemented for CUDA yet.
		*/
		template< typename Vector,
		typename Reduction,
		typename Flags >
		static void
		perform( Vector& v,
		Flags& flags,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::ValueType zero );
		};

		} // namespace Algorithms
		} // namespace TNL

		#include <TNL/Algorithms/SegmentedScan.hpp>

src/TNL/Algorithms/SegmentedScan.hpp

0 → 100644

+104 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		SegmentedScan.hpp - description
		-------------------
		begin : Mar 24, 2013
		copyright : (C) 2013 by Tomas Oberhuber et al.
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		// Implemented by: Tomas Oberhuber, Jakub Klinkovsky

		#pragma once

		#include "SegmentedScan.h"

		#include <TNL/Exceptions/NotImplementedError.h>

		namespace TNL {
		namespace Algorithms {

		template< ScanType Type >
		template< typename Vector,
		typename Reduction,
		typename Flags >
		void
		SegmentedScan< Devices::Sequential, Type >::
		perform( Vector& v,
		Flags& flags,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::ValueType zero )
		{
		using ValueType = typename Vector::ValueType;
		using IndexType = typename Vector::IndexType;

		if( Type == ScanType::Inclusive )
		{
		for( IndexType i = begin + 1; i < end; i++ )
		if( ! flags[ i ] )
		v[ i ] = reduction( v[ i ], v[ i - 1 ] );
		}
		else // Exclusive scan
		{
		ValueType aux( v[ begin ] );
		v[ begin ] = zero;
		for( IndexType i = begin + 1; i < end; i++ )
		{
		ValueType x = v[ i ];
		if( flags[ i ] )
		aux = zero;
		v[ i ] = aux;
		aux = reduction( aux, x );
		}
		}
		}

		template< ScanType Type >
		template< typename Vector,
		typename Reduction,
		typename Flags >
		void
		SegmentedScan< Devices::Host, Type >::
		perform( Vector& v,
		Flags& flags,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::ValueType zero )
		{
		#ifdef HAVE_OPENMP
		// TODO: parallelize with OpenMP
		SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
		#else
		SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
		#endif
		}

		template< ScanType Type >
		template< typename Vector,
		typename Reduction,
		typename Flags >
		void
		SegmentedScan< Devices::Cuda, Type >::
		perform( Vector& v,
		Flags& flags,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::ValueType zero )
		{
		#ifdef HAVE_CUDA
		using ValueType = typename Vector::ValueType;
		using IndexType = typename Vector::IndexType;

		throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for CUDA." );
		#else
		throw Exceptions::CudaSupportMissing();
		#endif
		}

		} // namespace Algorithms
		} // namespace TNL

src/TNL/Containers/VectorView.h

+1 −0

Original line number	Diff line number	Diff line
		@@ -15,6 +15,7 @@
		#include <TNL/Containers/ArrayView.h>
		#include <TNL/Containers/Expressions/ExpressionTemplates.h>
		#include <TNL/Algorithms/Scan.h>
		#include <TNL/Algorithms/SegmentedScan.h>

		namespace TNL {
		namespace Containers {