Loading src/TNL/Containers/Algorithms/Reduction.h +162 −0 Original line number Diff line number Diff line Loading @@ -22,12 +22,59 @@ namespace TNL { namespace Containers { namespace Algorithms { /** * \brief Reduction implements [(parallel) reduction](https://en.wikipedia.org/wiki/Reduce_(parallel_pattern)) for vectors and arrays. * * Reduction can be used for operations having one or more vectors (or arrays) elements is input and returning * one number (or element) as output. Some examples of such operations can be vectors/arrays comparison, * vector norm, scalar product of two vectors or computing minimum or maximum. If one needs to know even * position of the smallest or the largest element, reduction with argument can be used. * * \tparam Device this parameter says on what device the reduction is gonna be performed. * * See \ref Reduction< Devices::Host > and \ref Reduction< Devices::Cuda >. */ template< typename Device > struct Reduction; template<> struct Reduction< Devices::Host > { /** * \brief Computes reduction on CPU. * * \tparam Index is a type for indexing. * \tparam Result is a type of the reduction result. * \tparam ReductionOperation is a lambda function performing the reduction. * \tparam DataFetcher is a lambda function for fetching the input data. * * \param size is number of elements to be reduced. * \param reduction is a lambda function defining the reduction operation. * \param dataFetcher is a lambda function fetching the input data. * \param zero is the idempotent element for the reduction operation, i.e. element which * does not change the result of the reduction. * \return result of the reduction * * The dataFetcher lambda function takes one argument which is index of the element to be fetched: * * ``` * auto dataFetcher1 = [=] __cuda_callable__ ( Index i ) { return ... }; * ``` * * The reduction lambda function takes two variables which are supposed to be reduced: * * ``` * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... }; * ``` * * \par Example * * \include ReductionAndScan/SumExample.cpp * * \par Output * * \include SumExample.out */ template< typename Index, typename Result, typename ReductionOperation, Loading @@ -38,6 +85,46 @@ struct Reduction< Devices::Host > DataFetcher& dataFetcher, const Result& zero ); /** * \brief Computes reduction on CPU and returns position of an element of interest. * * For example in case of computing minimal or maximal element in array/vector, * the position of the element having given value can be obtained. The use of this method * is, however, more flexible. * * \tparam Index is a type for indexing. * \tparam Result is a type of the reduction result. * \tparam ReductionOperation is a lambda function performing the reduction. * \tparam DataFetcher is a lambda function for fetching the input data. * * \param size is number of elements to be reduced. * \param reduction is a lambda function defining the reduction operation and managing the elements positions. * \param dataFetcher is a lambda function fetching the input data. * \param zero is the idempotent element for the reduction operation, i.e. element which * does not change the result of the reduction. * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first' * is the element position and `pair.second` is the reduction result. * * The dataFetcher lambda function takes one argument which is index of the element to be fetched: * * ``` * auto dataFetcher1 = [=] __cuda_callable__ ( Index i ) { return ... }; * ``` * * The reduction lambda function takes two variables which are supposed to be reduced: * * ``` * auto reduction = [] __cuda_callable__ ( Index& aIdx, const Index& bIdx, const Result& a, const Result& b ) { return ... }; * ``` * * \par Example * * \include ReductionAndScan/ReductionWithArgument.cpp * * \par Output * * \include ReductionWithArgument.out */ template< typename Index, typename Result, typename ReductionOperation, Loading @@ -52,6 +139,41 @@ struct Reduction< Devices::Host > template<> struct Reduction< Devices::Cuda > { /** * \brief Computes reduction on GPU. * * \tparam Index is a type for indexing. * \tparam Result is a type of the reduction result. * \tparam ReductionOperation is a lambda function performing the reduction. * \tparam DataFetcher is a lambda function for fetching the input data. * * \param size is number of elements to be reduced. * \param reduction is a lambda function defining the reduction operation. * \param dataFetcher is a lambda function fetching the input data. * \param zero is the idempotent element for the reduction operation, i.e. element which * does not change the result of the reduction. * \return result of the reduction * * The dataFetcher lambda function takes one argument which is index of the element to be fetched: * * ``` * auto dataFetcher1 = [=] __cuda_callable__ ( Index i ) { return ... }; * ``` * * The reduction lambda function takes two variables which are supposed to be reduced: * * ``` * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... }; * ``` * * \par Example * * \include ReductionAndScan/SumExample.cpp * * \par Output * * \include SumExample.out */ template< typename Index, typename Result, typename ReductionOperation, Loading @@ -62,6 +184,46 @@ struct Reduction< Devices::Cuda > DataFetcher& dataFetcher, const Result& zero ); /** * \brief Computes reduction on GPU and returns position of an element of interest. * * For example in case of computing minimal or maximal element in array/vector, * the position of the element having given value can be obtained. The use of this method * is, however, more flexible. * * \tparam Index is a type for indexing. * \tparam Result is a type of the reduction result. * \tparam ReductionOperation is a lambda function performing the reduction. * \tparam DataFetcher is a lambda function for fetching the input data. * * \param size is number of elements to be reduced. * \param reduction is a lambda function defining the reduction operation and managing the elements positions. * \param dataFetcher is a lambda function fetching the input data. * \param zero is the idempotent element for the reduction operation, i.e. element which * does not change the result of the reduction. * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first' * is the element position and `pair.second` is the reduction result. * * The dataFetcher lambda function takes one argument which is index of the element to be fetched: * * ``` * auto dataFetcher1 = [=] __cuda_callable__ ( Index i ) { return ... }; * ``` * * The reduction lambda function takes two variables which are supposed to be reduced: * * ``` * auto reduction = [] __cuda_callable__ ( Index& aIdx, const Index& bIdx, const Result& a, const Result& b ) { return ... }; * ``` * * \par Example * * \include ReductionAndScan/ReductionWithArgument.cpp * * \par Output * * \include ReductionWithArgument.out */ template< typename Index, typename Result, typename ReductionOperation, Loading Loading
src/TNL/Containers/Algorithms/Reduction.h +162 −0 Original line number Diff line number Diff line Loading @@ -22,12 +22,59 @@ namespace TNL { namespace Containers { namespace Algorithms { /** * \brief Reduction implements [(parallel) reduction](https://en.wikipedia.org/wiki/Reduce_(parallel_pattern)) for vectors and arrays. * * Reduction can be used for operations having one or more vectors (or arrays) elements is input and returning * one number (or element) as output. Some examples of such operations can be vectors/arrays comparison, * vector norm, scalar product of two vectors or computing minimum or maximum. If one needs to know even * position of the smallest or the largest element, reduction with argument can be used. * * \tparam Device this parameter says on what device the reduction is gonna be performed. * * See \ref Reduction< Devices::Host > and \ref Reduction< Devices::Cuda >. */ template< typename Device > struct Reduction; template<> struct Reduction< Devices::Host > { /** * \brief Computes reduction on CPU. * * \tparam Index is a type for indexing. * \tparam Result is a type of the reduction result. * \tparam ReductionOperation is a lambda function performing the reduction. * \tparam DataFetcher is a lambda function for fetching the input data. * * \param size is number of elements to be reduced. * \param reduction is a lambda function defining the reduction operation. * \param dataFetcher is a lambda function fetching the input data. * \param zero is the idempotent element for the reduction operation, i.e. element which * does not change the result of the reduction. * \return result of the reduction * * The dataFetcher lambda function takes one argument which is index of the element to be fetched: * * ``` * auto dataFetcher1 = [=] __cuda_callable__ ( Index i ) { return ... }; * ``` * * The reduction lambda function takes two variables which are supposed to be reduced: * * ``` * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... }; * ``` * * \par Example * * \include ReductionAndScan/SumExample.cpp * * \par Output * * \include SumExample.out */ template< typename Index, typename Result, typename ReductionOperation, Loading @@ -38,6 +85,46 @@ struct Reduction< Devices::Host > DataFetcher& dataFetcher, const Result& zero ); /** * \brief Computes reduction on CPU and returns position of an element of interest. * * For example in case of computing minimal or maximal element in array/vector, * the position of the element having given value can be obtained. The use of this method * is, however, more flexible. * * \tparam Index is a type for indexing. * \tparam Result is a type of the reduction result. * \tparam ReductionOperation is a lambda function performing the reduction. * \tparam DataFetcher is a lambda function for fetching the input data. * * \param size is number of elements to be reduced. * \param reduction is a lambda function defining the reduction operation and managing the elements positions. * \param dataFetcher is a lambda function fetching the input data. * \param zero is the idempotent element for the reduction operation, i.e. element which * does not change the result of the reduction. * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first' * is the element position and `pair.second` is the reduction result. * * The dataFetcher lambda function takes one argument which is index of the element to be fetched: * * ``` * auto dataFetcher1 = [=] __cuda_callable__ ( Index i ) { return ... }; * ``` * * The reduction lambda function takes two variables which are supposed to be reduced: * * ``` * auto reduction = [] __cuda_callable__ ( Index& aIdx, const Index& bIdx, const Result& a, const Result& b ) { return ... }; * ``` * * \par Example * * \include ReductionAndScan/ReductionWithArgument.cpp * * \par Output * * \include ReductionWithArgument.out */ template< typename Index, typename Result, typename ReductionOperation, Loading @@ -52,6 +139,41 @@ struct Reduction< Devices::Host > template<> struct Reduction< Devices::Cuda > { /** * \brief Computes reduction on GPU. * * \tparam Index is a type for indexing. * \tparam Result is a type of the reduction result. * \tparam ReductionOperation is a lambda function performing the reduction. * \tparam DataFetcher is a lambda function for fetching the input data. * * \param size is number of elements to be reduced. * \param reduction is a lambda function defining the reduction operation. * \param dataFetcher is a lambda function fetching the input data. * \param zero is the idempotent element for the reduction operation, i.e. element which * does not change the result of the reduction. * \return result of the reduction * * The dataFetcher lambda function takes one argument which is index of the element to be fetched: * * ``` * auto dataFetcher1 = [=] __cuda_callable__ ( Index i ) { return ... }; * ``` * * The reduction lambda function takes two variables which are supposed to be reduced: * * ``` * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... }; * ``` * * \par Example * * \include ReductionAndScan/SumExample.cpp * * \par Output * * \include SumExample.out */ template< typename Index, typename Result, typename ReductionOperation, Loading @@ -62,6 +184,46 @@ struct Reduction< Devices::Cuda > DataFetcher& dataFetcher, const Result& zero ); /** * \brief Computes reduction on GPU and returns position of an element of interest. * * For example in case of computing minimal or maximal element in array/vector, * the position of the element having given value can be obtained. The use of this method * is, however, more flexible. * * \tparam Index is a type for indexing. * \tparam Result is a type of the reduction result. * \tparam ReductionOperation is a lambda function performing the reduction. * \tparam DataFetcher is a lambda function for fetching the input data. * * \param size is number of elements to be reduced. * \param reduction is a lambda function defining the reduction operation and managing the elements positions. * \param dataFetcher is a lambda function fetching the input data. * \param zero is the idempotent element for the reduction operation, i.e. element which * does not change the result of the reduction. * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first' * is the element position and `pair.second` is the reduction result. * * The dataFetcher lambda function takes one argument which is index of the element to be fetched: * * ``` * auto dataFetcher1 = [=] __cuda_callable__ ( Index i ) { return ... }; * ``` * * The reduction lambda function takes two variables which are supposed to be reduced: * * ``` * auto reduction = [] __cuda_callable__ ( Index& aIdx, const Index& bIdx, const Result& a, const Result& b ) { return ... }; * ``` * * \par Example * * \include ReductionAndScan/ReductionWithArgument.cpp * * \par Output * * \include ReductionWithArgument.out */ template< typename Index, typename Result, typename ReductionOperation, Loading