Commit 0468914d authored by Tomáš Oberhuber's avatar Tomáš Oberhuber

Writing tutorials on reduction and scan.

parent 92dc4a47
add_subdirectory( Arrays )
add_subdirectory( Vectors )
add_subdirectory( Reduction )
\ No newline at end of file
add_subdirectory( ReductionAndScan )
\ No newline at end of file
......@@ -10,16 +10,17 @@ IF( BUILD_CUDA )
CUDA_ADD_EXECUTABLE( ComparisonExample ComparisonExample.cu )
ADD_CUSTOM_COMMAND( COMMAND ComparisonExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ComparisonExample.out OUTPUT ComparisonExample.out )
CUDA_ADD_EXECUTABLE( UpdateAndResidueExample UpdateAndResidueExample.cu )
ADD_CUSTOM_COMMAND( COMMAND UpdateAndResidueExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UpdateAndResidueExample.out OUTPUT UpdateAndResidueExample.out )
# FIXME
# CUDA_ADD_EXECUTABLE( MapReduceExample-1 MapReduceExample-1.cu )
# ADD_CUSTOM_COMMAND( COMMAND MapReduceExample-1 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MapReduceExample-1.out OUTPUT MapReduceExample-1.out )
# CUDA_ADD_EXECUTABLE( MapReduceExample-2 MapReduceExample-2.cu )
# ADD_CUSTOM_COMMAND( COMMAND MapReduceExample-2 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MapReduceExample-2.out OUTPUT MapReduceExample-2.out )
# CUDA_ADD_EXECUTABLE( MapReduceExample-3 MapReduceExample-3.cu )
# ADD_CUSTOM_COMMAND( COMMAND MapReduceExample-3 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MapReduceExample-3.out OUTPUT MapReduceExample-3.out )
# CUDA_ADD_EXECUTABLE( ReductionWithArgument ReductionWithArgument.cu )
# ADD_CUSTOM_COMMAND( COMMAND ReductionWithArgument > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ReductionWithArgument.out OUTPUT ReductionWithArgument.out )
ADD_CUSTOM_COMMAND( COMMAND UpdateAndResidueExample > UpdateAndResidueExample.out OUTPUT UpdateAndResidueExample.out )
CUDA_ADD_EXECUTABLE( MapReduceExample-1 MapReduceExample-1.cu )
ADD_CUSTOM_COMMAND( COMMAND MapReduceExample-1 > MapReduceExample-1.out OUTPUT MapReduceExample-1.out )
CUDA_ADD_EXECUTABLE( MapReduceExample-2 MapReduceExample-2.cu )
ADD_CUSTOM_COMMAND( COMMAND MapReduceExample-2 > MapReduceExample-2.out OUTPUT MapReduceExample-2.out )
CUDA_ADD_EXECUTABLE( MapReduceExample-3 MapReduceExample-3.cu )
ADD_CUSTOM_COMMAND( COMMAND MapReduceExample-3 > MapReduceExample-3.out OUTPUT MapReduceExample-3.out )
CUDA_ADD_EXECUTABLE( ReductionWithArgument ReductionWithArgument.cu )
ADD_CUSTOM_COMMAND( COMMAND ReductionWithArgument > ReductionWithArgument.out OUTPUT ReductionWithArgument.out )
CUDA_ADD_EXECUTABLE( ScanExample ScanExample.cu )
ADD_CUSTOM_COMMAND( COMMAND ScanExample > ScanExample.out OUTPUT ScanExample.out )
ENDIF()
IF( BUILD_CUDA )
......@@ -29,10 +30,10 @@ ADD_CUSTOM_TARGET( TutorialsReduction-cuda ALL DEPENDS
ScalarProductExample.out
MaximumNormExample.out
ComparisonExample.out
UpdateAndResidueExample.out )
# FIXME
# MapReduceExample-1.out
# MapReduceExample-2.out
# MapReduceExample-3.out
# ReductionWithArgument.out )
UpdateAndResidueExample.out
MapReduceExample-1.out
MapReduceExample-2.out
MapReduceExample-3.out
ReductionWithArgument.out )
ScanExample.out )
ENDIF()
MapReduceExample-1.cpp
\ No newline at end of file
MapReduceExample-2.cpp
\ No newline at end of file
MapReduceExample-3.cpp
\ No newline at end of file
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Containers/Algorithms/Reduction.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Containers::Algorithms;
template< typename Device >
std::pair< int, double >
maximumNorm( const Vector< double, Device >& v )
{
auto view = v.getConstView();
auto fetch = [=] __cuda_callable__ ( int i ) { return abs( view[ i ] ); };
auto reduction = [] __cuda_callable__ ( int& aIdx, const int& bIdx, double& a, const double& b ) {
if( a < b ) {
a = b;
aIdx = bIdx;
}
else if( a == b && bIdx < aIdx )
aIdx = bIdx;
};
return Reduction< Device >::reduceWithArgument( view.getSize(), reduction, fetch, std::numeric_limits< double >::max() );
}
int main( int argc, char* argv[] )
{
Vector< double, Devices::Host > host_v( 10 );
host_v.evaluate( [] __cuda_callable__ ( int i )->double { return i - 7; } );
std::cout << "host_v = " << host_v << std::endl;
auto maxNormHost = maximumNorm( host_v );
std::cout << "The maximum norm of the host vector elements is " << maxNormHost.second << " at position " << maxNormHost.first << "." << std::endl;
#ifdef HAVE_CUDA
Vector< double, Devices::Cuda > cuda_v( 10 );
cuda_v.evaluate( [] __cuda_callable__ ( int i )->double { return i - 7; } );
std::cout << "cuda_v = " << cuda_v << std::endl;
auto maxNormCuda = maximumNorm( cuda_v );
std::cout << "The maximum norm of the device vector elements is " << maxNormCuda.second << " at position " << maxNormCuda.first << "." << std::endl;
#endif
return EXIT_SUCCESS;
}
ReductionWithArgument.cpp
\ No newline at end of file
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Containers/Array.h>
#include <TNL/Containers/Algorithms/Reduction.h>
#include <TNL/Containers/StaticVector.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Containers::Algorithms;
template< typename Device >
void scan( const Vector< double, Device >& v )
{
/****
* Get vector view which can be captured by lambda.
*/
auto view = v.getConstView();
/****
* The fetch function just reads elements of vector v.
*/
auto fetch = [=] __cuda_callable__ ( int i ) -> double { return view[ i ]; };
/***
* Reduction is sum of two numbers.
*/
auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
/***
* Finally we call the templated function Reduction and pass number of elements to reduce,
* lambdas defined above and finally value of idempotent element, zero in this case, which serve for the
* reduction initiation.
*/
Scan< Device >::perform( view, 0, view.getSize(), reduce, 0.0 );
}
int main( int argc, char* argv[] )
{
/***
* Firstly, test the prefix sum with vectors allocated on CPU.
*/
Vector< double, Devices::Host > host_v( 10 );
host_v = 1.0;
std::cout << "host_v = " << host_v << std::endl;
scan( host_v );
std::cout << "The prefix sum of the host vector is " << host_v << "." << std::endl;
/***
* And then also on GPU.
*/
#ifdef HAVE_CUDA
Vector< double, Devices::Cuda > cuda_v( 10 );
cuda_v = 1.0;
std::cout << "cuda_v = " << cuda_v << std::endl;
scan( cuda_v );
std::cout << "The prefisx sum of the CUDA vector is " << cuda_v << "." << std::endl;
#endif
return EXIT_SUCCESS;
}
ScanExample.cpp
\ No newline at end of file
......@@ -14,6 +14,7 @@ This tutorial introduces flexible parallel reduction in TNL. It shows how to eas
6. [Update and Residue](#flexible_parallel_reduction_update_and_residue)
7. [Simple Mask and Reduce](#flexible_parallel_reduction_simple_mask_and_reduce)
8. [Reduction with argument](#flexible_parallel_reduction_with_argument)
2. [Flexible Scan](#flexible_scan)
## Flexible parallel reduction<a name="flexible_parallel_reduction"></a>
......@@ -71,7 +72,7 @@ The result is:
\include ScalarProductExample.out
Scalar product of vectors `u` and `v` can be in TNL computed by [`dot(u,v)`](../html/namespaceTNL.html#ab49c6303cbe48c65ca350389460c2e40) or simply as [`(u,v)`](../html/namespaceTNL_1_1Containers.html#a6453777fc16ef91a3c309338cd18dd0c).
Scalar product of vectors `u` and `v` can be in TNL computed by [`dot(u,v)`](../html/namespaceTNL.html#ab49c6303cbe48c65ca350389460c2e40) or simply as [`(u,v)`](@ref Containers_operator_scalar_product_vector_et).
### Maxium norm<a name="flexible_parallel_reduction_maximum_norm"></a>
......@@ -83,7 +84,7 @@ The output is:
\include MaximumNormExample.out
Maximum norm in TNL computes function [`maxNorm(v)`](../html/namespaceTNL.html#acea36b20e471c597fb21bc2b996bbb04).
Maximum norm in TNL computes function @ref TNL::maxNorm "TNL::maxNorm".
### Vectors comparison<a name="flexible_parallel_reduction_vector_comparison"></a>
......@@ -159,10 +160,30 @@ The definition of the lambda function `reduction` reads as:
auto reduction = [] __cuda_callable__ ( int& aIdx, const int& bIdx, double& a, const double& b );
```
In addition to vector elements valuesd `a` and `b`, it gets also their positions `aIdx` and `bIdx`. The functions is responsible to set `a` to maximum of the two and `aIdx` to the position of the larger element. Note, that the parameters have the above mentioned meaning only in case of computing minimum or maximum.
In addition to vector elements valuesd `a` and `b`, it gets also their positions `aIdx` and `bIdx`. The functions is responsible to set `a` to maximum of the two and `aIdx` to the position of the larger element. Note, that the parameters have the above mentioned meaning only in case of computing minimum or maximum.
The result looks as:
\include ReductionWithArgument.out
## Flexible scan<a name="flexible_scan"></a>
Inclusive scan ( or prefix sum) operation turns a sequence \f$a_1, \ldots, a_n\f$ into a sequence \f$s_1, \ldots, s_n\f$ defined as
\f[
s_i = \sum_{j=1}^i a_i.
\f]
Exclusive scan (or prefix sum) is defined as
\f[
\sigma_i = \sum_{j=1}^{i-1} a_i.
\f]
Both kinds of [scans](https://en.wikipedia.org/wiki/Prefix_sum)) are usually applied only on sumation, however product or logical operations could be handy as well. In TNL, prefix sum is implemented in simillar way as reduction and so it can be easily modified by lambda functions. The following example shows how it works:
\include ScanExample.cpp
The result looks as:
\include ScanExample.out
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment