Skip to content
Snippets Groups Projects
Commit 362727e5 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber Committed by Tomáš Oberhuber
Browse files

Writting documentation for ParallelFor.

parent f4bbeead
No related branches found
No related tags found
1 merge request!44Tutorials
IF( BUILD_CUDA )
CUDA_ADD_EXECUTABLE(ParallelForExampleCuda ParallelForExample.cu)
ADD_CUSTOM_COMMAND( COMMAND ParallelForExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
ELSE()
ADD_EXECUTABLE(ParallelForExample ParallelForExample.cpp)
ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
ENDIF()
IF( BUILD_CUDA )
ADD_CUSTOM_TARGET( RunAlgorithmsExamples-cuda ALL DEPENDS
ParallelForExample.out
)
ELSE()
ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS
ParallelForExample.out
)
ENDIF()
\ No newline at end of file
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/ParallelFor.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
template< typename Device >
void initMeshFunction( const int xSize,
const int ySize,
Vector< double, Device >& v,
const double& c )
{
auto view = v1.getConstView();
auto init = [=] __cuda_callable__ ( int i, int j, const int xSize, const double c ) mutable {
view[ j * xSize + i ] = c; };
ParallelFor2D< Device >::exec( 0, 0, xSize, ySize, init, xSize, c );
}
int main( int argc, char* argv[] )
{
/***
* Define dimensions of 2D mesh function.
*/
const int xSize( 10 ), ySize( 10 );
const int size = xSize * ySize;
/***
* Firstly, test the mesh function initiation on CPU.
*/
Vector< double, Devices::Host > host_v;
initMeshFunction( xSize, ySize, host_v, 1.0 );
/***
* And then also on GPU.
*/
#ifdef HAVE_CUDA
Vector< double, Devices::Cuda > cuda_v( size );
initMeshFunction( xSize, ySize, cuda_v, 1.0 );
#endif
return EXIT_SUCCESS;
}
ParallelForExample-2D.cpp
\ No newline at end of file
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/ParallelFor.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
template< typename Device >
void initMeshFunction( const int xSize,
const int ySize,
const int zSize,
Vector< double, Device >& v,
const double& c )
{
auto view = v1.getConstView();
auto init = [=] __cuda_callable__ ( int i, int j, int k, const int xSize, const int ySize, const double c ) mutable {
view[ ( k * ySize + j ) * xSize + i ] = c; };
ParallelFor3D< Device >::exec( 0, 0, xSize, ySize, init, xSize, ySize, c );
}
int main( int argc, char* argv[] )
{
/***
* Define dimensions of 2D mesh function.
*/
const int xSize( 10 ), ySize( 10 ), zSize( 10 );
const int size = xSize * ySize * zSize;
/***
* Firstly, test the mesh function initiation on CPU.
*/
Vector< double, Devices::Host > host_v;
initMeshFunction( xSize, ySize, zSize, host_v, 1.0 );
/***
* And then also on GPU.
*/
#ifdef HAVE_CUDA
Vector< double, Devices::Cuda > cuda_v( size );
initMeshFunction( xSize, ySize, cuda_v, 1.0 );
#endif
return EXIT_SUCCESS;
}
ParallelForExample-3D.cpp
\ No newline at end of file
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/ParallelFor.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
/****
* Set all elements of the vector v to the constant c.
*/
template< typename Device >
void initVector( Vector< double, Device >& v,
const double& c )
{
auto view = v.getConstView();
auto init = [=] __cuda_callable__ ( int i, const double c ) mutable {
view[ i ] = c;
ParallelFor< Device >::exec( 0, v.getSize(), init, c );
}
int main( int argc, char* argv[] )
{
/***
* Firstly, test the vector initiation on CPU.
*/
Vector< double, Devices::Host > host_v( 10 );
initVector( host_v, 1.0 );
std::cout << "host_v = " << host_v << std::endl;
/***
* And then also on GPU.
*/
#ifdef HAVE_CUDA
Vector< double, Devices::Cuda > cuda_v( 10 );
initVector( cuda_v, 1.0 );
std::cout << "cuda_v = " << cuda_v << std::endl;
#endif
return EXIT_SUCCESS;
}
#include <iostream>
#include <cstdlib>
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/ParallelFor.h>
using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Algorithms;
template< typename Device >
void vectorSum( const Vector< double, Device >& v1,
const Vector< double, Device >& v2,
const double& c,
Vector< double, Device >& result )
{
/****
* Get vectors view which can be captured by lambda.
*/
auto v1_view = v1.getConstView();
auto v2_view = v2.getConstView();
auto result_view = result.getView();
/****
* The sum function.
*/
auto sum = [=] __cuda_callable__ ( int i, const double c ) mutable {
result_view[ i ] = v1_view[ i ] + v2_view[ i ] + c; };
ParallelFor< Device >::exec( 0, v1.getSize(), sum, c );
}
int main( int argc, char* argv[] )
{
/***
* Firstly, test the vectors sum on CPU.
*/
Vector< double, Devices::Host > host_v1( 10 ), host_v2( 10 ), host_result( 10 );
host_v1 = 1.0;
host_v2.evaluate( []__cuda_callable__ ( int i )->double { return i; } );
vectorSum( host_v1, host_v2, 2.0, host_result );
std::cout << "host_v1 = " << host_v1 << std::endl;
std::cout << "host_v2 = " << host_v2 << std::endl;
std::cout << "The sum of the vectors on CPU is " << host_result << "." << std::endl;
/***
* And then also on GPU.
*/
#ifdef HAVE_CUDA
Vector< double, Devices::Cuda > cuda_v1( 10 ), cuda_v2( 10 ), cuda_result( 10 );
cuda_v1 = 1.0;
cuda_v2.evaluate( []__cuda_callable__ ( int i )->double { return i; } );
vectorSum( cuda_v1, cuda_v2, 2.0, cuda_result );
std::cout << "cuda_v1 = " << cuda_v1 << std::endl;
std::cout << "cuda_v2 = " << cuda_v2 << std::endl;
std::cout << "The sum of the vectors on GPU is " << cuda_result << "." << std::endl;
#endif
return EXIT_SUCCESS;
}
ADD_SUBDIRECTORY( Algorithms )
ADD_SUBDIRECTORY( Containers )
ADD_EXECUTABLE( FileExample FileExample.cpp )
......
......@@ -33,12 +33,47 @@
namespace TNL {
namespace Algorithms {
// TODO: ParallelForMode should be moved to Device (=Executor)
/**
* \brief Enum for the parallel processing of the for-loop.
*
* Synchronous means that the program control returns to the caller when the loop is processed completely.
* Asynchronous means that the program control returns to the caller immediately even before the loop is processing is finished.
*/
enum ParallelForMode { SynchronousMode, AsynchronousMode };
/**
* \brief Parallel for loop for one dimensional interval of indexes.
*
* \tparam Device says on what device the for-loop is gonna be executed.
* It can be Devices::Host, Devices::Cuda or Devices::Sequential.
* \tparam Mode defines synchronous/asynchronous mode on parallel devices.
*/
template< typename Device = Devices::Sequential,
ParallelForMode Mode = SynchronousMode >
struct ParallelFor
{
/**
* \brief Static method for execution of the loop.
*
* \tparam Index defines the type of indexes over which the loop iterates.
* \tparam Function is the type of function to be called in each iteration.
* \tparam FunctionArgs is a variadic type of additional parameters which are
* supposed to be passed to the inner Function.
*
* \param start the for-loop iterates over index interval [start, end).
* \param end the for-loop iterates over index interval [start, end).
* \param f is the function to be called in each iteration
* \param args are additional parameters to be passed to the function f.
*
* \par Example
* \include Algorithms/ParallelForExample.cpp
* \par Output
* \include ParallelForExample.out
*
*/
template< typename Index,
typename Function,
typename... FunctionArgs >
......@@ -49,10 +84,44 @@ struct ParallelFor
}
};
/**
* \brief Parallel for loop for two dimensional domain of indexes.
*
* \tparam Device says on what device the for-loop is gonna be executed.
* It can be Devices::Host, Devices::Cuda or Devices::Sequential.
* \tparam Mode defines synchronous/asynchronous mode on parallel devices.
*/
template< typename Device = Devices::Sequential,
ParallelForMode Mode = SynchronousMode >
struct ParallelFor2D
{
/**
* \brief Static method for execution of the loop.
*
* \tparam Index defines the type of indexes over which the loop iterates.
* \tparam Function is the type of function to be called in each iteration.
* \tparam FunctionArgs is a variadic type of additional parameters which are
* supposed to be passed to the inner Function.
*
* \param startX the for-loop iterates over index domain [startX,endX)x[startY,endY).
* \param startY the for-loop iterates over index domain [startX,endX)x[startY,endY).
* \param endX the for-loop iterates over index domain [startX,endX)x[startY,endY).
* \param endY the for-loop iterates over index domain [startX,endX)x[startY,endY).
* \param f is the function to be called in each iteration
* \param args are additional parameters to be passed to the function f.
*
* The function f is called for each iteration as
*
* f( i, j, args... )
*
* where the first parameter is changing more often than the second one.
*
* \par Example
* \include Algorithms/ParallelForExample-2D.cpp
* \par Output
* \include ParallelForExample-2D.out
*
*/
template< typename Index,
typename Function,
typename... FunctionArgs >
......@@ -64,10 +133,46 @@ struct ParallelFor2D
}
};
/**
* \brief Parallel for loop for three dimensional domain of indexes.
*
* \tparam Device says on what device the for-loop is gonna be executed.
* It can be Devices::Host, Devices::Cuda or Devices::Sequential.
* \tparam Mode defines synchronous/asynchronous mode on parallel devices.
*/
template< typename Device = Devices::Sequential,
ParallelForMode Mode = SynchronousMode >
struct ParallelFor3D
{
/**
* \brief Static method for execution of the loop.
*
* \tparam Index defines the type of indexes over which the loop iterates.
* \tparam Function is the type of function to be called in each iteration.
* \tparam FunctionArgs is a variadic type of additional parameters which are
* supposed to be passed to the inner Function.
*
* \param startX the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
* \param startY the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
* \param startZ the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
* \param endX the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
* \param endY the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
* \param endZ the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
* \param f is the function to be called in each iteration
* \param args are additional parameters to be passed to the function f.
*
* The function f is called for each iteration as
*
* f( i, j, k, args... )
*
* where the first parameter is changing the most often.
*
* \par Example
* \include Algorithms/ParallelForExample-3D.cpp
* \par Output
* \include ParallelForExample-3D.out
*
*/
template< typename Index,
typename Function,
typename... FunctionArgs >
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment