Writting documentation for ParallelFor.

362727e5 · Tomáš Oberhuber · Tomáš Oberhuber · f4bbeead · 362727e5 · 362727e5
Commit 362727e5 authored 5 years ago by Tomáš Oberhuber Committed by Tomáš Oberhuber 5 years ago
--- a/Documentation/Examples/Algorithms/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/CMakeLists.txt
+IF( BUILD_CUDA )
+   CUDA_ADD_EXECUTABLE(ParallelForExampleCuda ParallelForExample.cu)
+   ADD_CUSTOM_COMMAND( COMMAND ParallelForExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
+ELSE()
+   ADD_EXECUTABLE(ParallelForExample ParallelForExample.cpp)
+   ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
+ENDIF()
+
+IF( BUILD_CUDA )
+ADD_CUSTOM_TARGET( RunAlgorithmsExamples-cuda ALL DEPENDS
+   ParallelForExample.out
+ )
+ELSE()
+ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS
+   ParallelForExample.out
+ )
+ENDIF()
\ No newline at end of file
--- a/Documentation/Examples/Algorithms/ParallelForExample-2D.cpp
+++ b/Documentation/Examples/Algorithms/ParallelForExample-2D.cpp
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename Device >
+void initMeshFunction( const int xSize,
+                       const int ySize,
+                       Vector< double, Device >& v,
+                       const double& c )
+{
+   auto view = v1.getConstView();
+   auto init = [=] __cuda_callable__  ( int i, int j, const int xSize, const double c ) mutable {
+      view[ j * xSize + i ] =  c; };
+   ParallelFor2D< Device >::exec( 0, 0, xSize, ySize, init, xSize, c );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Define dimensions of 2D mesh function.
+    */
+   const int xSize( 10 ), ySize( 10 );
+   const int size = xSize * ySize;
+
+   /***
+    * Firstly, test the mesh function initiation on CPU.
+    */
+   Vector< double, Devices::Host > host_v;
+   initMeshFunction( xSize, ySize, host_v, 1.0 );
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v( size );
+   initMeshFunction( xSize, ySize, cuda_v, 1.0 );
+#endif
+   return EXIT_SUCCESS;
+}
+
--- a/Documentation/Examples/Algorithms/ParallelForExample-2D.cu
+++ b/Documentation/Examples/Algorithms/ParallelForExample-2D.cu
+ParallelForExample-2D.cpp
\ No newline at end of file
--- a/Documentation/Examples/Algorithms/ParallelForExample-3D.cpp
+++ b/Documentation/Examples/Algorithms/ParallelForExample-3D.cpp
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename Device >
+void initMeshFunction( const int xSize,
+                       const int ySize,
+                       const int zSize,
+                       Vector< double, Device >& v,
+                       const double& c )
+{
+   auto view = v1.getConstView();
+   auto init = [=] __cuda_callable__  ( int i, int j, int k, const int xSize, const int ySize, const double c ) mutable {
+      view[ ( k * ySize + j ) * xSize + i ] =  c; };
+   ParallelFor3D< Device >::exec( 0, 0, xSize, ySize, init, xSize, ySize, c );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Define dimensions of 2D mesh function.
+    */
+   const int xSize( 10 ), ySize( 10 ), zSize( 10 );
+   const int size = xSize * ySize * zSize;
+
+   /***
+    * Firstly, test the mesh function initiation on CPU.
+    */
+   Vector< double, Devices::Host > host_v;
+   initMeshFunction( xSize, ySize, zSize, host_v, 1.0 );
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v( size );
+   initMeshFunction( xSize, ySize, cuda_v, 1.0 );
+#endif
+   return EXIT_SUCCESS;
+}
+
--- a/Documentation/Examples/Algorithms/ParallelForExample-3D.cu
+++ b/Documentation/Examples/Algorithms/ParallelForExample-3D.cu
+ParallelForExample-3D.cpp
\ No newline at end of file
--- a/Documentation/Examples/Algorithms/ParallelForExample.cpp
+++ b/Documentation/Examples/Algorithms/ParallelForExample.cpp
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+/****
+ * Set all elements of the vector v to the constant c.
+ */
+template< typename Device >
+void initVector( Vector< double, Device >& v,
+                 const double& c )
+{
+   auto view = v.getConstView();
+   auto init = [=] __cuda_callable__  ( int i, const double c ) mutable {
+      view[ i ] = c;
+
+   ParallelFor< Device >::exec( 0, v.getSize(), init, c );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the vector initiation on CPU.
+    */
+   Vector< double, Devices::Host > host_v( 10 );
+   initVector( host_v, 1.0 );
+   std::cout << "host_v = " << host_v << std::endl;
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v( 10 );
+   initVector( cuda_v, 1.0 );
+   std::cout << "cuda_v = " << cuda_v << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
+
--- a/Documentation/Examples/Algorithms/ParallelForExample.cu
+++ b/Documentation/Examples/Algorithms/ParallelForExample.cu
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename Device >
+void vectorSum( const Vector< double, Device >& v1,
+                const Vector< double, Device >& v2,
+                const double& c,
+                Vector< double, Device >& result )
+{
+   /****
+    * Get vectors view which can be captured by lambda.
+    */
+   auto v1_view = v1.getConstView();
+   auto v2_view = v2.getConstView();
+   auto result_view = result.getView();
+
+   /****
+    * The sum function.
+    */
+   auto sum = [=] __cuda_callable__  ( int i, const double c ) mutable {
+      result_view[ i ] = v1_view[ i ] + v2_view[ i ] + c; };
+
+      ParallelFor< Device >::exec( 0, v1.getSize(), sum, c );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the vectors sum on CPU.
+    */
+   Vector< double, Devices::Host > host_v1( 10 ), host_v2( 10 ), host_result( 10 );
+   host_v1 = 1.0;
+   host_v2.evaluate( []__cuda_callable__ ( int i )->double { return i; } );
+   vectorSum( host_v1, host_v2, 2.0, host_result );
+   std::cout << "host_v1 = " << host_v1 << std::endl;
+   std::cout << "host_v2 = " << host_v2 << std::endl;
+   std::cout << "The sum of the vectors on CPU is " << host_result << "." << std::endl;
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v1( 10 ), cuda_v2( 10 ), cuda_result( 10 );
+   cuda_v1 = 1.0;
+   cuda_v2.evaluate( []__cuda_callable__ ( int i )->double { return i; } );
+   vectorSum( cuda_v1, cuda_v2, 2.0, cuda_result );
+   std::cout << "cuda_v1 = " << cuda_v1 << std::endl;
+   std::cout << "cuda_v2 = " << cuda_v2 << std::endl;
+   std::cout << "The sum of the vectors on GPU is " << cuda_result << "." << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
+
--- a/Documentation/Examples/CMakeLists.txt
+++ b/Documentation/Examples/CMakeLists.txt
+ADD_SUBDIRECTORY( Algorithms )
 ADD_SUBDIRECTORY( Containers )

 ADD_EXECUTABLE( FileExample FileExample.cpp )

--- a/src/TNL/Algorithms/ParallelFor.h
+++ b/src/TNL/Algorithms/ParallelFor.h
@@ -33,12 +33,47 @@
 namespace TNL {
 namespace Algorithms {

+// TODO: ParallelForMode should be moved to Device (=Executor)
+
+/**
+ * \brief Enum for the parallel processing of the for-loop. 
+ * 
+ * Synchronous means that the program control returns to the caller when the loop is processed completely.
+ * Asynchronous means that the program control returns to the caller immediately even before the loop is processing is finished.
+ */
 enum ParallelForMode { SynchronousMode, AsynchronousMode };

+
+/**
+ * \brief Parallel for loop for one dimensional interval of indexes.
+ * 
+ * \tparam Device says on what device the for-loop is gonna be executed.
+ *    It can be Devices::Host, Devices::Cuda or Devices::Sequential. 
+ * \tparam Mode defines synchronous/asynchronous mode on parallel devices.
+ */
 template< typename Device = Devices::Sequential,
          ParallelForMode Mode = SynchronousMode >
 struct ParallelFor
 {
+   /**
+    * \brief Static method for execution of the loop.
+    * 
+    * \tparam Index defines the type of indexes over which the loop iterates.
+    * \tparam Function is the type of function to be called in each iteration.
+    * \tparam FunctionArgs is a variadic type of additional parameters which are 
+    *    supposed to be passed to the inner Function.
+    * 
+    * \param start the for-loop iterates over index interval [start, end).
+    * \param end the for-loop iterates over index interval [start, end).
+    * \param f is the function to be called in each iteration
+    * \param args are additional parameters to be passed to the function f.
+    * 
+    * \par Example
+    * \include Algorithms/ParallelForExample.cpp
+    * \par Output
+    * \include ParallelForExample.out
+    * 
+    */
   template< typename Index,
             typename Function,
             typename... FunctionArgs >
@@ -49,10 +84,44 @@ struct ParallelFor
   }
 };

+/**
+ * \brief Parallel for loop for two dimensional domain of indexes.
+ * 
+ * \tparam Device says on what device the for-loop is gonna be executed.
+ *    It can be Devices::Host, Devices::Cuda or Devices::Sequential. 
+ * \tparam Mode defines synchronous/asynchronous mode on parallel devices.
+ */
 template< typename Device = Devices::Sequential,
          ParallelForMode Mode = SynchronousMode >
 struct ParallelFor2D
 {
+   /**
+    * \brief Static method for execution of the loop.
+    * 
+    * \tparam Index defines the type of indexes over which the loop iterates.
+    * \tparam Function is the type of function to be called in each iteration.
+    * \tparam FunctionArgs is a variadic type of additional parameters which are 
+    *    supposed to be passed to the inner Function.
+    * 
+    * \param startX the for-loop iterates over index domain [startX,endX)x[startY,endY).
+    * \param startY the for-loop iterates over index domain [startX,endX)x[startY,endY).
+    * \param endX the for-loop iterates over index domain [startX,endX)x[startY,endY).
+    * \param endY the for-loop iterates over index domain [startX,endX)x[startY,endY).
+    * \param f is the function to be called in each iteration
+    * \param args are additional parameters to be passed to the function f.
+    * 
+    * The function f is called for each iteration as 
+    * 
+    * f( i, j, args... )
+    * 
+    * where the first parameter is changing more often than the second one.
+    *
+    * \par Example
+    * \include Algorithms/ParallelForExample-2D.cpp
+    * \par Output
+    * \include ParallelForExample-2D.out
+    * 
+    */
   template< typename Index,
             typename Function,
             typename... FunctionArgs >
@@ -64,10 +133,46 @@ struct ParallelFor2D
   }
 };

+/**
+ * \brief Parallel for loop for three dimensional domain of indexes.
+ * 
+ * \tparam Device says on what device the for-loop is gonna be executed.
+ *    It can be Devices::Host, Devices::Cuda or Devices::Sequential. 
+ * \tparam Mode defines synchronous/asynchronous mode on parallel devices.
+ */
 template< typename Device = Devices::Sequential,
          ParallelForMode Mode = SynchronousMode >
 struct ParallelFor3D
 {
+   /**
+    * \brief Static method for execution of the loop.
+    * 
+    * \tparam Index defines the type of indexes over which the loop iterates.
+    * \tparam Function is the type of function to be called in each iteration.
+    * \tparam FunctionArgs is a variadic type of additional parameters which are 
+    *    supposed to be passed to the inner Function.
+    * 
+    * \param startX the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param startY the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param startZ the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param endX the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param endY the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param endZ the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param f is the function to be called in each iteration
+    * \param args are additional parameters to be passed to the function f.
+    * 
+    * The function f is called for each iteration as 
+    * 
+    * f( i, j, k, args... )
+    * 
+    * where the first parameter is changing the most often.
+    * 
+    * \par Example
+    * \include Algorithms/ParallelForExample-3D.cpp
+    * \par Output
+    * \include ParallelForExample-3D.out
+    * 
+    */
   template< typename Index,
             typename Function,
             typename... FunctionArgs >