Writing SharePointer tutorial.

fc532769 · Tomáš Oberhuber · Tomáš Oberhuber · f11da7b1 · fc532769 · fc532769
Commit fc532769 authored 5 years ago by Tomáš Oberhuber Committed by Tomáš Oberhuber 5 years ago
--- a/Documentation/Tutorials/Pointers/SharedPointerExample.cpp
+++ b/Documentation/Tutorials/Pointers/SharedPointerExample.cpp
@@ -9,14 +9,26 @@ using ArrayCuda = Containers::Array< int, Devices::Cuda >;

 struct Tuple
 {
+   Tuple( const int size ):
+   a1( size ), a2( size ){};
+
+   void setSize( const int size )
+   {
+      a1->setSize( size );
+      a2->setSize( size );
+   }
+
   Pointers::SharedPointer< ArrayCuda > a1, a2;
 };

-__global__ void checkArray( const Tuple t )
+__global__ void printTuple( const Tuple t )
 {
-   printf( "Array size is: %d\n", ptr->getSize() );
-   for( int i = 0; i < ptr->getSize(); i++ )
-      printf( "a[ %d ] = %d \n", i, ( *ptr )[ i ] );
+   printf( "Tuple size is: %d\n", t.a1->getSize() );
+   for( int i = 0; i < t.a1->getSize(); i++ )
+   {
+      printf( "a1[ %d ] = %d \n", i, ( *t.a1 )[ i ] );
+      printf( "a2[ %d ] = %d \n", i, ( *t.a2 )[ i ] );
+   }
 }

 int main( int argc, char* argv[] )
@@ -25,23 +37,20 @@ int main( int argc, char* argv[] )
    * Create a tuple of arrays and print the in CUDA kernel
    */
 #ifdef HAVE_CUDA
-   Tuple t;
-   t.a1.modifyData< Devices::Host >().setSize( 10 );
-   t.a1.modifyData< Devices::Host >() = 1;
-   t.a2.modifyData< Devices::Host >().setSize( 10 );
-   t.a2.modifyData< Devices::Host >() = 2;
+   Tuple t( 3 );
+   *t.a1 = 1;
+   *t.a2 = 2;
   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
-   printkArrays<<< 1, 1 >>>( t );
+   printTuple<<< 1, 1 >>>( t );

   /***
    * Resize the array
    */
-   t.a1.modifyData< Devices::Host >().setSize( 5 );
-   t.a1.modifyData< Devices::Host >() = 3;
-   t.a2.modifyData< Devices::Host >().setSize( 5 );
-   t.a2.modifyData< Devices::Host >() = 4;
+   t.setSize( 5 );
+   *t.a1 = 3;
+   *t.a2 = 4;
   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
-   printArrays<<< 1, 1 >>>( t );
+   printTuple<<< 1, 1 >>>( t );
 #endif
   return EXIT_SUCCESS;


--- a/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-1.cpp
+++ b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-1.cpp
+struct Array
+{
+   double* data;
+   int size;
+};
+
--- a/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-2.cpp
+++ b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-2.cpp
+Array a;
+cudaKernel<<< gridSize, blockSize >>>( a );
\ No newline at end of file
--- a/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-3.cpp
+++ b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-3.cpp
+__global__ void cudaKernel( Array a )
+{
+   if( thredadIdx.x. < a.size )
+      a.data[ threadIdx.x ] = 0;
+}
\ No newline at end of file
--- a/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-4.cpp
+++ b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-4.cpp
+struct ArrayTuple
+{
+   Array *a1, *a2;
+}
\ No newline at end of file
--- a/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-5.cpp
+++ b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-5.cpp
+__global__ tupleKernel( ArrayTuple tuple )
+{
+   if( threadIdx.x < tuple.a1->size )
+      tuple.a1->data[ threadIdx.x ] = 0;
+   if( threadIdx.x < tuple.a2->size )
+      tuple.a2->data[ threadIdx.x ] = 0;
+}
--- a/Documentation/Tutorials/Pointers/codeSnippetUniquePointer.cpp
+++ b/Documentation/Tutorials/Pointers/codeSnippetUniquePointer.cpp
+template< typename Object, typename Device = typename Object::DeviceType >
+class UniquePointer;
\ No newline at end of file
--- a/Documentation/Tutorials/Pointers/tutorial_Pointers.md
+++ b/Documentation/Tutorials/Pointers/tutorial_Pointers.md
@@ -2,7 +2,7 @@

 ## Introduction

-Smart pointers in TNL are motivated by the smart pointerin the STL library. In addition, they can manage image of the object they hold on different devices which makes objects offloading easier.
+Smart pointers in TNL are motivated by the smart pointers in the STL library. In addition, they can manage image of the object they hold on different devices which is supposed to make objects offloading easier.

 ## Table of Contents
 1. [Unique pointers](#unique_pointers)
@@ -12,12 +12,9 @@ Smart pointers in TNL are motivated by the smart pointerin the STL library. In a

 ## Unique pointers <a name="unique_pointers"></a>

-Simillar to STL smart pointer `std::unique_ptr` `UniquePointer` is a smart pointer managing certain dynamicaly allocated object. The object is automatically deallocated when the pointer goes out of scope. The definition of `UniquePointer` reads as:
+Simillar to STL unique smart pointer `std::unique_ptr`, `UniquePointer` manages certain dynamicaly allocated object. The object is automatically deallocated when the pointer goes out of scope. The definition of `UniquePointer` reads as:

-```
-template< typename Object, typename Device = typename Object::DeviceType >
-class UniquePointer;
-```
+\include codeSnippetUniquePointer.cpp

 It takes two template parameters:

@@ -33,7 +30,7 @@ The result is:
 \include UniquePointerHostExample.out


-If the device is different, `Devices::Cuda` for example, the unique pointer creates an image if the object even in the host memory. It means, that one can manipulate the object on the host. All smart pointers are registered in a special register using which they can be easily synchronised with the host images before calling a CUDA kernel. This means that all modified images of the objects in the memory are transferred on the GPU. See the following example:
+If the device is different, `Devices::Cuda` for example, the unique pointer creates an image of the object even in the host memory. It allows one to manipulate the object on the host. All smart pointers are registered in a special register using which they can be synchronised with the host images before calling a CUDA kernel - all at once. This means that all modified images of the objects in the host memory are transferred on the GPU. See the following example:

 \include UniquePointerExample.cpp

@@ -41,58 +38,30 @@ The result looks as:

 \include UniquePointerExample.out

-A disadventage of `UniquePointer` is that it cannot be passed to the CUDA kernel since it requires making a copy of it. This is, however, from the nature of this object, prohibited. Not only this is solved by a `SharedPointer`.
+A disadventage of `UniquePointer` is that it cannot be passed to the CUDA kernel since it requires making a copy of itself. This is, however, from the nature of this object, prohibited. Not only for this reason, TNL offers also a `SharedPointer`.

 ## Shared pointers <a name="shared_pointers"></a>

-One of the main goals of the TNL library is to make the development of the HPC code, including GPU kernels as easy and efficient as possible. One way to do this is to profit from the object opriented programming even in CUDA kernels. Let us explain it on arrays. From certain point of view `Array` can be understood as an object consisiting of data and metadata. Data part means elements that we insert into the array. Metadata is a pointer to the data but also size of the array. This information makes use of the class easier. Though it is not necessary in any situations it may help to check array bounds when accessing the array elements for example. It is something that, when it is performed even in CUDA kernels, may help significantly with finding bugs in a code.  To do this, we need to transfer on the GPU not only pointers to the data but also complete metadata. It is simple if the structure which is supposed to be transfered on the GPU does not have pointers to metadata. See the following example:
+One of the main goals of the TNL library is to make the development of the HPC code, including GPU kernels, as easy and efficient as possible. One way to do this is to profit from the object opriented programming even in CUDA kernels. Let us explain it on arrays. From certain point of view `Array` can be understood as an object consisting of data and metadata. Data part means elements that we insert into the array. Metadata is a pointer to the data but also size of the array. This information makes use of the class easier for example by checking array bounds when accessing the array elements. It is something that, when it is performed even in CUDA kernels, may help significantly with finding bugs in a code. To do this, we need to transfer not only pointers to the data but also complete metadata on the device. It is simple if the structure which is supposed to be transfered on the GPU does not have pointers to metadata. See the following example:

-```
-struct Array
-{
-   double* data;
-   int size;
-};
-```
+
+\include codeSnippetSharedPointer-1.cpp

 If the pointer `data` points to a memory on GPU, this array can be passed to a kernel like this:

-```
-Array a;
-cudaKernel<<< gridSize, blockSize >>>( a );
-```
+\include codeSnippetSharedPointer-2.cpp

 The kernel `cudaKernel` can access the data as follows:

-```
-__global__ void cudaKernel( Array a )
-{
-   if( thredadIdx.x. < a.size )
-      a.data[ threadIdx.x ] = 0;
-}
-```
+\include codeSnippetSharedPointer-3.cpp

 But what if we have an object like this:

-```
-struct ArrayTuple
-{
-   Array *a1, *a2;
-}
-```
+\include codeSnippetSharedPointer-4.cpp

 Assume that there is an instance of `ArrayTuple` lets say `tuple` containing pointers to instances `a1` and `a2` of `Array`. The instances must be allocated on the GPU if one wants to simply pass the `tuple` to the CUDA kernel. Indeed, the CUDA kernels needs the arrays `a1` and `a2` to be on the GPU. See the following example:

-```
-__global__ tupleKernel( ArrayTuple tuple )
-{
-   if( threadIdx.x < tuple.a1->size )
-      tuple.a1->data[ threadIdx.x ] = 0;
-   if( threadIdx.x < tuple.a2->size )
-      tuple.a2->data[ threadIdx.x ] = 0;
-}
-
-```
+\include codeSnippetSharedPointer-5.cpp

 See, that the kernel needs to dereference `tuple.a1` and `tuple.a2`. Therefore these pointers must point to the global memoty of the GPU which means that arrays `a1` and `a2` must be allocated there using [cudaMalloc](http://developer.download.nvidia.com/compute/cuda/2_3/toolkit/docs/online/group__CUDART__MEMORY_gc63ffd93e344b939d6399199d8b12fef.html) lets say. It means, however, that the arrays `a1` and `a2` cannot be managed (for example resizing them requires changing `a1->size` and `a2->size`) on the host system by the CPU. The only solution to this is to have images of `a1` and `a2` and in the host memory and to copy them on the GPU before calling the CUDA kernel. One must not forget to modify the pointers in the `tuple` to point to the array copies on the GPU. To simplify this, TNL offers *cross-device shared smart pointers*. In addition to common smart pointers thay can manage an images of an object on different devices. Note that [CUDA Unified Memory](https://devblogs.nvidia.com/unified-memory-cuda-beginners/) is an answer to this problem as well. TNL cross-device smart pointers can be more efficient in some situations. (TODO: Prove this with benchmark problem.)

@@ -105,3 +74,5 @@ The result looks as:
 \include SharedPointerExample.out

 ## Device pointers <a name="device_pointers"></a>
+
+The last type of the smart pointer implemented in TNL is `DevicePointer`. It works the same way as `SharedPointer` but it does not create new object on the host system. `DevicePointer` is therefore useful in situation when there is already an object created in the host memory and we want to create its image even on the device. Both images are linked one with each other and so one can just manipulate the one on the host and then synchronize it on the device.