From f3da6becdd4d32a0d31227e471d039a741472d45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 21 Nov 2019 20:47:19 +0100
Subject: [PATCH] Writing SharedPointer tutorial.

---
 .../Tutorials/Pointers/CMakeLists.txt         | 17 ++++-
 .../Pointers/SharedPointerExample.cpp         | 49 ++++++++++++
 .../Pointers/SharedPointerExample.cu          |  1 +
 .../Pointers/UniquePointerExample.cpp         | 28 ++++---
 .../Pointers/UniquePointerHostExample.cpp     | 23 ++++++
 .../Tutorials/Pointers/tutorial_Pointers.md   | 76 ++++++++++++++++++-
 6 files changed, 173 insertions(+), 21 deletions(-)
 create mode 100644 Documentation/Tutorials/Pointers/SharedPointerExample.cpp
 create mode 120000 Documentation/Tutorials/Pointers/SharedPointerExample.cu
 create mode 100644 Documentation/Tutorials/Pointers/UniquePointerHostExample.cpp

diff --git a/Documentation/Tutorials/Pointers/CMakeLists.txt b/Documentation/Tutorials/Pointers/CMakeLists.txt
index de824666cc..bf6581c52b 100644
--- a/Documentation/Tutorials/Pointers/CMakeLists.txt
+++ b/Documentation/Tutorials/Pointers/CMakeLists.txt
@@ -1,12 +1,23 @@
 IF( BUILD_CUDA )
    CUDA_ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cu )
    ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
+   CUDA_ADD_EXECUTABLE( SharedPointerExample SharedPointerExample.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SharedPointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SharedPointerExample.out OUTPUT SharedPointerExample.out )
 ELSE()
-   ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cu )
+   ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cpp )
    ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
 ENDIF()
 
+ADD_EXECUTABLE( UniquePointerHostExample UniquePointerHostExample.cpp )
+ADD_CUSTOM_COMMAND( COMMAND UniquePointerHostExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerHostExample.out OUTPUT UniquePointerHostExample.out )
+
+
 IF( BUILD_CUDA )
-ADD_CUSTOM_TARGET( TutorialsPointers ALL DEPENDS
-   UniquePointerExample.out )
+ADD_CUSTOM_TARGET( TutorialsPointersCuda ALL DEPENDS
+   UniquePointerExample.out
+   SharedPointerExample.out )
 ENDIF()
+
+ADD_CUSTOM_TARGET( TutorialsPointers ALL DEPENDS
+   UniquePointerHostExample.out
+)
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/SharedPointerExample.cpp b/Documentation/Tutorials/Pointers/SharedPointerExample.cpp
new file mode 100644
index 0000000000..8df827b059
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/SharedPointerExample.cpp
@@ -0,0 +1,49 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Pointers/SharedPointer.h>
+
+using namespace TNL;
+
+using ArrayCuda = Containers::Array< int, Devices::Cuda >;
+
+struct Tuple
+{
+   Pointers::SharedPointer< ArrayCuda > a1, a2;
+};
+
+__global__ void checkArray( const Tuple t )
+{
+   printf( "Array size is: %d\n", ptr->getSize() );
+   for( int i = 0; i < ptr->getSize(); i++ )
+      printf( "a[ %d ] = %d \n", i, ( *ptr )[ i ] );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Create a tuple of arrays and print the in CUDA kernel
+    */
+#ifdef HAVE_CUDA
+   Tuple t;
+   t.a1.modifyData< Devices::Host >().setSize( 10 );
+   t.a1.modifyData< Devices::Host >() = 1;
+   t.a2.modifyData< Devices::Host >().setSize( 10 );
+   t.a2.modifyData< Devices::Host >() = 2;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printkArrays<<< 1, 1 >>>( t );
+
+   /***
+    * Resize the array
+    */
+   t.a1.modifyData< Devices::Host >().setSize( 5 );
+   t.a1.modifyData< Devices::Host >() = 3;
+   t.a2.modifyData< Devices::Host >().setSize( 5 );
+   t.a2.modifyData< Devices::Host >() = 4;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printArrays<<< 1, 1 >>>( t );
+#endif
+   return EXIT_SUCCESS;
+
+}
+
diff --git a/Documentation/Tutorials/Pointers/SharedPointerExample.cu b/Documentation/Tutorials/Pointers/SharedPointerExample.cu
new file mode 120000
index 0000000000..7d10e33126
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/SharedPointerExample.cu
@@ -0,0 +1 @@
+SharedPointerExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/UniquePointerExample.cpp b/Documentation/Tutorials/Pointers/UniquePointerExample.cpp
index 55eb9e9c3c..6f25305e9a 100644
--- a/Documentation/Tutorials/Pointers/UniquePointerExample.cpp
+++ b/Documentation/Tutorials/Pointers/UniquePointerExample.cpp
@@ -3,13 +3,11 @@
 #include <TNL/Containers/Array.h>
 #include <TNL/Pointers/UniquePointer.h>
 
-
 using namespace TNL;
 
-using ArrayHost = Containers::Array< int, Devices::Host >;
 using ArrayCuda = Containers::Array< int, Devices::Cuda >;
 
-__global__ void checkArray( const ArrayCuda* ptr )
+__global__ void printArray( const ArrayCuda* ptr )
 {
    printf( "Array size is: %d\n", ptr->getSize() );
    for( int i = 0; i < ptr->getSize(); i++ )
@@ -18,24 +16,24 @@ __global__ void checkArray( const ArrayCuda* ptr )
 
 int main( int argc, char* argv[] )
 {
-
    /***
-    * Make unique pointer on array on CPU and manipulate the
-    * array via the pointer.
+    * Create an array and print its elements in CUDA kernel
     */
-   Pointers::UniquePointer< ArrayHost > array_host_ptr( 10 );
-   *array_host_ptr = 1;
-   std::cout << "Array = " << *array_host_ptr << std::endl;
+#ifdef HAVE_CUDA
+   Pointers::UniquePointer< ArrayCuda > array_ptr( 10 );
+   array_ptr.modifyData< Devices::Host >() = 1;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printArray<<< 1, 1 >>>( &array_ptr.getData< Devices::Cuda >() );
 
    /***
-    * Let's do the same in CUDA
+    * Resize the array and print it again
     */
-#ifdef HAVE_CUDA
-   Pointers::UniquePointer< ArrayCuda > array_cuda_ptr( 10 );
-   array_cuda_ptr.modifyData< Devices::Host >() = 1;
-   //Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
-   //checkArray<<< 1, 1 >>>( &array_cuda_ptr.getData< Devices::Cuda >() );
+   array_ptr->setSize( 5 );
+   array_ptr.modifyData< Devices::Host >() = 2;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printArray<<< 1, 1 >>>( &array_ptr.getData< Devices::Cuda >() );
 #endif
    return EXIT_SUCCESS;
+
 }
 
diff --git a/Documentation/Tutorials/Pointers/UniquePointerHostExample.cpp b/Documentation/Tutorials/Pointers/UniquePointerHostExample.cpp
new file mode 100644
index 0000000000..1fd0bba0e6
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/UniquePointerHostExample.cpp
@@ -0,0 +1,23 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Pointers/UniquePointer.h>
+
+using namespace TNL;
+
+using ArrayHost = Containers::Array< int, Devices::Host >;
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Make unique pointer on array on CPU and manipulate the
+    * array via the pointer.
+    */
+   Pointers::UniquePointer< ArrayHost > array_ptr( 10 );
+   *array_ptr = 1;
+   std::cout << "Array size is " << array_ptr->getSize() << std::endl;
+   std::cout << "Array = " << *array_ptr << std::endl;
+   return EXIT_SUCCESS;
+}
+
+
diff --git a/Documentation/Tutorials/Pointers/tutorial_Pointers.md b/Documentation/Tutorials/Pointers/tutorial_Pointers.md
index 8e4106ad30..d8b3f907d6 100644
--- a/Documentation/Tutorials/Pointers/tutorial_Pointers.md
+++ b/Documentation/Tutorials/Pointers/tutorial_Pointers.md
@@ -2,7 +2,7 @@
 
 ## Introduction
 
-Smart pointers in TNL are motivated by the smart pointerin the STL library. In addition, they work across different devices and so they make data management easier.
+Smart pointers in TNL are motivated by the smart pointerin the STL library. In addition, they can manage image of the object they hold on different devices which makes objects offloading easier.
 
 ## Table of Contents
 1. [Unique pointers](#unique_pointers)
@@ -12,7 +12,7 @@ Smart pointers in TNL are motivated by the smart pointerin the STL library. In a
 
 ## Unique pointers <a name="unique_pointers"></a>
 
-Simillar to STL smart pointer `std::unique_ptr` `UniquePointer` is a smart poinetr managing certain dynamicaly allocated object. The object is automatically  deallocated when the pointer goes out of scope. The definition of `UniquePointer` reads as:
+Simillar to STL smart pointer `std::unique_ptr` `UniquePointer` is a smart pointer managing certain dynamicaly allocated object. The object is automatically deallocated when the pointer goes out of scope. The definition of `UniquePointer` reads as:
 
 ```
 template< typename Object, typename Device = typename Object::DeviceType >
@@ -24,7 +24,16 @@ It takes two template parameters:
 1. `Object` is a type of object managed by the pointer.
 2. `Device` is a device where the object is to be allocated.
 
-If the device type is `Devices::Host`, `UniquePointer` behaves as usual unique smart pointer. If the device is different, `Devices::Cuda` for example, the unique pointer creates an image if the object even in the host memory. It means, that one can manipulate the object on the host. All smart pointers are registered in a special register using which they can be easily synchronised before calling a CUDA kernel. This means that all modified images of the objects in the memory are transferred on the GPU. See the following example:
+If the device type is `Devices::Host`, `UniquePointer` behaves as usual unique smart pointer. See the following example:
+
+\include UniquePointerHostExample.cpp
+
+The result is:
+
+\include UniquePointerHostExample.out
+
+
+If the device is different, `Devices::Cuda` for example, the unique pointer creates an image if the object even in the host memory. It means, that one can manipulate the object on the host. All smart pointers are registered in a special register using which they can be easily synchronised with the host images before calling a CUDA kernel. This means that all modified images of the objects in the memory are transferred on the GPU. See the following example:
 
 \include UniquePointerExample.cpp
 
@@ -32,6 +41,67 @@ The result looks as:
 
 \include UniquePointerExample.out
 
+A disadventage of `UniquePointer` is that it cannot be passed to the CUDA kernel since it requires making a copy of it. This is, however, from the nature of this object, prohibited. Not only this is solved by a `SharedPointer`.
+
 ## Shared pointers <a name="shared_pointers"></a>
 
+One of the main goals of the TNL library is to make the development of the HPC code, including GPU kernels as easy and efficient as possible. One way to do this is to profit from the object opriented programming even in CUDA kernels. Let us explain it on arrays. From certain point of view `Array` can be understood as an object consisiting of data and metadata. Data part means elements that we insert into the array. Metadata is a pointer to the data but also size of the array. This information makes use of the class easier. Though it is not necessary in any situations it may help to check array bounds when accessing the array elements for example. It is something that, when it is performed even in CUDA kernels, may help significantly with finding bugs in a code.  To do this, we need to transfer on the GPU not only pointers to the data but also complete metadata. It is simple if the structure which is supposed to be transfered on the GPU does not have pointers to metadata. See the following example:
+
+```
+struct Array
+{
+   double* data;
+   int size;
+};
+```
+
+If the pointer `data` points to a memory on GPU, this array can be passed to a kernel like this:
+
+```
+Array a;
+cudaKernel<<< gridSize, blockSize >>>( a );
+```
+
+The kernel `cudaKernel` can access the data as follows:
+
+```
+__global__ void cudaKernel( Array a )
+{
+   if( thredadIdx.x. < a.size )
+      a.data[ threadIdx.x ] = 0;
+}
+```
+
+But what if we have an object like this:
+
+```
+struct ArrayTuple
+{
+   Array *a1, *a2;
+}
+```
+
+Assume that there is an instance of `ArrayTuple` lets say `tuple` containing pointers to instances `a1` and `a2` of `Array`. The instances must be allocated on the GPU if one wants to simply pass the `tuple` to the CUDA kernel. Indeed, the CUDA kernels needs the arrays `a1` and `a2` to be on the GPU. See the following example:
+
+```
+__global__ tupleKernel( ArrayTuple tuple )
+{
+   if( threadIdx.x < tuple.a1->size )
+      tuple.a1->data[ threadIdx.x ] = 0;
+   if( threadIdx.x < tuple.a2->size )
+      tuple.a2->data[ threadIdx.x ] = 0;
+}
+
+```
+
+See, that the kernel needs to dereference `tuple.a1` and `tuple.a2`. Therefore these pointers must point to the global memoty of the GPU which means that arrays `a1` and `a2` must be allocated there using [cudaMalloc](http://developer.download.nvidia.com/compute/cuda/2_3/toolkit/docs/online/group__CUDART__MEMORY_gc63ffd93e344b939d6399199d8b12fef.html) lets say. It means, however, that the arrays `a1` and `a2` cannot be managed (for example resizing them requires changing `a1->size` and `a2->size`) on the host system by the CPU. The only solution to this is to have images of `a1` and `a2` and in the host memory and to copy them on the GPU before calling the CUDA kernel. One must not forget to modify the pointers in the `tuple` to point to the array copies on the GPU. To simplify this, TNL offers *cross-device shared smart pointers*. In addition to common smart pointers thay can manage an images of an object on different devices. Note that [CUDA Unified Memory](https://devblogs.nvidia.com/unified-memory-cuda-beginners/) is an answer to this problem as well. TNL cross-device smart pointers can be more efficient in some situations. (TODO: Prove this with benchmark problem.)
+
+The previous example could be implemented in TNL as follows:
+
+\include SharedPointerExample.cpp
+
+The result looks as:
+
+\include SharedPointerExample.out
+
 ## Device pointers <a name="device_pointers"></a>
-- 
GitLab