From 537407ec9a19e066f2711679c1cb8b5267994d9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 25 Nov 2019 16:58:41 +0100
Subject: [PATCH] Writing documentation on DevicePointer.

---
 .../Tutorials/Pointers/CMakeLists.txt         |  5 +-
 .../Pointers/DevicePointerExample.cpp         | 54 +++++++++++++++++++
 .../Pointers/DevicePointerExample.cu          |  1 +
 .../Pointers/SharedPointerExample.cpp         |  4 +-
 .../Tutorials/Pointers/tutorial_Pointers.md   |  8 ++-
 src/TNL/Pointers/DevicePointer.h              | 20 +++++++
 6 files changed, 88 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/Tutorials/Pointers/DevicePointerExample.cpp
 create mode 120000 Documentation/Tutorials/Pointers/DevicePointerExample.cu

diff --git a/Documentation/Tutorials/Pointers/CMakeLists.txt b/Documentation/Tutorials/Pointers/CMakeLists.txt
index bf6581c52b..0535e8fd5d 100644
--- a/Documentation/Tutorials/Pointers/CMakeLists.txt
+++ b/Documentation/Tutorials/Pointers/CMakeLists.txt
@@ -3,6 +3,8 @@ IF( BUILD_CUDA )
    ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
    CUDA_ADD_EXECUTABLE( SharedPointerExample SharedPointerExample.cu )
    ADD_CUSTOM_COMMAND( COMMAND SharedPointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SharedPointerExample.out OUTPUT SharedPointerExample.out )
+   CUDA_ADD_EXECUTABLE( DevicePointerExample DevicePointerExample.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DevicePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DevicePointerExample.out OUTPUT DevicePointerExample.out )
 ELSE()
    ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cpp )
    ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
@@ -15,7 +17,8 @@ ADD_CUSTOM_COMMAND( COMMAND UniquePointerHostExample > ${TNL_DOCUMENTATION_OUTPU
 IF( BUILD_CUDA )
 ADD_CUSTOM_TARGET( TutorialsPointersCuda ALL DEPENDS
    UniquePointerExample.out
-   SharedPointerExample.out )
+   SharedPointerExample.out
+   DevicePointerExample.out )
 ENDIF()
 
 ADD_CUSTOM_TARGET( TutorialsPointers ALL DEPENDS
diff --git a/Documentation/Tutorials/Pointers/DevicePointerExample.cpp b/Documentation/Tutorials/Pointers/DevicePointerExample.cpp
new file mode 100644
index 0000000000..144ae98b0e
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/DevicePointerExample.cpp
@@ -0,0 +1,54 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Pointers/DevicePointer.h>
+
+using namespace TNL;
+
+using ArrayCuda = Containers::Array< int, Devices::Cuda >;
+
+struct Tuple
+{
+   Tuple( ArrayCuda& _a1, ArrayCuda& _a2 ):
+   a1( _a1 ), a2( _a2 ){};
+
+   Pointers::DevicePointer< ArrayCuda > a1, a2;
+};
+
+__global__ void printTuple( const Tuple t )
+{
+   printf( "Tuple size is: %d\n", t.a1->getSize() );
+   for( int i = 0; i < t.a1->getSize(); i++ )
+   {
+      printf( "a1[ %d ] = %d \n", i, ( *t.a1 )[ i ] );
+      printf( "a2[ %d ] = %d \n", i, ( *t.a2 )[ i ] );
+   }
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Create a tuple of arrays and print them in CUDA kernel
+    */
+#ifdef HAVE_CUDA
+   ArrayCuda a1( 3 ), a2( 3 );
+   Tuple t( a1, a2 );
+   a1 = 1;
+   a2 = 2;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printTuple<<< 1, 1 >>>( t );
+
+   /***
+    * Resize the arrays
+    */
+   a1.setSize( 5 );
+   a2.setSize( 5 );
+   a1 = 3;
+   a2 = 4;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printTuple<<< 1, 1 >>>( t );
+#endif
+   return EXIT_SUCCESS;
+
+}
+
diff --git a/Documentation/Tutorials/Pointers/DevicePointerExample.cu b/Documentation/Tutorials/Pointers/DevicePointerExample.cu
new file mode 120000
index 0000000000..b17ef30da0
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/DevicePointerExample.cu
@@ -0,0 +1 @@
+DevicePointerExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/SharedPointerExample.cpp b/Documentation/Tutorials/Pointers/SharedPointerExample.cpp
index 5facaf2016..287aae8e8d 100644
--- a/Documentation/Tutorials/Pointers/SharedPointerExample.cpp
+++ b/Documentation/Tutorials/Pointers/SharedPointerExample.cpp
@@ -34,7 +34,7 @@ __global__ void printTuple( const Tuple t )
 int main( int argc, char* argv[] )
 {
    /***
-    * Create a tuple of arrays and print the in CUDA kernel
+    * Create a tuple of arrays and print them in CUDA kernel
     */
 #ifdef HAVE_CUDA
    Tuple t( 3 );
@@ -44,7 +44,7 @@ int main( int argc, char* argv[] )
    printTuple<<< 1, 1 >>>( t );
 
    /***
-    * Resize the array
+    * Resize the arrays
     */
    t.setSize( 5 );
    *t.a1 = 3;
diff --git a/Documentation/Tutorials/Pointers/tutorial_Pointers.md b/Documentation/Tutorials/Pointers/tutorial_Pointers.md
index d8fbef6eec..f9ef457e4d 100644
--- a/Documentation/Tutorials/Pointers/tutorial_Pointers.md
+++ b/Documentation/Tutorials/Pointers/tutorial_Pointers.md
@@ -77,4 +77,10 @@ One of the differences between `UniquePointer` and `SmartPointer` is that the `S
 
 ## Device pointers <a name="device_pointers"></a>
 
-The last type of the smart pointer implemented in TNL is `DevicePointer`. It works the same way as `SharedPointer` but it does not create new object on the host system. `DevicePointer` is therefore useful in situation when there is already an object created in the host memory and we want to create its image even on the device. Both images are linked one with each other and so one can just manipulate the one on the host and then synchronize it on the device.
+The last type of the smart pointer implemented in TNL is `DevicePointer`. It works the same way as `SharedPointer` but it does not create new object on the host system. `DevicePointer` is therefore useful in situation when there is already an object created in the host memory and we want to create its image even on the device. Both images are linked one with each other and so one can just manipulate the one on the host and then synchronize it on the device. The following listing is a modification of the previous example with tuple:
+
+\include DevicePointerExample.cpp
+
+The result looks the same:
+
+\include DevicePointerExample.out
diff --git a/src/TNL/Pointers/DevicePointer.h b/src/TNL/Pointers/DevicePointer.h
index 5276c3ed46..b72aaf9b1a 100644
--- a/src/TNL/Pointers/DevicePointer.h
+++ b/src/TNL/Pointers/DevicePointer.h
@@ -267,26 +267,46 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          pointer.cuda_pointer = nullptr;
       }
 
+      __cuda_callable__
       const Object* operator->() const
       {
+#ifdef __CUDA_ARCH__
+         return this->cuda_pointer;
+#else
          return this->pointer;
+#endif
       }
 
+      __cuda_callable__
       Object* operator->()
       {
+#ifdef __CUDA_ARCH__
+         return this->cuda_pointer;
+#else
          this->pd->maybe_modified = true;
          return this->pointer;
+#endif
       }
 
+      __cuda_callable__
       const Object& operator *() const
       {
+#ifdef __CUDA_ARCH__
+         return *( this->cuda_pointer );
+#else
          return *( this->pointer );
+#endif
       }
 
+      __cuda_callable__
       Object& operator *()
       {
+#ifdef __CUDA_ARCH__
+         return *( this->cuda_pointer );
+#else
          this->pd->maybe_modified = true;
          return *( this->pointer );
+#endif
       }
 
       __cuda_callable__
-- 
GitLab