diff --git a/Documentation/Examples/Algorithms/CMakeLists.txt b/Documentation/Examples/Algorithms/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d0d1eda9b89edfa7d2c752187976f62c8635913b
--- /dev/null
+++ b/Documentation/Examples/Algorithms/CMakeLists.txt
@@ -0,0 +1,17 @@
+IF( BUILD_CUDA )
+   CUDA_ADD_EXECUTABLE(ParallelForExampleCuda ParallelForExample.cu)
+   ADD_CUSTOM_COMMAND( COMMAND ParallelForExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
+ELSE()
+   ADD_EXECUTABLE(ParallelForExample ParallelForExample.cpp)
+   ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
+ENDIF()
+
+IF( BUILD_CUDA )
+ADD_CUSTOM_TARGET( RunAlgorithmsExamples-cuda ALL DEPENDS
+   ParallelForExample.out
+ )
+ELSE()
+ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS
+   ParallelForExample.out
+ )
+ENDIF()
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/ParallelForExample-2D.cpp b/Documentation/Examples/Algorithms/ParallelForExample-2D.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aafff2466415ed5f710eb59fd25dc1925b959b1e
--- /dev/null
+++ b/Documentation/Examples/Algorithms/ParallelForExample-2D.cpp
@@ -0,0 +1,45 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename Device >
+void initMeshFunction( const int xSize,
+                       const int ySize,
+                       Vector< double, Device >& v,
+                       const double& c )
+{
+   auto view = v1.getConstView();
+   auto init = [=] __cuda_callable__  ( int i, int j, const int xSize, const double c ) mutable {
+      view[ j * xSize + i ] =  c; };
+   ParallelFor2D< Device >::exec( 0, 0, xSize, ySize, init, xSize, c );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Define dimensions of 2D mesh function.
+    */
+   const int xSize( 10 ), ySize( 10 );
+   const int size = xSize * ySize;
+
+   /***
+    * Firstly, test the mesh function initiation on CPU.
+    */
+   Vector< double, Devices::Host > host_v;
+   initMeshFunction( xSize, ySize, host_v, 1.0 );
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v( size );
+   initMeshFunction( xSize, ySize, cuda_v, 1.0 );
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Examples/Algorithms/ParallelForExample-2D.cu b/Documentation/Examples/Algorithms/ParallelForExample-2D.cu
new file mode 120000
index 0000000000000000000000000000000000000000..937609f775d70b4124034bb317f59963eeae5424
--- /dev/null
+++ b/Documentation/Examples/Algorithms/ParallelForExample-2D.cu
@@ -0,0 +1 @@
+ParallelForExample-2D.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/ParallelForExample-3D.cpp b/Documentation/Examples/Algorithms/ParallelForExample-3D.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3cb9b5b6482c8270e53b0bca9708705a8b3c28c4
--- /dev/null
+++ b/Documentation/Examples/Algorithms/ParallelForExample-3D.cpp
@@ -0,0 +1,46 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename Device >
+void initMeshFunction( const int xSize,
+                       const int ySize,
+                       const int zSize,
+                       Vector< double, Device >& v,
+                       const double& c )
+{
+   auto view = v1.getConstView();
+   auto init = [=] __cuda_callable__  ( int i, int j, int k, const int xSize, const int ySize, const double c ) mutable {
+      view[ ( k * ySize + j ) * xSize + i ] =  c; };
+   ParallelFor3D< Device >::exec( 0, 0, xSize, ySize, init, xSize, ySize, c );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Define dimensions of 2D mesh function.
+    */
+   const int xSize( 10 ), ySize( 10 ), zSize( 10 );
+   const int size = xSize * ySize * zSize;
+
+   /***
+    * Firstly, test the mesh function initiation on CPU.
+    */
+   Vector< double, Devices::Host > host_v;
+   initMeshFunction( xSize, ySize, zSize, host_v, 1.0 );
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v( size );
+   initMeshFunction( xSize, ySize, cuda_v, 1.0 );
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Examples/Algorithms/ParallelForExample-3D.cu b/Documentation/Examples/Algorithms/ParallelForExample-3D.cu
new file mode 120000
index 0000000000000000000000000000000000000000..686a94df2249c803caf9facaab7e39d2656b0d47
--- /dev/null
+++ b/Documentation/Examples/Algorithms/ParallelForExample-3D.cu
@@ -0,0 +1 @@
+ParallelForExample-3D.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/ParallelForExample.cpp b/Documentation/Examples/Algorithms/ParallelForExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c056fa1d2800e337eba71f78ee8d2c0f6f594cd
--- /dev/null
+++ b/Documentation/Examples/Algorithms/ParallelForExample.cpp
@@ -0,0 +1,42 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+/****
+ * Set all elements of the vector v to the constant c.
+ */
+template< typename Device >
+void initVector( Vector< double, Device >& v,
+                 const double& c )
+{
+   auto view = v.getView();
+   auto init = [=] __cuda_callable__  ( int i, const double c ) mutable {
+      view[ i ] = c; };
+
+   Algorithms::ParallelFor< Device >::exec( 0, v.getSize(), init, c );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the vector initiation on CPU.
+    */
+   Vector< double, Devices::Host > host_v( 10 );
+   initVector( host_v, 1.0 );
+   std::cout << "host_v = " << host_v << std::endl;
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v( 10 );
+   initVector( cuda_v, 1.0 );
+   std::cout << "cuda_v = " << cuda_v << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Examples/Algorithms/ParallelForExample.cu b/Documentation/Examples/Algorithms/ParallelForExample.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5714df7d3152a8b84f627cc377778793bccff5c4
--- /dev/null
+++ b/Documentation/Examples/Algorithms/ParallelForExample.cu
@@ -0,0 +1,59 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename Device >
+void vectorSum( const Vector< double, Device >& v1,
+                const Vector< double, Device >& v2,
+                const double& c,
+                Vector< double, Device >& result )
+{
+   /****
+    * Get vectors view which can be captured by lambda.
+    */
+   auto v1_view = v1.getConstView();
+   auto v2_view = v2.getConstView();
+   auto result_view = result.getView();
+
+   /****
+    * The sum function.
+    */
+   auto sum = [=] __cuda_callable__  ( int i, const double c ) mutable {
+      result_view[ i ] = v1_view[ i ] + v2_view[ i ] + c; };
+
+      ParallelFor< Device >::exec( 0, v1.getSize(), sum, c );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the vectors sum on CPU.
+    */
+   Vector< double, Devices::Host > host_v1( 10 ), host_v2( 10 ), host_result( 10 );
+   host_v1 = 1.0;
+   host_v2.evaluate( []__cuda_callable__ ( int i )->double { return i; } );
+   vectorSum( host_v1, host_v2, 2.0, host_result );
+   std::cout << "host_v1 = " << host_v1 << std::endl;
+   std::cout << "host_v2 = " << host_v2 << std::endl;
+   std::cout << "The sum of the vectors on CPU is " << host_result << "." << std::endl;
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v1( 10 ), cuda_v2( 10 ), cuda_result( 10 );
+   cuda_v1 = 1.0;
+   cuda_v2.evaluate( []__cuda_callable__ ( int i )->double { return i; } );
+   vectorSum( cuda_v1, cuda_v2, 2.0, cuda_result );
+   std::cout << "cuda_v1 = " << cuda_v1 << std::endl;
+   std::cout << "cuda_v2 = " << cuda_v2 << std::endl;
+   std::cout << "The sum of the vectors on GPU is " << cuda_result << "." << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Examples/Algorithms/StaticForExample.cpp b/Documentation/Examples/Algorithms/StaticForExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..47757458d71505fa585f3624575bcdaa55869602
--- /dev/null
+++ b/Documentation/Examples/Algorithms/StaticForExample.cpp
@@ -0,0 +1,28 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/StaticVector.h>
+#include <TNL/Algorithms/StaticFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+int main( int argc, char* argv[] )
+{
+   /****
+    * Create two static vectors
+    */
+   const int Size( 3 );
+   StaticVector< Size, double > a, b;
+   a = 1.0;
+   b = 2.0;
+   double sum( 0.0 );
+
+   /****
+    * Compute an addition of a vector and a constant number.
+    */
+   auto addition = [&]( int i, const double& c ) { a[ i ] = b[ i ] + c; sum += a[ i ]; };
+   Algorithms::StaticFor< 0, Size >::exec( addition, 3.14 );
+   std::cout << "a = " << a << std::endl;
+   std::cout << "sum = " << sum << std::endl;
+}
+
diff --git a/Documentation/Examples/Algorithms/TemplateStaticForExample.cpp b/Documentation/Examples/Algorithms/TemplateStaticForExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2fce79ae670bcda93da5fa6c1a3e95d1d260475
--- /dev/null
+++ b/Documentation/Examples/Algorithms/TemplateStaticForExample.cpp
@@ -0,0 +1,31 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/StaticVector.h>
+#include <TNL/Algorithms/TemplateStaticFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+const int Size( 5 );
+
+template< int I >
+struct LoopBody
+{
+   static void exec( const StaticVector< Size, double >& v ) {
+      std::cout << "v[ " << I << " ] = " << v[ I ] << std::endl;
+   }
+};
+
+int main( int argc, char* argv[] )
+{
+   /****
+    * Initiate static vector
+    */
+   StaticVector< Size, double > v{ 1.0, 2.0, 3.0, 4.0, 5.0 };
+
+   /****
+    * Print out the vector using template parameters for indexing.
+    */
+   Algorithms::TemplateStaticFor< 0, Size, LoopBody >::exec( v );
+}
+
diff --git a/Documentation/Examples/CMakeLists.txt b/Documentation/Examples/CMakeLists.txt
index 817e04398031009b1485679d6585dc6221ef9d36..45689f9e93ce4284ac5c622715bb9c8cf54961c4 100644
--- a/Documentation/Examples/CMakeLists.txt
+++ b/Documentation/Examples/CMakeLists.txt
@@ -1,4 +1,6 @@
+ADD_SUBDIRECTORY( Algorithms )
 ADD_SUBDIRECTORY( Containers )
+ADD_SUBDIRECTORY( Pointers )
 
 ADD_EXECUTABLE( FileExample FileExample.cpp )
 ADD_CUSTOM_COMMAND( COMMAND FileExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/FileExample.out OUTPUT FileExample.out )
diff --git a/Documentation/Examples/Containers/VectorExample.cpp b/Documentation/Examples/Containers/VectorExample.cpp
index 798774c17c369218e2cd1c886452bbd2471d0a85..be2db767afcdabff51e7d6038674ae011285de4b 100644
--- a/Documentation/Examples/Containers/VectorExample.cpp
+++ b/Documentation/Examples/Containers/VectorExample.cpp
@@ -7,16 +7,15 @@ using namespace std;
 
 int main()
 {
-    Containers::Vector<int> vector1;
-    vector1.setSize(5);
-    vector1.setValue(0);
-    cout << "Does vector contain 1?" << vector1.containsValue(1) << endl;
-    cout << "Does vector contain only zeros?" << vector1.containsOnlyValue(0) << endl;
+    Containers::Vector<int> vector1( 5 );
+    vector1 = 0;
+    cout << "Does vector contain 1?" << vector1.containsValue( 1 ) << endl;
+    cout << "Does vector contain only zeros?" << vector1.containsOnlyValue( 0 ) << endl;
 
-    Containers::Vector<int> vector2(3);
-    vector2.setValue(1);
-    vector2.swap(vector1);
-    vector2.setElement(2,4);
+    Containers::Vector<int> vector2( 3 );
+    vector2 = 1;
+    vector2.swap( vector1 );
+    vector2.setElement( 2, 4 );
 
     cout << "First vector:" << vector1.getData() << endl;
     cout << "Second vector:" << vector2.getData() << endl;
@@ -24,10 +23,11 @@ int main()
     vector2.reset();
     cout << "Second vector after reset:" << vector2.getData() << endl;
 
-    /*Containers::Vector<int> vect = {1, 2, -3, 3};
-    cout << "The smallest element is:" << vect.min() << endl;
-    cout << "The absolute biggest element is:" << vect.absMax() << endl;
-    cout << "Sum of all vector elements:" << vect.sum() << endl;
-    vect.scalarMultiplication(2);*/
+    Containers::Vector<int> vect = { 1, 2, -3, 3 };
+    cout << "The smallest element is:" << min( vect ) << endl;
+    cout << "The absolute biggest element is:" << max( abs( vect ) ) << endl;
+    cout << "Sum of all vector elements:" << sum( vect ) << endl;
+    vect *= 2.0;
+    cout << "Vector multiplied by 2:" << vect << endl;
 }
 
diff --git a/Documentation/Examples/Pointers/CMakeLists.txt b/Documentation/Examples/Pointers/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ef7a5f6150c600bcc016ccad02c099052afd1ec4
--- /dev/null
+++ b/Documentation/Examples/Pointers/CMakeLists.txt
@@ -0,0 +1,15 @@
+IF( BUILD_CUDA )
+   CUDA_ADD_EXECUTABLE(UniquePointerExampleCuda UniquePointerExample.cu)
+   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
+   CUDA_ADD_EXECUTABLE(SharedPointerExampleCuda SharedPointerExample.cu)
+   ADD_CUSTOM_COMMAND( COMMAND SharedPointerExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SharedPointerExample.out OUTPUT SharedPointerExample.out )
+   CUDA_ADD_EXECUTABLE(DevicePointerExampleCuda DevicePointerExample.cu)
+   ADD_CUSTOM_COMMAND( COMMAND DevicePointerExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DevicePointerExample.out OUTPUT DevicePointerExample.out )
+
+ADD_CUSTOM_TARGET( RunPointersExamples ALL DEPENDS
+   UniquePointerExample.out
+   SharedPointerExample.out
+   DevicePointerExample.out
+ )
+
+ENDIF()
diff --git a/Documentation/Examples/Pointers/DevicePointerExample.cpp b/Documentation/Examples/Pointers/DevicePointerExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..897b92962ab8e9dd65aa973cd207708287327fed
--- /dev/null
+++ b/Documentation/Examples/Pointers/DevicePointerExample.cpp
@@ -0,0 +1,56 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Pointers/DevicePointer.h>
+
+using namespace TNL;
+
+using ArrayCuda = Containers::Array< int, Devices::Cuda >;
+
+struct Tuple
+{
+   Tuple( ArrayCuda& _a1, ArrayCuda& _a2 ):
+   a1( _a1 ), a2( _a2 ){};
+
+   Pointers::DevicePointer< ArrayCuda > a1, a2;
+};
+
+#ifdef HAVE_CUDA
+__global__ void printTuple( const Tuple t )
+{
+   printf( "Tuple size is: %d\n", t.a1->getSize() );
+   for( int i = 0; i < t.a1->getSize(); i++ )
+   {
+      printf( "a1[ %d ] = %d \n", i, ( *t.a1 )[ i ] );
+      printf( "a2[ %d ] = %d \n", i, ( *t.a2 )[ i ] );
+   }
+}
+#endif
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Create a tuple of arrays and print them in CUDA kernel
+    */
+#ifdef HAVE_CUDA
+   ArrayCuda a1( 3 ), a2( 3 );
+   Tuple t( a1, a2 );
+   a1 = 1;
+   a2 = 2;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printTuple<<< 1, 1 >>>( t );
+
+   /***
+    * Resize the arrays
+    */
+   a1.setSize( 5 );
+   a2.setSize( 5 );
+   a1 = 3;
+   a2 = 4;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printTuple<<< 1, 1 >>>( t );
+#endif
+   return EXIT_SUCCESS;
+
+}
+
diff --git a/Documentation/Examples/Pointers/DevicePointerExample.cu b/Documentation/Examples/Pointers/DevicePointerExample.cu
new file mode 120000
index 0000000000000000000000000000000000000000..b17ef30da03880e80daeb03fb1506bdc00e9b832
--- /dev/null
+++ b/Documentation/Examples/Pointers/DevicePointerExample.cu
@@ -0,0 +1 @@
+DevicePointerExample.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Pointers/SharedPointerExample.cpp b/Documentation/Examples/Pointers/SharedPointerExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..be149518cc04e1c230397015d6733970527293d9
--- /dev/null
+++ b/Documentation/Examples/Pointers/SharedPointerExample.cpp
@@ -0,0 +1,60 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Pointers/SharedPointer.h>
+
+using namespace TNL;
+
+using ArrayCuda = Containers::Array< int, Devices::Cuda >;
+
+struct Tuple
+{
+   Tuple( const int size ):
+   a1( size ), a2( size ){};
+
+   void setSize( const int size )
+   {
+      a1->setSize( size );
+      a2->setSize( size );
+   }
+
+   Pointers::SharedPointer< ArrayCuda > a1, a2;
+};
+
+#ifdef HAVE_CUDA
+__global__ void printTuple( const Tuple t )
+{
+   printf( "Tuple size is: %d\n", t.a1->getSize() );
+   for( int i = 0; i < t.a1->getSize(); i++ )
+   {
+      printf( "a1[ %d ] = %d \n", i, ( *t.a1 )[ i ] );
+      printf( "a2[ %d ] = %d \n", i, ( *t.a2 )[ i ] );
+   }
+}
+#endif
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Create a tuple of arrays and print them in CUDA kernel
+    */
+#ifdef HAVE_CUDA
+   Tuple t( 3 );
+   *t.a1 = 1;
+   *t.a2 = 2;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printTuple<<< 1, 1 >>>( t );
+
+   /***
+    * Resize the arrays
+    */
+   t.setSize( 5 );
+   *t.a1 = 3;
+   *t.a2 = 4;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printTuple<<< 1, 1 >>>( t );
+#endif
+   return EXIT_SUCCESS;
+
+}
+
diff --git a/Documentation/Examples/Pointers/SharedPointerExample.cu b/Documentation/Examples/Pointers/SharedPointerExample.cu
new file mode 120000
index 0000000000000000000000000000000000000000..7d10e33126a284a5536999f2ea636e1da1e841b5
--- /dev/null
+++ b/Documentation/Examples/Pointers/SharedPointerExample.cu
@@ -0,0 +1 @@
+SharedPointerExample.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Pointers/UniquePointerExample.cpp b/Documentation/Examples/Pointers/UniquePointerExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6a42d34b1b16674c76e199d335a6235b71a6c33d
--- /dev/null
+++ b/Documentation/Examples/Pointers/UniquePointerExample.cpp
@@ -0,0 +1,41 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Pointers/UniquePointer.h>
+
+using namespace TNL;
+
+using ArrayCuda = Containers::Array< int, Devices::Cuda >;
+
+#ifdef HAVE_CUDA
+__global__ void printArray( const ArrayCuda* ptr )
+{
+   printf( "Array size is: %d\n", ptr->getSize() );
+   for( int i = 0; i < ptr->getSize(); i++ )
+      printf( "a[ %d ] = %d \n", i, ( *ptr )[ i ] );
+}
+#endif
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Create an array and print its elements in CUDA kernel
+    */
+#ifdef HAVE_CUDA
+   Pointers::UniquePointer< ArrayCuda > array_ptr( 10 );
+   array_ptr.modifyData< Devices::Host >() = 1;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printArray<<< 1, 1 >>>( &array_ptr.getData< Devices::Cuda >() );
+
+   /***
+    * Resize the array and print it again
+    */
+   array_ptr.modifyData< Devices::Host >().setSize( 5 );
+   array_ptr.modifyData< Devices::Host >() = 2;
+   std::cout << array_ptr.modifyData< Devices::Host >().getSize() << std::endl;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printArray<<< 1, 1 >>>( &array_ptr.getData< Devices::Cuda >() );
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Examples/Pointers/UniquePointerExample.cu b/Documentation/Examples/Pointers/UniquePointerExample.cu
new file mode 120000
index 0000000000000000000000000000000000000000..a7c9828d5b35a010795073ea20ccf4e54b000d24
--- /dev/null
+++ b/Documentation/Examples/Pointers/UniquePointerExample.cu
@@ -0,0 +1 @@
+UniquePointerExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Arrays/tutorial_01_Arrays.md b/Documentation/Tutorials/Arrays/tutorial_Arrays.md
similarity index 99%
rename from Documentation/Tutorials/Arrays/tutorial_01_Arrays.md
rename to Documentation/Tutorials/Arrays/tutorial_Arrays.md
index cb07521dce8ee4df1c1c41eaf9833abb83a5b212..0d728935e35b5edbdf491b94be27d359c166a249 100644
--- a/Documentation/Tutorials/Arrays/tutorial_01_Arrays.md
+++ b/Documentation/Tutorials/Arrays/tutorial_Arrays.md
@@ -1,4 +1,4 @@
-\page tutorial_01_arrays  Arrays tutorial
+\page tutorial_Arrays  Arrays tutorial
 
 ## Introduction
 
diff --git a/Documentation/Tutorials/BuildWithTNL/CMakeLists.txt b/Documentation/Tutorials/BuildWithTNL/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Documentation/Tutorials/BuildWithTNL/Makefile b/Documentation/Tutorials/BuildWithTNL/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..50f3297be0dc34f87dc0d3c96cdec65a33ca8ad3
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/Makefile
@@ -0,0 +1,54 @@
+include Makefile.inc
+
+SUBDIRS = 
+HEADERS =
+SOURCES = tnl-make-example.cpp
+CUDA_SOURCES = tnl-make-example-cuda.cu
+TARGETS = tnl-make-example 
+CUDA_TARGETS = tnl-make-example-cuda
+
+FILES = Makefile \
+        Makefile.inc \
+        $(CUDA_SOURCES) \
+        $(SOURCES) \
+        $(HEADERS)
+
+SUBDIRSCLEAN=$(addsuffix clean,$(SUBDIRS))
+
+all: bin subdirs $(TARGETS) $(CUDA_TARGETS)
+
+.PHONY:	subdirs $(SUBDIRS)
+subdirs:    $(SUBDIRS)
+$(SUBDIRS):
+	$(MAKE) -C $@	
+
+bin:
+	mkdir -p bin
+
+install: all
+	mkdir -p $(INSTALL_DIR)/bin
+	cp bin/* $(INSTALL_DIR)/bin
+
+.PHONY:	clean
+clean:	$(SUBDIRSCLEAN) clean_curdir
+
+clean_curdir:
+	rm -f *.o
+	
+%clean:	%
+	$(MAKE) -C $< clean
+
+dist: clean
+	tar zcvf $(PROJECT_NAME)-src.tgz $(SUBDIRS) $(FILES)
+
+$(TARGETS): % : %.o
+	$(CXX) $(LDFLAGS) -o $@ $< $(LDLIBS)
+
+$(CUDA_TARGETS): % : %.cu.o
+	$(CUDA_CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS)
+
+$(SOURCES:%.cpp=%.o): %.o: %.cpp
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $<
+
+$(CUDA_SOURCES:%.cu=%.cu.o): %.cu.o : %.cu
+	$(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $<
\ No newline at end of file
diff --git a/Documentation/Tutorials/BuildWithTNL/Makefile.inc b/Documentation/Tutorials/BuildWithTNL/Makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..2d05fcdeeb4078e179156a22239e501697dab18c
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/Makefile.inc
@@ -0,0 +1,48 @@
+# Replace the following with your project name
+PROJECT_NAME = tnl-make-example
+
+# Replace the following with your TNL installation path
+TNL_HEADERS = ${HOME}/.local/include
+INSTALL_DIR = ${HOME}/.local
+WITH_CUDA = yes
+WITH_OPENMP = yes
+WITH_DEBUG = no
+
+# If TNL is installed on your system, CUDA arch can be detected automatically using
+# a tool 'tnl-cuda-arch'. This is done by default if CUDA_ARCH is set to 'auto'. 
+# Otherwise, if you set it manually by telling the CUDA architecture number
+# i.e 50, 60 etc.
+CUDA_ARCH = auto
+
+# Set-up compilers
+CXX = g++
+CUDA_CXX = nvcc
+
+# Set-up CXX_FLAGS
+CXXFLAGS = -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -Wno-unknown-pragmas -std=c++14 -I$(TNL_HEADERS)
+ifeq ( $(WITH_DEBUG), yes )
+	CXXFLAGS += -O0 -g
+else
+   CXXFLAGS += -DNDEBUG -O3 -funroll-loops
+endif
+
+# Set-up CUDA_CXXFLAGS
+CUDA_CXXFLAGS = -Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -Xcudafe --std c++14 -I$(TNL_HEADERS) 
+ifeq ( $(WITH_CUDA), yes )
+   CUDA_CXXFLAGS += -DHAVE_CUDA
+   ifeq ( $(CUDA_ARCH), auto )
+      CUDA_CXXFLAGS += `tnl-cuda-arch`
+   else
+      CUDA_CXXFLAGS += -gencode arch=compute_$(CUDA_ARCH),code=sm_$(CUDA_ARCH)
+   endif
+endif
+
+# Set-up CPPFLAGS
+CPPFLAGS = -MD -MP
+
+# Set-up LDFLAGS
+LDFLAGS += -lm
+ifeq ( $(WITH_OPENMP), yes )
+   LDFLAGS += -lgomp
+endif
+
diff --git a/Documentation/Tutorials/BuildWithTNL/example-cuda-2.cpp b/Documentation/Tutorials/BuildWithTNL/example-cuda-2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7ea4ace2aefcb8c886869c3d1007149d616ac55a
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/example-cuda-2.cpp
@@ -0,0 +1 @@
+#include "example-cuda-2.h"
\ No newline at end of file
diff --git a/Documentation/Tutorials/BuildWithTNL/example-cuda-2.cu b/Documentation/Tutorials/BuildWithTNL/example-cuda-2.cu
new file mode 120000
index 0000000000000000000000000000000000000000..13435476be445e8e3d0899e3000af5eced0b7413
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/example-cuda-2.cu
@@ -0,0 +1 @@
+example-cuda-2.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/BuildWithTNL/example-cuda-2.h b/Documentation/Tutorials/BuildWithTNL/example-cuda-2.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d5606c78c789149b8b369a934ada6becfdd5980
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/example-cuda-2.h
@@ -0,0 +1,27 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Devices/Cuda.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+int main( int argc, char* argv[] )
+{
+   /****
+    * Create an array on the host and print it on a console.
+    */
+   Array< int > host_array{ 1, 2, 3 };
+   std::cout << "host_array = " << host_array << std::endl;
+
+   /****
+    * Create another array on GPU and print it on a console as well.
+    */
+#ifdef HAVE_CUDA
+   Array< int, Devices::Cuda > device_array{ 4, 5, 6 };
+   std::cout << "device_array = " << device_array << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
+
+
diff --git a/Documentation/Tutorials/BuildWithTNL/example-cuda.cpp b/Documentation/Tutorials/BuildWithTNL/example-cuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..83ac8ca6c793118389b109f94a428c6fe2d1328b
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/example-cuda.cpp
@@ -0,0 +1,25 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Devices/Cuda.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+int main( int argc, char* argv[] )
+{
+   /****
+    * Create an array on the host and print it on a console.
+    */
+   Array< int > host_array{ 1, 2, 3 };
+   std::cout << "host_array = " << host_array << std::endl;
+
+   /****
+    * Create another array on GPU and print it on a console as well.
+    */
+   Array< int, Devices::Cuda > device_array{ 4, 5, 6 };
+   std::cout << "device_array = " << device_array << std::endl;
+   return EXIT_SUCCESS;
+}
+
+
diff --git a/Documentation/Tutorials/BuildWithTNL/example-cuda.cu b/Documentation/Tutorials/BuildWithTNL/example-cuda.cu
new file mode 120000
index 0000000000000000000000000000000000000000..24b9f5933051bfc6748bdabe938dca650c37e94f
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/example-cuda.cu
@@ -0,0 +1 @@
+example-cuda.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/BuildWithTNL/example-host.cpp b/Documentation/Tutorials/BuildWithTNL/example-host.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d5d356fdbb561eaeeb9eeaa50acfdd55c48abe1
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/example-host.cpp
@@ -0,0 +1,19 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+int main( int argc, char* argv[] )
+{
+   /****
+    * Create an array on the host and print it on a console.
+    */
+   Array< int > host_array{ 1, 2, 3 };
+   std::cout << "host_array = " << host_array << std::endl;
+
+   return EXIT_SUCCESS;
+}
+
+
diff --git a/Documentation/Tutorials/BuildWithTNL/tnl-make-example-cuda.cu b/Documentation/Tutorials/BuildWithTNL/tnl-make-example-cuda.cu
new file mode 120000
index 0000000000000000000000000000000000000000..13435476be445e8e3d0899e3000af5eced0b7413
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/tnl-make-example-cuda.cu
@@ -0,0 +1 @@
+example-cuda-2.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/BuildWithTNL/tnl-make-example.cpp b/Documentation/Tutorials/BuildWithTNL/tnl-make-example.cpp
new file mode 120000
index 0000000000000000000000000000000000000000..13435476be445e8e3d0899e3000af5eced0b7413
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/tnl-make-example.cpp
@@ -0,0 +1 @@
+example-cuda-2.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/BuildWithTNL/tutorial_building_applications_with_tnl.md b/Documentation/Tutorials/BuildWithTNL/tutorial_building_applications_with_tnl.md
new file mode 100644
index 0000000000000000000000000000000000000000..2d9572b765bb327b935b056d96ba99c431a7c23b
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/tutorial_building_applications_with_tnl.md
@@ -0,0 +1,109 @@
+\page tutorial_building_applications_with_tnl  Building Applications with TNL
+
+## Introduction
+
+One may find usefull to read this tutorial before any other to learn how to compile the examples. Here we explain how to build applications with TNL and we provide templates of Makefiles which can help the user when starting developing programs with TNL. Since TNL is header-only library no linker setup is required.
+
+## Table of Contents
+1. [Compilation using command-line](#command_line)
+   1. [Compilation with `g++`](#command_line_gcc)
+   2. [Compilation with `nvcc` for CUDA](#command_line_nvcc)
+2. [Build with Makefile](#makefile)
+3. [Build with Cmake](#cmake)
+
+## Compilation using command-line  <a name="command_line"></a>
+
+This section mainly explains how to compile with and without support of CUDA using different compilers. We start with the following simple example:
+
+\include example-host.cpp
+
+This short program just create new array, initiate it with values 1, 2 and 3 and print it on a console. 
+
+### Compilation with `g++` <a name="command_line_gcc"></a>
+
+We assume that the code above is saved in a file `example-host.cpp.` With GNU g++compiler, the program can be compiled as follows:
+
+```
+g++ -std=c++14 -I${HOME}/.local/include/tnl example-host.cpp -o example-host
+```
+
+TNL requires standard C++14 which we enforce with the first parameter `-std=c++14`. Next, we need to tell the compiler the folder with TNL headers. This is done with the flag `-I`. By default, TNL installs into `${HOME}/.local/include/tnl`. You may also replace it just with the path where you have downloaded TNL. TNL is header only library and so it does not require any instalation. Finaly, we just past the source code file `example-host.cpp` using the command-line parameter `-c`.
+
+### Compilation with `nvcc` for CUDA <a name="command_line_nvcc"></a>
+
+If you want to profit from the great performance of GPUs using CUDA you need to have the CUDA compiler `nvcc`. It can be obtained with the [CUDA toolkit](https://developer.nvidia.com/cuda-downloads). We first modify our program as follows:
+
+\include example-cuda.cpp
+
+We need to include the header `TNL/Devices/Cuda.h` and declare the new `device_array` using a template parameter `Devices::Cuda`. For more details see [the arrays tutorial](tutorial_01_arrays.html). To compile the code above invoke the following command:
+
+```
+nvcc -I${HOME}/.local/include/tnl example-cuda.cu -o example-cuda
+```
+
+After executing the binary `example-cuda` we get error message surprisingly:
+
+```
+host_array = [ 1, 2, 3 ]
+terminate called after throwing an instance of 'TNL::Exceptions::CudaSupportMissing'
+  what():  CUDA support is missing, but the program called a function which needs it. Please recompile the program with CUDA support.
+Aborted (core dumped)
+```
+
+The reason is that each piece of CUDA code in TNL is guarded by a macro `HAVE_CUDA`. Therefore we need to pass `-DHAVE_CUDA` to the compiler. The following command will make it:
+
+```
+nvcc -DHAVE_CUDA -I${HOME}/.local/include/tnl example-cuda.cu -o example-cuda
+```
+
+Unfortunately, `nvcc` compiler generates a lot of warnings. When used with TNL, the amount of code processed by `nvcc` is rather large and so you can get really a lot of warnings. Some of them are treated as errors by default. For this reason we recommend to add the following flags to `nvcc`:
+
+```
+-Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda
+```
+
+The overall command looks as:
+
+```
+nvcc -Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -DHAVE_CUDA -I${HOME}/.local/include/tnl example-cuda.cu -o example-cuda
+```
+
+We sugest to guard the CUDA code by the macro HAVE_CUDA even in your projects. Our simple example then turns into the following:
+
+\include example-cuda-2.h
+
+The best way is store this code into a header file `example-cuda-2.h` for example. Include this header in files `example-cuda-2.cpp` and `example-cuda-2.cu` like this:
+
+\include example-cuda-2.cpp
+
+It allows you to compile with CUDA like this:
+
+```
+nvcc -Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -DHAVE_CUDA -I${HOME}/.local/include/tnl example-cuda-2.cu -o example-cuda-2
+```
+
+Or may compile it withou CUDA like this:
+
+```
+g++ -std=c++14 -I${HOME}/.local/include/tnl example-cuda-2.cpp -o example-cuda-2
+```
+
+Thus you have one code which you may easily compile with or without CUDA depending on your needs.
+
+## Build with Makefile <a name="makefile"></a>
+
+Larger projects needs to be managed by Makefile tool. In this section we propose a Makefile template which might help you to create more complex applications with TNL. The basic setup is stored in [Makefile.inc](../../BuildWithTNL/Makefile.inc) file:
+
+\include Makefile.inc
+
+In this file, you may define a name of your project (`PROJECT_NAME`), set the path to TNL headers (`TNL_HEADERS`), set the installation directory (`INSTALL_DIR`), turn on and off support of CUDA (`WITH_CUDA`), OpenMP (`WITH_OPEMP`) or debug mode (`WITH_DEBUG`). If you compile with CUDA you may set the CUDA architecture of your system.
+
+The main [Makefile](../../BuildWithTNL/Makefile) looks as follows:
+
+\include Makefile
+
+If your project source codes are splitted into several subdirectories you may specify them in variable `SUBDIRS`. Next, in variables `HEADERS` and `SOURCES` you should tell all source files in the current folder. The same holds for `CUDA_SOURCES` which are all .cu files in the current folder. `TARGETS` and `CUDA_TRGETS` tell the names of binaries to be build in the current folder.
+
+## Build with Cmake <a name="cmake"></a>
+
+
diff --git a/Documentation/Tutorials/CMakeLists.txt b/Documentation/Tutorials/CMakeLists.txt
index 690cf93a0c1be661d242d68ddd7d54486e5aeaf5..56fbb202c7e72dbae33177b675235ff784db7044 100644
--- a/Documentation/Tutorials/CMakeLists.txt
+++ b/Documentation/Tutorials/CMakeLists.txt
@@ -1,3 +1,6 @@
+add_subdirectory( BuildWithTNL )
 add_subdirectory( Arrays )
 add_subdirectory( Vectors )
-add_subdirectory( ReductionAndScan )
\ No newline at end of file
+add_subdirectory( ReductionAndScan )
+add_subdirectory( ForLoops )
+add_subdirectory( Pointers )
diff --git a/Documentation/Tutorials/ForLoops/CMakeLists.txt b/Documentation/Tutorials/ForLoops/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..738b1002043ddbe32148d539fc6e2b7f2f78dcc2
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/CMakeLists.txt
@@ -0,0 +1,22 @@
+IF( BUILD_CUDA )
+   CUDA_ADD_EXECUTABLE( ParallelForExample ParallelForExample.cu )
+   ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
+   CUDA_ADD_EXECUTABLE( ParallelForExample-2D ParallelForExample-2D.cu )
+   CUDA_ADD_EXECUTABLE( ParallelForExample-3D ParallelForExample-3D.cu )
+ELSE()
+   ADD_EXECUTABLE( ParallelForExample-2D ParallelForExample-2D.cpp )
+   ADD_EXECUTABLE( ParallelForExample-3D ParallelForExample-3D.cpp )
+ENDIF()
+
+ADD_EXECUTABLE( StaticForExample StaticForExample.cpp )
+ADD_CUSTOM_COMMAND( COMMAND StaticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/StaticForExample.out OUTPUT StaticForExample.out )
+
+ADD_EXECUTABLE( TemplateStaticForExample TemplateStaticForExample.cpp )
+ADD_CUSTOM_COMMAND( COMMAND TemplateStaticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/TemplateStaticForExample.out OUTPUT TemplateStaticForExample.out )
+
+IF( BUILD_CUDA )
+ADD_CUSTOM_TARGET( ForLoops-cuda ALL DEPENDS
+   ParallelForExample.out
+   StaticForExample.out
+   TemplateStaticForExample.out )
+ENDIF()
diff --git a/Documentation/Tutorials/ForLoops/ParallelFor2D-snippet.cpp b/Documentation/Tutorials/ForLoops/ParallelFor2D-snippet.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..40f29313a6189b8576ae7f9a80ee48ce5f9eda39
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/ParallelFor2D-snippet.cpp
@@ -0,0 +1,3 @@
+for( Index j = startY; j < endY; j++ )
+   for( Index i = startX; i < endX; i++ )
+      f( i, j, args... );
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cpp b/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..388c326ec2085d10ba50d265ee6bd8763310ae5a
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cpp
@@ -0,0 +1,61 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+template< typename Device >
+void meshFunctionSum( const int xSize,
+                      const int ySize,
+                      const Vector< double, Device >& v1,
+                      const Vector< double, Device >& v2,
+                      const double& c,
+                      Vector< double, Device >& result )
+{
+   /****
+    * Get vectors view which can be captured by lambda.
+    */
+   auto v1_view = v1.getConstView();
+   auto v2_view = v2.getConstView();
+   auto result_view = result.getView();
+
+   /****
+    * The sum function.
+    */
+   auto sum = [=] __cuda_callable__  ( int i, int j, const int xSize, const double c ) mutable {
+      const int idx = j * xSize + i;
+      result_view[ idx ] = v1_view[ idx ] + v2_view[ idx ] + c; };
+
+   Algorithms::ParallelFor2D< Device >::exec( 0, 0, xSize, ySize, sum, xSize, c );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Define dimensions of 2D mesh function.
+    */
+   const int xSize( 10 ), ySize( 10 );
+   const int size = xSize * ySize;
+
+   /***
+    * Firstly, test the mesh functions sum on CPU.
+    */
+   Vector< double, Devices::Host > host_v1( size ), host_v2( size ), host_result( size );
+   host_v1 = 1.0;
+   host_v2 = 2.0;
+   meshFunctionSum( xSize, ySize, host_v1, host_v2, 2.0, host_result );
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v1( size ), cuda_v2( size ), cuda_result( size );
+   cuda_v1 = 1.0;
+   cuda_v2 = 2.0;
+   meshFunctionSum( xSize, ySize, cuda_v1, cuda_v2, 2.0, cuda_result );
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cu b/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cu
new file mode 120000
index 0000000000000000000000000000000000000000..937609f775d70b4124034bb317f59963eeae5424
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cu
@@ -0,0 +1 @@
+ParallelForExample-2D.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cpp b/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..37e07c75ec1857d6792c43b11c1daf022dd1d2e9
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cpp
@@ -0,0 +1,62 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+template< typename Device >
+void meshFunctionSum( const int xSize,
+                      const int ySize,
+                      const int zSize,
+                      const Vector< double, Device >& v1,
+                      const Vector< double, Device >& v2,
+                      const double& c,
+                      Vector< double, Device >& result )
+{
+   /****
+    * Get vectors view which can be captured by lambda.
+    */
+   auto v1_view = v1.getConstView();
+   auto v2_view = v2.getConstView();
+   auto result_view = result.getView();
+
+   /****
+    * The sum function.
+    */
+   auto sum = [=] __cuda_callable__  ( int i, int j, int k, const int xSize, const int ySize, const double c ) mutable {
+      const int idx = ( k * ySize + j ) * xSize + i;
+      result_view[ idx ] = v1_view[ idx ] + v2_view[ idx ] + c; };
+
+   Algorithms::ParallelFor3D< Device >::exec( 0, 0, 0, xSize, ySize,zSize, sum, xSize, ySize, c );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Define dimensions of 3D mesh function.
+    */
+   const int xSize( 10 ), ySize( 10 ), zSize( 10 );
+   const int size = xSize * ySize * xSize;
+
+   /***
+    * Firstly, test the mesh functions sum on CPU.
+    */
+   Vector< double, Devices::Host > host_v1( size ), host_v2( size ), host_result( size );
+   host_v1 = 1.0;
+   host_v2 = 2.0;
+   meshFunctionSum( xSize, ySize, zSize, host_v1, host_v2, 2.0, host_result );
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v1( size ), cuda_v2( size ), cuda_result( size );
+   cuda_v1 = 1.0;
+   cuda_v2 = 2.0;
+   meshFunctionSum( xSize, ySize, zSize, cuda_v1, cuda_v2, 2.0, cuda_result );
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cu b/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cu
new file mode 120000
index 0000000000000000000000000000000000000000..686a94df2249c803caf9facaab7e39d2656b0d47
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cu
@@ -0,0 +1 @@
+ParallelForExample-3D.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample.cpp b/Documentation/Tutorials/ForLoops/ParallelForExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8e5f4e8b24f6c4e4464d550dbf6df6f88ebe0706
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/ParallelForExample.cpp
@@ -0,0 +1,58 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+template< typename Device >
+void vectorSum( const Vector< double, Device >& v1,
+                const Vector< double, Device >& v2,
+                const double& c,
+                Vector< double, Device >& result )
+{
+   /****
+    * Get vectors view which can be captured by lambda.
+    */
+   auto v1_view = v1.getConstView();
+   auto v2_view = v2.getConstView();
+   auto result_view = result.getView();
+
+   /****
+    * The sum function.
+    */
+   auto sum = [=] __cuda_callable__  ( int i, const double c ) mutable {
+      result_view[ i ] = v1_view[ i ] + v2_view[ i ] + c; };
+
+   Algorithms::ParallelFor< Device >::exec( 0, v1.getSize(), sum, c );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the vectors sum on CPU.
+    */
+   Vector< double, Devices::Host > host_v1( 10 ), host_v2( 10 ), host_result( 10 );
+   host_v1 = 1.0;
+   host_v2.evaluate( []__cuda_callable__ ( int i )->double { return i; } );
+   vectorSum( host_v1, host_v2, 2.0, host_result );
+   std::cout << "host_v1 = " << host_v1 << std::endl;
+   std::cout << "host_v2 = " << host_v2 << std::endl;
+   std::cout << "The sum of the vectors on CPU is " << host_result << "." << std::endl;
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v1( 10 ), cuda_v2( 10 ), cuda_result( 10 );
+   cuda_v1 = 1.0;
+   cuda_v2.evaluate( []__cuda_callable__ ( int i )->double { return i; } );
+   vectorSum( cuda_v1, cuda_v2, 2.0, cuda_result );
+   std::cout << "cuda_v1 = " << cuda_v1 << std::endl;
+   std::cout << "cuda_v2 = " << cuda_v2 << std::endl;
+   std::cout << "The sum of the vectors on GPU is " << cuda_result << "." << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample.cu b/Documentation/Tutorials/ForLoops/ParallelForExample.cu
new file mode 120000
index 0000000000000000000000000000000000000000..fba5e081637e867ece8d0675bcd0d2bec92501d4
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/ParallelForExample.cu
@@ -0,0 +1 @@
+ParallelForExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ForLoops/StaticForExample-2.cpp b/Documentation/Tutorials/ForLoops/StaticForExample-2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7ee4afd72c42e2bf0fd8db28bd3f5e7c3c47cc0f
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/StaticForExample-2.cpp
@@ -0,0 +1,4 @@
+for( int i = 0; i < Size; i++ )
+{
+   a[ i ] = b[ i ] + c; sum += a[ i ];
+};
diff --git a/Documentation/Tutorials/ForLoops/StaticForExample-3.cpp b/Documentation/Tutorials/ForLoops/StaticForExample-3.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5298b00a138b547d4fac56af327a146771eae13e
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/StaticForExample-3.cpp
@@ -0,0 +1 @@
+Algorithms::StaticFor< 0, Size, true >::exec( addition, 3.14 );
\ No newline at end of file
diff --git a/Documentation/Tutorials/ForLoops/StaticForExample.cpp b/Documentation/Tutorials/ForLoops/StaticForExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..47757458d71505fa585f3624575bcdaa55869602
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/StaticForExample.cpp
@@ -0,0 +1,28 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/StaticVector.h>
+#include <TNL/Algorithms/StaticFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+int main( int argc, char* argv[] )
+{
+   /****
+    * Create two static vectors
+    */
+   const int Size( 3 );
+   StaticVector< Size, double > a, b;
+   a = 1.0;
+   b = 2.0;
+   double sum( 0.0 );
+
+   /****
+    * Compute an addition of a vector and a constant number.
+    */
+   auto addition = [&]( int i, const double& c ) { a[ i ] = b[ i ] + c; sum += a[ i ]; };
+   Algorithms::StaticFor< 0, Size >::exec( addition, 3.14 );
+   std::cout << "a = " << a << std::endl;
+   std::cout << "sum = " << sum << std::endl;
+}
+
diff --git a/Documentation/Tutorials/ForLoops/TemplateStaticForExample.cpp b/Documentation/Tutorials/ForLoops/TemplateStaticForExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..eb65fd6ccb90580672699a323269888c31b4415b
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/TemplateStaticForExample.cpp
@@ -0,0 +1,32 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/StaticVector.h>
+#include <TNL/Algorithms/TemplateStaticFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+using Index = int;
+const Index Size( 5 );
+
+template< Index I >
+struct LoopBody
+{
+   static void exec( const StaticVector< Size, double >& v ) {
+      std::cout << "v[ " << I << " ] = " << v[ I ] << std::endl;
+   }
+};
+
+int main( int argc, char* argv[] )
+{
+   /****
+    * Initiate static vector
+    */
+   StaticVector< Size, double > v{ 1.0, 2.0, 3.0, 4.0, 5.0 };
+
+   /****
+    * Print out the vector using template parameters for indexing.
+    */
+   Algorithms::TemplateStaticFor< Index, 0, Size, LoopBody >::exec( v );
+}
+
diff --git a/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md b/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
new file mode 100644
index 0000000000000000000000000000000000000000..ba6f3ea2d481c0864961d460fead6a92d5f567e5
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
@@ -0,0 +1,100 @@
+\page tutorial_ForLoops For loops
+
+## Introduction
+
+This tutorial shows how to use different kind of for loops implemented in TNL. Namely, they are:
+
+* **Parallel for** is a for loop which can be run in parallel, i.e. all iterations of the loop must be independent. Paralle for can run on both multicore CPUs and GPUs.
+* **n-dimensional Parallel For** is extension of common parallel for into more dimensions.
+* **Static For** is a for loop which is performed sequentialy and it is explicitly unrolled by C++ templates. Number of iterations must be static (known at compile time).
+* **Templated Static For** ....
+
+## Table of Contents
+1. [Parallel For](#parallel_for)
+2. [n-dimensional Parallel For](#n_dimensional_parallel_for)
+3. [Static For](#static_for)
+4. [Templated Static For](#templated_static_for)
+
+## Parallel For<a name="parallel_for"></a>
+
+Basic parallel for construction in TNL serves for hardware platform transparent expression of parallel for loops. The hardware platform is expressed by a template parameter. The parallel for is defined as:
+
+```
+ParallelFor< Device >::exec( start, end, function, arguments... );
+```
+
+The `Device` can be either `Devices::Host` or `Devices::Cuda`. The first two parameters define the loop bounds in the C style. It means that there will be iterations for indexes `start` ... `end-1`. Function is a lambda function to be performed in each iteration. It is supposed to receive the iteration index and arguments passed to the parallel for (the last arguments). See the following example:
+
+\include ParallelForExample.cpp
+
+The result is:
+
+\include ParallelForExample.out 
+
+## n-dimensional Parallel For<a name="n_dimensional_parallel_for"></a>
+
+Performing for-loops in higher dimensions is simillar. In the following example we build 2D mesh function on top of TNL vector. Two dimensional indexes `( i, j )` are mapped to vector index `idx` as `idx = j * xSize + i`, where the mesh fuction has dimensions `xSize * ySize`. Of course, in this simple example, it does not make any sense to compute a sum of two mesh function this way, it is only an example.
+
+\include ParallelForExample-2D.cpp
+
+Notice the parameters of the lambda function `sum`. The first parameter `i` changes more often than `j` and therefore the index mapping has the form `j * xSize + i` to acces the vector elements sequentialy on CPU and to fullfill coalesced memory accesses on GPU. The for-loop is executed by calling `ParallelFor2D` with proper device. The first four parameters are `startX, startY, endX, endY` and on CPU this is equivalent to the following embeded for loops:
+
+\include ParallelFor2D-snippet.cpp
+
+where `args...` stand for additional arguments passed to the for-loop. After the parameters defining the loops bounds, lambda function (`sum` in this case) is passed followed by additional arguments. One of them, in our example, is `xSize` again because it must be passed to the lambda function for the index mapping computation.
+
+For the completness, we show modification of the previous example into 3D:
+
+\include ParallelForExample-3D.cpp
+
+## Static For<a name="static_for"></a>
+
+Static for-loop is designed for short loops with constant (i.e. known at the compile time) number of iterations. It is often used with static arrays and vectors. An adventage of this kind of for loop is that it is explicitly unrolled when the loop is short (up to eight iterations). See the following example:
+
+\include StaticForExample.cpp
+
+Notice that the static for-loop works with a lambda function simillar to parallel for-loop. The bounds of the loop are passed as template parameters in the statement `Algorithms::StaticFor< 0, Size >`. The parameters of the static method `exec` are the lambda functions to be performed in each iteration and auxiliar data to be passed to the function. The function gets the loop index `i` first followed by the auxiliary data `sum` in this example. 
+
+The result looks as:
+
+\include StaticForExample.out
+
+The effect of `StaticFor` is really the same as usual for-loop. The following code does the same as the previous example:
+
+\include StaticForExample-2.cpp
+
+The benefit of `StaticFor` is mainly in the explicit unrolling of short loops which can improve the performance in some situations. `StaticFor` can be forced to do the loop-unrolling in any situations using the third template parameter as follows:
+
+\include StaticForExample-3.cpp
+
+`StaticFor` can be used also in CUDA kernels.
+
+## Templated Static For<a name="templated_static_for"></a>
+
+Templated static for-loop (`TemplateStaticFor`) is a for-loop in template parameters. For example, if class `LoopBody` is defined as
+
+```
+template< int i >
+struct LoopBody
+{
+   static void exec() { ... };
+}
+```
+
+one might need to execute the following sequence of statements:
+
+```
+LoopBody< 0 >::exec();
+LoopBody< 1 >::exec();
+LoopBody< 3 >::exec();
+...
+LoodBody< N >::exec();
+```
+
+This is exactly what `TemplateStaticFor` can do - in a slightly more general way. See the following example:
+
+\include TemplateStaticForExample.cpp
+
+The output looks as follows:
+
+\include TemplateStaticForExample.out
diff --git a/Documentation/Tutorials/Pointers/CMakeLists.txt b/Documentation/Tutorials/Pointers/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0535e8fd5df0c242c4df984a483ec6a34dd32e46
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/CMakeLists.txt
@@ -0,0 +1,26 @@
+IF( BUILD_CUDA )
+   CUDA_ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cu )
+   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
+   CUDA_ADD_EXECUTABLE( SharedPointerExample SharedPointerExample.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SharedPointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SharedPointerExample.out OUTPUT SharedPointerExample.out )
+   CUDA_ADD_EXECUTABLE( DevicePointerExample DevicePointerExample.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DevicePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DevicePointerExample.out OUTPUT DevicePointerExample.out )
+ELSE()
+   ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cpp )
+   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
+ENDIF()
+
+ADD_EXECUTABLE( UniquePointerHostExample UniquePointerHostExample.cpp )
+ADD_CUSTOM_COMMAND( COMMAND UniquePointerHostExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerHostExample.out OUTPUT UniquePointerHostExample.out )
+
+
+IF( BUILD_CUDA )
+ADD_CUSTOM_TARGET( TutorialsPointersCuda ALL DEPENDS
+   UniquePointerExample.out
+   SharedPointerExample.out
+   DevicePointerExample.out )
+ENDIF()
+
+ADD_CUSTOM_TARGET( TutorialsPointers ALL DEPENDS
+   UniquePointerHostExample.out
+)
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/DevicePointerExample.cpp b/Documentation/Tutorials/Pointers/DevicePointerExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..144ae98b0e57a4aab79eb6d3e4aa20135dc12ca7
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/DevicePointerExample.cpp
@@ -0,0 +1,54 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Pointers/DevicePointer.h>
+
+using namespace TNL;
+
+using ArrayCuda = Containers::Array< int, Devices::Cuda >;
+
+struct Tuple
+{
+   Tuple( ArrayCuda& _a1, ArrayCuda& _a2 ):
+   a1( _a1 ), a2( _a2 ){};
+
+   Pointers::DevicePointer< ArrayCuda > a1, a2;
+};
+
+__global__ void printTuple( const Tuple t )
+{
+   printf( "Tuple size is: %d\n", t.a1->getSize() );
+   for( int i = 0; i < t.a1->getSize(); i++ )
+   {
+      printf( "a1[ %d ] = %d \n", i, ( *t.a1 )[ i ] );
+      printf( "a2[ %d ] = %d \n", i, ( *t.a2 )[ i ] );
+   }
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Create a tuple of arrays and print them in CUDA kernel
+    */
+#ifdef HAVE_CUDA
+   ArrayCuda a1( 3 ), a2( 3 );
+   Tuple t( a1, a2 );
+   a1 = 1;
+   a2 = 2;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printTuple<<< 1, 1 >>>( t );
+
+   /***
+    * Resize the arrays
+    */
+   a1.setSize( 5 );
+   a2.setSize( 5 );
+   a1 = 3;
+   a2 = 4;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printTuple<<< 1, 1 >>>( t );
+#endif
+   return EXIT_SUCCESS;
+
+}
+
diff --git a/Documentation/Tutorials/Pointers/DevicePointerExample.cu b/Documentation/Tutorials/Pointers/DevicePointerExample.cu
new file mode 120000
index 0000000000000000000000000000000000000000..b17ef30da03880e80daeb03fb1506bdc00e9b832
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/DevicePointerExample.cu
@@ -0,0 +1 @@
+DevicePointerExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/SharedPointerExample.cpp b/Documentation/Tutorials/Pointers/SharedPointerExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..287aae8e8dd2f9faf3c2ebeb86670f5e77f489a0
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/SharedPointerExample.cpp
@@ -0,0 +1,58 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Pointers/SharedPointer.h>
+
+using namespace TNL;
+
+using ArrayCuda = Containers::Array< int, Devices::Cuda >;
+
+struct Tuple
+{
+   Tuple( const int size ):
+   a1( size ), a2( size ){};
+
+   void setSize( const int size )
+   {
+      a1->setSize( size );
+      a2->setSize( size );
+   }
+
+   Pointers::SharedPointer< ArrayCuda > a1, a2;
+};
+
+__global__ void printTuple( const Tuple t )
+{
+   printf( "Tuple size is: %d\n", t.a1->getSize() );
+   for( int i = 0; i < t.a1->getSize(); i++ )
+   {
+      printf( "a1[ %d ] = %d \n", i, ( *t.a1 )[ i ] );
+      printf( "a2[ %d ] = %d \n", i, ( *t.a2 )[ i ] );
+   }
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Create a tuple of arrays and print them in CUDA kernel
+    */
+#ifdef HAVE_CUDA
+   Tuple t( 3 );
+   *t.a1 = 1;
+   *t.a2 = 2;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printTuple<<< 1, 1 >>>( t );
+
+   /***
+    * Resize the arrays
+    */
+   t.setSize( 5 );
+   *t.a1 = 3;
+   *t.a2 = 4;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printTuple<<< 1, 1 >>>( t );
+#endif
+   return EXIT_SUCCESS;
+
+}
+
diff --git a/Documentation/Tutorials/Pointers/SharedPointerExample.cu b/Documentation/Tutorials/Pointers/SharedPointerExample.cu
new file mode 120000
index 0000000000000000000000000000000000000000..7d10e33126a284a5536999f2ea636e1da1e841b5
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/SharedPointerExample.cu
@@ -0,0 +1 @@
+SharedPointerExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/UniquePointerExample.cpp b/Documentation/Tutorials/Pointers/UniquePointerExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad49cbdd63e26c0de7cb5956da71ba25fc8a4410
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/UniquePointerExample.cpp
@@ -0,0 +1,40 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Pointers/UniquePointer.h>
+
+using namespace TNL;
+
+using ArrayCuda = Containers::Array< int, Devices::Cuda >;
+
+#ifdef HAVE_CUDA
+__global__ void printArray( const ArrayCuda* ptr )
+{
+   printf( "Array size is: %d\n", ptr->getSize() );
+   for( int i = 0; i < ptr->getSize(); i++ )
+      printf( "a[ %d ] = %d \n", i, ( *ptr )[ i ] );
+}
+#endif
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Create an array and print its elements in CUDA kernel
+    */
+#ifdef HAVE_CUDA
+   Pointers::UniquePointer< ArrayCuda > array_ptr( 10 );
+   array_ptr.modifyData< Devices::Host >() = 1;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printArray<<< 1, 1 >>>( &array_ptr.getData< Devices::Cuda >() );
+
+   /***
+    * Resize the array and print it again
+    */
+   array_ptr->setSize( 5 );
+   array_ptr.modifyData< Devices::Host >() = 2;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printArray<<< 1, 1 >>>( &array_ptr.getData< Devices::Cuda >() );
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Tutorials/Pointers/UniquePointerExample.cu b/Documentation/Tutorials/Pointers/UniquePointerExample.cu
new file mode 120000
index 0000000000000000000000000000000000000000..a7c9828d5b35a010795073ea20ccf4e54b000d24
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/UniquePointerExample.cu
@@ -0,0 +1 @@
+UniquePointerExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/UniquePointerHostExample.cpp b/Documentation/Tutorials/Pointers/UniquePointerHostExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1fd0bba0e6823f4287ebe5be92db866215679cad
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/UniquePointerHostExample.cpp
@@ -0,0 +1,23 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Pointers/UniquePointer.h>
+
+using namespace TNL;
+
+using ArrayHost = Containers::Array< int, Devices::Host >;
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Make unique pointer on array on CPU and manipulate the
+    * array via the pointer.
+    */
+   Pointers::UniquePointer< ArrayHost > array_ptr( 10 );
+   *array_ptr = 1;
+   std::cout << "Array size is " << array_ptr->getSize() << std::endl;
+   std::cout << "Array = " << *array_ptr << std::endl;
+   return EXIT_SUCCESS;
+}
+
+
diff --git a/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-1.cpp b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-1.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e3753a27950f2b469c84dcbdabd5953044e04573
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-1.cpp
@@ -0,0 +1,6 @@
+struct Array
+{
+   double* data;
+   int size;
+};
+
diff --git a/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-2.cpp b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c46e4cfc14699ab765f13faa8d4c50695948467f
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-2.cpp
@@ -0,0 +1,2 @@
+Array a;
+cudaKernel<<< gridSize, blockSize >>>( a );
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-3.cpp b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-3.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8d4be832dff0199719b5037d1b0db5c78766cbad
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-3.cpp
@@ -0,0 +1,5 @@
+__global__ void cudaKernel( Array a )
+{
+   if( thredadIdx.x. < a.size )
+      a.data[ threadIdx.x ] = 0;
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-4.cpp b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-4.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..eb723a6d8cc7287c1ba9ec4882a5b23e8b951e99
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-4.cpp
@@ -0,0 +1,4 @@
+struct ArrayTuple
+{
+   Array *a1, *a2;
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-5.cpp b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-5.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0fe942e151a46aa12bba0b8d6cdc1d178c273afa
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-5.cpp
@@ -0,0 +1,7 @@
+__global__ tupleKernel( ArrayTuple tuple )
+{
+   if( threadIdx.x < tuple.a1->size )
+      tuple.a1->data[ threadIdx.x ] = 0;
+   if( threadIdx.x < tuple.a2->size )
+      tuple.a2->data[ threadIdx.x ] = 0;
+}
diff --git a/Documentation/Tutorials/Pointers/codeSnippetUniquePointer.cpp b/Documentation/Tutorials/Pointers/codeSnippetUniquePointer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bd180fe2707bbeeeb52be54f649889f3111730cb
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/codeSnippetUniquePointer.cpp
@@ -0,0 +1,2 @@
+template< typename Object, typename Device = typename Object::DeviceType >
+class UniquePointer;
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/tutorial_Pointers.md b/Documentation/Tutorials/Pointers/tutorial_Pointers.md
new file mode 100644
index 0000000000000000000000000000000000000000..f9ef457e4d65ff0735f6b66c08e615bb2d281062
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/tutorial_Pointers.md
@@ -0,0 +1,86 @@
+\page tutorial_Pointers  Cross-device pointers tutorial
+
+## Introduction
+
+Smart pointers in TNL are motivated by the smart pointers in the STL library. In addition, they can manage image of the object they hold on different devices which is supposed to make objects offloading easier.
+
+## Table of Contents
+1. [Unique pointers](#unique_pointers)
+2. [Shared pointers](#shared_pointers)
+3. [Device pointers](#device_pointers)
+
+
+## Unique pointers <a name="unique_pointers"></a>
+
+Simillar to STL unique smart pointer `std::unique_ptr`, `UniquePointer` manages certain dynamicaly allocated object. The object is automatically deallocated when the pointer goes out of scope. The definition of `UniquePointer` reads as:
+
+\include codeSnippetUniquePointer.cpp
+
+It takes two template parameters:
+
+1. `Object` is a type of object managed by the pointer.
+2. `Device` is a device where the object is to be allocated.
+
+If the device type is `Devices::Host`, `UniquePointer` behaves as usual unique smart pointer. See the following example:
+
+\include UniquePointerHostExample.cpp
+
+The result is:
+
+\include UniquePointerHostExample.out
+
+
+If the device is different, `Devices::Cuda` for example, the unique pointer creates an image of the object even in the host memory. It allows one to manipulate the object on the host. All smart pointers are registered in a special register using which they can be synchronised with the host images before calling a CUDA kernel - all at once. This means that all modified images of the objects in the host memory are transferred on the GPU. See the following example:
+
+\include UniquePointerExample.cpp
+
+The result looks as:
+
+\include UniquePointerExample.out
+
+A disadventage of `UniquePointer` is that it cannot be passed to the CUDA kernel since it requires making a copy of itself. This is, however, from the nature of this object, prohibited. For this reason we have to derreference the pointer on the host. This is done by a method `getData`. Its template parameter tells what object image we want to dereference - the one on the host or the one on the device. When we passing the object on the device, we need to get the device image. The method `getData` returns constant reference on the object. Non-constant reference is accessible via a method `modifyData`. When this method is used to get the reference on the host image, the pointer is marked as **potentialy modified**. Note that we need to have non-const reference even when we need to change the data (array elements for example) but not the meta-data (array size for example). If meta-data do not change there is no need to synchronize the object image with the one on the device. To distinguish between these two situations, the smart pointer keeps one more object image which stores the meta-data state since the last synchronization. Before the device image is synchronised, the host image and the last-synchronization-state image are compared. If they do not change no synchronization is required. One can see that TNL cross-device smart pointers are really meant only for small objects, otherwise the smart pointers overhead might be significant.
+
+## Shared pointers <a name="shared_pointers"></a>
+
+One of the main goals of the TNL library is to make the development of the HPC code, including GPU kernels, as easy and efficient as possible. One way to do this is to profit from the object opriented programming even in CUDA kernels. Let us explain it on arrays. From certain point of view `Array` can be understood as an object consisting of data and metadata. Data part means elements that we insert into the array. Metadata is a pointer to the data but also size of the array. This information makes use of the class easier for example by checking array bounds when accessing the array elements. It is something that, when it is performed even in CUDA kernels, may help significantly with finding bugs in a code. To do this, we need to transfer not only pointers to the data but also complete metadata on the device. It is simple if the structure which is supposed to be transfered on the GPU does not have pointers to metadata. See the following example:
+
+
+\include codeSnippetSharedPointer-1.cpp
+
+If the pointer `data` points to a memory on GPU, this array can be passed to a kernel like this:
+
+\include codeSnippetSharedPointer-2.cpp
+
+The kernel `cudaKernel` can access the data as follows:
+
+\include codeSnippetSharedPointer-3.cpp
+
+But what if we have an object like this:
+
+\include codeSnippetSharedPointer-4.cpp
+
+Assume that there is an instance of `ArrayTuple` lets say `tuple` containing pointers to instances `a1` and `a2` of `Array`. The instances must be allocated on the GPU if one wants to simply pass the `tuple` to the CUDA kernel. Indeed, the CUDA kernels needs the arrays `a1` and `a2` to be on the GPU. See the following example:
+
+\include codeSnippetSharedPointer-5.cpp
+
+See, that the kernel needs to dereference `tuple.a1` and `tuple.a2`. Therefore these pointers must point to the global memoty of the GPU which means that arrays `a1` and `a2` must be allocated there using [cudaMalloc](http://developer.download.nvidia.com/compute/cuda/2_3/toolkit/docs/online/group__CUDART__MEMORY_gc63ffd93e344b939d6399199d8b12fef.html) lets say. It means, however, that the arrays `a1` and `a2` cannot be managed (for example resizing them requires changing `a1->size` and `a2->size`) on the host system by the CPU. The only solution to this is to have images of `a1` and `a2` and in the host memory and to copy them on the GPU before calling the CUDA kernel. One must not forget to modify the pointers in the `tuple` to point to the array copies on the GPU. To simplify this, TNL offers *cross-device shared smart pointers*. In addition to common smart pointers thay can manage an images of an object on different devices. Note that [CUDA Unified Memory](https://devblogs.nvidia.com/unified-memory-cuda-beginners/) is an answer to this problem as well. TNL cross-device smart pointers can be more efficient in some situations. (TODO: Prove this with benchmark problem.)
+
+The previous example could be implemented in TNL as follows:
+
+\include SharedPointerExample.cpp
+
+The result looks as:
+
+\include SharedPointerExample.out
+
+One of the differences between `UniquePointer` and `SmartPointer` is that the `SmartPointer` can be passed to the CUDA kernel. Dereferencing by operators `*` and `->` can be done in kernels as well and the result is reference to a proper object image i.e. on the host or the device. When these operators are used on constant smart pointer, constant reference is returned which is the same as calling the method `getData` with appropriate explicitely stated `Device` template parameter. In case of non-constant `SharedPointer` non-constant reference is obtained. It has the same effect as calling `modifyData` method. On the host system, everything what was mentioned in the section about `UniquePointer` holds even for the `SharedPointer`. In addition, `modifyData` method call or non-constant dereferencing can be done in kernel on the device. In this case, the programmer gets non-constant reference to an object which is however meant to be used to change the data managed by the object but not the metadata. There is no way to synchronize objects managed by the smart pointers from the device to the host. **It means that the metadata should not be changed on the device!** In fact, it would not make sense. Imagine changing array size or re-allocating the array within a CUDA kernel. This is something one should never do.
+
+## Device pointers <a name="device_pointers"></a>
+
+The last type of the smart pointer implemented in TNL is `DevicePointer`. It works the same way as `SharedPointer` but it does not create new object on the host system. `DevicePointer` is therefore useful in situation when there is already an object created in the host memory and we want to create its image even on the device. Both images are linked one with each other and so one can just manipulate the one on the host and then synchronize it on the device. The following listing is a modification of the previous example with tuple:
+
+\include DevicePointerExample.cpp
+
+The result looks the same:
+
+\include DevicePointerExample.out
diff --git a/Documentation/Tutorials/ReductionAndScan/tutorial_03_ReductionAndScan.md b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
similarity index 99%
rename from Documentation/Tutorials/ReductionAndScan/tutorial_03_ReductionAndScan.md
rename to Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
index a82dcfa4cd59eb83af3cec7868c9ed44aea4f95d..717f17da5a0b2a21e1b6c48ef770bf760ce4550e 100644
--- a/Documentation/Tutorials/ReductionAndScan/tutorial_03_ReductionAndScan.md
+++ b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
@@ -1,4 +1,4 @@
-\page tutorial_03_reduction Flexible (parallel) reduction and prefix-sum tutorial
+\page tutorial_ReductionAndScan Flexible (parallel) reduction and prefix-sum tutorial
 
 ## Introduction
 
diff --git a/Documentation/Tutorials/Vectors/tutorial_02_Vectors.md b/Documentation/Tutorials/Vectors/tutorial_Vectors.md
similarity index 98%
rename from Documentation/Tutorials/Vectors/tutorial_02_Vectors.md
rename to Documentation/Tutorials/Vectors/tutorial_Vectors.md
index 301f410a5c3d283a1c503e37f80252df70a9a982..acbe7e7f0fd4f46f5cfefe47db6509fa268560b4 100644
--- a/Documentation/Tutorials/Vectors/tutorial_02_Vectors.md
+++ b/Documentation/Tutorials/Vectors/tutorial_Vectors.md
@@ -1,4 +1,4 @@
-\page tutorial_02_vectors  Vectors tutorial
+\page tutorial_Vectors  Vectors tutorial
 
 ## Introduction
 
diff --git a/Documentation/Tutorials/index.md b/Documentation/Tutorials/index.md
index a5edd055f7cafd5c5bf295283339416119209462..0dd60716f3300b89ba60d6279bae85b7a4cf9bf7 100644
--- a/Documentation/Tutorials/index.md
+++ b/Documentation/Tutorials/index.md
@@ -2,6 +2,9 @@
 
 ## Tutorials
 
-1. [Arrays](tutorial_01_arrays.html)
-2. [Vectors](tutorial_02_vectors.html)
-3. [Flexible parallel reduction and prefix-sum](tutorial_03_reduction.html)
+1. [Building applications with TNL](tutorial_building_applications_with_tnl.html)
+2. [Arrays](tutorial_Arrays.html)
+3. [Vectors](tutorial_Vectors.html)
+4. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html)
+5. [For loops](tutorial_ForLoops.html)
+6. [Cross-device pointers](tutorial_Pointers.html)
diff --git a/build b/build
index 914c65b1971bd0b895f5c2aa6f093ee83dd132a2..68966f0f423d13fd2cfb69a380fe02ea9bc25e8f 100755
--- a/build
+++ b/build
@@ -202,7 +202,9 @@ if [[ "$make" != "make" ]] && [[ "$VERBOSE" ]]; then
    VERBOSE="-v"
 fi
 
-$make ${VERBOSE} $make_target
+if ! $make ${VERBOSE} $make_target; then
+   exit 1
+fi
 
 if [[ ${WITH_DOC} == "yes" ]]; then
    "$ROOT_DIR/Documentation/build" --prefix="$PREFIX"
diff --git a/install b/install
index fe138dfaa005539a87e7ccbb9a8746143c4cbb0e..9b66bfbeece92c3bdd629b8c56dd49c47fb2b4f3 100755
--- a/install
+++ b/install
@@ -35,7 +35,10 @@ if [[ ${BUILD_DEBUG} == "yes" ]]; then
       mkdir Debug
    fi
    pushd Debug
-   ../build --root-dir=.. --build=Debug --install=yes ${OPTIONS}
+   if ! ../build --root-dir=.. --build=Debug --install=yes ${OPTIONS}; then
+      echo "Debug build failed."
+      exit 1
+   fi
    popd
 fi
 
@@ -44,7 +47,10 @@ if [[ ${BUILD_RELEASE} == "yes" ]]; then
       mkdir Release
    fi
    pushd Release
-   ../build --root-dir=.. --build=Release --install=yes ${OPTIONS};
+   if ! ../build --root-dir=.. --build=Release --install=yes ${OPTIONS}; then
+      echo "Release build failed."
+      exit 1
+   fi
    popd
 fi
 
diff --git a/src/TNL/Algorithms/ParallelFor.h b/src/TNL/Algorithms/ParallelFor.h
index 6d5e917ba4ac07246322a82c7d5edec38a1cb02b..cb096f8798176970b19770f5f9315abc3fdbcee2 100644
--- a/src/TNL/Algorithms/ParallelFor.h
+++ b/src/TNL/Algorithms/ParallelFor.h
@@ -31,14 +31,57 @@
  */
 
 namespace TNL {
+/**
+ * \brief Namespace for fundamental TNL algorithms
+ *
+ * It contains algorithms like for-loops, memory operations, (parallel) reduction,
+ * multireduction, scan etc.
+ */
 namespace Algorithms {
 
+// TODO: ParallelForMode should be moved to Device (=Executor)
+
+/**
+ * \brief Enum for the parallel processing of the for-loop.
+ *
+ * Synchronous means that the program control returns to the caller when the loop is processed completely.
+ * Asynchronous means that the program control returns to the caller immediately even before the loop is processing is finished.
+ *
+ * Only parallel for-loops in CUDA are affected by this mode.
+ */
 enum ParallelForMode { SynchronousMode, AsynchronousMode };
 
+
+/**
+ * \brief Parallel for loop for one dimensional interval of indexes.
+ *
+ * \tparam Device says on what device the for-loop is gonna be executed.
+ *    It can be Devices::Host, Devices::Cuda or Devices::Sequential.
+ * \tparam Mode defines synchronous/asynchronous mode on parallel devices.
+ */
 template< typename Device = Devices::Sequential,
           ParallelForMode Mode = SynchronousMode >
 struct ParallelFor
 {
+   /**
+    * \brief Static method for execution of the loop.
+    *
+    * \tparam Index defines the type of indexes over which the loop iterates.
+    * \tparam Function is the type of function to be called in each iteration.
+    * \tparam FunctionArgs is a variadic type of additional parameters which are
+    *    supposed to be passed to the inner Function.
+    *
+    * \param start the for-loop iterates over index interval [start, end).
+    * \param end the for-loop iterates over index interval [start, end).
+    * \param f is the function to be called in each iteration
+    * \param args are additional parameters to be passed to the function f.
+    *
+    * \par Example
+    * \include Algorithms/ParallelForExample.cpp
+    * \par Output
+    * \include ParallelForExample.out
+    *
+    */
    template< typename Index,
              typename Function,
              typename... FunctionArgs >
@@ -49,10 +92,44 @@ struct ParallelFor
    }
 };
 
+/**
+ * \brief Parallel for loop for two dimensional domain of indexes.
+ *
+ * \tparam Device says on what device the for-loop is gonna be executed.
+ *    It can be Devices::Host, Devices::Cuda or Devices::Sequential.
+ * \tparam Mode defines synchronous/asynchronous mode on parallel devices.
+ */
 template< typename Device = Devices::Sequential,
           ParallelForMode Mode = SynchronousMode >
 struct ParallelFor2D
 {
+   /**
+    * \brief Static method for execution of the loop.
+    *
+    * \tparam Index defines the type of indexes over which the loop iterates.
+    * \tparam Function is the type of function to be called in each iteration.
+    * \tparam FunctionArgs is a variadic type of additional parameters which are
+    *    supposed to be passed to the inner Function.
+    *
+    * \param startX the for-loop iterates over index domain [startX,endX)x[startY,endY).
+    * \param startY the for-loop iterates over index domain [startX,endX)x[startY,endY).
+    * \param endX the for-loop iterates over index domain [startX,endX)x[startY,endY).
+    * \param endY the for-loop iterates over index domain [startX,endX)x[startY,endY).
+    * \param f is the function to be called in each iteration
+    * \param args are additional parameters to be passed to the function f.
+    *
+    * The function f is called for each iteration as
+    *
+    * f( i, j, args... )
+    *
+    * where the first parameter is changing more often than the second one.
+    *
+    * \par Example
+    * \include Algorithms/ParallelForExample-2D.cpp
+    * \par Output
+    * \include ParallelForExample-2D.out
+    *
+    */
    template< typename Index,
              typename Function,
              typename... FunctionArgs >
@@ -64,10 +141,46 @@ struct ParallelFor2D
    }
 };
 
+/**
+ * \brief Parallel for loop for three dimensional domain of indexes.
+ *
+ * \tparam Device says on what device the for-loop is gonna be executed.
+ *    It can be Devices::Host, Devices::Cuda or Devices::Sequential.
+ * \tparam Mode defines synchronous/asynchronous mode on parallel devices.
+ */
 template< typename Device = Devices::Sequential,
           ParallelForMode Mode = SynchronousMode >
 struct ParallelFor3D
 {
+   /**
+    * \brief Static method for execution of the loop.
+    *
+    * \tparam Index defines the type of indexes over which the loop iterates.
+    * \tparam Function is the type of function to be called in each iteration.
+    * \tparam FunctionArgs is a variadic type of additional parameters which are
+    *    supposed to be passed to the inner Function.
+    *
+    * \param startX the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param startY the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param startZ the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param endX the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param endY the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param endZ the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param f is the function to be called in each iteration
+    * \param args are additional parameters to be passed to the function f.
+    *
+    * The function f is called for each iteration as
+    *
+    * f( i, j, k, args... )
+    *
+    * where the first parameter is changing the most often.
+    *
+    * \par Example
+    * \include Algorithms/ParallelForExample-3D.cpp
+    * \par Output
+    * \include ParallelForExample-3D.out
+    *
+    */
    template< typename Index,
              typename Function,
              typename... FunctionArgs >
diff --git a/src/TNL/Algorithms/StaticFor.h b/src/TNL/Algorithms/StaticFor.h
index c7404545840143bd053ed371c5813a7a0feaa185..6a450638f49ed22bd899af2397df602d98ff74a3 100644
--- a/src/TNL/Algorithms/StaticFor.h
+++ b/src/TNL/Algorithms/StaticFor.h
@@ -15,10 +15,28 @@
 namespace TNL {
 namespace Algorithms {
 
-// Manual unrolling does not make sense for loops with a large iterations
-// count. For a very large iterations count it would trigger the compiler's
-// limit on recursive template instantiation. Also note that the compiler
-// will (at least partially) unroll loops with static bounds anyway.
+/**
+ * \brief StaticFor is a wrapper for common for-loop with explicit unrolling.
+ *
+ * StaticFor can be used only for for-loops bounds of which are known at the
+ * compile time. StaticFor performs explicit loop unrolling for better performance.
+ * This, however, does not make sense for loops with a large iterations
+ * count. For a very large iterations count it could trigger the compiler's
+ * limit on recursive template instantiation. Also note that the compiler
+ * will (at least partially) unroll loops with static bounds anyway. For theses
+ * reasons, the explicit loop unrolling can be controlled by the third template
+ * parameter.
+ *
+ * \tparam Begin the loop will iterate over indexes [Begin,End)
+ * \tparam End the loop will iterate over indexes [Begin,End)
+ * \tparam unrolled controls the explicit loop unrolling. If it is true, the
+ *   unrolling is performed.
+ *
+ * \par Example
+ * \include Algorithms/StaticForExample.cpp
+ * \par Output
+ * \include StaticForExample.out
+ */
 template< int Begin, int End, bool unrolled = (End - Begin <= 8) >
 struct StaticFor;
 
@@ -27,6 +45,12 @@ struct StaticFor< Begin, End, true >
 {
    static_assert( Begin < End, "Wrong index interval for StaticFor. Begin must be less than end." );
 
+   /**
+    * \brief Static method for execution od the StaticFor.
+    *
+    * \param f is a (lambda) function to be performed in each iteration.
+    * \param args are auxiliary data to be passed to the function f.
+    */
    template< typename Function, typename... Args >
    __cuda_callable__
    static void exec( const Function& f, Args&&... args )
diff --git a/src/TNL/Algorithms/TemplateStaticFor.h b/src/TNL/Algorithms/TemplateStaticFor.h
index 753ad9b2618b2704292517e9b74ffff7192d22b7..c96c816dc990aad8b3cb1fc8da9b5cc67b8f110a 100644
--- a/src/TNL/Algorithms/TemplateStaticFor.h
+++ b/src/TNL/Algorithms/TemplateStaticFor.h
@@ -17,6 +17,36 @@
 
 namespace TNL {
 namespace Algorithms {
+
+/**
+ * \brief TemplateStaticFor serves for coding for-loops in template parameters.
+ *
+ * The result of calling this loop with a templated class \p LoopBody is as follows:
+ *
+ * LoopBody< begin >::exec( ... );
+ *
+ * LoodBody< begin + 1 >::exec( ... );
+ *
+ * ...
+ *
+ * LoopBody< end - 1 >::exec( ... );
+ *
+ * \tparam IndexType is type of the loop indexes
+ * \tparam begin the loop iterates over index interval [begin,end).
+ * \tparam end the loop iterates over index interval [begin,end).
+ * \tparam LoopBody is a templated class having one template parameter of IndexType.
+ *
+ * \par Example
+ * \include Algorithms/TamplateStaticForExample.cpp
+ * \par Output
+ * \include TamplateStaticForExample.out
+ */
+template< typename IndexType,
+          IndexType begin,
+          IndexType end,
+          template< IndexType > class LoopBody >
+struct TemplateStaticFor;
+
 namespace detail {
 
 template< typename IndexType,
@@ -25,6 +55,12 @@ template< typename IndexType,
           template< IndexType > class LoopBody >
 struct TemplateStaticForExecutor
 {
+   /**
+    * \brief Static method initiating the for-loop.
+    *
+    * \tparam Args type of user defined data to be passed to for-loop.
+    * \param args user defined data to be passed to for-loop.
+    */
    template< typename... Args >
    __cuda_callable__
    static void exec( Args&&... args )
diff --git a/src/TNL/Containers/Array.h b/src/TNL/Containers/Array.h
index 45ef1e272e8affa96e6a77b5b9e74cec8a59b447..117cb32ae4a84afb803f84dbf34d54d1948c0f2b 100644
--- a/src/TNL/Containers/Array.h
+++ b/src/TNL/Containers/Array.h
@@ -62,7 +62,7 @@ template< int, typename > class StaticArray;
  * See also \ref ArrayView, \ref Vector, \ref VectorView.
  *
  * \par Example
- * \include ArrayExample.cpp
+ * \include Containers/ArrayExample.cpp
  * \par Output
  * \include ArrayExample.out
  */
diff --git a/src/TNL/Containers/ArrayView.h b/src/TNL/Containers/ArrayView.h
index d51f151f772f3828dc7ad27ca13041d01730ce76..c06ad56dcc113541167b9d012ca4caf836a4f5c5 100644
--- a/src/TNL/Containers/ArrayView.h
+++ b/src/TNL/Containers/ArrayView.h
@@ -55,7 +55,9 @@ namespace Containers {
  * See also \ref Array, \ref Vector, \ref VectorView.
  *
  * \par Example
- * \include ArrayViewExample.cpp
+ * \include Containers/ArrayViewExample.cpp
+ * \par Output
+ * \include ArrayViewExample.out
  */
 template< typename Value,
           typename Device = Devices::Host,
diff --git a/src/TNL/Containers/Vector.h b/src/TNL/Containers/Vector.h
index be08266b61bc42555f9b78cd5471bce7f31f5b43..6bec6932115b03a47696de90a7a560c31762c4a2 100644
--- a/src/TNL/Containers/Vector.h
+++ b/src/TNL/Containers/Vector.h
@@ -32,7 +32,9 @@ namespace Containers {
  *                   is selected with \ref Allocators::Default.
  *
  * \par Example
- * \include VectorExample.cpp
+ * \include Containers/VectorExample.cpp
+ * \par Output
+ * \include VectorExample.out
  */
 template< typename Real = double,
           typename Device = Devices::Host,
diff --git a/src/TNL/Pointers/DevicePointer.h b/src/TNL/Pointers/DevicePointer.h
index 5276c3ed465938e7e7fcdfde2885dc8986cac3b5..df3e1d5c2d569afe0a1d8e6c9055f0fd827e4c6d 100644
--- a/src/TNL/Pointers/DevicePointer.h
+++ b/src/TNL/Pointers/DevicePointer.h
@@ -25,9 +25,26 @@
 namespace TNL {
 namespace Pointers {
 
-/***
- * The DevicePointer is like SharedPointer, except it takes an existing host
+/**
+ * \brief The DevicePointer is like SharedPointer, except it takes an existing host
  * object - there is no call to the ObjectType's constructor nor destructor.
+ *
+ * **NOTE: When using smart pointers to pass objects on GPU, one must call
+ * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >()
+ * before calling a CUDA kernel working with smart pointers.**
+ *
+ * \tparam Object is a type of object to be owned by the pointer.
+ * \tparam Device is device where the object is to be allocated. The object is
+ * always allocated on the host system as well for easier object manipulation.
+ *
+ * See also \ref UniquePointer and \ref SharedPointer.
+ *
+ * See also \ref DevicePointer< Object, Devices::Host > and \ref DevicePointer< Object, Devices::Cuda >.
+ *
+ * \par Example
+ * \include Pointers/DevicePointerExample.cpp
+ * \par Output
+ * \include DevicePointerExample.out
  */
 template< typename Object,
           typename Device = typename Object::DeviceType >
@@ -36,17 +53,22 @@ class DevicePointer
    static_assert( ! std::is_same< Device, void >::value, "The device cannot be void. You need to specify the device explicitly in your code." );
 };
 
-/****
- * Specialization for Devices::Host
+/**
+ * \brief Specialization of the \ref DevicePointer for the host system.
+ *
+ * \tparam  Object is a type of object to be owned by the pointer.
  */
 template< typename Object >
 class DevicePointer< Object, Devices::Host > : public SmartPointer
 {
    private:
-      // Convenient template alias for controlling the selection of copy- and
-      // move-constructors and assignment operators using SFINAE.
-      // The type Object_ is "enabled" iff Object_ and Object are not the same,
-      // but after removing const and volatile qualifiers they are the same.
+      /**
+       * \typedef Enabler
+       * Convenient template alias for controlling the selection of copy- and
+       * move-constructors and assignment operators using SFINAE.
+       * The type Object_ is "enabled" iff Object_ and Object are not the same,
+       * but after removing const and volatile qualifiers they are the same.
+       */
       template< typename Object_ >
       using Enabler = std::enable_if< ! std::is_same< Object_, Object >::value &&
                                       std::is_same< typename std::remove_cv< Object >::type, Object_ >::value >;
@@ -57,77 +79,161 @@ class DevicePointer< Object, Devices::Host > : public SmartPointer
 
    public:
 
-      typedef Object ObjectType;
-      typedef Devices::Host DeviceType;
+      /**
+       * \typedef ObjectType is the type of object owned by the pointer.
+       */
+      using ObjectType = Object;
+
+      /**
+       * \typedef DeviceType is the type of device where the object is to be
+       * mirrored.
+       */
+      using DeviceType = Devices::Host;
+
+      /**
+       * \brief Constructor of an empty pointer.
+       */
+      DevicePointer( std::nullptr_t )
+      : pointer( nullptr )
+      {}
 
+      /**
+       * \brief Constructor with an object reference.
+       *
+       * \param obj reference to an object to be managed by the pointer.
+       */
       explicit  DevicePointer( ObjectType& obj )
       : pointer( nullptr )
       {
          this->pointer = &obj;
       }
 
-      // this is needed only to avoid the default compiler-generated constructor
-      DevicePointer( const DevicePointer& pointer )
+      /**
+       * \brief Copy constructor.
+       *
+       * \param pointer is the source device pointer.
+       */
+      DevicePointer( const DevicePointer& pointer ) // this is needed only to avoid the default compiler-generated constructor
       : pointer( pointer.pointer )
       {
       }
 
-      // conditional constructor for non-const -> const data
+      /**
+       * \brief Copy constructor.
+       *
+       * This is specialization for compatible object types.
+       *
+       * See \ref Enabler.
+       *
+       * \param pointer is the source device pointer.
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      DevicePointer( const DevicePointer< Object_, DeviceType >& pointer )
+      DevicePointer( const DevicePointer< Object_, DeviceType >& pointer ) // conditional constructor for non-const -> const data
       : pointer( pointer.pointer )
       {
       }
 
-      // this is needed only to avoid the default compiler-generated constructor
-      DevicePointer( DevicePointer&& pointer )
+      /**
+       * \brief Move constructor.
+       *
+       * \param pointer is the source device pointer.
+       */
+      DevicePointer( DevicePointer&& pointer ) // this is needed only to avoid the default compiler-generated constructor
       : pointer( pointer.pointer )
       {
          pointer.pointer = nullptr;
       }
 
-      // conditional constructor for non-const -> const data
+      /**
+       * \brief Move constructor.
+       *
+       * This is specialization for compatible object types.
+       *
+       * See \ref Enabler.
+       *
+       * \param pointer is the source device pointer.
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      DevicePointer( DevicePointer< Object_, DeviceType >&& pointer )
+      DevicePointer( DevicePointer< Object_, DeviceType >&& pointer ) // conditional constructor for non-const -> const data
       : pointer( pointer.pointer )
       {
          pointer.pointer = nullptr;
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by constant smart pointer.
+       *
+       * \return constant pointer to the object owned by this smart pointer.
+       */
       const Object* operator->() const
       {
          return this->pointer;
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by non-constant smart pointer.
+       *
+       * \return pointer to the object owned by this smart pointer.
+       */
       Object* operator->()
       {
          return this->pointer;
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by constant smart pointer.
+       *
+       * \return constant reference to the object owned by this smart pointer.
+       */
       const Object& operator *() const
       {
          return *( this->pointer );
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by non-constant smart pointer.
+       *
+       * \return reference to the object owned by this smart pointer.
+       */
       Object& operator *()
       {
          return *( this->pointer );
       }
 
+      /**
+       * \brief Conversion to boolean type.
+       *
+       * \return Returns true if the pointer is not empty, false otherwise.
+       */
       __cuda_callable__
       operator bool() const
       {
          return this->pointer;
       }
 
+      /**
+       * \brief Negation operator.
+       *
+       * \return Returns false if the pointer is not empty, true otherwise.
+       */
       __cuda_callable__
       bool operator!() const
       {
          return ! this->pointer;
       }
 
+      /**
+       * \brief Constant object reference getter.
+       *
+       * No synchronization of this pointer will be performed due to calling
+       * this method.
+       *
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       __cuda_callable__
       const Object& getData() const
@@ -135,6 +241,16 @@ class DevicePointer< Object, Devices::Host > : public SmartPointer
          return *( this->pointer );
       }
 
+      /**
+       * \brief Non-constant object reference getter.
+       *
+       * No synchronization of this pointer will be performed due to calling
+       * this method.
+       *
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       __cuda_callable__
       Object& modifyData()
@@ -142,45 +258,97 @@ class DevicePointer< Object, Devices::Host > : public SmartPointer
          return *( this->pointer );
       }
 
-      // this is needed only to avoid the default compiler-generated operator
-      const DevicePointer& operator=( const DevicePointer& ptr )
+      /**
+       * \brief Assignment operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
+      const DevicePointer& operator=( const DevicePointer& ptr ) // this is needed only to avoid the default compiler-generated operator
       {
          this->pointer = ptr.pointer;
          return *this;
       }
 
-      // conditional operator for non-const -> const data
+      /**
+       * \brief Assignment operator for compatible object types.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * See \ref Enabler.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const DevicePointer& operator=( const DevicePointer< Object_, DeviceType >& ptr )
+      const DevicePointer& operator=( const DevicePointer< Object_, DeviceType >& ptr ) // conditional operator for non-const -> const data
       {
          this->pointer = ptr.pointer;
          return *this;
       }
 
-      // this is needed only to avoid the default compiler-generated operator
-      const DevicePointer& operator=( DevicePointer&& ptr )
+      /**
+       * \brief Move operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
+      const DevicePointer& operator=( DevicePointer&& ptr ) // this is needed only to avoid the default compiler-generated operator
       {
          this->pointer = ptr.pointer;
          ptr.pointer = nullptr;
          return *this;
       }
 
-      // conditional operator for non-const -> const data
+      /**
+       * \brief Move operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * See \ref Enabler.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const DevicePointer& operator=( DevicePointer< Object_, DeviceType >&& ptr )
+      const DevicePointer& operator=( DevicePointer< Object_, DeviceType >&& ptr ) // conditional operator for non-const -> const data
       {
          this->pointer = ptr.pointer;
          ptr.pointer = nullptr;
          return *this;
       }
 
+      /**
+       * \brief Cross-device pointer synchronization.
+       *
+       * For the smart pointers on the host, this method does nothing.
+       *
+       * \return true.
+       */
       bool synchronize()
       {
          return true;
       }
 
+      /**
+       * \brief Swap the owned object with another pointer.
+       *
+       * \param ptr2 the other device pointer for swapping.
+       */
+      void swap( DevicePointer& ptr2 )
+      {
+         std::swap( this->pointer, ptr2.pointer );
+      }
+
+      /**
+       * \brief Destructor.
+       */
       ~DevicePointer()
       {
       }
@@ -191,17 +359,23 @@ class DevicePointer< Object, Devices::Host > : public SmartPointer
       Object* pointer;
 };
 
-/****
- * Specialization for CUDA
+/**
+ * \brief Specialization of the \ref DevicePointer for the CUDA device.
+ *
+ * \tparam  Object is a type of object to be owned by the pointer.
  */
 template< typename Object >
 class DevicePointer< Object, Devices::Cuda > : public SmartPointer
 {
    private:
-      // Convenient template alias for controlling the selection of copy- and
-      // move-constructors and assignment operators using SFINAE.
-      // The type Object_ is "enabled" iff Object_ and Object are not the same,
-      // but after removing const and volatile qualifiers they are the same.
+      /**
+       * \typedef Enabler
+       *
+       * Convenient template alias for controlling the selection of copy- and
+       * move-constructors and assignment operators using SFINAE.
+       * The type Object_ is "enabled" iff Object_ and Object are not the same,
+       * but after removing const and volatile qualifiers they are the same.
+       */
       template< typename Object_ >
       using Enabler = std::enable_if< ! std::is_same< Object_, Object >::value &&
                                       std::is_same< typename std::remove_cv< Object >::type, Object_ >::value >;
@@ -212,9 +386,30 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
 
    public:
 
-      typedef Object ObjectType;
-      typedef Devices::Cuda DeviceType;
+      /**
+       * \typedef ObjectType is the type of object owned by the pointer.
+       */
+      using ObjectType = Object;
+
+      /**
+       * \typedef DeviceType is the type of device where the object is to be
+       * mirrored.
+       */
+      using DeviceType = Devices::Cuda;
+
+      /**
+       * \brief Constructor of empty pointer.
+       */
+      DevicePointer( std::nullptr_t )
+      : pointer( nullptr ),
+        pd( nullptr ),
+        cuda_pointer( nullptr ) {}
 
+      /**
+       * \brief Constructor with an object reference.
+       *
+       * \param obj is a reference on an object to be managed by the pointer.
+       */
       explicit  DevicePointer( ObjectType& obj )
       : pointer( nullptr ),
         pd( nullptr ),
@@ -223,8 +418,12 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          this->allocate( obj );
       }
 
-      // this is needed only to avoid the default compiler-generated constructor
-      DevicePointer( const DevicePointer& pointer )
+      /**
+       * \brief Copy constructor.
+       *
+       * \param pointer is the source device pointer.
+       */
+      DevicePointer( const DevicePointer& pointer ) // this is needed only to avoid the default compiler-generated constructor
       : pointer( pointer.pointer ),
         pd( (PointerData*) pointer.pd ),
         cuda_pointer( pointer.cuda_pointer )
@@ -232,10 +431,18 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          this->pd->counter += 1;
       }
 
-      // conditional constructor for non-const -> const data
+      /**
+       * \brief Copy constructor.
+       *
+       * This is specialization for compatible object types.
+       *
+       * See \ref Enabler.
+       *
+       * \param pointer is the source device pointer.
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      DevicePointer( const DevicePointer< Object_, DeviceType >& pointer )
+      DevicePointer( const DevicePointer< Object_, DeviceType >& pointer ) // conditional constructor for non-const -> const data
       : pointer( pointer.pointer ),
         pd( (PointerData*) pointer.pd ),
         cuda_pointer( pointer.cuda_pointer )
@@ -243,8 +450,12 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          this->pd->counter += 1;
       }
 
-      // this is needed only to avoid the default compiler-generated constructor
-      DevicePointer( DevicePointer&& pointer )
+      /**
+       * \brief Move constructor.
+       *
+       * \param pointer is the source device pointer.
+       */
+      DevicePointer( DevicePointer&& pointer ) // this is needed only to avoid the default compiler-generated constructor
       : pointer( pointer.pointer ),
         pd( (PointerData*) pointer.pd ),
         cuda_pointer( pointer.cuda_pointer )
@@ -254,10 +465,18 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          pointer.cuda_pointer = nullptr;
       }
 
-      // conditional constructor for non-const -> const data
+      /**
+       * \brief Move constructor.
+       *
+       * This is specialization for compatible object types.
+       *
+       * See \ref Enabler.
+       *
+       * \param pointer is the source device pointer.
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      DevicePointer( DevicePointer< Object_, DeviceType >&& pointer )
+      DevicePointer( DevicePointer< Object_, DeviceType >&& pointer ) // conditional constructor for non-const -> const data
       : pointer( pointer.pointer ),
         pd( (PointerData*) pointer.pd ),
         cuda_pointer( pointer.cuda_pointer )
@@ -267,40 +486,108 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          pointer.cuda_pointer = nullptr;
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by constant smart pointer.
+       *
+       * \return constant pointer to the object owned by this smart pointer. It
+       * returns pointer to object image on the CUDA device if it is called from CUDA
+       * kernel and pointer to host image otherwise.
+       */
+      __cuda_callable__
       const Object* operator->() const
       {
+#ifdef __CUDA_ARCH__
+         return this->cuda_pointer;
+#else
          return this->pointer;
+#endif
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by non-constant smart pointer.
+       *
+       * \return pointer to the object owned by this smart pointer. It
+       * returns pointer to object image on the CUDA device if it is called from CUDA
+       * kernel and pointer to host image otherwise.
+       */
+      __cuda_callable__
       Object* operator->()
       {
+#ifdef __CUDA_ARCH__
+         return this->cuda_pointer;
+#else
          this->pd->maybe_modified = true;
          return this->pointer;
+#endif
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by constant smart pointer.
+       *
+       * \return constant reference to the object owned by this smart pointer. It
+       * returns reference to object image on the CUDA device if it is called from CUDA
+       * kernel and reference to host image otherwise.
+       */
+      __cuda_callable__
       const Object& operator *() const
       {
+#ifdef __CUDA_ARCH__
+         return *( this->cuda_pointer );
+#else
          return *( this->pointer );
+#endif
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by non-constant smart pointer.
+       *
+       * \return reference to the object owned by this smart pointer.  It
+       * returns reference to object image on the CUDA device if it is called from CUDA
+       * kernel and reference to host image otherwise.
+       */
+      __cuda_callable__
       Object& operator *()
       {
+#ifdef __CUDA_ARCH__
+         return *( this->cuda_pointer );
+#else
          this->pd->maybe_modified = true;
          return *( this->pointer );
+#endif
       }
 
+      /**
+       * \brief Conversion to boolean type.
+       *
+       * \return Returns true if the pointer is not empty, false otherwise.
+       */
       __cuda_callable__
       operator bool() const
       {
          return this->pd;
       }
 
+      /**
+       * \brief Negation operator.
+       *
+       * \return Returns false if the pointer is not empty, true otherwise.
+       */
       __cuda_callable__
       bool operator!() const
       {
          return ! this->pd;
       }
 
+      /**
+       * \brief Constant object reference getter.
+       *
+       * No synchronization of this pointer will be performed due to calling
+       * this method.
+       *
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       __cuda_callable__
       const Object& getData() const
@@ -315,6 +602,18 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
             return *( this->cuda_pointer );
       }
 
+      /**
+       * \brief Non-constant object reference getter.
+       *
+       * After calling this method, the object owned by the pointer might need
+       * to be synchronized. One should not forget to call
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >()
+       * before calling CUDA kernel using object from this smart pointer.
+       *
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       __cuda_callable__
       Object& modifyData()
@@ -332,32 +631,57 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
             return *( this->cuda_pointer );
       }
 
-      // this is needed only to avoid the default compiler-generated operator
-      const DevicePointer& operator=( const DevicePointer& ptr )
+      /**
+       * \brief Assignment operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
+      const DevicePointer& operator=( const DevicePointer& ptr ) // this is needed only to avoid the default compiler-generated operator
       {
          this->free();
          this->pointer = ptr.pointer;
          this->pd = (PointerData*) ptr.pd;
          this->cuda_pointer = ptr.cuda_pointer;
-         this->pd->counter += 1;
+         if( this->pd )
+            this->pd->counter += 1;
          return *this;
       }
 
-      // conditional operator for non-const -> const data
+      /**
+       * \brief Assignment operator for compatible object types.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * See \ref Enabler.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const DevicePointer& operator=( const DevicePointer< Object_, DeviceType >& ptr )
+      const DevicePointer& operator=( const DevicePointer< Object_, DeviceType >& ptr ) // conditional operator for non-const -> const data
       {
          this->free();
          this->pointer = ptr.pointer;
          this->pd = (PointerData*) ptr.pd;
          this->cuda_pointer = ptr.cuda_pointer;
-         this->pd->counter += 1;
+         if( this->pd )
+            this->pd->counter += 1;
          return *this;
       }
 
-      // this is needed only to avoid the default compiler-generated operator
-      const DevicePointer& operator=( DevicePointer&& ptr )
+      /**
+       * \brief Move operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
+      const DevicePointer& operator=( DevicePointer&& ptr ) // this is needed only to avoid the default compiler-generated operator
       {
          this->free();
          this->pointer = ptr.pointer;
@@ -369,10 +693,19 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          return *this;
       }
 
-      // conditional operator for non-const -> const data
+      /**
+       * \brief Move operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * See \ref Enabler.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const DevicePointer& operator=( DevicePointer< Object_, DeviceType >&& ptr )
+      const DevicePointer& operator=( DevicePointer< Object_, DeviceType >&& ptr ) // conditional operator for non-const -> const data
       {
          this->free();
          this->pointer = ptr.pointer;
@@ -384,6 +717,14 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          return *this;
       }
 
+      /**
+       * \brief Cross-device pointer synchronization.
+       *
+       * This method is usually called by the smart pointers register when calling
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >()
+       *
+       * \return true if the synchronization was successful, false otherwise.
+       */
       bool synchronize()
       {
          if( ! this->pd )
@@ -404,6 +745,21 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
 #endif
       }
 
+      /**
+       * \brief Swap the owned object with another pointer.
+       *
+       * \param ptr2 the other device pointer for swapping.
+       */
+      void swap( DevicePointer& ptr2 )
+      {
+         std::swap( this->pointer, ptr2.pointer );
+         std::swap( this->pd, ptr2.pd );
+         std::swap( this->cuda_pointer, ptr2.cuda_pointer );
+      }
+
+      /**
+       * \brief Destructor.
+       */
       ~DevicePointer()
       {
          this->free();
diff --git a/src/TNL/Pointers/SharedPointer.h b/src/TNL/Pointers/SharedPointer.h
index 93f63f807c5038795c53cc0c5182571ab2d8a9c4..293434ccd2ea589b48f574e994caabfcfc7d99fd 100644
--- a/src/TNL/Pointers/SharedPointer.h
+++ b/src/TNL/Pointers/SharedPointer.h
@@ -22,6 +22,32 @@
 namespace TNL {
 namespace Pointers {
 
+/**
+ * \brief Cross-device shared smart pointer.
+ * 
+ * This smart pointer is inspired by std::shared_ptr from STL library. It means
+ * that the object owned by the smart pointer can be shared with other
+ * smart pointers. One can make a copy of this smart pointer. In addition,
+ * the smart pointer is able to work across different devices which means that the
+ * object owned by the smart pointer is mirrored on both host and device.
+ * 
+ * **NOTE: When using smart pointers to pass objects on GPU, one must call 
+ * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >() 
+ * before calling a CUDA kernel working with smart pointers.**
+ * 
+ * \tparam Object is a type of object to be owned by the pointer.
+ * \tparam Device is device where the object is to be allocated. The object is
+ * always allocated on the host system as well for easier object manipulation.
+ * 
+ * See also \ref UniquePointer and \ref DevicePointer.
+ * 
+ * See also \ref SharedPointer< Object, Devices::Host > and \ref SharedPointer< Object, Devices::Cuda >.
+ *
+ * \par Example
+ * \include Pointers/SharedPointerExample.cpp
+ * \par Output
+ * \include SharedPointerExample.out
+ */
 template< typename Object,
           typename Device = typename Object::DeviceType >
 class SharedPointer
diff --git a/src/TNL/Pointers/SharedPointerCuda.h b/src/TNL/Pointers/SharedPointerCuda.h
index 54dd4ee3c71c7eb461de7d8c906fd42dc96af1c6..f4f73ec39f30ae0455cabdf5fd72886d20fb9b8a 100644
--- a/src/TNL/Pointers/SharedPointerCuda.h
+++ b/src/TNL/Pointers/SharedPointerCuda.h
@@ -28,15 +28,25 @@ namespace Pointers {
 
 //#define HAVE_CUDA_UNIFIED_MEMORY
 
-#ifdef HAVE_CUDA_UNIFIED_MEMORY
+#if ! defined HAVE_CUDA_UNIFIED_MEMORY
+
+/**
+ * \brief Specialization of the \ref SharedPointer for the CUDA device.
+ *
+ * \tparam  Object is a type of object to be owned by the pointer.
+ */
 template< typename Object >
 class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 {
    private:
-      // Convenient template alias for controlling the selection of copy- and
-      // move-constructors and assignment operators using SFINAE.
-      // The type Object_ is "enabled" iff Object_ and Object are not the same,
-      // but after removing const and volatile qualifiers they are the same.
+      /**
+       * \typedef Enabler
+       *
+       * Convenient template alias for controlling the selection of copy- and
+       * move-constructors and assignment operators using SFINAE.
+       * The type Object_ is "enabled" iff Object_ and Object are not the same,
+       * but after removing const and volatile qualifiers they are the same.
+       */
       template< typename Object_ >
       using Enabler = std::enable_if< ! std::is_same< Object_, Object >::value &&
                                       std::is_same< typename std::remove_cv< Object >::type, Object_ >::value >;
@@ -47,71 +57,129 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 
    public:
 
+      /**
+       * \typedef ObjectType is the type of object owned by the pointer.
+       */
       using ObjectType = Object;
-      using DeviceType = Devices::Cuda; 
 
+      /**
+       * \typedef DeviceType is the type of device where the object is to be
+       * mirrored.
+       */
+      using DeviceType = Devices::Cuda;
+
+      /**
+       * \brief Constructor of empty pointer.
+       */
       SharedPointer( std::nullptr_t )
-      : pd( nullptr )
+      : pd( nullptr ),
+        cuda_pointer( nullptr )
       {}
 
+      /**
+       * \brief Constructor with parameters of the Object constructor.
+       *
+       * \tparam Args is variadic template type of arguments of the Object constructor.
+       * \tparam args are arguments passed to the Object constructor.
+       */
       template< typename... Args >
       explicit  SharedPointer( Args... args )
-      : pd( nullptr )
+      : pd( nullptr ),
+        cuda_pointer( nullptr )
       {
-#ifdef TNL_DEBUG_SHARED_POINTERS
-         std::cerr << "Creating shared pointer to " << getType< ObjectType >() << std::endl;
-#endif
          this->allocate( args... );
       }
 
-      // this is needed only to avoid the default compiler-generated constructor
-      SharedPointer( const SharedPointer& pointer )
-      : pd( (PointerData*) pointer.pd )
+      /**
+       * \brief Copy constructor.
+       *
+       * \param pointer is the source shared pointer.
+       */
+      SharedPointer( const SharedPointer& pointer ) // this is needed only to avoid the default compiler-generated constructor
+      : pd( (PointerData*) pointer.pd ),
+        cuda_pointer( pointer.cuda_pointer )
       {
          this->pd->counter += 1;
       }
 
-      // conditional constructor for non-const -> const data
+      /**
+       * \brief Copy constructor.
+       *
+       * This is specialization for compatible object types.
+       *
+       * See \ref Enabler.
+       *
+       * \param pointer is the source shared pointer.
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      SharedPointer( const SharedPointer<  Object_, DeviceType >& pointer )
-      : pd( (PointerData*) pointer.pd )
+      SharedPointer( const SharedPointer<  Object_, DeviceType >& pointer ) // conditional constructor for non-const -> const data
+      : pd( (PointerData*) pointer.pd ),
+        cuda_pointer( pointer.cuda_pointer )
       {
          this->pd->counter += 1;
       }
 
-      // this is needed only to avoid the default compiler-generated constructor
-      SharedPointer( SharedPointer&& pointer )
-      : pd( (PointerData*) pointer.pd )
+      /**
+       * \brief Move constructor.
+       *
+       * \param pointer is the source shared pointer.
+       */
+      SharedPointer( SharedPointer&& pointer ) // this is needed only to avoid the default compiler-generated constructor
+      : pd( (PointerData*) pointer.pd ),
+        cuda_pointer( pointer.cuda_pointer )
       {
          pointer.pd = nullptr;
+         pointer.cuda_pointer = nullptr;
       }
 
-      // conditional constructor for non-const -> const data
+      /**
+       * \brief Move constructor.
+       *
+       * This is specialization for compatible object types.
+       *
+       * See \ref Enabler.
+       *
+       * \param pointer is the source shared pointer.
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      SharedPointer( SharedPointer<  Object_, DeviceType >&& pointer )
-      : pd( (PointerData*) pointer.pd )
+      SharedPointer( SharedPointer<  Object_, DeviceType >&& pointer ) // conditional constructor for non-const -> const data
+      : pd( (PointerData*) pointer.pd ),
+        cuda_pointer( pointer.cuda_pointer )
       {
          pointer.pd = nullptr;
+         pointer.cuda_pointer = nullptr;
       }
 
+      /**
+       * \brief Create new object based in given constructor parameters.
+       *
+       * \tparam Args is variadic template type of arguments to be passed to the
+       *    object constructor.
+       * \param args are arguments to be passed to the object constructor.
+       * \return true if recreation was successful, false otherwise.
+       */
       template< typename... Args >
       bool recreate( Args... args )
       {
 #ifdef TNL_DEBUG_SHARED_POINTERS
          std::cerr << "Recreating shared pointer to " << getType< ObjectType >() << std::endl;
 #endif
-         if( ! this->counter )
+         if( ! this->pd )
             return this->allocate( args... );
 
-         if( *this->pd->counter == 1 )
+         if( this->pd->counter == 1 )
          {
             /****
              * The object is not shared -> recreate it in-place, without reallocation
              */
-            this->pd->data.~ObjectType();
-            new ( this->pd->data ) ObjectType( args... );
+            this->pd->data.~Object();
+            new ( &this->pd->data ) Object( args... );
+#ifdef HAVE_CUDA
+            cudaMemcpy( (void*) this->cuda_pointer, (void*) &this->pd->data, sizeof( Object ), cudaMemcpyHostToDevice );
+#endif
+            this->set_last_sync_state();
             return true;
          }
 
@@ -121,167 +189,381 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
          return this->allocate( args... );
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by constant smart pointer.
+       *
+       * \return constant pointer to the object owned by this smart pointer. It
+       * returns pointer to object image on the CUDA device if it is called from CUDA
+       * kernel and pointer to host image otherwise.
+       */
+      __cuda_callable__
       const Object* operator->() const
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
+#ifdef __CUDA_ARCH__
+         return this->cuda_pointer;
+#else
          return &this->pd->data;
+#endif
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by non-constant smart pointer.
+       *
+       * \return pointer to the object owned by this smart pointer. It
+       * returns pointer to object image on the CUDA device if it is called from CUDA
+       * kernel and pointer to host image otherwise.
+       */
+      __cuda_callable__
       Object* operator->()
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
+#ifdef __CUDA_ARCH__
+         return this->cuda_pointer;
+#else
+         this->pd->maybe_modified = true;
          return &this->pd->data;
+#endif
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by constant smart pointer.
+       *
+       * \return constant reference to the object owned by this smart pointer. It
+       * returns reference to object image on the CUDA device if it is called from CUDA
+       * kernel and reference to host image otherwise.
+       */
+      __cuda_callable__
       const Object& operator *() const
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
+#ifdef __CUDA_ARCH__
+         return *( this->cuda_pointer );
+#else
          return this->pd->data;
+#endif
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by non-constant smart pointer.
+       *
+       * \return reference to the object owned by this smart pointer.  It
+       * returns reference to object image on the CUDA device if it is called from CUDA
+       * kernel and reference to host image otherwise.
+       */
+      __cuda_callable__
       Object& operator *()
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
+#ifdef __CUDA_ARCH__
+         return *( this->cuda_pointer );
+#else
+         this->pd->maybe_modified = true;
          return this->pd->data;
+#endif
       }
 
+      /**
+       * \brief Conversion to boolean type.
+       *
+       * \return Returns true if the pointer is not empty, false otherwise.
+       */
       __cuda_callable__
       operator bool() const
       {
          return this->pd;
       }
 
+      /**
+       * \brief Negation operator.
+       *
+       * \return Returns false if the pointer is not empty, true otherwise.
+       */
       __cuda_callable__
       bool operator!() const
       {
          return ! this->pd;
       }
 
+      /**
+       * \brief Constant object reference getter.
+       *
+       * No synchronization of this pointer will be performed due to calling
+       * this method.
+       *
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       __cuda_callable__
       const Object& getData() const
       {
+         static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value, "Only Devices::Host or Devices::Cuda devices are accepted here." );
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
-         return this->pd->data;
+         TNL_ASSERT_TRUE( this->cuda_pointer, "Attempt to dereference a null pointer" );
+         if( std::is_same< Device, Devices::Host >::value )
+            return this->pd->data;
+         if( std::is_same< Device, Devices::Cuda >::value )
+            return *( this->cuda_pointer );
       }
 
+      /**
+       * \brief Non-constant object reference getter.
+       *
+       * After calling this method, the object owned by the pointer might need
+       * to be synchronized. One should not forget to call
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >()
+       * before calling CUDA kernel using object from this smart pointer.
+       *
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       __cuda_callable__
       Object& modifyData()
       {
+         static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value, "Only Devices::Host or Devices::Cuda devices are accepted here." );
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
-         return this->pd->data;
+         TNL_ASSERT_TRUE( this->cuda_pointer, "Attempt to dereference a null pointer" );
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            this->pd->maybe_modified = true;
+            return this->pd->data;
+         }
+         if( std::is_same< Device, Devices::Cuda >::value )
+            return *( this->cuda_pointer );
       }
 
-      // this is needed only to avoid the default compiler-generated operator
-      const SharedPointer& operator=( const SharedPointer& ptr )
+      /**
+       * \brief Assignment operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
+      const SharedPointer& operator=( const SharedPointer& ptr ) // this is needed only to avoid the default compiler-generated operator
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
-         if( this->pd != nullptr ) 
+         this->cuda_pointer = ptr.cuda_pointer;
+         if( this->pd != nullptr )
             this->pd->counter += 1;
+#ifdef TNL_DEBUG_SHARED_POINTERS
+         std::cerr << "Copy-assigned shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
+#endif
          return *this;
       }
 
-      // conditional operator for non-const -> const data
+      /**
+       * \brief Assignment operator for compatible object types.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * See \ref Enabler.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const SharedPointer& operator=( const SharedPointer<  Object_, DeviceType >& ptr )
+      const SharedPointer& operator=( const SharedPointer<  Object_, DeviceType >& ptr ) // conditional operator for non-const -> const data
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
+         this->cuda_pointer = ptr.cuda_pointer;
          if( this->pd != nullptr )
             this->pd->counter += 1;
+#ifdef TNL_DEBUG_SHARED_POINTERS
+         std::cerr << "Copy-assigned shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
+#endif
          return *this;
       }
 
-      // this is needed only to avoid the default compiler-generated operator
-      const SharedPointer& operator=( SharedPointer&& ptr )
+      /**
+       * \brief Move operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
+      const SharedPointer& operator=( SharedPointer&& ptr ) // this is needed only to avoid the default compiler-generated operator
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
+         this->cuda_pointer = ptr.cuda_pointer;
          ptr.pd = nullptr;
+         ptr.cuda_pointer = nullptr;
+#ifdef TNL_DEBUG_SHARED_POINTERS
+         std::cerr << "Move-assigned shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
+#endif
          return *this;
       }
 
-      // conditional operator for non-const -> const data
+      /**
+       * \brief Move operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * See \ref Enabler.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const SharedPointer& operator=( SharedPointer<  Object_, DeviceType >&& ptr )
+      const SharedPointer& operator=( SharedPointer<  Object_, DeviceType >&& ptr ) // conditional operator for non-const -> const data
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
+         this->cuda_pointer = ptr.cuda_pointer;
          ptr.pd = nullptr;
+         ptr.cuda_pointer = nullptr;
+#ifdef TNL_DEBUG_SHARED_POINTERS
+         std::cerr << "Move-assigned shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
+#endif
          return *this;
       }
 
+      /**
+       * \brief Cross-device pointer synchronization.
+       *
+       * This method is usually called by the smart pointers register when calling
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >()
+       *
+       * \return true if the synchronization was successful, false otherwise.
+       */
       bool synchronize()
       {
+         if( ! this->pd )
+            return true;
+#ifdef HAVE_CUDA
+         if( this->modified() )
+         {
+#ifdef TNL_DEBUG_SHARED_POINTERS
+            std::cerr << "Synchronizing shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
+            std::cerr << "   ( " << sizeof( Object ) << " bytes, CUDA adress " << this->cuda_pointer << " )" << std::endl;
+#endif
+            TNL_ASSERT( this->cuda_pointer, );
+            cudaMemcpy( (void*) this->cuda_pointer, (void*) &this->pd->data, sizeof( Object ), cudaMemcpyHostToDevice );
+            TNL_CHECK_CUDA_DEVICE;
+            this->set_last_sync_state();
+            return true;
+         }
          return true;
+#else
+         return false;
+#endif
       }
 
+      /**
+       * \brief Reset the pointer to empty state.
+       */
       void clear()
       {
          this->free();
       }
 
+      /**
+       * \brief Swap the owned object with another pointer.
+       *
+       * \param ptr2 the other shared pointer for swapping.
+       */
       void swap( SharedPointer& ptr2 )
       {
          std::swap( this->pd, ptr2.pd );
+         std::swap( this->cuda_pointer, ptr2.cuda_pointer );
       }
 
+      /**
+       * \brief Destructor.
+       */
       ~SharedPointer()
       {
          this->free();
+         getSmartPointersRegister< DeviceType >().remove( this );
       }
 
-
    protected:
 
       struct PointerData
       {
          Object data;
+         char data_image[ sizeof(Object) ];
          int counter;
+         bool maybe_modified;
 
          template< typename... Args >
          explicit PointerData( Args... args )
          : data( args... ),
-           counter( 1 )
+           counter( 1 ),
+           maybe_modified( false )
          {}
       };
 
       template< typename... Args >
       bool allocate( Args... args )
       {
-#ifdef HAVE_CUDA
-         if( cudaMallocManaged( ( void** ) &this->pd, sizeof( PointerData ) != cudaSuccess ) )
-            return false;
-         new ( this->pd ) PointerData( args... );
-         return true;
-#else
-         return false;
+         this->pd = new PointerData( args... );
+         // pass to device
+         this->cuda_pointer = Cuda::passToDevice( this->pd->data );
+         // set last-sync state
+         this->set_last_sync_state();
+#ifdef TNL_DEBUG_SHARED_POINTERS
+         std::cerr << "Created shared pointer to " << getType< ObjectType >() << " (cuda_pointer = " << this->cuda_pointer << ")" << std::endl;
 #endif
+         getSmartPointersRegister< DeviceType >().insert( this );
+         return true;
+      }
+
+      void set_last_sync_state()
+      {
+         TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
+         std::memcpy( (void*) &this->pd->data_image, (void*) &this->pd->data, sizeof( Object ) );
+         this->pd->maybe_modified = false;
+      }
+
+      bool modified()
+      {
+         TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
+         // optimization: skip bitwise comparison if we're sure that the data is the same
+         if( ! this->pd->maybe_modified )
+            return false;
+         return std::memcmp( (void*) &this->pd->data_image, (void*) &this->pd->data, sizeof( Object ) ) != 0;
       }
 
       void free()
       {
          if( this->pd )
          {
+#ifdef TNL_DEBUG_SHARED_POINTERS
+            std::cerr << "Freeing shared pointer: counter = " << this->pd->counter << ", cuda_pointer = " << this->cuda_pointer << ", type: " << getType< ObjectType >() << std::endl;
+#endif
             if( ! --this->pd->counter )
             {
-#ifdef HAVE_CUDA
-               cudaFree( this->pd );
-#endif
+               delete this->pd;
                this->pd = nullptr;
+               if( this->cuda_pointer )
+                  Cuda::freeFromDevice( this->cuda_pointer );
+#ifdef TNL_DEBUG_SHARED_POINTERS
+               std::cerr << "...deleted data." << std::endl;
+#endif
             }
          }
       }
 
       PointerData* pd;
-};
 
-#else // HAVE_CUDA_UNIFIED_MEMORY
+      // cuda_pointer can't be part of PointerData structure, since we would be
+      // unable to dereference this-pd on the device
+      Object* cuda_pointer;
+};
 
+#else
+// Implementation with CUDA unified memory. It is very slow, we keep it only for experimental reasons.
 template< typename Object >
 class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 {
@@ -301,25 +583,25 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
    public:
 
       using ObjectType = Object;
-      using DeviceType = Devices::Cuda; 
+      using DeviceType = Devices::Cuda;
 
       SharedPointer( std::nullptr_t )
-      : pd( nullptr ),
-        cuda_pointer( nullptr )
+      : pd( nullptr )
       {}
 
       template< typename... Args >
       explicit  SharedPointer( Args... args )
-      : pd( nullptr ),
-        cuda_pointer( nullptr )
+      : pd( nullptr )
       {
+#ifdef TNL_DEBUG_SHARED_POINTERS
+         std::cerr << "Creating shared pointer to " << getType< ObjectType >() << std::endl;
+#endif
          this->allocate( args... );
       }
 
       // this is needed only to avoid the default compiler-generated constructor
       SharedPointer( const SharedPointer& pointer )
-      : pd( (PointerData*) pointer.pd ),
-        cuda_pointer( pointer.cuda_pointer )
+      : pd( (PointerData*) pointer.pd )
       {
          this->pd->counter += 1;
       }
@@ -328,30 +610,25 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
       SharedPointer( const SharedPointer<  Object_, DeviceType >& pointer )
-      : pd( (PointerData*) pointer.pd ),
-        cuda_pointer( pointer.cuda_pointer )
+      : pd( (PointerData*) pointer.pd )
       {
          this->pd->counter += 1;
       }
 
       // this is needed only to avoid the default compiler-generated constructor
       SharedPointer( SharedPointer&& pointer )
-      : pd( (PointerData*) pointer.pd ),
-        cuda_pointer( pointer.cuda_pointer )
+      : pd( (PointerData*) pointer.pd )
       {
          pointer.pd = nullptr;
-         pointer.cuda_pointer = nullptr;
       }
 
       // conditional constructor for non-const -> const data
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
       SharedPointer( SharedPointer<  Object_, DeviceType >&& pointer )
-      : pd( (PointerData*) pointer.pd ),
-        cuda_pointer( pointer.cuda_pointer )
+      : pd( (PointerData*) pointer.pd )
       {
          pointer.pd = nullptr;
-         pointer.cuda_pointer = nullptr;
       }
 
       template< typename... Args >
@@ -360,20 +637,16 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 #ifdef TNL_DEBUG_SHARED_POINTERS
          std::cerr << "Recreating shared pointer to " << getType< ObjectType >() << std::endl;
 #endif
-         if( ! this->pd )
+         if( ! this->counter )
             return this->allocate( args... );
 
-         if( this->pd->counter == 1 )
+         if( *this->pd->counter == 1 )
          {
             /****
              * The object is not shared -> recreate it in-place, without reallocation
              */
-            this->pd->data.~Object();
-            new ( &this->pd->data ) Object( args... );
-#ifdef HAVE_CUDA
-            cudaMemcpy( (void*) this->cuda_pointer, (void*) &this->pd->data, sizeof( Object ), cudaMemcpyHostToDevice );
-#endif
-            this->set_last_sync_state();
+            this->pd->data.~ObjectType();
+            new ( this->pd->data ) ObjectType( args... );
             return true;
          }
 
@@ -392,7 +665,6 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       Object* operator->()
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
-         this->pd->maybe_modified = true;
          return &this->pd->data;
       }
 
@@ -405,7 +677,6 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       Object& operator *()
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
-         this->pd->maybe_modified = true;
          return this->pd->data;
       }
 
@@ -425,29 +696,16 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       __cuda_callable__
       const Object& getData() const
       {
-         static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value, "Only Devices::Host or Devices::Cuda devices are accepted here." );
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
-         TNL_ASSERT_TRUE( this->cuda_pointer, "Attempt to dereference a null pointer" );
-         if( std::is_same< Device, Devices::Host >::value )
-            return this->pd->data;
-         if( std::is_same< Device, Devices::Cuda >::value )
-            return *( this->cuda_pointer );
+         return this->pd->data;
       }
 
       template< typename Device = Devices::Host >
       __cuda_callable__
       Object& modifyData()
       {
-         static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value, "Only Devices::Host or Devices::Cuda devices are accepted here." );
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
-         TNL_ASSERT_TRUE( this->cuda_pointer, "Attempt to dereference a null pointer" );
-         if( std::is_same< Device, Devices::Host >::value )
-         {
-            this->pd->maybe_modified = true;
-            return this->pd->data;
-         }
-         if( std::is_same< Device, Devices::Cuda >::value )
-            return *( this->cuda_pointer );
+         return this->pd->data;
       }
 
       // this is needed only to avoid the default compiler-generated operator
@@ -455,12 +713,8 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
-         this->cuda_pointer = ptr.cuda_pointer;
          if( this->pd != nullptr )
             this->pd->counter += 1;
-#ifdef TNL_DEBUG_SHARED_POINTERS
-         std::cerr << "Copy-assigned shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
-#endif
          return *this;
       }
 
@@ -471,12 +725,8 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
-         this->cuda_pointer = ptr.cuda_pointer;
          if( this->pd != nullptr )
             this->pd->counter += 1;
-#ifdef TNL_DEBUG_SHARED_POINTERS
-         std::cerr << "Copy-assigned shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
-#endif
          return *this;
       }
 
@@ -485,12 +735,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
-         this->cuda_pointer = ptr.cuda_pointer;
          ptr.pd = nullptr;
-         ptr.cuda_pointer = nullptr;
-#ifdef TNL_DEBUG_SHARED_POINTERS
-         std::cerr << "Move-assigned shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
-#endif
          return *this;
       }
 
@@ -501,36 +746,13 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
-         this->cuda_pointer = ptr.cuda_pointer;
          ptr.pd = nullptr;
-         ptr.cuda_pointer = nullptr;
-#ifdef TNL_DEBUG_SHARED_POINTERS
-         std::cerr << "Move-assigned shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
-#endif
          return *this;
       }
 
       bool synchronize()
       {
-         if( ! this->pd )
-            return true;
-#ifdef HAVE_CUDA
-         if( this->modified() )
-         {
-#ifdef TNL_DEBUG_SHARED_POINTERS
-            std::cerr << "Synchronizing shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
-            std::cerr << "   ( " << sizeof( Object ) << " bytes, CUDA adress " << this->cuda_pointer << " )" << std::endl;
-#endif
-            TNL_ASSERT( this->cuda_pointer, );
-            cudaMemcpy( (void*) this->cuda_pointer, (void*) &this->pd->data, sizeof( Object ), cudaMemcpyHostToDevice );
-            TNL_CHECK_CUDA_DEVICE;
-            this->set_last_sync_state();
-            return true;
-         }
          return true;
-#else
-         return false;
-#endif
       }
 
       void clear()
@@ -541,90 +763,59 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       void swap( SharedPointer& ptr2 )
       {
          std::swap( this->pd, ptr2.pd );
-         std::swap( this->cuda_pointer, ptr2.cuda_pointer );
       }
 
       ~SharedPointer()
       {
          this->free();
-         getSmartPointersRegister< DeviceType >().remove( this );
       }
 
+
    protected:
 
       struct PointerData
       {
          Object data;
-         char data_image[ sizeof(Object) ];
          int counter;
-         bool maybe_modified;
 
          template< typename... Args >
          explicit PointerData( Args... args )
          : data( args... ),
-           counter( 1 ),
-           maybe_modified( false )
+           counter( 1 )
          {}
       };
 
       template< typename... Args >
       bool allocate( Args... args )
       {
-         this->pd = new PointerData( args... );
-         // pass to device
-         this->cuda_pointer = Cuda::passToDevice( this->pd->data );
-         // set last-sync state
-         this->set_last_sync_state();
-#ifdef TNL_DEBUG_SHARED_POINTERS
-         std::cerr << "Created shared pointer to " << getType< ObjectType >() << " (cuda_pointer = " << this->cuda_pointer << ")" << std::endl;
-#endif
-         getSmartPointersRegister< DeviceType >().insert( this );
-         return true;
-      }
-
-      void set_last_sync_state()
-      {
-         TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
-         std::memcpy( (void*) &this->pd->data_image, (void*) &this->pd->data, sizeof( Object ) );
-         this->pd->maybe_modified = false;
-      }
-
-      bool modified()
-      {
-         TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
-         // optimization: skip bitwise comparison if we're sure that the data is the same
-         if( ! this->pd->maybe_modified )
+#ifdef HAVE_CUDA
+         if( cudaMallocManaged( ( void** ) &this->pd, sizeof( PointerData ) != cudaSuccess ) )
             return false;
-         return std::memcmp( (void*) &this->pd->data_image, (void*) &this->pd->data, sizeof( Object ) ) != 0;
+         new ( this->pd ) PointerData( args... );
+         return true;
+#else
+         return false;
+#endif
       }
 
       void free()
       {
          if( this->pd )
          {
-#ifdef TNL_DEBUG_SHARED_POINTERS
-            std::cerr << "Freeing shared pointer: counter = " << this->pd->counter << ", cuda_pointer = " << this->cuda_pointer << ", type: " << getType< ObjectType >() << std::endl;
-#endif
             if( ! --this->pd->counter )
             {
-               delete this->pd;
-               this->pd = nullptr;
-               if( this->cuda_pointer )
-                  Cuda::freeFromDevice( this->cuda_pointer );
-#ifdef TNL_DEBUG_SHARED_POINTERS
-               std::cerr << "...deleted data." << std::endl;
+#ifdef HAVE_CUDA
+               cudaFree( this->pd );
 #endif
+               this->pd = nullptr;
             }
          }
       }
 
       PointerData* pd;
-
-      // cuda_pointer can't be part of PointerData structure, since we would be
-      // unable to dereference this-pd on the device
-      Object* cuda_pointer;
 };
-#endif // HAVE_CUDA_UNIFIED_MEMORY
+
+#endif // ! HAVE_CUDA_UNIFIED_MEMORY
 
 } // namespace Pointers
 } // namespace TNL
diff --git a/src/TNL/Pointers/SharedPointerHost.h b/src/TNL/Pointers/SharedPointerHost.h
index 39a6d4da4a2b8ab8b964110173d1716fade1ac71..ea8654d16ee8c443b1073f9890d6081c497367ab 100644
--- a/src/TNL/Pointers/SharedPointerHost.h
+++ b/src/TNL/Pointers/SharedPointerHost.h
@@ -24,14 +24,23 @@
 namespace TNL {
 namespace Pointers {
 
+/**
+ * \brief Specialization of the \ref SharedPointer for the host system.
+ *
+ * \tparam  Object is a type of object to be owned by the pointer.
+ */
 template< typename Object >
 class SharedPointer< Object, Devices::Host > : public SmartPointer
 {
    private:
-      // Convenient template alias for controlling the selection of copy- and
-      // move-constructors and assignment operators using SFINAE.
-      // The type Object_ is "enabled" iff Object_ and Object are not the same,
-      // but after removing const and volatile qualifiers they are the same.
+
+      /**
+       * \typedef Enabler
+       * Convenient template alias for controlling the selection of copy- and
+       * move-constructors and assignment operators using SFINAE.
+       * The type Object_ is "enabled" iff Object_ and Object are not the same,
+       * but after removing const and volatile qualifiers they are the same.
+       */
       template< typename Object_ >
       using Enabler = std::enable_if< ! std::is_same< Object_, Object >::value &&
                                       std::is_same< typename std::remove_cv< Object >::type, Object_ >::value >;
@@ -42,13 +51,30 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
 
    public:
 
+      /**
+       * \typedef ObjectType is the type of object owned by the pointer.
+       */
       using ObjectType = Object;
-      using DeviceType = Devices::Host; 
 
+      /**
+       * \typedef DeviceType is the type of device where the object is to be
+       * mirrored.
+       */
+      using DeviceType = Devices::Host;
+
+      /**
+       * \brief Constructor of an empty pointer.
+       */
       SharedPointer( std::nullptr_t )
       : pd( nullptr )
       {}
 
+      /**
+       * \brief Constructor with parameters of the Object constructor.
+       *
+       * \tparam Args is variadic template type of arguments of the Object constructor.
+       * \tparam args are arguments passed to the Object constructor.
+       */
       template< typename... Args >
       explicit  SharedPointer( Args... args )
       : pd( nullptr )
@@ -59,38 +85,70 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
          this->allocate( args... );
       }
 
-      // this is needed only to avoid the default compiler-generated constructor
-      SharedPointer( const SharedPointer& pointer )
+      /**
+       * \brief Copy constructor.
+       *
+       * \param pointer is the source shared pointer.
+       */
+      SharedPointer( const SharedPointer& pointer ) // this is needed only to avoid the default compiler-generated constructor
       : pd( (PointerData*) pointer.pd )
       {
          this->pd->counter += 1;
       }
 
-      // conditional constructor for non-const -> const data
+      /**
+       * \brief Copy constructor.
+       *
+       * This is specialization for compatible object types.
+       *
+       * See \ref Enabler.
+       *
+       * \param pointer is the source shared pointer.
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      SharedPointer( const SharedPointer<  Object_, DeviceType >& pointer )
+      SharedPointer( const SharedPointer<  Object_, DeviceType >& pointer ) // conditional constructor for non-const -> const data
       : pd( (PointerData*) pointer.pd )
       {
          this->pd->counter += 1;
       }
 
-      // this is needed only to avoid the default compiler-generated constructor
-      SharedPointer( SharedPointer&& pointer )
+      /**
+       * \brief Move constructor.
+       *
+       * \param pointer is the source shared pointer.
+       */
+      SharedPointer( SharedPointer&& pointer ) // this is needed only to avoid the default compiler-generated constructor
       : pd( (PointerData*) pointer.pd )
       {
          pointer.pd = nullptr;
       }
 
-      // conditional constructor for non-const -> const data
+      /**
+       * \brief Move constructor.
+       *
+       * This is specialization for compatible object types.
+       *
+       * See \ref Enabler.
+       *
+       * \param pointer is the source shared pointer.
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      SharedPointer( SharedPointer<  Object_, DeviceType >&& pointer )
+      SharedPointer( SharedPointer<  Object_, DeviceType >&& pointer ) // conditional constructor for non-const -> const data
       : pd( (PointerData*) pointer.pd )
       {
          pointer.pd = nullptr;
       }
 
+      /**
+       * \brief Create new object based in given constructor parameters.
+       *
+       * \tparam Args is variadic template type of arguments to be passed to the
+       *    object constructor.
+       * \param args are arguments to be passed to the object constructor.
+       * \return true if recreation was successful, false otherwise.
+       */
       template< typename... Args >
       bool recreate( Args... args )
       {
@@ -116,42 +174,80 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
          return this->allocate( args... );
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by constant smart pointer.
+       *
+       * \return constant pointer to the object owned by this smart pointer.
+       */
       const Object* operator->() const
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
          return &this->pd->data;
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by non-constant smart pointer.
+       *
+       * \return pointer to the object owned by this smart pointer.
+       */
       Object* operator->()
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
          return &this->pd->data;
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by constant smart pointer.
+       *
+       * \return constant reference to the object owned by this smart pointer.
+       */
       const Object& operator *() const
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
          return this->pd->data;
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by non-constant smart pointer.
+       *
+       * \return reference to the object owned by this smart pointer.
+       */
       Object& operator *()
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
          return this->pd->data;
       }
 
-      __cuda_callable__
+      /**
+       * \brief Conversion to boolean type.
+       *
+       * \return Returns true if the pointer is not empty, false otherwise.
+       */
       operator bool() const
       {
          return this->pd;
       }
 
-      __cuda_callable__
+      /**
+       * \brief Negation operator.
+       *
+       * \return Returns false if the pointer is not empty, true otherwise.
+       */
       bool operator!() const
       {
          return ! this->pd;
       }
 
+      /**
+       * \brief Constant object reference getter.
+       *
+       * No synchronization of this pointer will be performed due to calling
+       * this method.
+       *
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       __cuda_callable__
       const Object& getData() const
@@ -160,6 +256,16 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
          return this->pd->data;
       }
 
+      /**
+       * \brief Non-constant object reference getter.
+       *
+       * No synchronization of this pointer will be performed due to calling
+       * this method.
+       *
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       __cuda_callable__
       Object& modifyData()
@@ -168,8 +274,15 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
          return this->pd->data;
       }
 
-      // this is needed only to avoid the default compiler-generated operator
-      const SharedPointer& operator=( const SharedPointer& ptr )
+      /**
+       * \brief Assignment operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
+      const SharedPointer& operator=( const SharedPointer& ptr ) // this is needed only to avoid the default compiler-generated operator
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
@@ -178,10 +291,19 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
          return *this;
       }
 
-      // conditional operator for non-const -> const data
+      /**
+       * \brief Assignment operator for compatible object types.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * See \ref Enabler.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const SharedPointer& operator=( const SharedPointer<  Object_, DeviceType >& ptr )
+      const SharedPointer& operator=( const SharedPointer<  Object_, DeviceType >& ptr ) // conditional operator for non-const -> const data
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
@@ -190,8 +312,15 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
          return *this;
       }
 
-      // this is needed only to avoid the default compiler-generated operator
-      const SharedPointer& operator=( SharedPointer&& ptr )
+      /**
+       * \brief Move operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
+      const SharedPointer& operator=( SharedPointer&& ptr ) // this is needed only to avoid the default compiler-generated operator
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
@@ -199,10 +328,19 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
          return *this;
       }
 
-      // conditional operator for non-const -> const data
+      /**
+       * \brief Move operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * See \ref Enabler.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const SharedPointer& operator=( SharedPointer<  Object_, DeviceType >&& ptr )
+      const SharedPointer& operator=( SharedPointer<  Object_, DeviceType >&& ptr ) // conditional operator for non-const -> const data
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
@@ -210,21 +348,39 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
          return *this;
       }
 
+      /**
+       * \brief Cross-device pointer synchronization.
+       *
+       * For the smart pointers on the host, this method does nothing.
+       *
+       * \return true.
+       */
       bool synchronize()
       {
          return true;
       }
 
+      /**
+       * \brief Reset the pointer to empty state.
+       */
       void clear()
       {
          this->free();
       }
 
+      /**
+       * \brief Swap the owned object with another pointer.
+       *
+       * \param ptr2 the other shared pointer for swapping.
+       */
       void swap( SharedPointer& ptr2 )
       {
          std::swap( this->pd, ptr2.pd );
       }
 
+      /**
+       * \brief Destructor.
+       */
       ~SharedPointer()
       {
          this->free();
diff --git a/src/TNL/Pointers/SmartPointer.h b/src/TNL/Pointers/SmartPointer.h
index e6700781c3b44dcd898c4e7ee976cdffcb11bbe6..66516af70362bfb62a41b6717ee748b9bfdbf04c 100644
--- a/src/TNL/Pointers/SmartPointer.h
+++ b/src/TNL/Pointers/SmartPointer.h
@@ -11,6 +11,12 @@
 #pragma once
 
 namespace TNL {
+
+/**
+ * \brief Namespace for TNL pointers.
+ *
+ * Pointers in TNL are similar to STL pointers but they work across different device.
+ */
 namespace Pointers {
 
 class SmartPointer
diff --git a/src/TNL/Pointers/UniquePointer.h b/src/TNL/Pointers/UniquePointer.h
index 071de4d51132fa6b71e0e6b86ab16acd3c8269c8..66bc4a33c3869ef0e076ad1f60e08a92a4735135 100644
--- a/src/TNL/Pointers/UniquePointer.h
+++ b/src/TNL/Pointers/UniquePointer.h
@@ -25,65 +25,153 @@
 namespace TNL {
 namespace Pointers {
 
+/**
+ * \brief Cross-device unique smart pointer.
+ *
+ * This smart pointer is inspired by std::unique_ptr from STL library. It means
+ * that the object owned by the smart pointer is accessible only through this
+ * smart pointer. One cannot make any copy of this smart pointer. In addition,
+ * the smart pointer is able to work across different devices which means that the
+ * object owned by the smart pointer is mirrored on both host and device.
+ *
+ * **NOTE: When using smart pointers to pass objects on GPU, one must call
+ * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >()
+ * before calling a CUDA kernel working with smart pointers.**
+ *
+ * \tparam Object is a type of object to be owned by the pointer.
+ * \tparam Device is device where the object is to be allocated. The object is
+ * always allocated on the host system as well for easier object manipulation.
+ *
+ * See also \ref SharedPointer and \ref DevicePointer.
+ *
+ * See also \ref UniquePointer< Object, Devices::Host > and \ref UniquePointer< Object, Devices::Cuda >.
+ *
+ * \par Example
+ * \include Pointers/UniquePointerExample.cpp
+ * \par Output
+ * \include UniquePointerExample.out
+ */
 template< typename Object, typename Device = typename Object::DeviceType >
 class UniquePointer
 {
 };
 
+/**
+ * \brief Specialization of the \ref UniquePointer for the host system.
+ *
+ * \tparam  Object is a type of object to be owned by the pointer.
+ */
 template< typename Object >
 class UniquePointer< Object, Devices::Host > : public SmartPointer
 {
    public:
 
-      typedef Object ObjectType;
-      typedef Devices::Host DeviceType;
+      /**
+       * \typedef ObjectType is the type of object owned by the pointer.
+       */
+      using ObjectType = Object;
 
+      /**
+       * \typedef DeviceType is the type of device where the object is to be
+       * mirrored.
+       */
+      using DeviceType = Devices::Host;
+
+      /**
+       * \brief Constructor of empty pointer.
+       */
       UniquePointer( std::nullptr_t )
       : pointer( nullptr )
       {}
 
+      /**
+       * \brief Constructor with parameters of the Object constructor.
+       *
+       * \tparam Args is variadic template type of arguments of the Object constructor.
+       * \tparam args are arguments passed to the Object constructor.
+       */
       template< typename... Args >
       explicit  UniquePointer( const Args... args )
       {
          this->pointer = new Object( args... );
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by constant smart pointer.
+       *
+       * \return constant pointer to the object owned by this smart pointer.
+       */
       const Object* operator->() const
       {
          TNL_ASSERT_TRUE( this->pointer, "Attempt to dereference a null pointer" );
          return this->pointer;
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by non-constant smart pointer.
+       *
+       * \return pointer to the object owned by this smart pointer.
+       */
       Object* operator->()
       {
          TNL_ASSERT_TRUE( this->pointer, "Attempt to dereference a null pointer" );
          return this->pointer;
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by constant smart pointer.
+       *
+       * \return constant reference to the object owned by this smart pointer.
+       */
       const Object& operator *() const
       {
          TNL_ASSERT_TRUE( this->pointer, "Attempt to dereference a null pointer" );
          return *( this->pointer );
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by non-constant smart pointer.
+       *
+       * \return reference to the object owned by this smart pointer.
+       */
       Object& operator *()
       {
          TNL_ASSERT_TRUE( this->pointer, "Attempt to dereference a null pointer" );
          return *( this->pointer );
       }
 
+      /**
+       * \brief Conversion to boolean type.
+       *
+       * \return Returns true if the pointer is not empty, false otherwise.
+       */
       __cuda_callable__
       operator bool() const
       {
          return this->pointer;
       }
 
+      /**
+       * \brief Negation operator.
+       *
+       * \return Returns false if the pointer is not empty, true otherwise.
+       */
       __cuda_callable__
       bool operator!() const
       {
          return ! this->pointer;
       }
 
+      /**
+       * \brief Constant object reference getter.
+       *
+       * No synchronization of this pointer will be performed due to calling
+       * this method.
+       *
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       const Object& getData() const
       {
@@ -91,6 +179,18 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
          return *( this->pointer );
       }
 
+      /**
+       * \brief Non-constant object reference getter.
+       *
+       * After calling this method, the object owned by the pointer might need
+       * to be synchronized. One should not forget to call
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >()
+       * before calling CUDA kernel using object from this smart pointer.
+       *
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       Object& modifyData()
       {
@@ -98,6 +198,15 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
          return *( this->pointer );
       }
 
+      /**
+       * \brief Assignment operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       * The original pointer \ref ptr is reset to empty state.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       const UniquePointer& operator=( UniquePointer& ptr )
       {
          if( this->pointer )
@@ -107,16 +216,35 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
          return *this;
       }
 
+      /**
+       * \brief Move operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       * The original pointer \ref ptr is reset to empty state.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       const UniquePointer& operator=( UniquePointer&& ptr )
       {
          return this->operator=( ptr );
       }
 
+      /**
+       * \brief Cross-device pointer synchronization.
+       *
+       * For the smart pointers in the host, this method does nothing.
+       *
+       * \return true.
+       */
       bool synchronize()
       {
          return true;
       }
 
+      /**
+       * \brief Destructor.
+       */
       ~UniquePointer()
       {
          if( this->pointer )
@@ -129,19 +257,41 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
       Object* pointer;
 };
 
+/**
+ * \brief Specialization of the \ref UniquePointer for the CUDA device.
+ *
+ * \tparam  Object is a type of object to be owned by the pointer.
+ */
 template< typename Object >
 class UniquePointer< Object, Devices::Cuda > : public SmartPointer
 {
    public:
 
-      typedef Object ObjectType;
-      typedef Devices::Cuda DeviceType;
+      /**
+       * \typedef ObjectType is the type of object owned by the pointer.
+       */
+      using ObjectType = Object;
+
+      /**
+       * \typedef DeviceType is the type of device where the object is to be
+       * mirrored.
+       */
+      using DeviceType = Devices::Cuda;
 
+      /**
+       * \brief Constructor of empty pointer.
+       */
       UniquePointer( std::nullptr_t )
       : pd( nullptr ),
         cuda_pointer( nullptr )
       {}
 
+      /**
+       * \brief Constructor with parameters of the Object constructor.
+       *
+       * \tparam Args is variadic template type of arguments of the Object constructor.
+       * \tparam args are arguments passed to the Object constructor.
+       */
       template< typename... Args >
       explicit  UniquePointer( const Args... args )
       : pd( nullptr ),
@@ -150,12 +300,22 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
          this->allocate( args... );
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by constant smart pointer.
+       *
+       * \return constant pointer to the object owned by this smart pointer.
+       */
       const Object* operator->() const
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
          return &this->pd->data;
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by non-constant smart pointer.
+       *
+       * \return pointer to the object owned by this smart pointer.
+       */
       Object* operator->()
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
@@ -163,12 +323,22 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
          return &this->pd->data;
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by constant smart pointer.
+       *
+       * \return constant reference to the object owned by this smart pointer.
+       */
       const Object& operator *() const
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
          return this->pd->data;
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by non-constant smart pointer.
+       *
+       * \return reference to the object owned by this smart pointer.
+       */
       Object& operator *()
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
@@ -176,18 +346,38 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
          return this->pd->data;
       }
 
+      /**
+       * \brief Conversion to boolean type.
+       *
+       * \return Returns true if the pointer is not empty, false otherwise.
+       */
       __cuda_callable__
       operator bool() const
       {
          return this->pd;
       }
 
+      /**
+       * \brief Negation operator.
+       *
+       * \return Returns false if the pointer is not empty, true otherwise.
+       */
       __cuda_callable__
       bool operator!() const
       {
          return ! this->pd;
       }
 
+      /**
+       * \brief Constant object reference getter.
+       *
+       * No synchronization of this pointer will be performed due to calling
+       * this method.
+       *
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       const Object& getData() const
       {
@@ -200,6 +390,18 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
             return *( this->cuda_pointer );
       }
 
+      /**
+       * \brief Non-constant object reference getter.
+       *
+       * After calling this method, the object owned by the pointer might need
+       * to be synchronized. One should not forget to call
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >()
+       * before calling CUDA kernel using object from this smart pointer.
+       *
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       Object& modifyData()
       {
@@ -215,6 +417,15 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
             return *( this->cuda_pointer );
       }
 
+      /**
+       * \brief Assignment operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       * The original pointer \ref ptr is reset to empty state.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       const UniquePointer& operator=( UniquePointer& ptr )
       {
          this->free();
@@ -225,11 +436,28 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
          return *this;
       }
 
+      /**
+       * \brief Move operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       * The original pointer \ref ptr is reset to empty state.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       const UniquePointer& operator=( UniquePointer&& ptr )
       {
          return this->operator=( ptr );
       }
 
+      /**
+       * \brief Cross-device pointer synchronization.
+       *
+       * This method is usually called by the smart pointers register when calling
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >()
+       *
+       * \return true if the synchronization was successful, false otherwise.
+       */
       bool synchronize()
       {
          if( ! this->pd )
@@ -248,6 +476,9 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
 #endif
       }
 
+      /**
+       * \brief Destructor.
+       */
       ~UniquePointer()
       {
          this->free();
diff --git a/src/UnitTests/Pointers/CMakeLists.txt b/src/UnitTests/Pointers/CMakeLists.txt
index 0f5e9865610e99d48699b35b8b9fd96fd8352877..8845cdfbdffd7c0723698d97b9f0d68715ea71df 100644
--- a/src/UnitTests/Pointers/CMakeLists.txt
+++ b/src/UnitTests/Pointers/CMakeLists.txt
@@ -14,4 +14,8 @@ if( BUILD_CUDA )
                         OPTIONS ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( SharedPointerCudaTest ${GTEST_BOTH_LIBRARIES} )
    ADD_TEST( SharedPointerCudaTest ${EXECUTABLE_OUTPUT_PATH}/SharedPointerCudaTest${CMAKE_EXECUTABLE_SUFFIX} )
+   CUDA_ADD_EXECUTABLE( DevicePointerCudaTest DevicePointerCudaTest.cu 
+                        OPTIONS ${CXX_TESTS_FLAGS} )
+   TARGET_LINK_LIBRARIES( DevicePointerCudaTest ${GTEST_BOTH_LIBRARIES} )
+   ADD_TEST( DevicePointerCudaTest ${EXECUTABLE_OUTPUT_PATH}/DevicePointerCudaTest${CMAKE_EXECUTABLE_SUFFIX} )
 endif( BUILD_CUDA )
diff --git a/src/UnitTests/Pointers/DevicePointerCudaTest.cu b/src/UnitTests/Pointers/DevicePointerCudaTest.cu
new file mode 100644
index 0000000000000000000000000000000000000000..76320904a469aeed7a16b1e889b5e323c97cd461
--- /dev/null
+++ b/src/UnitTests/Pointers/DevicePointerCudaTest.cu
@@ -0,0 +1,153 @@
+/***************************************************************************
+                          DevicePointerCudaTest.cpp  -  description
+                             -------------------
+    begin                : Nov 26, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <cstdlib>
+#include <TNL/Devices/Host.h>
+#include <TNL/Pointers/DevicePointer.h>
+#include <TNL/Containers/StaticArray.h>
+#include <TNL/Containers/Array.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+#endif
+
+#include <TNL/Devices/Cuda.h>
+
+using namespace TNL;
+
+#ifdef HAVE_GTEST
+TEST( DevicePointerCudaTest, ConstructorTest )
+{
+#ifdef HAVE_CUDA
+   using TestType =  TNL::Containers::StaticArray< 2, int  >;
+   TestType obj1;
+   Pointers::DevicePointer< TestType, Devices::Cuda > ptr1( obj1 );
+
+   ptr1->x() = 0;
+   ptr1->y() = 0;
+   ASSERT_EQ( ptr1->x(), 0 );
+   ASSERT_EQ( ptr1->y(), 0 );
+
+   TestType obj2( 1,2 );
+   Pointers::DevicePointer< TestType, Devices::Cuda > ptr2( obj2 );
+   ASSERT_EQ( ptr2->x(), 1 );
+   ASSERT_EQ( ptr2->y(), 2 );
+
+   ptr1 = ptr2;
+   ASSERT_EQ( ptr1->x(), 1 );
+   ASSERT_EQ( ptr1->y(), 2 );
+#endif
+};
+
+TEST( DevicePointerCudaTest, getDataTest )
+{
+#ifdef HAVE_CUDA
+   using TestType = TNL::Containers::StaticArray< 2, int  >;
+   TestType obj1( 1, 2 );
+   Pointers::DevicePointer< TestType, Devices::Cuda > ptr1( obj1 );
+
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+
+   TestType aux;
+
+   cudaMemcpy( ( void*) &aux, &ptr1.getData< Devices::Cuda >(), sizeof( TestType ), cudaMemcpyDeviceToHost );
+
+   ASSERT_EQ( aux[ 0 ], 1 );
+   ASSERT_EQ( aux[ 1 ], 2 );
+#endif  // HAVE_CUDA
+};
+
+#ifdef HAVE_CUDA
+__global__ void copyArrayKernel( const TNL::Containers::Array< int, Devices::Cuda >* inArray,
+                                 int* outArray )
+{
+   if( threadIdx.x < 2 )
+   {
+      outArray[ threadIdx.x ] = ( *inArray )[ threadIdx.x ];
+   }
+}
+
+__global__ void copyArrayKernel2( const Pointers::DevicePointer< TNL::Containers::Array< int, Devices::Cuda > > inArray,
+                                  int* outArray )
+{
+   if( threadIdx.x < 2 )
+   {
+      outArray[ threadIdx.x ] = ( *inArray )[ threadIdx.x ];
+   }
+}
+#endif
+
+TEST( DevicePointerCudaTest, getDataArrayTest )
+{
+#ifdef HAVE_CUDA
+   using TestType = TNL::Containers::Array< int, Devices::Cuda  >;
+   TestType obj;
+   Pointers::DevicePointer< TestType > ptr( obj );
+
+   ptr->setSize( 2 );
+   ptr->setElement( 0, 1 );
+   ptr->setElement( 1, 2 );
+
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+
+   int *testArray_device, *testArray_host;
+   cudaMalloc( ( void** ) &testArray_device, 2 * sizeof( int ) );
+   copyArrayKernel<<< 1, 2 >>>( &ptr.getData< Devices::Cuda >(), testArray_device );
+   testArray_host = new int [ 2 ];
+   cudaMemcpy( testArray_host, testArray_device, 2 * sizeof( int ), cudaMemcpyDeviceToHost );
+
+   ASSERT_EQ( testArray_host[ 0 ], 1 );
+   ASSERT_EQ( testArray_host[ 1 ], 2 );
+
+   copyArrayKernel2<<< 1, 2 >>>( ptr, testArray_device );
+   cudaMemcpy( testArray_host, testArray_device, 2 * sizeof( int ), cudaMemcpyDeviceToHost );
+
+   ASSERT_EQ( testArray_host[ 0 ], 1 );
+   ASSERT_EQ( testArray_host[ 1 ], 2 );
+
+   delete[] testArray_host;
+   cudaFree( testArray_device );
+
+#endif
+};
+
+TEST( DevicePointerCudaTest, nullptrAssignement )
+{
+#ifdef HAVE_CUDA
+   using TestType = Pointers::DevicePointer< double, Devices::Cuda >;
+   double o1 = 5;
+   TestType p1( o1 ), p2( nullptr );
+
+   // This should not crash
+   p1 = p2;
+
+   ASSERT_FALSE( p1 );
+   ASSERT_FALSE( p2 );
+#endif
+}
+
+TEST( DevicePointerCudaTest, swap )
+{
+#ifdef HAVE_CUDA
+   using TestType = Pointers::DevicePointer< double, Devices::Cuda >;
+   double o1( 1 ), o2( 2 );
+   TestType p1( o1 ), p2( o2 );
+
+   p1.swap( p2 );
+
+   ASSERT_EQ( *p1, 2 );
+   ASSERT_EQ( *p2, 1 );
+#endif
+}
+
+#endif
+
+
+#include "../main.h"
diff --git a/src/UnitTests/Pointers/SharedPointerCudaTest.cu b/src/UnitTests/Pointers/SharedPointerCudaTest.cu
index 83b6b4793bf6d5e17f6587b71c60261e8b80cea0..37cfc56b727da6ac2e283496df4f045f404999f2 100644
--- a/src/UnitTests/Pointers/SharedPointerCudaTest.cu
+++ b/src/UnitTests/Pointers/SharedPointerCudaTest.cu
@@ -50,11 +50,6 @@ TEST( SharedPointerCudaTest, getDataTest )
    typedef TNL::Containers::StaticArray< 2, int  > TestType;
    Pointers::SharedPointer< TestType, Devices::Cuda > ptr1( 1, 2 );
 
-#ifdef HAVE_CUDA_UNIFIED_MEMORY
-   ASSERT_EQ( ptr1->x(), 1 );
-   ASSERT_EQ( ptr1->y(), 2 );
-#else
-
    Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
 
    TestType aux;
@@ -63,7 +58,6 @@ TEST( SharedPointerCudaTest, getDataTest )
 
    ASSERT_EQ( aux[ 0 ], 1 );
    ASSERT_EQ( aux[ 1 ], 2 );
-#endif  // HAVE_CUDA_UNIFIED_MEMORY
 #endif  // HAVE_CUDA
 };
 
@@ -77,6 +71,14 @@ __global__ void copyArrayKernel( const TNL::Containers::Array< int, Devices::Cud
    }
 }
 
+__global__ void copyArrayKernel2( const Pointers::SharedPointer< TNL::Containers::Array< int, Devices::Cuda > > inArray,
+                                  int* outArray )
+{
+   if( threadIdx.x < 2 )
+   {
+      outArray[ threadIdx.x ] = ( *inArray )[ threadIdx.x ];
+   }
+}
 #endif
 
 TEST( SharedPointerCudaTest, getDataArrayTest )
@@ -100,6 +102,12 @@ TEST( SharedPointerCudaTest, getDataArrayTest )
    ASSERT_EQ( testArray_host[ 0 ], 1 );
    ASSERT_EQ( testArray_host[ 1 ], 2 );
 
+   copyArrayKernel2<<< 1, 2 >>>( ptr, testArray_device );
+   cudaMemcpy( testArray_host, testArray_device, 2 * sizeof( int ), cudaMemcpyDeviceToHost );
+
+   ASSERT_EQ( testArray_host[ 0 ], 1 );
+   ASSERT_EQ( testArray_host[ 1 ], 2 );
+
    delete[] testArray_host;
    cudaFree( testArray_device );