From 8c001dd64ce132ce952c9c9882fba025c07cf2b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 2 Sep 2019 21:25:51 +0200
Subject: [PATCH 01/43] Added tutorial on for loops.

---
 Documentation/Tutorials/CMakeLists.txt        |   3 +-
 .../ForLoops/.tutorial_04_ForLoops.md.swp     | Bin 0 -> 20480 bytes
 .../Tutorials/ForLoops/CMakeLists.txt         |   9 +++
 .../Tutorials/ForLoops/ParallelForExample.cpp |  58 ++++++++++++++++++
 .../Tutorials/ForLoops/ParallelForExample.cu  |   1 +
 .../ForLoops/tutorial_04_ForLoops.md          |  37 +++++++++++
 Documentation/Tutorials/index.md              |   1 +
 7 files changed, 108 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/Tutorials/ForLoops/.tutorial_04_ForLoops.md.swp
 create mode 100644 Documentation/Tutorials/ForLoops/CMakeLists.txt
 create mode 100644 Documentation/Tutorials/ForLoops/ParallelForExample.cpp
 create mode 120000 Documentation/Tutorials/ForLoops/ParallelForExample.cu
 create mode 100644 Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md

diff --git a/Documentation/Tutorials/CMakeLists.txt b/Documentation/Tutorials/CMakeLists.txt
index 690cf93a0..dadf70b79 100644
--- a/Documentation/Tutorials/CMakeLists.txt
+++ b/Documentation/Tutorials/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory( Arrays )
 add_subdirectory( Vectors )
-add_subdirectory( ReductionAndScan )
\ No newline at end of file
+add_subdirectory( ReductionAndScan )
+add_subdirectory( ForLoops )
diff --git a/Documentation/Tutorials/ForLoops/.tutorial_04_ForLoops.md.swp b/Documentation/Tutorials/ForLoops/.tutorial_04_ForLoops.md.swp
new file mode 100644
index 0000000000000000000000000000000000000000..45aefd2ee4fd7c7876016a6933928a1f76f578dc
GIT binary patch
literal 20480
zcmYc?2=nw+u+TGLU|?VnU|<N$$c@#|bz>+9VqnNmN-fGLO-d~S$>G7tiA6=3sk(`I
zd8v3*F)-A@4b(5sFUl@1NK8)EFUiZ%cgasK%}vcKNi50C&(jYnEy*v+Ow1|PcgrvG
z$<Hq+)-Qny#T%H!!vytmQ}Eh0DmfYgqagqffzpyRT?<|YV<SUwR##S1R1g*lg-eez
zMnhmU1V%$(Gz3ONU^E0qLtr!nMnhmU1crYIloT*B)H5(JFhRYl3Z)s*Xl5v%9ZElg
z(lB)}d3mTfy8N&I5P(Y_s%?}S4S~@R7!85Z5Eu=C(GVC7fzc2c4S~@R7!85Z5Eu=C
z(GZ|P2qdO3Fa&cjFfemM=Ko>+|J(cw47d0h7*6psFr4IPV94NSU`Xd@VA#jUz>v(x
zz_5dtfnf$O1H*J)28JeH28Kjl1_l*g1_pLs28L@q3=CI!7#LRZFfgp-VPLq;&A@Pr
zn}OjPHv_|DZU%;aZU%-vZU%;boD2+qIT;x4axyU7;bdSq%gMlShLeF|9VY|BT22Or
zG)@Kvdrk%h7EXx!JlPo-KC>|}Y-eL&@MmLS;AdlC;A3N8xX#MJaE+CLVFN1z!+KT*
zhK0-w3=5bU7+RSb7+RPa7#=e*Fg#*nU|7S%z_6N$f#De=1H)6;zy%G$ls-l(D=RAm
z6y>LsCYRVGD&!^RrrIi{<)l_*Cgr5Y7bF%X=H#U2#22NelqNGUkf0c}1|z-zVz82(
zjeepX14D3Wu0npALRo5ZNq&(+YECL>wMMZ*a$=rBQmR5?PI10Meo{$dW?pKFLSnH(
zbV6}yu11+=Lac_Ko_<D2ZjL_4ec%NjA$~r3AQ9z66T{@xL=$sMGeZlDGy@BB^OU3{
zvs81VRHGDQQ&Y=SO+5yNn9RK7oYIt3h2YX$GJQd&w&W7Jc^;n&@yTJt^Pr77*NVj4
zf}B*n{L&Hz2EY80RE3g^M1}HHg@VN5Vub_)J%a>Lx+%#>Rme#!E>TD<N(V0p0wp?-
zU~XzjMt+JyLJ%l5gM4YD;F4OFnVhO%XJrLW;;9LG3Z5kjnZ*i;c?w#YDXF;y`6a1&
zC9uS*rJzxqnyL`(36n`qjn&8~DJdwn($`Ne)+^7<&MZhx$xPJCFG|-3@$`|DYHA{x
z4o&_F<r$gD89EAS`9%upnPsVY3i$=8Mc^e&ItnTIsl^I;`6UX;8Hsu6An&Fs6r~oI
z=9K6uxaAiqB&MWfg8ZSQkO_8lRccYbo<c|lOqD`Tetvc_C`{7wb8_;_i}e^7f;01o
z&B^4Z8gli)^9sER8cOV;L@(~bkT#}Lvxp@>a&hX!DS@pP0;TtXu2`@*7$l?+lA2qP
zlUS0Pq7V#Pg_sP=r0`m`1ST0@43>;f%P)df4$8{dG+<Mfr<;<Qo0?agnV*-KqYwbC
zpHb||i$_v|sKODpA*w@|ex!m`fq?<3{Kzb}LaIGs%H8sdU^<cu3LqL1ASG!6EUBet
zmSm(BDL{pbt*kuqi%Sv|@{xs{OH&dPAmvqBW>Im8LP>eP0;tWAn_7}uRIHGanwFUd
zDfM#l^9vM`@=NnliWM^R6hNi9vqEu6Wlk!%NX<=6%qs@f1|=Y=qEv<Q%$yuhqaw2e
zT9_7tOW4f3l+=pUVugg_lEk8t1O+`kJ%xnSycAu-1U&_}(!69)>H-)43OR|nNhyg6
zX)wW(e2{qssYPk|MY*XdAXlU&CTA$XZ2&tYvsj_Hw4flrI5kC~BwwK@H90l24B`d2
zLItoZ6cY1N;5Ae+xC(>l0r?FS2%v@+*wGpw5tIt8SW{0SI2B?nq-4*`OIJvRRF75+
z3<(Jd3`miMT+O9cq$X=9fW5DykeZjG1NWg0!Zn~^(X<8|!{C%yoS6)c6}W?v^Ye;J
zib|n@m8THm=c7=ZT2z(_3FM5#qLlK)qErP?UI2xbLP=3#UU5NU5x6F;C@4xT1|?un
zwT$9XP$U;a(gUU$nZ*jJ5Ur^x3Q3g;i3+d`1$G=b$w2g?n3)MnMxZ986*!l|LIx$*
zz%(J{N(K`>g=lP9I#xp&W_JowX4Ygd)>DW^<os9-WkeR%WH8cGh{m0LV>Og<XIxDN
zLp_CPWOv7EC?mUAlR;ToAtW&=ClwT^&iQ$uMoTdRgO-Ap7B(MfX@Qcy9#|gXU@a|3
z(g(FgKyd+Sn}Ea+$-X$XurxKVBr`Fm5}bk|4Ug1{f}G6c%#xf+h0?sD{G1#}964)i
z!(*vfPr<J=Hz~CU<U)8}C|1ZVEiM7&EJ#o(Xk_Q*m**)YmMA3W=N4q<q$-qT=7JjW
zAg|yKK8Vv(D@q_`Bgl^A{M_99JaFQGr`*iEl6-~S{GwC^co|s?2?coIrsWqw3`+!O
zKBRDf_EU;V^FWy&7W_I2nR=;u3W+&6h@zkvWIs69f{HDe4?)?!AT=)~HLpYuR&;~y
z02vH2H9t=wDZeB`A-6QABr_T0O6P!3P*XHT!95@p)OZWY$ShWXEn`<G&d4t>R>;UN
z2NgA?#UM|mrKN&%S$1Y#3dj+#s4P~<1T~#O9WF@O1?oU4_$B71=2Yr{%FIfI#G+J4
z{_xBzf%Mxl^Ya)OVhR#LEl(^<=Rx@g)WR$%W&rj7&v7#_r13!d|FHS_B7O#jLVgB@
zTz&=yPksglSAGTt7k&nYYkUk0SNRwiF7h!jtmb22SkA}5u#As^p^}e*p@NTrp@fft
zA%Ks8!H185!JChP!IhVR!G)KB!GV{7L4lWnfq|ES;SUc3!yz69hJ!o|4D)#y80PUX
zFm&=TFm&)h+>r`$3w=7#qqdBOz-S1JhQMeDjE2By2#kinXb6mkz-S0iJp>TbIH~E7
zo=k9ZVjgTL3Op--G<H>#nwSC_`$itY3dW`pJcgyE1)5N>Qh=*aC<b-QG9lyd$ekxp
zm&!TR#ZjR$wM0)#3p`2^ZKz|cW2$4MW1?fGV+m#(gIR_~I)<jP4AF)<208{h5E4Q|
z*a`+Z3I?%YGZ;Mc-~;-g4jr`r3UY=HXp$o@6*4}QnwOlP0vY=$hIj|w-OS7@C@oQd
zbkUO`BV(ZPkNmt;@Q6uiMP^QBVo{|+T25kmu|i@|QDUWnMrH|Y%nl-upQezMpPvI7
zQPWh&NGt;lEhQKxfV$+ME^Sh3dS)I-EI&;FJOBoD1b8$c!5~2)KM&j)&IgbG733F#
zXDEvG6g=}3@=J?gqg3E2i}L)^oD@*!KPNLA<RkEKBWUm{8Wct*vC#%PMmlCXhDIQk
zF__S?)G@TsF@}X1Xb=q)A*mI~Ii<y*p*E;<Qd1NPic-@uD-?=Lb0O0ska4kSLmd+x
z%h+g$!8%5UvC+ml79av@vW|%<G(tdzm!~R}C+3wvBc!0TBvk=CnFzKEG>eI_7?fVn
z0tGTngV2E0apkEB>8T}f*BR@Wf?NkqNg!vL#_B|)BqmS*F<_?CM1|b^l+3iuWbot>
zsPm7Mg22IF0vbI?PAmombt*iWfK=vzM`pl7s~I3)gHlaqS!N1&te_-c0qX8z1$ab3
zW~@>nLyqA7{{uz_1`E&vegOsse`wx+z|X*NkDq~|i=Tm^lb?Yhl9z!Yf|r58l$U|Q
zgqMNg2M+_ocOC|Y&pZqa6L}aIdU+TadUzNZ{J9wz{J0qyEV&sNEVvmMM7bFlM7S9k
zo^vrUJmX?un8n4wAkM|WAjZYOAjHMM@Qagy;TtCd!&goQh7+6&497Vc7!GqXFf8I^
zV3^0rz%ZASfuVwvfuWp}fuWd_fgzETfgzTYfgy&IfgzHUfgysEfgyyGfx&^3fx(88
zfx()Sfq{#Yfq|2gfq|8if#DMe1H(HG28OpB3=FH;85mZvGcZhKXJDAX&cL9?%D|w?
z%D^DN%D}+S%E0iAg@NHK3j>1>3j>2U3j>2Y3j@P<W(I~&%nS@4nHd;PF*7inWM*Jk
z&dk8DjG2L<mzjZ~hnayPi<yBTlbM0Rk{J??=FAKX;>-*TV$2K-x0x6iZZR=1Tw`Kj
z*vG`cu#1U-VJ8y<!+k~uhI@2ag8-WNM@@0?X$@%7NKJ_cl~m9P4a5|KD=dFt6xEJ-
zDWHKT=!7O{APGEGiCkP`)eJ7KLFGJXfG{VuBo$Vqz>QFVjERHhE)o(F6c|8O!^XgM
z6hOR?%7RoYE7uChOelEL4U6|c;-C>@Wwcs>Sick0OHe^-Uda#*SZHZ%XJrK)JJwJr
z(*f0XWqRqUCBd0hsT!I(3Xn;69R&kD0|nTG13c0pH8A*$4{$;$PRs=r#+BgOFbgzn
z467203sRFa(<(v5d{QMuZEA8pD1AZa+`yy2;3)#cn09e0xKvL-Ne>WbLnoz@L4&&a
zX%GWJrF%(6Y92h5Ax(^djKI|lfla|%F@Vz{Voo14V4IMZT9TZR0H1?VNJ>pk1WABu
zT2Ow15067bF*OBTzrxngfTxBab7!FOXs8dN3nEJL;VuU+JSYYase{(`lvIKS^}!2n
zGBVRa^K_uRlvo65V1Oe9G!X`xPAM%`$V^iJwID!&T@0!RQ}Pvx^K(-X3l$W~GfOhS
zcA+orNlHxv4P(Q60~%@0gfI9>0hx+4a{#V*N+45rphXXfX=$m+C17)*zE8|cPE{z*
zOwR*Xb9p5>C6#&#{*dWFNP7cRdx308g(M!t>|J7ZDyVuZNi8Z%%mGz-AUB~qw>%@Y
z2wFLUqC2x#0ctU1K~iQ(u|hE<VS(IQl95@2n8E`ER;of?Y6a4|6VPCN268~5Pa?un
zE66s8k<g_ph}eXhfwUGyLkYCbLrD|1C<PQjAm6|jj-ae~LTVv{OHL=y3<GF-0NiSf
z1|>4Y5|-3FY)e=IzzwPR;?i7A&@6#MX>n;{4ru<Tpdcp`oFa27LCLbXG#A|V&;iX!
zq?V-?DL@y1fu?<P^3yYuK|v2&-U3RF(B>Z~)+#~ERLWCxa==ZM5I-Lslm;Be1V(XY
zZe~tSVv$05B3LUt)q)$M#rdGZ02HiAsS2ry#hE#k(B=_jW(sX~7dq<*sW(84TWG@u
zX^JJYL;+kBL8n(CEmTlz)J(_7RL9s1sm+LJj3PBbVdV&@5QDYQ6*7zA%{~((vvf=>
zp~fL80Ch0TL5d7@42_}EphZAXr|KZJOHr(4V2DYJWr!)xOwUb>&s4Bgh$$}3jjztK
zHLQ)R&eS!mRY;7_)MJQAi)LUzD|<EaK~V^6MQefz4al54XfhRIZZW!%nMlTYf|lKZ
zTgsqhrhsl7Ji#iIlosU`D<mqw112U-B{AMmM<FIBCBLLtM<FpjFD6Yz0X%<)s;n4U
zSusQzk|RMa7lboFc?=OdAV<OLa(E*>J~1yP9<B(bRimt|;07u`LG!FAn&B%ZL82IK
zAozN-@XV47$0Eq0G{`D8P&)>)q6=QHSTUgY1ksDBvc#Oy)Z!F{gv11JVoyj)0C%U-
zQ%i~=%P31SQZtJb;GHjpghbDj3YcOL6H;2iipOG5%oe2<7v$#^gBE^*N_kNA2(kb)
zVVYQxnG0FF2`Xnmt296X2{ja2&A=>!m9066Md_*VRi1hZpzbxOWKM*Z=!r#%;OeRv
z)W?Jrx`|2oWvL3Fo_%IMXbmW6RR<_D=I7;9f~H)-U6=eca5Yw10+Pwi%!9bBNFf*E
zNIgh~N-PDhz5p$2&CJhJuvLhTRfvyIE=@^{2i5AJnezB}1r1PwQd0m0j*bF&RRcsI
z$up$_wEC_zDJNA;ArU5*k`EC`g11#b(Faa!;7%^W>!?dd65wT1f&#Wt3GC?uTv~hP
zDS+$R;>?m#aM@Z68VblwtW*G1$Dnx1$xlu!NrkQ*ge@wBE|GyG97rb=T1-F}pTJ6^
zT+sU7;u25+4T@TDq!q)LJL)MQoQ|A!6F}(&X>AO64I_9IAq7<Kf=h6aaTw((xakJ&
zrdA@_JfQwNsQm$nQjk7qTq@+{7lBs{Cn}Vsrhpa=gG??iNlgULZG)SDs2agJLq{Pm
zzZ?`yu#S6rY6+^hVZj6)-2hpcm{gpf1M1y_jZjD}EKSS-1usZ0H?acj6iB8)t2&T^
zC_y15AF^CCKNmbw08Y3t&wzrwJRdcpK$H8Gp!Enk3Q46UpehTzSODBIgZK<I+>nx5
z0ZUrtnR$7sMc7vVfCh!Y^Y_XM@bm;}n8B98!ize1X$5L!fYwGp)^~%NeDR>BTs%w%
Yv?NnNbmx2%3&5tQLYup~#^7=c0ABU4p#T5?

literal 0
HcmV?d00001

diff --git a/Documentation/Tutorials/ForLoops/CMakeLists.txt b/Documentation/Tutorials/ForLoops/CMakeLists.txt
new file mode 100644
index 000000000..af47d80b7
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/CMakeLists.txt
@@ -0,0 +1,9 @@
+IF( BUILD_CUDA )
+   CUDA_ADD_EXECUTABLE( ParallelForExample ParallelForExample.cu )
+   ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
+ENDIF()
+
+IF( BUILD_CUDA )
+ADD_CUSTOM_TARGET( ForLoops-cuda ALL DEPENDS
+   ParallelForExample.out )
+ENDIF()
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample.cpp b/Documentation/Tutorials/ForLoops/ParallelForExample.cpp
new file mode 100644
index 000000000..cd6ece928
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/ParallelForExample.cpp
@@ -0,0 +1,58 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Containers::Algorithms;
+
+template< typename Device >
+void vectorSum( const Vector< double, Device >& v1,
+                const Vector< double, Device >& v2,
+                const double& c,
+                Vector< double, Device >& result )
+{
+   /****
+    * Get vectors view which can be captured by lambda.
+    */
+   auto v1_view = v1.getConstView();
+   auto v2_view = v2.getConstView();
+   auto result_view = result.getView();
+
+   /****
+    * The sum function.
+    */
+   auto sum = [=] __cuda_callable__  ( int i, const double c ) mutable {
+      result_view[ i ] = v1_view[ i ] + v2_view[ i ] + c; };
+
+      ParallelFor< Device >::exec( 0, v1.getSize(), sum, c );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the vectors sum on CPU.
+    */
+   Vector< double, Devices::Host > host_v1( 10 ), host_v2( 10 ), host_result( 10 );
+   host_v1 = 1.0;
+   host_v2.evaluate( []__cuda_callable__ ( int i )->double { return i; } );
+   vectorSum( host_v1, host_v2, 2.0, host_result );
+   std::cout << "host_v1 = " << host_v1 << std::endl;
+   std::cout << "host_v2 = " << host_v2 << std::endl;
+   std::cout << "The sum of the vectors on CPU is " << host_result << "." << std::endl;
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v1( 10 ), cuda_v2( 10 ), cuda_result( 10 );
+   cuda_v1 = 1.0;
+   cuda_v2.evaluate( []__cuda_callable__ ( int i )->double { return i; } );
+   vectorSum( cuda_v1, cuda_v2, 2.0, cuda_result );
+   std::cout << "cuda_v1 = " << cuda_v1 << std::endl;
+   std::cout << "cuda_v2 = " << cuda_v2 << std::endl;
+   std::cout << "The sum of the vectors on GPU is " << cuda_result << "." << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample.cu b/Documentation/Tutorials/ForLoops/ParallelForExample.cu
new file mode 120000
index 000000000..fba5e0816
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/ParallelForExample.cu
@@ -0,0 +1 @@
+ParallelForExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md b/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md
new file mode 100644
index 000000000..0a9f9bd4d
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md
@@ -0,0 +1,37 @@
+\page tutorial_04_ForLoops For loops
+
+## Introduction
+
+This tutorial shows how to use different kind of for loops implemented in TNL. Namely, they are:
+
+* **Parallel for** is a for loop which can be run in parallel, i.e. all iterations of the loop must be independent. Paralle for can run on both multicore CPUs and GPUs.
+* **n-dimensional Parallel For** is extension of common parallel for into more dimensions.
+* **Static For** is a for loop which is performed sequentialy and it is explicitly unrolled by C++ templates. Number of iterations must be static (known at compile time).
+* **Templated Static For** ....
+
+## Table of Contents
+1. [Parallel For](#parallel_for)
+2. [n-dimensional Parallel For](#n_dimensional_parallel_for)
+3. [Static For](#static_for)
+4. [Templated Static For](#templated_static_for)
+
+## Parallel For<a name="parallel_for"></a>
+
+Basic parallel for construction in TNL serves for hardware platform transparent expression of parallel for loops. The hardware platform is expressed by a template parameter. The parallel for is defined as:
+
+```
+ParallelFor< Device >::exec( start, end, function, arguments... );
+```
+
+The `Device` can be either `Devices::Host` or `Devices::Cuda`. The first two parameters define the loop bounds in the C style. It means that there will be iterations for indexes `start` ... `end-1`. Function is a lambda function to be performed in each iteration. It is supposed to receive the iteration index and arguments passed to the parallel for (the last arguments). See the following example:
+
+\include ParallelForExample.cpp
+
+The result is:
+
+\include ParallelExample.out 
+
+## n-dimensional Parallel For<a name="n_dimensional_parallel_for"></a>
+## Static For<a name="static_for"></a>
+## Templated Static For<a name="templated_static_for"></a>
+
diff --git a/Documentation/Tutorials/index.md b/Documentation/Tutorials/index.md
index a5edd055f..95f300b1e 100644
--- a/Documentation/Tutorials/index.md
+++ b/Documentation/Tutorials/index.md
@@ -5,3 +5,4 @@
 1. [Arrays](tutorial_01_arrays.html)
 2. [Vectors](tutorial_02_vectors.html)
 3. [Flexible parallel reduction and prefix-sum](tutorial_03_reduction.html)
+4. [For loops](tutorial_04_ForLoops.html)
-- 
GitLab


From 322d0ae9a7ce0086c8458115847e0398d2c5c23e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 2 Sep 2019 21:35:56 +0200
Subject: [PATCH 02/43] Fixing for loops tutorial output snippet including.

---
 .../ForLoops/.tutorial_04_ForLoops.md.swp       | Bin 20480 -> 0 bytes
 .../Tutorials/ForLoops/tutorial_04_ForLoops.md  |   2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 delete mode 100644 Documentation/Tutorials/ForLoops/.tutorial_04_ForLoops.md.swp

diff --git a/Documentation/Tutorials/ForLoops/.tutorial_04_ForLoops.md.swp b/Documentation/Tutorials/ForLoops/.tutorial_04_ForLoops.md.swp
deleted file mode 100644
index 45aefd2ee4fd7c7876016a6933928a1f76f578dc..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 20480
zcmYc?2=nw+u+TGLU|?VnU|<N$$c@#|bz>+9VqnNmN-fGLO-d~S$>G7tiA6=3sk(`I
zd8v3*F)-A@4b(5sFUl@1NK8)EFUiZ%cgasK%}vcKNi50C&(jYnEy*v+Ow1|PcgrvG
z$<Hq+)-Qny#T%H!!vytmQ}Eh0DmfYgqagqffzpyRT?<|YV<SUwR##S1R1g*lg-eez
zMnhmU1V%$(Gz3ONU^E0qLtr!nMnhmU1crYIloT*B)H5(JFhRYl3Z)s*Xl5v%9ZElg
z(lB)}d3mTfy8N&I5P(Y_s%?}S4S~@R7!85Z5Eu=C(GVC7fzc2c4S~@R7!85Z5Eu=C
z(GZ|P2qdO3Fa&cjFfemM=Ko>+|J(cw47d0h7*6psFr4IPV94NSU`Xd@VA#jUz>v(x
zz_5dtfnf$O1H*J)28JeH28Kjl1_l*g1_pLs28L@q3=CI!7#LRZFfgp-VPLq;&A@Pr
zn}OjPHv_|DZU%;aZU%-vZU%;boD2+qIT;x4axyU7;bdSq%gMlShLeF|9VY|BT22Or
zG)@Kvdrk%h7EXx!JlPo-KC>|}Y-eL&@MmLS;AdlC;A3N8xX#MJaE+CLVFN1z!+KT*
zhK0-w3=5bU7+RSb7+RPa7#=e*Fg#*nU|7S%z_6N$f#De=1H)6;zy%G$ls-l(D=RAm
z6y>LsCYRVGD&!^RrrIi{<)l_*Cgr5Y7bF%X=H#U2#22NelqNGUkf0c}1|z-zVz82(
zjeepX14D3Wu0npALRo5ZNq&(+YECL>wMMZ*a$=rBQmR5?PI10Meo{$dW?pKFLSnH(
zbV6}yu11+=Lac_Ko_<D2ZjL_4ec%NjA$~r3AQ9z66T{@xL=$sMGeZlDGy@BB^OU3{
zvs81VRHGDQQ&Y=SO+5yNn9RK7oYIt3h2YX$GJQd&w&W7Jc^;n&@yTJt^Pr77*NVj4
zf}B*n{L&Hz2EY80RE3g^M1}HHg@VN5Vub_)J%a>Lx+%#>Rme#!E>TD<N(V0p0wp?-
zU~XzjMt+JyLJ%l5gM4YD;F4OFnVhO%XJrLW;;9LG3Z5kjnZ*i;c?w#YDXF;y`6a1&
zC9uS*rJzxqnyL`(36n`qjn&8~DJdwn($`Ne)+^7<&MZhx$xPJCFG|-3@$`|DYHA{x
z4o&_F<r$gD89EAS`9%upnPsVY3i$=8Mc^e&ItnTIsl^I;`6UX;8Hsu6An&Fs6r~oI
z=9K6uxaAiqB&MWfg8ZSQkO_8lRccYbo<c|lOqD`Tetvc_C`{7wb8_;_i}e^7f;01o
z&B^4Z8gli)^9sER8cOV;L@(~bkT#}Lvxp@>a&hX!DS@pP0;TtXu2`@*7$l?+lA2qP
zlUS0Pq7V#Pg_sP=r0`m`1ST0@43>;f%P)df4$8{dG+<Mfr<;<Qo0?agnV*-KqYwbC
zpHb||i$_v|sKODpA*w@|ex!m`fq?<3{Kzb}LaIGs%H8sdU^<cu3LqL1ASG!6EUBet
zmSm(BDL{pbt*kuqi%Sv|@{xs{OH&dPAmvqBW>Im8LP>eP0;tWAn_7}uRIHGanwFUd
zDfM#l^9vM`@=NnliWM^R6hNi9vqEu6Wlk!%NX<=6%qs@f1|=Y=qEv<Q%$yuhqaw2e
zT9_7tOW4f3l+=pUVugg_lEk8t1O+`kJ%xnSycAu-1U&_}(!69)>H-)43OR|nNhyg6
zX)wW(e2{qssYPk|MY*XdAXlU&CTA$XZ2&tYvsj_Hw4flrI5kC~BwwK@H90l24B`d2
zLItoZ6cY1N;5Ae+xC(>l0r?FS2%v@+*wGpw5tIt8SW{0SI2B?nq-4*`OIJvRRF75+
z3<(Jd3`miMT+O9cq$X=9fW5DykeZjG1NWg0!Zn~^(X<8|!{C%yoS6)c6}W?v^Ye;J
zib|n@m8THm=c7=ZT2z(_3FM5#qLlK)qErP?UI2xbLP=3#UU5NU5x6F;C@4xT1|?un
zwT$9XP$U;a(gUU$nZ*jJ5Ur^x3Q3g;i3+d`1$G=b$w2g?n3)MnMxZ986*!l|LIx$*
zz%(J{N(K`>g=lP9I#xp&W_JowX4Ygd)>DW^<os9-WkeR%WH8cGh{m0LV>Og<XIxDN
zLp_CPWOv7EC?mUAlR;ToAtW&=ClwT^&iQ$uMoTdRgO-Ap7B(MfX@Qcy9#|gXU@a|3
z(g(FgKyd+Sn}Ea+$-X$XurxKVBr`Fm5}bk|4Ug1{f}G6c%#xf+h0?sD{G1#}964)i
z!(*vfPr<J=Hz~CU<U)8}C|1ZVEiM7&EJ#o(Xk_Q*m**)YmMA3W=N4q<q$-qT=7JjW
zAg|yKK8Vv(D@q_`Bgl^A{M_99JaFQGr`*iEl6-~S{GwC^co|s?2?coIrsWqw3`+!O
zKBRDf_EU;V^FWy&7W_I2nR=;u3W+&6h@zkvWIs69f{HDe4?)?!AT=)~HLpYuR&;~y
z02vH2H9t=wDZeB`A-6QABr_T0O6P!3P*XHT!95@p)OZWY$ShWXEn`<G&d4t>R>;UN
z2NgA?#UM|mrKN&%S$1Y#3dj+#s4P~<1T~#O9WF@O1?oU4_$B71=2Yr{%FIfI#G+J4
z{_xBzf%Mxl^Ya)OVhR#LEl(^<=Rx@g)WR$%W&rj7&v7#_r13!d|FHS_B7O#jLVgB@
zTz&=yPksglSAGTt7k&nYYkUk0SNRwiF7h!jtmb22SkA}5u#As^p^}e*p@NTrp@fft
zA%Ks8!H185!JChP!IhVR!G)KB!GV{7L4lWnfq|ES;SUc3!yz69hJ!o|4D)#y80PUX
zFm&=TFm&)h+>r`$3w=7#qqdBOz-S1JhQMeDjE2By2#kinXb6mkz-S0iJp>TbIH~E7
zo=k9ZVjgTL3Op--G<H>#nwSC_`$itY3dW`pJcgyE1)5N>Qh=*aC<b-QG9lyd$ekxp
zm&!TR#ZjR$wM0)#3p`2^ZKz|cW2$4MW1?fGV+m#(gIR_~I)<jP4AF)<208{h5E4Q|
z*a`+Z3I?%YGZ;Mc-~;-g4jr`r3UY=HXp$o@6*4}QnwOlP0vY=$hIj|w-OS7@C@oQd
zbkUO`BV(ZPkNmt;@Q6uiMP^QBVo{|+T25kmu|i@|QDUWnMrH|Y%nl-upQezMpPvI7
zQPWh&NGt;lEhQKxfV$+ME^Sh3dS)I-EI&;FJOBoD1b8$c!5~2)KM&j)&IgbG733F#
zXDEvG6g=}3@=J?gqg3E2i}L)^oD@*!KPNLA<RkEKBWUm{8Wct*vC#%PMmlCXhDIQk
zF__S?)G@TsF@}X1Xb=q)A*mI~Ii<y*p*E;<Qd1NPic-@uD-?=Lb0O0ska4kSLmd+x
z%h+g$!8%5UvC+ml79av@vW|%<G(tdzm!~R}C+3wvBc!0TBvk=CnFzKEG>eI_7?fVn
z0tGTngV2E0apkEB>8T}f*BR@Wf?NkqNg!vL#_B|)BqmS*F<_?CM1|b^l+3iuWbot>
zsPm7Mg22IF0vbI?PAmombt*iWfK=vzM`pl7s~I3)gHlaqS!N1&te_-c0qX8z1$ab3
zW~@>nLyqA7{{uz_1`E&vegOsse`wx+z|X*NkDq~|i=Tm^lb?Yhl9z!Yf|r58l$U|Q
zgqMNg2M+_ocOC|Y&pZqa6L}aIdU+TadUzNZ{J9wz{J0qyEV&sNEVvmMM7bFlM7S9k
zo^vrUJmX?un8n4wAkM|WAjZYOAjHMM@Qagy;TtCd!&goQh7+6&497Vc7!GqXFf8I^
zV3^0rz%ZASfuVwvfuWp}fuWd_fgzETfgzTYfgy&IfgzHUfgysEfgyyGfx&^3fx(88
zfx()Sfq{#Yfq|2gfq|8if#DMe1H(HG28OpB3=FH;85mZvGcZhKXJDAX&cL9?%D|w?
z%D^DN%D}+S%E0iAg@NHK3j>1>3j>2U3j>2Y3j@P<W(I~&%nS@4nHd;PF*7inWM*Jk
z&dk8DjG2L<mzjZ~hnayPi<yBTlbM0Rk{J??=FAKX;>-*TV$2K-x0x6iZZR=1Tw`Kj
z*vG`cu#1U-VJ8y<!+k~uhI@2ag8-WNM@@0?X$@%7NKJ_cl~m9P4a5|KD=dFt6xEJ-
zDWHKT=!7O{APGEGiCkP`)eJ7KLFGJXfG{VuBo$Vqz>QFVjERHhE)o(F6c|8O!^XgM
z6hOR?%7RoYE7uChOelEL4U6|c;-C>@Wwcs>Sick0OHe^-Uda#*SZHZ%XJrK)JJwJr
z(*f0XWqRqUCBd0hsT!I(3Xn;69R&kD0|nTG13c0pH8A*$4{$;$PRs=r#+BgOFbgzn
z467203sRFa(<(v5d{QMuZEA8pD1AZa+`yy2;3)#cn09e0xKvL-Ne>WbLnoz@L4&&a
zX%GWJrF%(6Y92h5Ax(^djKI|lfla|%F@Vz{Voo14V4IMZT9TZR0H1?VNJ>pk1WABu
zT2Ow15067bF*OBTzrxngfTxBab7!FOXs8dN3nEJL;VuU+JSYYase{(`lvIKS^}!2n
zGBVRa^K_uRlvo65V1Oe9G!X`xPAM%`$V^iJwID!&T@0!RQ}Pvx^K(-X3l$W~GfOhS
zcA+orNlHxv4P(Q60~%@0gfI9>0hx+4a{#V*N+45rphXXfX=$m+C17)*zE8|cPE{z*
zOwR*Xb9p5>C6#&#{*dWFNP7cRdx308g(M!t>|J7ZDyVuZNi8Z%%mGz-AUB~qw>%@Y
z2wFLUqC2x#0ctU1K~iQ(u|hE<VS(IQl95@2n8E`ER;of?Y6a4|6VPCN268~5Pa?un
zE66s8k<g_ph}eXhfwUGyLkYCbLrD|1C<PQjAm6|jj-ae~LTVv{OHL=y3<GF-0NiSf
z1|>4Y5|-3FY)e=IzzwPR;?i7A&@6#MX>n;{4ru<Tpdcp`oFa27LCLbXG#A|V&;iX!
zq?V-?DL@y1fu?<P^3yYuK|v2&-U3RF(B>Z~)+#~ERLWCxa==ZM5I-Lslm;Be1V(XY
zZe~tSVv$05B3LUt)q)$M#rdGZ02HiAsS2ry#hE#k(B=_jW(sX~7dq<*sW(84TWG@u
zX^JJYL;+kBL8n(CEmTlz)J(_7RL9s1sm+LJj3PBbVdV&@5QDYQ6*7zA%{~((vvf=>
zp~fL80Ch0TL5d7@42_}EphZAXr|KZJOHr(4V2DYJWr!)xOwUb>&s4Bgh$$}3jjztK
zHLQ)R&eS!mRY;7_)MJQAi)LUzD|<EaK~V^6MQefz4al54XfhRIZZW!%nMlTYf|lKZ
zTgsqhrhsl7Ji#iIlosU`D<mqw112U-B{AMmM<FIBCBLLtM<FpjFD6Yz0X%<)s;n4U
zSusQzk|RMa7lboFc?=OdAV<OLa(E*>J~1yP9<B(bRimt|;07u`LG!FAn&B%ZL82IK
zAozN-@XV47$0Eq0G{`D8P&)>)q6=QHSTUgY1ksDBvc#Oy)Z!F{gv11JVoyj)0C%U-
zQ%i~=%P31SQZtJb;GHjpghbDj3YcOL6H;2iipOG5%oe2<7v$#^gBE^*N_kNA2(kb)
zVVYQxnG0FF2`Xnmt296X2{ja2&A=>!m9066Md_*VRi1hZpzbxOWKM*Z=!r#%;OeRv
z)W?Jrx`|2oWvL3Fo_%IMXbmW6RR<_D=I7;9f~H)-U6=eca5Yw10+Pwi%!9bBNFf*E
zNIgh~N-PDhz5p$2&CJhJuvLhTRfvyIE=@^{2i5AJnezB}1r1PwQd0m0j*bF&RRcsI
z$up$_wEC_zDJNA;ArU5*k`EC`g11#b(Faa!;7%^W>!?dd65wT1f&#Wt3GC?uTv~hP
zDS+$R;>?m#aM@Z68VblwtW*G1$Dnx1$xlu!NrkQ*ge@wBE|GyG97rb=T1-F}pTJ6^
zT+sU7;u25+4T@TDq!q)LJL)MQoQ|A!6F}(&X>AO64I_9IAq7<Kf=h6aaTw((xakJ&
zrdA@_JfQwNsQm$nQjk7qTq@+{7lBs{Cn}Vsrhpa=gG??iNlgULZG)SDs2agJLq{Pm
zzZ?`yu#S6rY6+^hVZj6)-2hpcm{gpf1M1y_jZjD}EKSS-1usZ0H?acj6iB8)t2&T^
zC_y15AF^CCKNmbw08Y3t&wzrwJRdcpK$H8Gp!Enk3Q46UpehTzSODBIgZK<I+>nx5
z0ZUrtnR$7sMc7vVfCh!Y^Y_XM@bm;}n8B98!ize1X$5L!fYwGp)^~%NeDR>BTs%w%
Yv?NnNbmx2%3&5tQLYup~#^7=c0ABU4p#T5?

diff --git a/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md b/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md
index 0a9f9bd4d..f00fc2852 100644
--- a/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md
+++ b/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md
@@ -29,7 +29,7 @@ The `Device` can be either `Devices::Host` or `Devices::Cuda`. The first two par
 
 The result is:
 
-\include ParallelExample.out 
+\include ParallelForExample.out 
 
 ## n-dimensional Parallel For<a name="n_dimensional_parallel_for"></a>
 ## Static For<a name="static_for"></a>
-- 
GitLab


From 483c176460d550a46cace148c3f1c98e0554a4f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 14 Nov 2019 20:42:24 +0100
Subject: [PATCH 03/43] Writting tutorial on building applications with TNL.

---
 ...l_00_building_applications_with_tnl.md.swp | Bin 0 -> 12288 bytes
 .../Tutorials/BuildWithTNL/CMakeLists.txt     |   0
 .../Tutorials/BuildWithTNL/example-cuda-2.cpp |   1 +
 .../Tutorials/BuildWithTNL/example-cuda-2.cu  |   1 +
 .../Tutorials/BuildWithTNL/example-cuda-2.h   |  27 +++++
 .../Tutorials/BuildWithTNL/example-cuda.cpp   |  25 +++++
 .../Tutorials/BuildWithTNL/example-cuda.cu    |   1 +
 .../Tutorials/BuildWithTNL/example-host.cpp   |  19 ++++
 ...orial_00_building_applications_with_tnl.md |  97 ++++++++++++++++++
 Documentation/Tutorials/CMakeLists.txt        |   1 +
 .../Tutorials/ForLoops/ParallelForExample.cpp |   3 +-
 Documentation/Tutorials/index.md              |   9 +-
 12 files changed, 179 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/Tutorials/BuildWithTNL/.tutorial_00_building_applications_with_tnl.md.swp
 create mode 100644 Documentation/Tutorials/BuildWithTNL/CMakeLists.txt
 create mode 100644 Documentation/Tutorials/BuildWithTNL/example-cuda-2.cpp
 create mode 120000 Documentation/Tutorials/BuildWithTNL/example-cuda-2.cu
 create mode 100644 Documentation/Tutorials/BuildWithTNL/example-cuda-2.h
 create mode 100644 Documentation/Tutorials/BuildWithTNL/example-cuda.cpp
 create mode 120000 Documentation/Tutorials/BuildWithTNL/example-cuda.cu
 create mode 100644 Documentation/Tutorials/BuildWithTNL/example-host.cpp
 create mode 100644 Documentation/Tutorials/BuildWithTNL/tutorial_00_building_applications_with_tnl.md

diff --git a/Documentation/Tutorials/BuildWithTNL/.tutorial_00_building_applications_with_tnl.md.swp b/Documentation/Tutorials/BuildWithTNL/.tutorial_00_building_applications_with_tnl.md.swp
new file mode 100644
index 0000000000000000000000000000000000000000..98470580bc7d09c65b2ec9d0a3609223640e4bf9
GIT binary patch
literal 12288
zcmYc?2=nw+u+TGLU|?VnU|`srbvCxG)P>=R8v{drQfg5~X;NwtNDdE9PAn?QOw~=y
z%S*+hih-dHZlHd7eo=ODL1J>Meo0=AzDs^`X>Mv>Nn%N6ex80vX-R%jW@1jUzEf#t
zPD*%YNk)jDkA4YMINrb@J_#g|nU@})SWu9YnGDuf9ABPUk`Z5$m!p@Pg4Z3RlA|Fo
z;zFRbBu&?Xm%-S`5S-POl@t|(g+fPMFpPS5Gz3ONU^E0qLtr!nMnhmU1V%$(Gz3O+
z2$U2sGSo9LFfc&_K^RIiqR~+9C^Z@aqaiRF0;3@?8UmvsFd71*Aut*OqaiRF0;3@?
z8UmvsFa$#&F@=Fai<g065-()_AJ+e0&(FZHj-P>H9zO%aTz&?IU3?4-JNXzGmhdq!
z6!I}J`0_C@=<_i!=<zWyyys<Lc*o1Y@RpZ>VIMC8!(LtnhAF%Z43nYePUK}^m;f?=
z2!_w7lSe~fGz3ONU^E0qLtr!nMnhmU1V%$(Gz7>F0cB-{u+-#|{Gwt7n?!}Y#N1R{
zr80<slAVoyq8$T+v7SP-OJ;FNQD#zUNotA$R615eIRzma4;9s9Fw#?q4hF3pPDWB$
z3>H8rQczF;sS8UjD#=Vv%u&cMNG*aa8je-aP%eWC$0LL_859&04D}SEJ@Sh(tMc<o
zFf?VrB~Y}1^gw+W3o;q(S4{?GWrdK$q?}ZR{4@pU{5%jJstPR}Kq0HFtN>p5tN>cj
ztl*rRn4OAHl?>v-!U|QTZ(??8T4oNCVvt&p2uwZ5Ug!MWg3KJS?-fdmGxO3FlJj$O
z6Z2AZb29T%6_9L#2*-njVFrLA2i<M48p>ctLE-{Q7UrH<4P}@^G#Nkvgx?7u^N_5@
z2)vTiypm!DhLDWRVg=ZuaD~*0f}F(6ykdon{Bnhod<F0daRub%;gHY_@$*qg%u7+o
zEKw*Z$}h`INi9|=NzE<DNi0b%2E`7{(Zveo8JWo$3dxCi3K^+61qvk@sS2gVsYMFq
z8L4>+#U+VFC7{4dNi9pw$u9sg3ySj7ixP8-;Wp_h1ZU<Yrz(If%`8^PNKH&hEz-@;
z%c)ez$xJFrEUHw<%U8(B%*#$KQYcO>DJ@XQELJE=EiBC}N=?yY07aK)UP)1YN@+4^
gX*>f%OhICLD%{tEm%=MRQWD5@jz~-8;VxtV0Kcb(&Hw-a

literal 0
HcmV?d00001

diff --git a/Documentation/Tutorials/BuildWithTNL/CMakeLists.txt b/Documentation/Tutorials/BuildWithTNL/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/Documentation/Tutorials/BuildWithTNL/example-cuda-2.cpp b/Documentation/Tutorials/BuildWithTNL/example-cuda-2.cpp
new file mode 100644
index 000000000..7ea4ace2a
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/example-cuda-2.cpp
@@ -0,0 +1 @@
+#include "example-cuda-2.h"
\ No newline at end of file
diff --git a/Documentation/Tutorials/BuildWithTNL/example-cuda-2.cu b/Documentation/Tutorials/BuildWithTNL/example-cuda-2.cu
new file mode 120000
index 000000000..13435476b
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/example-cuda-2.cu
@@ -0,0 +1 @@
+example-cuda-2.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/BuildWithTNL/example-cuda-2.h b/Documentation/Tutorials/BuildWithTNL/example-cuda-2.h
new file mode 100644
index 000000000..1d5606c78
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/example-cuda-2.h
@@ -0,0 +1,27 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Devices/Cuda.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+int main( int argc, char* argv[] )
+{
+   /****
+    * Create an array on the host and print it on a console.
+    */
+   Array< int > host_array{ 1, 2, 3 };
+   std::cout << "host_array = " << host_array << std::endl;
+
+   /****
+    * Create another array on GPU and print it on a console as well.
+    */
+#ifdef HAVE_CUDA
+   Array< int, Devices::Cuda > device_array{ 4, 5, 6 };
+   std::cout << "device_array = " << device_array << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
+
+
diff --git a/Documentation/Tutorials/BuildWithTNL/example-cuda.cpp b/Documentation/Tutorials/BuildWithTNL/example-cuda.cpp
new file mode 100644
index 000000000..83ac8ca6c
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/example-cuda.cpp
@@ -0,0 +1,25 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Devices/Cuda.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+int main( int argc, char* argv[] )
+{
+   /****
+    * Create an array on the host and print it on a console.
+    */
+   Array< int > host_array{ 1, 2, 3 };
+   std::cout << "host_array = " << host_array << std::endl;
+
+   /****
+    * Create another array on GPU and print it on a console as well.
+    */
+   Array< int, Devices::Cuda > device_array{ 4, 5, 6 };
+   std::cout << "device_array = " << device_array << std::endl;
+   return EXIT_SUCCESS;
+}
+
+
diff --git a/Documentation/Tutorials/BuildWithTNL/example-cuda.cu b/Documentation/Tutorials/BuildWithTNL/example-cuda.cu
new file mode 120000
index 000000000..24b9f5933
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/example-cuda.cu
@@ -0,0 +1 @@
+example-cuda.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/BuildWithTNL/example-host.cpp b/Documentation/Tutorials/BuildWithTNL/example-host.cpp
new file mode 100644
index 000000000..1d5d356fd
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/example-host.cpp
@@ -0,0 +1,19 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+int main( int argc, char* argv[] )
+{
+   /****
+    * Create an array on the host and print it on a console.
+    */
+   Array< int > host_array{ 1, 2, 3 };
+   std::cout << "host_array = " << host_array << std::endl;
+
+   return EXIT_SUCCESS;
+}
+
+
diff --git a/Documentation/Tutorials/BuildWithTNL/tutorial_00_building_applications_with_tnl.md b/Documentation/Tutorials/BuildWithTNL/tutorial_00_building_applications_with_tnl.md
new file mode 100644
index 000000000..0aab78a2f
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/tutorial_00_building_applications_with_tnl.md
@@ -0,0 +1,97 @@
+\page tutorial_00_building_applications_with_tnl  Building Applications with TNL
+
+## Introduction
+
+One may find usefull to read this tutorial before any other to learn how to compile the examples. Here we explain how to build applications with TNL and we provide templates of Makefiles which can help the user when starting developing programs with TNL. Since TNL is header-only library no linker setup is required.
+
+## Table of Contents
+1. [Compilation using command-line](#command_line)
+   1. [Compilation with `g++`](#command_line_gcc)
+   2. [Compilation with `nvcc` for CUDA](#command_line_nvcc)
+2. [Build with Makefile](#makefile)
+3. [Build with Cmake](#cmake)
+
+## Compilation using command-line  <a name="command_line"></a>
+
+This section mainly explains how to compile with and without support of CUDA using different compilers. We start with the following simple example:
+
+\include example-host.cpp
+
+This short program just create new array, initiate it with values 1, 2 and 3 and print it on a console. 
+
+### Compilation with `g++` <a name="command_line_gcc"></a>
+
+We assume that the code above is saved in a file `example-host.cpp.` With GNU g++compiler, the program can be compiled as follows:
+
+```
+g++ -std=c++14 -I${HOME}/.local/include/tnl example-host.cpp -o example-host
+```
+
+TNL requires standard C++14 which we enforce with the first parameter `-std=c++14`. Next, we need to tell the compiler the folder with TNL headers. This is done with the flag `-I`. By default, TNL installs into `${HOME}/.local/include/tnl`. You may also replace it just with the path where you have downloaded TNL. TNL is header only library and so it does not require any instalation. Finaly, we just past the source code file `example-host.cpp` using the command-line parameter `-c`.
+
+### Compilation with `nvcc` for CUDA <a name="command_line_nvcc"></a>
+
+If you want to profit from the great performance of GPUs using CUDA you need to have the CUDA compiler `nvcc`. It can be obtained with the [CUDA toolkit](https://developer.nvidia.com/cuda-downloads). We first modify our program as follows:
+
+\include example-cuda.cpp
+
+We need to include the header `TNL/Devices/Cuda.h` and declare the new `device_array` using a template parameter `Devices::Cuda`. For more details see [the arrays tutorial](tutorial_01_arrays.html). To compile the code above invoke the following command:
+
+```
+nvcc -I${HOME}/.local/include/tnl example-cuda.cu -o example-cuda
+```
+
+After executing the binary `example-cuda` we get error message surprisingly:
+
+```
+host_array = [ 1, 2, 3 ]
+terminate called after throwing an instance of 'TNL::Exceptions::CudaSupportMissing'
+  what():  CUDA support is missing, but the program called a function which needs it. Please recompile the program with CUDA support.
+Aborted (core dumped)
+```
+
+The reason is that each piece of CUDA code in TNL is guarded by a macro `HAVE_CUDA`. Therefore we need to pass `-DHAVE_CUDA` to the compiler. The following command will make it:
+
+```
+nvcc -DHAVE_CUDA -I${HOME}/.local/include/tnl example-cuda.cu -o example-cuda
+```
+
+Unfortunately, `nvcc` compiler generates a lot of warnings. When used with TNL, the amount of code processed by `nvcc` is rather large and so you can get really a lot of warnings. Some of them are treated as errors by default. For this reason we recommend to add the following flags to `nvcc`:
+
+```
+-Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda
+```
+
+The overall command looks as:
+
+```
+nvcc -Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -DHAVE_CUDA -I${HOME}/.local/include/tnl example-cuda.cu -o example-cuda
+```
+
+We sugest to guard the CUDA code by the macro HAVE_CUDA even in your projects. Our simple example then turns into the following:
+
+\include example-cuda-2.h
+
+The best way is store this code into a header file `example-cuda-2.h` for example. Include this header in files `example-cuda-2.cpp` and `example-cuda-2.cu` like this:
+
+\include example-cuda-2.cpp
+
+It allows you to compile with CUDA like this:
+
+```
+nvcc -Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -DHAVE_CUDA -I${HOME}/.local/include/tnl example-cuda-2.cu -o example-cuda-2
+```
+
+Or may compile it withou CUDA like this:
+
+```
+g++ -std=c++14 -I${HOME}/.local/include/tnl example-cuda-2.cpp -o example-cuda-2
+```
+
+Thus you have one code which you may easily compile with or without CUDA depending on your needs.
+
+## Build with Makefile <a name="makefile"></a>
+
+## Build with Cmake <a name="cmake"></a>
+
+
diff --git a/Documentation/Tutorials/CMakeLists.txt b/Documentation/Tutorials/CMakeLists.txt
index dadf70b79..8f9971d47 100644
--- a/Documentation/Tutorials/CMakeLists.txt
+++ b/Documentation/Tutorials/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory( BuildWithTNL )
 add_subdirectory( Arrays )
 add_subdirectory( Vectors )
 add_subdirectory( ReductionAndScan )
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample.cpp b/Documentation/Tutorials/ForLoops/ParallelForExample.cpp
index cd6ece928..5714df7d3 100644
--- a/Documentation/Tutorials/ForLoops/ParallelForExample.cpp
+++ b/Documentation/Tutorials/ForLoops/ParallelForExample.cpp
@@ -1,10 +1,11 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
-using namespace TNL::Containers::Algorithms;
+using namespace TNL::Algorithms;
 
 template< typename Device >
 void vectorSum( const Vector< double, Device >& v1,
diff --git a/Documentation/Tutorials/index.md b/Documentation/Tutorials/index.md
index 95f300b1e..8146a6bd8 100644
--- a/Documentation/Tutorials/index.md
+++ b/Documentation/Tutorials/index.md
@@ -2,7 +2,8 @@
 
 ## Tutorials
 
-1. [Arrays](tutorial_01_arrays.html)
-2. [Vectors](tutorial_02_vectors.html)
-3. [Flexible parallel reduction and prefix-sum](tutorial_03_reduction.html)
-4. [For loops](tutorial_04_ForLoops.html)
+1. [Building applications with TNL](tutorial_00_building_applications_with_tnl.html)
+2. [Arrays](tutorial_01_arrays.html)
+3. [Vectors](tutorial_02_vectors.html)
+4. [Flexible parallel reduction and prefix-sum](tutorial_03_reduction.html)
+5. [For loops](tutorial_04_ForLoops.html)
-- 
GitLab


From d14d7d91a29d85323403cc0f2072a0afb165b1a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 14 Nov 2019 21:29:01 +0100
Subject: [PATCH 04/43] Makefiles added to build tutorial.

---
 ...l_00_building_applications_with_tnl.md.swp | Bin 12288 -> 16384 bytes
 Documentation/Tutorials/BuildWithTNL/Makefile |  48 ++++++++++++++++++
 .../Tutorials/BuildWithTNL/Makefile.inc       |  14 +++++
 ...orial_00_building_applications_with_tnl.md |   4 ++
 4 files changed, 66 insertions(+)
 create mode 100644 Documentation/Tutorials/BuildWithTNL/Makefile
 create mode 100644 Documentation/Tutorials/BuildWithTNL/Makefile.inc

diff --git a/Documentation/Tutorials/BuildWithTNL/.tutorial_00_building_applications_with_tnl.md.swp b/Documentation/Tutorials/BuildWithTNL/.tutorial_00_building_applications_with_tnl.md.swp
index 98470580bc7d09c65b2ec9d0a3609223640e4bf9..c41656b4c76cf46233272ec0159468486efea11a 100644
GIT binary patch
literal 16384
zcmYc?2=nw+u+TGLU|?VnU|=xWcsBNfToS|ed<KU6q|~B}(xlWPkQ^SIoLE$pnW~$Z
zmzRo16$3*Z+(7;E{G#mQg2d!h{gS*KeV6>?(%jU%lEjkC{5<`T(vtk5%*32xeW%jQ
zoRsj)l8g{PAN>-jaJ+#*d=f|^GcP?pv7jI)Ga0O}IKDiyBqP2gFGnvo1+P0sB}YSG
z#Dzd<Nt&(&FN3j>AvmimD=8`n3x$rjU>NoAXb6mkz-S1JhQMeDjE2By2#kinXb6nt
z5GW~NVyI_eU|@m<f(MjlM5CFZd|4>%38i7`pxjYvGz3ONU^E0qLtr!nMnhmU1V%$(
zGz3ONU^E0qLtr!nMnhl_hCpHp0|O@m1A{B5BQL<f0PFv+=VxG8$IrkpkDq~IE<Xc9
z0v`iIJRbvt7as$II3EMUQ(gv!6}$`#ExZg2X}k;!sk{sfYP<{#s=N#g6L}aICh#yY
zl<_byl=3hz@bEA&aPu%QT;OJ4IM2<%;KR+p;LXjz;Lgp!@QaIq;TsnN!&fc_27XQk
z20l&(h7%kN497Vb7_PH3FkEA2VA#XXz_1%cvoSF2W@BL3&c?t{%ErJ@$i~1>z{bF!
z%*Mc=#KyoN$Hu_G%ErL3ft7(li<N;vgq49ofR%xPpOt|jmW6>KhJ}G4f`x&Bi-m!K
zjfH`Mm4$)fJ`)4OJthW*TTBcLeM}4tT}%uNolFc2*-Q)!SxgKJX-o_ZXBZh6PB1bs
z9A{);aASmol>;LKLp37<Lri8~G6O?+szPyTdTMcrLP@?tdTC-&ib6?7s)BQ<i=#qv
zeoCrBQYBa*H!-;=U%|sM%rzb?ms*yZr;wSaP?=v^q)<?lpOu<iQmm)oUs|M4oS9pY
zld6zfkqD+sGE(ytN=l3JiWM^RO7g*`rRC@3<d<jWrCTvDBqSs-<dr2SE9i#j<?E)T
z78Io>Czhn9=%yEx>XsxHrKgq@E9mN`Ruq)z7NzDSR-~rrCg<lBm!wt{6u~4?D@s!H
zQc_cNa}sltQW6z(T@X&y^;D_$@b`7C)z{0(PfpCy&&*5CDNO+{;DEVOH@P$=Q7^ev
zK{p>o2<$qBkc?D?{Ib-d#GD+3<ow*+#Jm)Roc#RkVui$Fs1K;<c!-^$d1?7YC8c?Z
zC8;@;ItmG(AW8t)UXYoSTBMMknwMIXSdv<-kf@N8U!storcj<(l$V*8UaY4O4hr(p
z;?xuc(83ah5I-LsaIh!l=9lJy)q<n8peR2%wHTx<sS;*EX0bw1Vo647kwOkA=oLU-
zEzSo;vqExWo<e$Ri9%6oVopvaHhY8fb5lVsDalC9RY)vKRVXP+1x2MoVzEMMQBi(T
zF(^5uq^2d7=9K6uxaAiqlw^QxOie7#&r>K*RVYdYg>PzJ3Me@yra%%SJhdpK<s_yT
zgQOtA0S$Xl(9j^piy*GbELJGVNGwrEO-#;ED9B7r273{l3BaM4nFk7Bh0J0|_DM|v
z`7u!el3^0y$uL1rAtWQUC^ap=C>7+>ywp^%Ckqmbixm=((rW_9f1o^=oSzGhYp_NI
zL?D1d2ox>lnK?NMxry1S3YjI)kZ?@OFDgk*QP4;R*^*M4TacQf$)KQ6o{?Chp=qT6
z$=AiD1qJy<B_Jo~W)>G`=B4W>B$bwclSn~PetJ=2u0nERPEKlyLZU)iX<jmDag#!M
zMrLw`0?66L3YjH(3IRE(iN(+u1Gy7LJ2>5e!w71b9z#iLQEnzEIm69PD@g?<fFf{7
zppXbEPl`(t^B@7N9^&U?W#w9toLT^~zSzpjxilp)7~*tah_lrhGV+T{;uDLC5-SyK
z6`~akbrg(r6pR&OA$iCVY(r{AYI12wW?nkjg-MxtiA9wP3DDvRobeMt5duou;6$pB
zn_66)m<}o!iwcS|LE)2A2@POKQlq#)2~PzFBq*k#g%Q|o8L5dWsYMD2phaUYsb!hT
zsm1yrU+ZNgfHPuBYH|)ZTV;Usmn$TsfE7UkAVHxN<Wz-3g_6|Vf}F&XRE2`XqQu-(
zP$(xrje~?@0wnL{f)hz<Nn&PBu|jcbszNj<*?~i|SOK<ZEmi}@iZ?KX*i@{SQIeaZ
zsizQ<4=a$s&If0Q#H9SPRE5mEvi$5+uqZ~twSv?=pduKhtkp{{C}3dlOam7-<%xOV
zDyg6-KP|IFA+0Ds7i?NOs6<gHNG(dsFUp0b759M9VyN%IX|6KA6dt}AiDeLXfMsBL
z3R>LhDR`EE%eACbh5V$F#LPTcnF$JsXt0)&{QR8k%#v7*jFOUqVk>?9l+?1+ocw~+
zBE7t_%#_STz2y8{eNg?Oo04Cimy@5EQmm<`5T2@#mRVF>qL7=Pl9^Vi0Ies%nJcjv
znw>$lhqAJ=f-^X)gBJrrifl;vrjV9j1j({Ci3)j%xv92F(99p7lbM$q4@!+nb~gHn
zb_@(5em)9CsfDGPMXALKplp-^s>_|VwGB<6S+_h@Ar(}qB&R~`0C^GOM5J`0TU?T2
zn+(#Gpr_!MT2Z0{${V0u4@#^hsW~~|Xn=<!*pU1jP_hN*d5|+8$+#F)Ze<pON|}`W
zJY+j_64MnDbUhRF6r3s*ptXSxq!7(3E=kPEfwmM9aM#-jdJ2*Gr3$%;l?sVD#rX<F
zsRcQS$)J)@A*-~w1mVGg#F7k!a!|nwPFCR50E?#76p*EQAOhq{h_e;)^KvQ`ax#;O
zK*a>8oGs2*$ShGv$xkg-$jb*+f8an?NX)BL$OO9=9B+CGZkc(BpsKn&72<}1L{O_L
zBUPa|zqBYh71Eqa%gjkd6h5Hh3zY0(c^4F#&=}SQ#V~RX)=f^(gS3{?wY3!x@uWah
z`vztcasvk>1}P}RQxy`6i%WAsIVBNN5g=#gVuj+wveXn%doNJ|yASjd6hO;e72N$o
z71Fh}Vd(?X?m$#3c?wCOfB;u)DWGNnq;$ZY57M=@6BMwg{`BNzc;W}OB8oFWl^V=0
za56{+l@^e)5?rk5C}iekmSlp;#!P4;DND>LO)Uo39N;q67)%xvW#*NDqCG!PAyEO;
z+|18O)l<Mya)AO76om{7Az(+OLTbz0#7s~+O06i!NzBYER>;UN2W22=xIoe;C~<)@
zO@3*K0<@Y0wVuGG4x}tk$xKU2ElSNRft9mGpw?X~D1R40-49AKkU#|0aHwr>D+UI%
z!UbY1N)p2!y0DNn)>DWEFJ^`~*%{Q{h}BR|2GN=fMyRsBiP@>3<Poc(43Y;iL9Gu3
zkTOC=9LP{e(;88>fi)ZIDMVvx2BiyhE$PY0nhfa5F@21tO%oJGA&E&jkak^uUP)?R
zNihS1e_krMd`Zj9OHn8-PE9KXm3^S1CK1#u1vLwy>$(+^Agwx3iI5L&JD21u<fJAR
z<)KGAD6*gpk77Lqk5oug5SmC}ior|76_D45L&849&j*|~%TqxeuCmOORCrBa3`#e!
z&@YCT_n`KAMruw0xWFzgPAvkJgL#m&14=8<dJM#bW}0HSMS2RsnV^Ok$j;1S1xV4T
zi&8Y^<tyZ5=4GcADHNxcloo(WP-y*=q6bcQo_Qrj`6;E~{J_8vQvhmGA=(mzmclD2
zfRi1_b&g0&<>4-50QLV{xfvKlc_8cmVe|XX_!$_U@-r~l@-r~l@G~$d@G~&T^D{7f
z<YQp?z{kMwhL3^44>}&;$;ZH;z{kKK&&R+Z&BwrSikE@m7%v0EQC<dyWL^e_Bwhvv
zV_pUZBVGoE2p$H8a2^H*M;-<S2Ob6naUKQ+F&?P>gK&Ig)JdZuFd71*Aut*OqaiRF
z0;3@?8UmvsFaknASsBuCfOHhV{Z*tME~r}y>)L`Qz`%o$u#S^nCTK>?2Q=@2n9~9e
zBo>4E?n$8jU|wQ6WP~54NdYuIsHfnW2bsz!hW7s<owS1d;#7r11%z7I$Sh<qJvTEw
zqeKDRM*|I6mgFly2Q0uNv!JnpoYV^JJwedmTB<@)VsU0NqW@J~l3xTFrb&QV3G!0{
zc+3Ve7y#~6LBa;61Q9%-&;fPYVIdTfQCf@`HUkaffk%ZPjt99IH2#*FSe%(t2^*k*
z7@c1Pra=Q6;L(hf)PmH!6i{C-A39|S8c{5!U_1*v-f9FIctIY~f`~%KwET;}-FHwZ
zg2vpS<30JMkO9@4Oz`k@CTQeo@XSE!8ey4()HQ;5$g@Nt5j1TIi4ss^0goC(M^?e%
zjO;6T@`esLfM+^D7Jx?CK{J|3pjqGY#7a=l9h`C?DHt5Z;F;P)Xultvd=P_{kU$2_
wNq~l@p{9d!0C-jvVhCiw05o6-N>9ZYnv)9(AX6GB^DrRSl_n?<<BJRi0A9WBCjbBd

delta 517
zcmZo@U~EW`N-_xZ^i{CXGhkp~U=Uzn*qe1Wwyo5K;fdQusTchGj12V*3=B*lNd^W6
z;mLvm;+s7LcJNR9pfE{+g;$H0fngFaM5pj%K?QNAU3?4-JNXzGmhdq!6!I}J`0_C@
z=<_i!=<zWyyys<Lc*o1Y@RpZ>VIMC8!(LtnhAF%Z43nXzOyp%?n83T)QGk`FURhZo
zEH$|#zo=NjCQ%_TF*ns#sSF~ZWM`wFXve@{tfvs|l383*l$lgolA59bm5$X=PC<yq
zLq#<ijPw+ugG&-iGLw;17J~&4iWC$SK<dKkQ;SM6lM{0k@(WUn5=%1k^NM2?G?dGr
z!tn@UO$G%81w%cBXpj7&%&Ppn5)4fla0wJ`AU#ko#)3=+dsdS{Sy>?@F-bWmRUto3
z!8tz<Bmh;077~+%ZS*D^+ZwQdluVY^lbh_Wn>blq&r>WUBePhcq_iZzC^IofA+@3)
dCowaxc=AO(4erbmg@U5|vdom!;>pbVe*tdtnYRD{

diff --git a/Documentation/Tutorials/BuildWithTNL/Makefile b/Documentation/Tutorials/BuildWithTNL/Makefile
new file mode 100644
index 000000000..6f3a7f2ed
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/Makefile
@@ -0,0 +1,48 @@
+include Makefile.inc
+
+FILES = Makefile \
+        Makefile.inc \
+        $(SOURCES) \
+	$(HEADERS)
+
+SUBDIRS = 
+
+SUBDIRSCLEAN=$(addsuffix clean,$(SUBDIRS))
+
+all: bin subdirs
+
+.PHONY:	subdirs $(SUBDIRS)
+subdirs:    $(SUBDIRS)
+$(SUBDIRS):
+	$(MAKE) -C $@	
+
+bin:
+	mkdir -p bin
+
+install: all
+	mkdir -p $(INSTALL_DIR)/bin
+	cp bin/* $(INSTALL_DIR)/bin
+
+.PHONY:	clean
+clean:	$(SUBDIRSCLEAN) clean_curdir
+
+clean_curdir:
+	rm -f *.o
+	
+%clean:	%
+	$(MAKE) -C $< clean
+
+dist: clean
+	tar zcvf fjfi-num1-src.tgz $(SUBDIRS) $(FILES)
+
+$(TARGETS): % : %.o
+	$(CXX) $(LDFLAGS) -o $@ $< $(LDLIBS)
+
+$(CUDA_TARGETS): % : %.cu.o
+	$(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS)
+
+$(SOURCES:%.cpp=%.o): %.o: %.cpp
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $<
+
+$(CUDA_SOURCES:%.cu=%.cu.o): %.cu.o : %.cu
+	$(CUDA_COMPILER) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $<
\ No newline at end of file
diff --git a/Documentation/Tutorials/BuildWithTNL/Makefile.inc b/Documentation/Tutorials/BuildWithTNL/Makefile.inc
new file mode 100644
index 000000000..08884187f
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/Makefile.inc
@@ -0,0 +1,14 @@
+# Replace the following with your TNL installation path
+TNL_HEADERS = ${HOME}/.local/include/tnl
+INSTALL_DIR = ${HOME}/.local
+WITH_CUDA = yes
+WITH_OPENMP = yes
+WITH_DEBUG = no
+
+CXX = g++
+CUDA_CXX = nvcc
+#CXXFLAGS = -DNDEBUG -O3 -funroll-loops -g -std=c++11 -Dlinux
+CXXFLAGS = -DNDEBUG -O0 -g -Dlinux -std=c++11 -fPIC
+CPPFLAGS = -MD -MP
+
+LDFLAGS += -lm #-lgomp
diff --git a/Documentation/Tutorials/BuildWithTNL/tutorial_00_building_applications_with_tnl.md b/Documentation/Tutorials/BuildWithTNL/tutorial_00_building_applications_with_tnl.md
index 0aab78a2f..c76f80360 100644
--- a/Documentation/Tutorials/BuildWithTNL/tutorial_00_building_applications_with_tnl.md
+++ b/Documentation/Tutorials/BuildWithTNL/tutorial_00_building_applications_with_tnl.md
@@ -92,6 +92,10 @@ Thus you have one code which you may easily compile with or without CUDA dependi
 
 ## Build with Makefile <a name="makefile"></a>
 
+Larger projects needs to be managed by Makefile tool. In this section we propose a Makefile template which might help you to create more complex applications with TNL. The basic setup is stored in `Makefile.inc` file:
+
+\include Makefile.inc
+
 ## Build with Cmake <a name="cmake"></a>
 
 
-- 
GitLab


From 187ddc4272eb9a222f7a068119c8f5b28fe03c83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 18 Nov 2019 15:56:10 +0100
Subject: [PATCH 05/43] Added tutorial on Makefiles.

---
 ...l_00_building_applications_with_tnl.md.swp | Bin 16384 -> 0 bytes
 Documentation/Tutorials/BuildWithTNL/Makefile |  20 ++++++---
 .../Tutorials/BuildWithTNL/Makefile.inc       |  42 ++++++++++++++++--
 .../BuildWithTNL/tnl-make-example-cuda.cu     |   1 +
 .../BuildWithTNL/tnl-make-example.cpp         |   1 +
 ...orial_00_building_applications_with_tnl.md |  10 ++++-
 6 files changed, 62 insertions(+), 12 deletions(-)
 delete mode 100644 Documentation/Tutorials/BuildWithTNL/.tutorial_00_building_applications_with_tnl.md.swp
 create mode 120000 Documentation/Tutorials/BuildWithTNL/tnl-make-example-cuda.cu
 create mode 120000 Documentation/Tutorials/BuildWithTNL/tnl-make-example.cpp

diff --git a/Documentation/Tutorials/BuildWithTNL/.tutorial_00_building_applications_with_tnl.md.swp b/Documentation/Tutorials/BuildWithTNL/.tutorial_00_building_applications_with_tnl.md.swp
deleted file mode 100644
index c41656b4c76cf46233272ec0159468486efea11a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16384
zcmYc?2=nw+u+TGLU|?VnU|=xWcsBNfToS|ed<KU6q|~B}(xlWPkQ^SIoLE$pnW~$Z
zmzRo16$3*Z+(7;E{G#mQg2d!h{gS*KeV6>?(%jU%lEjkC{5<`T(vtk5%*32xeW%jQ
zoRsj)l8g{PAN>-jaJ+#*d=f|^GcP?pv7jI)Ga0O}IKDiyBqP2gFGnvo1+P0sB}YSG
z#Dzd<Nt&(&FN3j>AvmimD=8`n3x$rjU>NoAXb6mkz-S1JhQMeDjE2By2#kinXb6nt
z5GW~NVyI_eU|@m<f(MjlM5CFZd|4>%38i7`pxjYvGz3ONU^E0qLtr!nMnhmU1V%$(
zGz3ONU^E0qLtr!nMnhl_hCpHp0|O@m1A{B5BQL<f0PFv+=VxG8$IrkpkDq~IE<Xc9
z0v`iIJRbvt7as$II3EMUQ(gv!6}$`#ExZg2X}k;!sk{sfYP<{#s=N#g6L}aICh#yY
zl<_byl=3hz@bEA&aPu%QT;OJ4IM2<%;KR+p;LXjz;Lgp!@QaIq;TsnN!&fc_27XQk
z20l&(h7%kN497Vb7_PH3FkEA2VA#XXz_1%cvoSF2W@BL3&c?t{%ErJ@$i~1>z{bF!
z%*Mc=#KyoN$Hu_G%ErL3ft7(li<N;vgq49ofR%xPpOt|jmW6>KhJ}G4f`x&Bi-m!K
zjfH`Mm4$)fJ`)4OJthW*TTBcLeM}4tT}%uNolFc2*-Q)!SxgKJX-o_ZXBZh6PB1bs
z9A{);aASmol>;LKLp37<Lri8~G6O?+szPyTdTMcrLP@?tdTC-&ib6?7s)BQ<i=#qv
zeoCrBQYBa*H!-;=U%|sM%rzb?ms*yZr;wSaP?=v^q)<?lpOu<iQmm)oUs|M4oS9pY
zld6zfkqD+sGE(ytN=l3JiWM^RO7g*`rRC@3<d<jWrCTvDBqSs-<dr2SE9i#j<?E)T
z78Io>Czhn9=%yEx>XsxHrKgq@E9mN`Ruq)z7NzDSR-~rrCg<lBm!wt{6u~4?D@s!H
zQc_cNa}sltQW6z(T@X&y^;D_$@b`7C)z{0(PfpCy&&*5CDNO+{;DEVOH@P$=Q7^ev
zK{p>o2<$qBkc?D?{Ib-d#GD+3<ow*+#Jm)Roc#RkVui$Fs1K;<c!-^$d1?7YC8c?Z
zC8;@;ItmG(AW8t)UXYoSTBMMknwMIXSdv<-kf@N8U!storcj<(l$V*8UaY4O4hr(p
z;?xuc(83ah5I-LsaIh!l=9lJy)q<n8peR2%wHTx<sS;*EX0bw1Vo647kwOkA=oLU-
zEzSo;vqExWo<e$Ri9%6oVopvaHhY8fb5lVsDalC9RY)vKRVXP+1x2MoVzEMMQBi(T
zF(^5uq^2d7=9K6uxaAiqlw^QxOie7#&r>K*RVYdYg>PzJ3Me@yra%%SJhdpK<s_yT
zgQOtA0S$Xl(9j^piy*GbELJGVNGwrEO-#;ED9B7r273{l3BaM4nFk7Bh0J0|_DM|v
z`7u!el3^0y$uL1rAtWQUC^ap=C>7+>ywp^%Ckqmbixm=((rW_9f1o^=oSzGhYp_NI
zL?D1d2ox>lnK?NMxry1S3YjI)kZ?@OFDgk*QP4;R*^*M4TacQf$)KQ6o{?Chp=qT6
z$=AiD1qJy<B_Jo~W)>G`=B4W>B$bwclSn~PetJ=2u0nERPEKlyLZU)iX<jmDag#!M
zMrLw`0?66L3YjH(3IRE(iN(+u1Gy7LJ2>5e!w71b9z#iLQEnzEIm69PD@g?<fFf{7
zppXbEPl`(t^B@7N9^&U?W#w9toLT^~zSzpjxilp)7~*tah_lrhGV+T{;uDLC5-SyK
z6`~akbrg(r6pR&OA$iCVY(r{AYI12wW?nkjg-MxtiA9wP3DDvRobeMt5duou;6$pB
zn_66)m<}o!iwcS|LE)2A2@POKQlq#)2~PzFBq*k#g%Q|o8L5dWsYMD2phaUYsb!hT
zsm1yrU+ZNgfHPuBYH|)ZTV;Usmn$TsfE7UkAVHxN<Wz-3g_6|Vf}F&XRE2`XqQu-(
zP$(xrje~?@0wnL{f)hz<Nn&PBu|jcbszNj<*?~i|SOK<ZEmi}@iZ?KX*i@{SQIeaZ
zsizQ<4=a$s&If0Q#H9SPRE5mEvi$5+uqZ~twSv?=pduKhtkp{{C}3dlOam7-<%xOV
zDyg6-KP|IFA+0Ds7i?NOs6<gHNG(dsFUp0b759M9VyN%IX|6KA6dt}AiDeLXfMsBL
z3R>LhDR`EE%eACbh5V$F#LPTcnF$JsXt0)&{QR8k%#v7*jFOUqVk>?9l+?1+ocw~+
zBE7t_%#_STz2y8{eNg?Oo04Cimy@5EQmm<`5T2@#mRVF>qL7=Pl9^Vi0Ies%nJcjv
znw>$lhqAJ=f-^X)gBJrrifl;vrjV9j1j({Ci3)j%xv92F(99p7lbM$q4@!+nb~gHn
zb_@(5em)9CsfDGPMXALKplp-^s>_|VwGB<6S+_h@Ar(}qB&R~`0C^GOM5J`0TU?T2
zn+(#Gpr_!MT2Z0{${V0u4@#^hsW~~|Xn=<!*pU1jP_hN*d5|+8$+#F)Ze<pON|}`W
zJY+j_64MnDbUhRF6r3s*ptXSxq!7(3E=kPEfwmM9aM#-jdJ2*Gr3$%;l?sVD#rX<F
zsRcQS$)J)@A*-~w1mVGg#F7k!a!|nwPFCR50E?#76p*EQAOhq{h_e;)^KvQ`ax#;O
zK*a>8oGs2*$ShGv$xkg-$jb*+f8an?NX)BL$OO9=9B+CGZkc(BpsKn&72<}1L{O_L
zBUPa|zqBYh71Eqa%gjkd6h5Hh3zY0(c^4F#&=}SQ#V~RX)=f^(gS3{?wY3!x@uWah
z`vztcasvk>1}P}RQxy`6i%WAsIVBNN5g=#gVuj+wveXn%doNJ|yASjd6hO;e72N$o
z71Fh}Vd(?X?m$#3c?wCOfB;u)DWGNnq;$ZY57M=@6BMwg{`BNzc;W}OB8oFWl^V=0
za56{+l@^e)5?rk5C}iekmSlp;#!P4;DND>LO)Uo39N;q67)%xvW#*NDqCG!PAyEO;
z+|18O)l<Mya)AO76om{7Az(+OLTbz0#7s~+O06i!NzBYER>;UN2W22=xIoe;C~<)@
zO@3*K0<@Y0wVuGG4x}tk$xKU2ElSNRft9mGpw?X~D1R40-49AKkU#|0aHwr>D+UI%
z!UbY1N)p2!y0DNn)>DWEFJ^`~*%{Q{h}BR|2GN=fMyRsBiP@>3<Poc(43Y;iL9Gu3
zkTOC=9LP{e(;88>fi)ZIDMVvx2BiyhE$PY0nhfa5F@21tO%oJGA&E&jkak^uUP)?R
zNihS1e_krMd`Zj9OHn8-PE9KXm3^S1CK1#u1vLwy>$(+^Agwx3iI5L&JD21u<fJAR
z<)KGAD6*gpk77Lqk5oug5SmC}ior|76_D45L&849&j*|~%TqxeuCmOORCrBa3`#e!
z&@YCT_n`KAMruw0xWFzgPAvkJgL#m&14=8<dJM#bW}0HSMS2RsnV^Ok$j;1S1xV4T
zi&8Y^<tyZ5=4GcADHNxcloo(WP-y*=q6bcQo_Qrj`6;E~{J_8vQvhmGA=(mzmclD2
zfRi1_b&g0&<>4-50QLV{xfvKlc_8cmVe|XX_!$_U@-r~l@-r~l@G~$d@G~&T^D{7f
z<YQp?z{kMwhL3^44>}&;$;ZH;z{kKK&&R+Z&BwrSikE@m7%v0EQC<dyWL^e_Bwhvv
zV_pUZBVGoE2p$H8a2^H*M;-<S2Ob6naUKQ+F&?P>gK&Ig)JdZuFd71*Aut*OqaiRF
z0;3@?8UmvsFaknASsBuCfOHhV{Z*tME~r}y>)L`Qz`%o$u#S^nCTK>?2Q=@2n9~9e
zBo>4E?n$8jU|wQ6WP~54NdYuIsHfnW2bsz!hW7s<owS1d;#7r11%z7I$Sh<qJvTEw
zqeKDRM*|I6mgFly2Q0uNv!JnpoYV^JJwedmTB<@)VsU0NqW@J~l3xTFrb&QV3G!0{
zc+3Ve7y#~6LBa;61Q9%-&;fPYVIdTfQCf@`HUkaffk%ZPjt99IH2#*FSe%(t2^*k*
z7@c1Pra=Q6;L(hf)PmH!6i{C-A39|S8c{5!U_1*v-f9FIctIY~f`~%KwET;}-FHwZ
zg2vpS<30JMkO9@4Oz`k@CTQeo@XSE!8ey4()HQ;5$g@Nt5j1TIi4ss^0goC(M^?e%
zjO;6T@`esLfM+^D7Jx?CK{J|3pjqGY#7a=l9h`C?DHt5Z;F;P)Xultvd=P_{kU$2_
wNq~l@p{9d!0C-jvVhCiw05o6-N>9ZYnv)9(AX6GB^DrRSl_n?<<BJRi0A9WBCjbBd

diff --git a/Documentation/Tutorials/BuildWithTNL/Makefile b/Documentation/Tutorials/BuildWithTNL/Makefile
index 6f3a7f2ed..50f3297be 100644
--- a/Documentation/Tutorials/BuildWithTNL/Makefile
+++ b/Documentation/Tutorials/BuildWithTNL/Makefile
@@ -1,15 +1,21 @@
 include Makefile.inc
 
+SUBDIRS = 
+HEADERS =
+SOURCES = tnl-make-example.cpp
+CUDA_SOURCES = tnl-make-example-cuda.cu
+TARGETS = tnl-make-example 
+CUDA_TARGETS = tnl-make-example-cuda
+
 FILES = Makefile \
         Makefile.inc \
+        $(CUDA_SOURCES) \
         $(SOURCES) \
-	$(HEADERS)
-
-SUBDIRS = 
+        $(HEADERS)
 
 SUBDIRSCLEAN=$(addsuffix clean,$(SUBDIRS))
 
-all: bin subdirs
+all: bin subdirs $(TARGETS) $(CUDA_TARGETS)
 
 .PHONY:	subdirs $(SUBDIRS)
 subdirs:    $(SUBDIRS)
@@ -33,16 +39,16 @@ clean_curdir:
 	$(MAKE) -C $< clean
 
 dist: clean
-	tar zcvf fjfi-num1-src.tgz $(SUBDIRS) $(FILES)
+	tar zcvf $(PROJECT_NAME)-src.tgz $(SUBDIRS) $(FILES)
 
 $(TARGETS): % : %.o
 	$(CXX) $(LDFLAGS) -o $@ $< $(LDLIBS)
 
 $(CUDA_TARGETS): % : %.cu.o
-	$(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS)
+	$(CUDA_CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS)
 
 $(SOURCES:%.cpp=%.o): %.o: %.cpp
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $<
 
 $(CUDA_SOURCES:%.cu=%.cu.o): %.cu.o : %.cu
-	$(CUDA_COMPILER) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $<
\ No newline at end of file
+	$(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $<
\ No newline at end of file
diff --git a/Documentation/Tutorials/BuildWithTNL/Makefile.inc b/Documentation/Tutorials/BuildWithTNL/Makefile.inc
index 08884187f..2d05fcdee 100644
--- a/Documentation/Tutorials/BuildWithTNL/Makefile.inc
+++ b/Documentation/Tutorials/BuildWithTNL/Makefile.inc
@@ -1,14 +1,48 @@
+# Replace the following with your project name
+PROJECT_NAME = tnl-make-example
+
 # Replace the following with your TNL installation path
-TNL_HEADERS = ${HOME}/.local/include/tnl
+TNL_HEADERS = ${HOME}/.local/include
 INSTALL_DIR = ${HOME}/.local
 WITH_CUDA = yes
 WITH_OPENMP = yes
 WITH_DEBUG = no
 
+# If TNL is installed on your system, CUDA arch can be detected automatically using
+# a tool 'tnl-cuda-arch'. This is done by default if CUDA_ARCH is set to 'auto'. 
+# Otherwise, if you set it manually by telling the CUDA architecture number
+# i.e 50, 60 etc.
+CUDA_ARCH = auto
+
+# Set-up compilers
 CXX = g++
 CUDA_CXX = nvcc
-#CXXFLAGS = -DNDEBUG -O3 -funroll-loops -g -std=c++11 -Dlinux
-CXXFLAGS = -DNDEBUG -O0 -g -Dlinux -std=c++11 -fPIC
+
+# Set-up CXX_FLAGS
+CXXFLAGS = -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable -Wno-unknown-pragmas -std=c++14 -I$(TNL_HEADERS)
+ifeq ( $(WITH_DEBUG), yes )
+	CXXFLAGS += -O0 -g
+else
+   CXXFLAGS += -DNDEBUG -O3 -funroll-loops
+endif
+
+# Set-up CUDA_CXXFLAGS
+CUDA_CXXFLAGS = -Wno-deprecated-gpu-targets --expt-relaxed-constexpr --expt-extended-lambda -Xcudafe --std c++14 -I$(TNL_HEADERS) 
+ifeq ( $(WITH_CUDA), yes )
+   CUDA_CXXFLAGS += -DHAVE_CUDA
+   ifeq ( $(CUDA_ARCH), auto )
+      CUDA_CXXFLAGS += `tnl-cuda-arch`
+   else
+      CUDA_CXXFLAGS += -gencode arch=compute_$(CUDA_ARCH),code=sm_$(CUDA_ARCH)
+   endif
+endif
+
+# Set-up CPPFLAGS
 CPPFLAGS = -MD -MP
 
-LDFLAGS += -lm #-lgomp
+# Set-up LDFLAGS
+LDFLAGS += -lm
+ifeq ( $(WITH_OPENMP), yes )
+   LDFLAGS += -lgomp
+endif
+
diff --git a/Documentation/Tutorials/BuildWithTNL/tnl-make-example-cuda.cu b/Documentation/Tutorials/BuildWithTNL/tnl-make-example-cuda.cu
new file mode 120000
index 000000000..13435476b
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/tnl-make-example-cuda.cu
@@ -0,0 +1 @@
+example-cuda-2.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/BuildWithTNL/tnl-make-example.cpp b/Documentation/Tutorials/BuildWithTNL/tnl-make-example.cpp
new file mode 120000
index 000000000..13435476b
--- /dev/null
+++ b/Documentation/Tutorials/BuildWithTNL/tnl-make-example.cpp
@@ -0,0 +1 @@
+example-cuda-2.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/BuildWithTNL/tutorial_00_building_applications_with_tnl.md b/Documentation/Tutorials/BuildWithTNL/tutorial_00_building_applications_with_tnl.md
index c76f80360..6d1d1bc7b 100644
--- a/Documentation/Tutorials/BuildWithTNL/tutorial_00_building_applications_with_tnl.md
+++ b/Documentation/Tutorials/BuildWithTNL/tutorial_00_building_applications_with_tnl.md
@@ -92,10 +92,18 @@ Thus you have one code which you may easily compile with or without CUDA dependi
 
 ## Build with Makefile <a name="makefile"></a>
 
-Larger projects needs to be managed by Makefile tool. In this section we propose a Makefile template which might help you to create more complex applications with TNL. The basic setup is stored in `Makefile.inc` file:
+Larger projects needs to be managed by Makefile tool. In this section we propose a Makefile template which might help you to create more complex applications with TNL. The basic setup is stored in [Makefile.inc](../../BuildWithTNL/Makefile.inc) file:
 
 \include Makefile.inc
 
+In this file, you may define a name of your project (`PROJECT_NAME`), set the path to TNL headers (`TNL_HEADERS`), set the installation directory (`INSTALL_DIR`), turn on and off support of CUDA (`WITH_CUDA`), OpenMP (`WITH_OPEMP`) or debug mode (`WITH_DEBUG`). If you compile with CUDA you may set the CUDA architecture of your system.
+
+The main [Makefile](../../BuildWithTNL/Makefile) looks as follows:
+
+\include Makefile
+
+If your project source codes are splitted into several subdirectories you may specify them in variable `SUBDIRS`. Next, in variables `HEADERS` and `SOURCES` you should tell all source files in the current folder. The same holds for `CUDA_SOURCES` which are all .cu files in the current folder. `TARGETS` and `CUDA_TRGETS` tell the names of binaries to be build in the current folder.
+
 ## Build with Cmake <a name="cmake"></a>
 
 
-- 
GitLab


From f4bbeead378f424c24c43dda641cab918492cc74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 18 Nov 2019 18:46:38 +0100
Subject: [PATCH 06/43] Writing tutorial on parallel for.

---
 .../Tutorials/ForLoops/CMakeLists.txt         |  5 ++
 .../ForLoops/ParallelFor2D-snippet.cpp        |  3 +
 .../ForLoops/ParallelForExample-2D.cpp        | 62 ++++++++++++++++++
 .../ForLoops/ParallelForExample-2D.cu         |  1 +
 .../ForLoops/ParallelForExample-3D.cpp        | 63 +++++++++++++++++++
 .../ForLoops/ParallelForExample-3D.cu         |  1 +
 .../ForLoops/tutorial_04_ForLoops.md          | 16 +++++
 7 files changed, 151 insertions(+)
 create mode 100644 Documentation/Tutorials/ForLoops/ParallelFor2D-snippet.cpp
 create mode 100644 Documentation/Tutorials/ForLoops/ParallelForExample-2D.cpp
 create mode 120000 Documentation/Tutorials/ForLoops/ParallelForExample-2D.cu
 create mode 100644 Documentation/Tutorials/ForLoops/ParallelForExample-3D.cpp
 create mode 120000 Documentation/Tutorials/ForLoops/ParallelForExample-3D.cu

diff --git a/Documentation/Tutorials/ForLoops/CMakeLists.txt b/Documentation/Tutorials/ForLoops/CMakeLists.txt
index af47d80b7..522b8fb88 100644
--- a/Documentation/Tutorials/ForLoops/CMakeLists.txt
+++ b/Documentation/Tutorials/ForLoops/CMakeLists.txt
@@ -1,6 +1,11 @@
 IF( BUILD_CUDA )
    CUDA_ADD_EXECUTABLE( ParallelForExample ParallelForExample.cu )
    ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
+   CUDA_ADD_EXECUTABLE( ParallelForExample-2D ParallelForExample-2D.cu )
+   CUDA_ADD_EXECUTABLE( ParallelForExample-3D ParallelForExample-3D.cu )
+ELSE()
+   ADD_EXECUTABLE( ParallelForExample-2D ParallelForExample-2D.cpp )
+   ADD_EXECUTABLE( ParallelForExample-3D ParallelForExample-3D.cpp )
 ENDIF()
 
 IF( BUILD_CUDA )
diff --git a/Documentation/Tutorials/ForLoops/ParallelFor2D-snippet.cpp b/Documentation/Tutorials/ForLoops/ParallelFor2D-snippet.cpp
new file mode 100644
index 000000000..40f29313a
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/ParallelFor2D-snippet.cpp
@@ -0,0 +1,3 @@
+for( Index j = startY; j < endY; j++ )
+   for( Index i = startX; i < endX; i++ )
+      f( i, j, args... );
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cpp b/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cpp
new file mode 100644
index 000000000..642ff9692
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cpp
@@ -0,0 +1,62 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename Device >
+void meshFunctionSum( const int xSize,
+                      const int ySize,
+                      const Vector< double, Device >& v1,
+                      const Vector< double, Device >& v2,
+                      const double& c,
+                      Vector< double, Device >& result )
+{
+   /****
+    * Get vectors view which can be captured by lambda.
+    */
+   auto v1_view = v1.getConstView();
+   auto v2_view = v2.getConstView();
+   auto result_view = result.getView();
+
+   /****
+    * The sum function.
+    */
+   auto sum = [=] __cuda_callable__  ( int i, int j, const int xSize, const double c ) mutable {
+      const int idx = j * xSize + i;
+      result_view[ idx ] = v1_view[ idx ] + v2_view[ idx ] + c; };
+
+   ParallelFor2D< Device >::exec( 0, 0, xSize, ySize, sum, xSize, c );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Define dimensions of 2D mesh function.
+    */
+   const int xSize( 10 ), ySize( 10 );
+   const int size = xSize * ySize;
+
+   /***
+    * Firstly, test the mesh functions sum on CPU.
+    */
+   Vector< double, Devices::Host > host_v1( size ), host_v2( size ), host_result( size );
+   host_v1 = 1.0;
+   host_v2 = 2.0;
+   meshFunctionSum( xSize, ySize, host_v1, host_v2, 2.0, host_result );
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v1( size ), cuda_v2( size ), cuda_result( size );
+   cuda_v1 = 1.0;
+   cuda_v2 = 2.0;
+   meshFunctionSum( xSize, ySize, cuda_v1, cuda_v2, 2.0, cuda_result );
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cu b/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cu
new file mode 120000
index 000000000..937609f77
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cu
@@ -0,0 +1 @@
+ParallelForExample-2D.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cpp b/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cpp
new file mode 100644
index 000000000..94479dd9b
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cpp
@@ -0,0 +1,63 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename Device >
+void meshFunctionSum( const int xSize,
+                      const int ySize,
+                      const int zSize,
+                      const Vector< double, Device >& v1,
+                      const Vector< double, Device >& v2,
+                      const double& c,
+                      Vector< double, Device >& result )
+{
+   /****
+    * Get vectors view which can be captured by lambda.
+    */
+   auto v1_view = v1.getConstView();
+   auto v2_view = v2.getConstView();
+   auto result_view = result.getView();
+
+   /****
+    * The sum function.
+    */
+   auto sum = [=] __cuda_callable__  ( int i, int j, int k, const int xSize, const int ySize, const double c ) mutable {
+      const int idx = ( k * ySize + j ) * xSize + i;
+      result_view[ idx ] = v1_view[ idx ] + v2_view[ idx ] + c; };
+
+   ParallelFor3D< Device >::exec( 0, 0, 0, xSize, ySize,zSize, sum, xSize, ySize, c );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Define dimensions of 3D mesh function.
+    */
+   const int xSize( 10 ), ySize( 10 ), zSize( 10 );
+   const int size = xSize * ySize * xSize;
+
+   /***
+    * Firstly, test the mesh functions sum on CPU.
+    */
+   Vector< double, Devices::Host > host_v1( size ), host_v2( size ), host_result( size );
+   host_v1 = 1.0;
+   host_v2 = 2.0;
+   meshFunctionSum( xSize, ySize, zSize, host_v1, host_v2, 2.0, host_result );
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v1( size ), cuda_v2( size ), cuda_result( size );
+   cuda_v1 = 1.0;
+   cuda_v2 = 2.0;
+   meshFunctionSum( xSize, ySize, zSize, cuda_v1, cuda_v2, 2.0, cuda_result );
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cu b/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cu
new file mode 120000
index 000000000..686a94df2
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cu
@@ -0,0 +1 @@
+ParallelForExample-3D.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md b/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md
index f00fc2852..e8d220aa7 100644
--- a/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md
+++ b/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md
@@ -32,6 +32,22 @@ The result is:
 \include ParallelForExample.out 
 
 ## n-dimensional Parallel For<a name="n_dimensional_parallel_for"></a>
+
+Performing for-loops in higher dimensions is simillar. In the following example we build 2D mesh function on top of TNL vector. Two dimensional indexes `( i, j )` are mapped to vector index `idx` as `idx = j * xSize + i`, where the mesh fuction has dimensions `xSize * ySize`. Of course, in this simple example, it does not make any sense to compute a sum of two mesh function this way, it is only an example.
+
+\include ParallelForExample-2D.cpp
+
+Notice the parameters of the lambda function `sum`. The first parameter `i` changes more often than `j` and therefore the index mapping has the form `j * xSize + i` to acces the vector elements sequentialy on CPU and to fullfill coalesced memory accesses on GPU. The for-loop is executed by calling `ParallelFor2D` with proper device. The first four parameters are `startX, startY, endX, endY` and on CPU this is equivalent to the following embeded for loops:
+
+\include ParallelFor2D-snippet.cpp
+
+where `args...` stand for additional arguments passed to the for-loop. After the parameters defining the loops bounds, lambda function (`sum` in this case) is passed followed by additional arguments. One of them, in our example, is `xSize` again because it must be passed to the lambda function for the index mapping computation.
+
+For the completness, we show modification of the previous example into 3D:
+
+\include ParallelForExample-3D.cpp
+
+
 ## Static For<a name="static_for"></a>
 ## Templated Static For<a name="templated_static_for"></a>
 
-- 
GitLab


From 362727e595e895dac9da261ebc4dddd0880e0a9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 18 Nov 2019 22:39:51 +0100
Subject: [PATCH 07/43] Writting documentation for ParallelFor.

---
 .../Examples/Algorithms/CMakeLists.txt        |  17 +++
 .../Algorithms/ParallelForExample-2D.cpp      |  45 ++++++++
 .../Algorithms/ParallelForExample-2D.cu       |   1 +
 .../Algorithms/ParallelForExample-3D.cpp      |  46 ++++++++
 .../Algorithms/ParallelForExample-3D.cu       |   1 +
 .../Algorithms/ParallelForExample.cpp         |  43 +++++++
 .../Examples/Algorithms/ParallelForExample.cu |  59 ++++++++++
 Documentation/Examples/CMakeLists.txt         |   1 +
 src/TNL/Algorithms/ParallelFor.h              | 105 ++++++++++++++++++
 9 files changed, 318 insertions(+)
 create mode 100644 Documentation/Examples/Algorithms/CMakeLists.txt
 create mode 100644 Documentation/Examples/Algorithms/ParallelForExample-2D.cpp
 create mode 120000 Documentation/Examples/Algorithms/ParallelForExample-2D.cu
 create mode 100644 Documentation/Examples/Algorithms/ParallelForExample-3D.cpp
 create mode 120000 Documentation/Examples/Algorithms/ParallelForExample-3D.cu
 create mode 100644 Documentation/Examples/Algorithms/ParallelForExample.cpp
 create mode 100644 Documentation/Examples/Algorithms/ParallelForExample.cu

diff --git a/Documentation/Examples/Algorithms/CMakeLists.txt b/Documentation/Examples/Algorithms/CMakeLists.txt
new file mode 100644
index 000000000..d0d1eda9b
--- /dev/null
+++ b/Documentation/Examples/Algorithms/CMakeLists.txt
@@ -0,0 +1,17 @@
+IF( BUILD_CUDA )
+   CUDA_ADD_EXECUTABLE(ParallelForExampleCuda ParallelForExample.cu)
+   ADD_CUSTOM_COMMAND( COMMAND ParallelForExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
+ELSE()
+   ADD_EXECUTABLE(ParallelForExample ParallelForExample.cpp)
+   ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
+ENDIF()
+
+IF( BUILD_CUDA )
+ADD_CUSTOM_TARGET( RunAlgorithmsExamples-cuda ALL DEPENDS
+   ParallelForExample.out
+ )
+ELSE()
+ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS
+   ParallelForExample.out
+ )
+ENDIF()
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/ParallelForExample-2D.cpp b/Documentation/Examples/Algorithms/ParallelForExample-2D.cpp
new file mode 100644
index 000000000..aafff2466
--- /dev/null
+++ b/Documentation/Examples/Algorithms/ParallelForExample-2D.cpp
@@ -0,0 +1,45 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename Device >
+void initMeshFunction( const int xSize,
+                       const int ySize,
+                       Vector< double, Device >& v,
+                       const double& c )
+{
+   auto view = v1.getConstView();
+   auto init = [=] __cuda_callable__  ( int i, int j, const int xSize, const double c ) mutable {
+      view[ j * xSize + i ] =  c; };
+   ParallelFor2D< Device >::exec( 0, 0, xSize, ySize, init, xSize, c );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Define dimensions of 2D mesh function.
+    */
+   const int xSize( 10 ), ySize( 10 );
+   const int size = xSize * ySize;
+
+   /***
+    * Firstly, test the mesh function initiation on CPU.
+    */
+   Vector< double, Devices::Host > host_v;
+   initMeshFunction( xSize, ySize, host_v, 1.0 );
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v( size );
+   initMeshFunction( xSize, ySize, cuda_v, 1.0 );
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Examples/Algorithms/ParallelForExample-2D.cu b/Documentation/Examples/Algorithms/ParallelForExample-2D.cu
new file mode 120000
index 000000000..937609f77
--- /dev/null
+++ b/Documentation/Examples/Algorithms/ParallelForExample-2D.cu
@@ -0,0 +1 @@
+ParallelForExample-2D.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/ParallelForExample-3D.cpp b/Documentation/Examples/Algorithms/ParallelForExample-3D.cpp
new file mode 100644
index 000000000..3cb9b5b64
--- /dev/null
+++ b/Documentation/Examples/Algorithms/ParallelForExample-3D.cpp
@@ -0,0 +1,46 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename Device >
+void initMeshFunction( const int xSize,
+                       const int ySize,
+                       const int zSize,
+                       Vector< double, Device >& v,
+                       const double& c )
+{
+   auto view = v1.getConstView();
+   auto init = [=] __cuda_callable__  ( int i, int j, int k, const int xSize, const int ySize, const double c ) mutable {
+      view[ ( k * ySize + j ) * xSize + i ] =  c; };
+   ParallelFor3D< Device >::exec( 0, 0, xSize, ySize, init, xSize, ySize, c );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Define dimensions of 2D mesh function.
+    */
+   const int xSize( 10 ), ySize( 10 ), zSize( 10 );
+   const int size = xSize * ySize * zSize;
+
+   /***
+    * Firstly, test the mesh function initiation on CPU.
+    */
+   Vector< double, Devices::Host > host_v;
+   initMeshFunction( xSize, ySize, zSize, host_v, 1.0 );
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v( size );
+   initMeshFunction( xSize, ySize, cuda_v, 1.0 );
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Examples/Algorithms/ParallelForExample-3D.cu b/Documentation/Examples/Algorithms/ParallelForExample-3D.cu
new file mode 120000
index 000000000..686a94df2
--- /dev/null
+++ b/Documentation/Examples/Algorithms/ParallelForExample-3D.cu
@@ -0,0 +1 @@
+ParallelForExample-3D.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/ParallelForExample.cpp b/Documentation/Examples/Algorithms/ParallelForExample.cpp
new file mode 100644
index 000000000..46d23f58a
--- /dev/null
+++ b/Documentation/Examples/Algorithms/ParallelForExample.cpp
@@ -0,0 +1,43 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+/****
+ * Set all elements of the vector v to the constant c.
+ */
+template< typename Device >
+void initVector( Vector< double, Device >& v,
+                 const double& c )
+{
+   auto view = v.getConstView();
+   auto init = [=] __cuda_callable__  ( int i, const double c ) mutable {
+      view[ i ] = c;
+
+   ParallelFor< Device >::exec( 0, v.getSize(), init, c );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the vector initiation on CPU.
+    */
+   Vector< double, Devices::Host > host_v( 10 );
+   initVector( host_v, 1.0 );
+   std::cout << "host_v = " << host_v << std::endl;
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v( 10 );
+   initVector( cuda_v, 1.0 );
+   std::cout << "cuda_v = " << cuda_v << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Examples/Algorithms/ParallelForExample.cu b/Documentation/Examples/Algorithms/ParallelForExample.cu
new file mode 100644
index 000000000..5714df7d3
--- /dev/null
+++ b/Documentation/Examples/Algorithms/ParallelForExample.cu
@@ -0,0 +1,59 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/ParallelFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename Device >
+void vectorSum( const Vector< double, Device >& v1,
+                const Vector< double, Device >& v2,
+                const double& c,
+                Vector< double, Device >& result )
+{
+   /****
+    * Get vectors view which can be captured by lambda.
+    */
+   auto v1_view = v1.getConstView();
+   auto v2_view = v2.getConstView();
+   auto result_view = result.getView();
+
+   /****
+    * The sum function.
+    */
+   auto sum = [=] __cuda_callable__  ( int i, const double c ) mutable {
+      result_view[ i ] = v1_view[ i ] + v2_view[ i ] + c; };
+
+      ParallelFor< Device >::exec( 0, v1.getSize(), sum, c );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the vectors sum on CPU.
+    */
+   Vector< double, Devices::Host > host_v1( 10 ), host_v2( 10 ), host_result( 10 );
+   host_v1 = 1.0;
+   host_v2.evaluate( []__cuda_callable__ ( int i )->double { return i; } );
+   vectorSum( host_v1, host_v2, 2.0, host_result );
+   std::cout << "host_v1 = " << host_v1 << std::endl;
+   std::cout << "host_v2 = " << host_v2 << std::endl;
+   std::cout << "The sum of the vectors on CPU is " << host_result << "." << std::endl;
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v1( 10 ), cuda_v2( 10 ), cuda_result( 10 );
+   cuda_v1 = 1.0;
+   cuda_v2.evaluate( []__cuda_callable__ ( int i )->double { return i; } );
+   vectorSum( cuda_v1, cuda_v2, 2.0, cuda_result );
+   std::cout << "cuda_v1 = " << cuda_v1 << std::endl;
+   std::cout << "cuda_v2 = " << cuda_v2 << std::endl;
+   std::cout << "The sum of the vectors on GPU is " << cuda_result << "." << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Examples/CMakeLists.txt b/Documentation/Examples/CMakeLists.txt
index 817e04398..29b9a9657 100644
--- a/Documentation/Examples/CMakeLists.txt
+++ b/Documentation/Examples/CMakeLists.txt
@@ -1,3 +1,4 @@
+ADD_SUBDIRECTORY( Algorithms )
 ADD_SUBDIRECTORY( Containers )
 
 ADD_EXECUTABLE( FileExample FileExample.cpp )
diff --git a/src/TNL/Algorithms/ParallelFor.h b/src/TNL/Algorithms/ParallelFor.h
index 6d5e917ba..20e6bf796 100644
--- a/src/TNL/Algorithms/ParallelFor.h
+++ b/src/TNL/Algorithms/ParallelFor.h
@@ -33,12 +33,47 @@
 namespace TNL {
 namespace Algorithms {
 
+// TODO: ParallelForMode should be moved to Device (=Executor)
+
+/**
+ * \brief Enum for the parallel processing of the for-loop. 
+ * 
+ * Synchronous means that the program control returns to the caller when the loop is processed completely.
+ * Asynchronous means that the program control returns to the caller immediately even before the loop is processing is finished.
+ */
 enum ParallelForMode { SynchronousMode, AsynchronousMode };
 
+
+/**
+ * \brief Parallel for loop for one dimensional interval of indexes.
+ * 
+ * \tparam Device says on what device the for-loop is gonna be executed.
+ *    It can be Devices::Host, Devices::Cuda or Devices::Sequential. 
+ * \tparam Mode defines synchronous/asynchronous mode on parallel devices.
+ */
 template< typename Device = Devices::Sequential,
           ParallelForMode Mode = SynchronousMode >
 struct ParallelFor
 {
+   /**
+    * \brief Static method for execution of the loop.
+    * 
+    * \tparam Index defines the type of indexes over which the loop iterates.
+    * \tparam Function is the type of function to be called in each iteration.
+    * \tparam FunctionArgs is a variadic type of additional parameters which are 
+    *    supposed to be passed to the inner Function.
+    * 
+    * \param start the for-loop iterates over index interval [start, end).
+    * \param end the for-loop iterates over index interval [start, end).
+    * \param f is the function to be called in each iteration
+    * \param args are additional parameters to be passed to the function f.
+    * 
+    * \par Example
+    * \include Algorithms/ParallelForExample.cpp
+    * \par Output
+    * \include ParallelForExample.out
+    * 
+    */
    template< typename Index,
              typename Function,
              typename... FunctionArgs >
@@ -49,10 +84,44 @@ struct ParallelFor
    }
 };
 
+/**
+ * \brief Parallel for loop for two dimensional domain of indexes.
+ * 
+ * \tparam Device says on what device the for-loop is gonna be executed.
+ *    It can be Devices::Host, Devices::Cuda or Devices::Sequential. 
+ * \tparam Mode defines synchronous/asynchronous mode on parallel devices.
+ */
 template< typename Device = Devices::Sequential,
           ParallelForMode Mode = SynchronousMode >
 struct ParallelFor2D
 {
+   /**
+    * \brief Static method for execution of the loop.
+    * 
+    * \tparam Index defines the type of indexes over which the loop iterates.
+    * \tparam Function is the type of function to be called in each iteration.
+    * \tparam FunctionArgs is a variadic type of additional parameters which are 
+    *    supposed to be passed to the inner Function.
+    * 
+    * \param startX the for-loop iterates over index domain [startX,endX)x[startY,endY).
+    * \param startY the for-loop iterates over index domain [startX,endX)x[startY,endY).
+    * \param endX the for-loop iterates over index domain [startX,endX)x[startY,endY).
+    * \param endY the for-loop iterates over index domain [startX,endX)x[startY,endY).
+    * \param f is the function to be called in each iteration
+    * \param args are additional parameters to be passed to the function f.
+    * 
+    * The function f is called for each iteration as 
+    * 
+    * f( i, j, args... )
+    * 
+    * where the first parameter is changing more often than the second one.
+    *
+    * \par Example
+    * \include Algorithms/ParallelForExample-2D.cpp
+    * \par Output
+    * \include ParallelForExample-2D.out
+    * 
+    */
    template< typename Index,
              typename Function,
              typename... FunctionArgs >
@@ -64,10 +133,46 @@ struct ParallelFor2D
    }
 };
 
+/**
+ * \brief Parallel for loop for three dimensional domain of indexes.
+ * 
+ * \tparam Device says on what device the for-loop is gonna be executed.
+ *    It can be Devices::Host, Devices::Cuda or Devices::Sequential. 
+ * \tparam Mode defines synchronous/asynchronous mode on parallel devices.
+ */
 template< typename Device = Devices::Sequential,
           ParallelForMode Mode = SynchronousMode >
 struct ParallelFor3D
 {
+   /**
+    * \brief Static method for execution of the loop.
+    * 
+    * \tparam Index defines the type of indexes over which the loop iterates.
+    * \tparam Function is the type of function to be called in each iteration.
+    * \tparam FunctionArgs is a variadic type of additional parameters which are 
+    *    supposed to be passed to the inner Function.
+    * 
+    * \param startX the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param startY the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param startZ the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param endX the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param endY the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param endZ the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
+    * \param f is the function to be called in each iteration
+    * \param args are additional parameters to be passed to the function f.
+    * 
+    * The function f is called for each iteration as 
+    * 
+    * f( i, j, k, args... )
+    * 
+    * where the first parameter is changing the most often.
+    * 
+    * \par Example
+    * \include Algorithms/ParallelForExample-3D.cpp
+    * \par Output
+    * \include ParallelForExample-3D.out
+    * 
+    */
    template< typename Index,
              typename Function,
              typename... FunctionArgs >
-- 
GitLab


From 456c722e820a03946542af14b4f1df584ea00afb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 18 Nov 2019 22:40:22 +0100
Subject: [PATCH 08/43] Fixing Vector documentation.

---
 .../Examples/Containers/VectorExample.cpp     | 28 +++++++++----------
 src/TNL/Containers/Vector.h                   |  4 ++-
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/Documentation/Examples/Containers/VectorExample.cpp b/Documentation/Examples/Containers/VectorExample.cpp
index 798774c17..be2db767a 100644
--- a/Documentation/Examples/Containers/VectorExample.cpp
+++ b/Documentation/Examples/Containers/VectorExample.cpp
@@ -7,16 +7,15 @@ using namespace std;
 
 int main()
 {
-    Containers::Vector<int> vector1;
-    vector1.setSize(5);
-    vector1.setValue(0);
-    cout << "Does vector contain 1?" << vector1.containsValue(1) << endl;
-    cout << "Does vector contain only zeros?" << vector1.containsOnlyValue(0) << endl;
+    Containers::Vector<int> vector1( 5 );
+    vector1 = 0;
+    cout << "Does vector contain 1?" << vector1.containsValue( 1 ) << endl;
+    cout << "Does vector contain only zeros?" << vector1.containsOnlyValue( 0 ) << endl;
 
-    Containers::Vector<int> vector2(3);
-    vector2.setValue(1);
-    vector2.swap(vector1);
-    vector2.setElement(2,4);
+    Containers::Vector<int> vector2( 3 );
+    vector2 = 1;
+    vector2.swap( vector1 );
+    vector2.setElement( 2, 4 );
 
     cout << "First vector:" << vector1.getData() << endl;
     cout << "Second vector:" << vector2.getData() << endl;
@@ -24,10 +23,11 @@ int main()
     vector2.reset();
     cout << "Second vector after reset:" << vector2.getData() << endl;
 
-    /*Containers::Vector<int> vect = {1, 2, -3, 3};
-    cout << "The smallest element is:" << vect.min() << endl;
-    cout << "The absolute biggest element is:" << vect.absMax() << endl;
-    cout << "Sum of all vector elements:" << vect.sum() << endl;
-    vect.scalarMultiplication(2);*/
+    Containers::Vector<int> vect = { 1, 2, -3, 3 };
+    cout << "The smallest element is:" << min( vect ) << endl;
+    cout << "The absolute biggest element is:" << max( abs( vect ) ) << endl;
+    cout << "Sum of all vector elements:" << sum( vect ) << endl;
+    vect *= 2.0;
+    cout << "Vector multiplied by 2:" << vect << endl;
 }
 
diff --git a/src/TNL/Containers/Vector.h b/src/TNL/Containers/Vector.h
index be08266b6..6bec69321 100644
--- a/src/TNL/Containers/Vector.h
+++ b/src/TNL/Containers/Vector.h
@@ -32,7 +32,9 @@ namespace Containers {
  *                   is selected with \ref Allocators::Default.
  *
  * \par Example
- * \include VectorExample.cpp
+ * \include Containers/VectorExample.cpp
+ * \par Output
+ * \include VectorExample.out
  */
 template< typename Real = double,
           typename Device = Devices::Host,
-- 
GitLab


From f166b7cee8851f4cea1f97ef84cc38b10ceaf736 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 18 Nov 2019 22:40:51 +0100
Subject: [PATCH 09/43] Fixing Array and ArrayView documentation.

---
 src/TNL/Containers/Array.h     | 2 +-
 src/TNL/Containers/ArrayView.h | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Containers/Array.h b/src/TNL/Containers/Array.h
index 45ef1e272..117cb32ae 100644
--- a/src/TNL/Containers/Array.h
+++ b/src/TNL/Containers/Array.h
@@ -62,7 +62,7 @@ template< int, typename > class StaticArray;
  * See also \ref ArrayView, \ref Vector, \ref VectorView.
  *
  * \par Example
- * \include ArrayExample.cpp
+ * \include Containers/ArrayExample.cpp
  * \par Output
  * \include ArrayExample.out
  */
diff --git a/src/TNL/Containers/ArrayView.h b/src/TNL/Containers/ArrayView.h
index d51f151f7..c06ad56dc 100644
--- a/src/TNL/Containers/ArrayView.h
+++ b/src/TNL/Containers/ArrayView.h
@@ -55,7 +55,9 @@ namespace Containers {
  * See also \ref Array, \ref Vector, \ref VectorView.
  *
  * \par Example
- * \include ArrayViewExample.cpp
+ * \include Containers/ArrayViewExample.cpp
+ * \par Output
+ * \include ArrayViewExample.out
  */
 template< typename Value,
           typename Device = Devices::Host,
-- 
GitLab


From 06b0992eacae608cea40fcf2c253ddfd5158fc31 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 19 Nov 2019 15:36:49 +0100
Subject: [PATCH 10/43] Writing tutorial on StaticFor.

---
 .../Tutorials/ForLoops/StaticForExample-2.cpp |  4 +++
 .../Tutorials/ForLoops/StaticForExample.cpp   | 28 +++++++++++++++++++
 .../ForLoops/tutorial_04_ForLoops.md          | 18 +++++++++++-
 3 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/Tutorials/ForLoops/StaticForExample-2.cpp
 create mode 100644 Documentation/Tutorials/ForLoops/StaticForExample.cpp

diff --git a/Documentation/Tutorials/ForLoops/StaticForExample-2.cpp b/Documentation/Tutorials/ForLoops/StaticForExample-2.cpp
new file mode 100644
index 000000000..7ee4afd72
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/StaticForExample-2.cpp
@@ -0,0 +1,4 @@
+for( int i = 0; i < Size; i++ )
+{
+   a[ i ] = b[ i ] + c; sum += a[ i ];
+};
diff --git a/Documentation/Tutorials/ForLoops/StaticForExample.cpp b/Documentation/Tutorials/ForLoops/StaticForExample.cpp
new file mode 100644
index 000000000..47757458d
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/StaticForExample.cpp
@@ -0,0 +1,28 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/StaticVector.h>
+#include <TNL/Algorithms/StaticFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+int main( int argc, char* argv[] )
+{
+   /****
+    * Create two static vectors
+    */
+   const int Size( 3 );
+   StaticVector< Size, double > a, b;
+   a = 1.0;
+   b = 2.0;
+   double sum( 0.0 );
+
+   /****
+    * Compute an addition of a vector and a constant number.
+    */
+   auto addition = [&]( int i, const double& c ) { a[ i ] = b[ i ] + c; sum += a[ i ]; };
+   Algorithms::StaticFor< 0, Size >::exec( addition, 3.14 );
+   std::cout << "a = " << a << std::endl;
+   std::cout << "sum = " << sum << std::endl;
+}
+
diff --git a/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md b/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md
index e8d220aa7..e389329a8 100644
--- a/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md
+++ b/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md
@@ -47,7 +47,23 @@ For the completness, we show modification of the previous example into 3D:
 
 \include ParallelForExample-3D.cpp
 
-
 ## Static For<a name="static_for"></a>
+
+Static for-loop is designed for short loops with constant (i.e. known at the compile time) number of iterations. It is often used with static arrays and vectors. An adventage of this kind of for loop is that it is explicitly unrolled when the loop is short (up to eight iterations). See the following example:
+
+\include StaticForExample.cpp
+
+Notice that the static for-loop works with a lambda function simillar to parallel for-loop. The bounds of the loop are passed as template parameters in the statement `Algorithms::StaticFor< 0, Size >`. The parameters of the static method `exec` are the lambda functions to be performed in each iteration and auxiliar data to be passed to the function. The function gets the loop index `i` first followed by the auxiliary data `sum` in this example. 
+
+The result looks as:
+
+\include StaticForExample.out
+
+The effect of `StaticFor` is really the same as usual for-loop. The following code does the same as the previous example:
+
+\include StaticForExample-2.cpp
+
+The benefit of `StaticFor` is mainly in the explicit unrolling of short loops which can improve the performance in some sitautions. `StaticFor` can be used also in CUDA kernels.
+
 ## Templated Static For<a name="templated_static_for"></a>
 
-- 
GitLab


From 52f727e0c4dd9eb0f0a1cb8b0ddf5fe91188da6e Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 19 Nov 2019 15:37:15 +0100
Subject: [PATCH 11/43] Adding StaticFor tutorial examples to CmakeLists.txt.

---
 Documentation/Tutorials/ForLoops/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Documentation/Tutorials/ForLoops/CMakeLists.txt b/Documentation/Tutorials/ForLoops/CMakeLists.txt
index 522b8fb88..1e21dc237 100644
--- a/Documentation/Tutorials/ForLoops/CMakeLists.txt
+++ b/Documentation/Tutorials/ForLoops/CMakeLists.txt
@@ -8,7 +8,11 @@ ELSE()
    ADD_EXECUTABLE( ParallelForExample-3D ParallelForExample-3D.cpp )
 ENDIF()
 
+ADD_EXECUTABLE( StaticForExample StaticForExample.cpp )
+ADD_CUSTOM_COMMAND( COMMAND StaticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/StaticForExample.out OUTPUT StaticForExample.out )
+
 IF( BUILD_CUDA )
 ADD_CUSTOM_TARGET( ForLoops-cuda ALL DEPENDS
-   ParallelForExample.out )
+   ParallelForExample.out
+   StaticForExample.out )
 ENDIF()
-- 
GitLab


From 9967b10bb7db1f27969ba9e2bb90e3cde9b69d94 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 19 Nov 2019 15:38:05 +0100
Subject: [PATCH 12/43] Modifying namespaces in ParallelFor tutorial examples.

---
 Documentation/Tutorials/ForLoops/ParallelForExample-2D.cpp | 3 +--
 Documentation/Tutorials/ForLoops/ParallelForExample-3D.cpp | 3 +--
 Documentation/Tutorials/ForLoops/ParallelForExample.cpp    | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cpp b/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cpp
index 642ff9692..388c326ec 100644
--- a/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cpp
+++ b/Documentation/Tutorials/ForLoops/ParallelForExample-2D.cpp
@@ -5,7 +5,6 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
-using namespace TNL::Algorithms;
 
 template< typename Device >
 void meshFunctionSum( const int xSize,
@@ -29,7 +28,7 @@ void meshFunctionSum( const int xSize,
       const int idx = j * xSize + i;
       result_view[ idx ] = v1_view[ idx ] + v2_view[ idx ] + c; };
 
-   ParallelFor2D< Device >::exec( 0, 0, xSize, ySize, sum, xSize, c );
+   Algorithms::ParallelFor2D< Device >::exec( 0, 0, xSize, ySize, sum, xSize, c );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cpp b/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cpp
index 94479dd9b..37e07c75e 100644
--- a/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cpp
+++ b/Documentation/Tutorials/ForLoops/ParallelForExample-3D.cpp
@@ -5,7 +5,6 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
-using namespace TNL::Algorithms;
 
 template< typename Device >
 void meshFunctionSum( const int xSize,
@@ -30,7 +29,7 @@ void meshFunctionSum( const int xSize,
       const int idx = ( k * ySize + j ) * xSize + i;
       result_view[ idx ] = v1_view[ idx ] + v2_view[ idx ] + c; };
 
-   ParallelFor3D< Device >::exec( 0, 0, 0, xSize, ySize,zSize, sum, xSize, ySize, c );
+   Algorithms::ParallelFor3D< Device >::exec( 0, 0, 0, xSize, ySize,zSize, sum, xSize, ySize, c );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ForLoops/ParallelForExample.cpp b/Documentation/Tutorials/ForLoops/ParallelForExample.cpp
index 5714df7d3..8e5f4e8b2 100644
--- a/Documentation/Tutorials/ForLoops/ParallelForExample.cpp
+++ b/Documentation/Tutorials/ForLoops/ParallelForExample.cpp
@@ -5,7 +5,6 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
-using namespace TNL::Algorithms;
 
 template< typename Device >
 void vectorSum( const Vector< double, Device >& v1,
@@ -26,7 +25,7 @@ void vectorSum( const Vector< double, Device >& v1,
    auto sum = [=] __cuda_callable__  ( int i, const double c ) mutable {
       result_view[ i ] = v1_view[ i ] + v2_view[ i ] + c; };
 
-      ParallelFor< Device >::exec( 0, v1.getSize(), sum, c );
+   Algorithms::ParallelFor< Device >::exec( 0, v1.getSize(), sum, c );
 }
 
 int main( int argc, char* argv[] )
-- 
GitLab


From 06a9d8bdc2a850e94150600f2724db2837424f66 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 19 Nov 2019 16:31:40 +0100
Subject: [PATCH 13/43] Fixed ParalleFor example.

---
 Documentation/Examples/Algorithms/ParallelForExample.cpp | 2 +-
 Documentation/Tutorials/ForLoops/StaticForExample-3.cpp  | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/Tutorials/ForLoops/StaticForExample-3.cpp

diff --git a/Documentation/Examples/Algorithms/ParallelForExample.cpp b/Documentation/Examples/Algorithms/ParallelForExample.cpp
index 46d23f58a..87ea0d0cf 100644
--- a/Documentation/Examples/Algorithms/ParallelForExample.cpp
+++ b/Documentation/Examples/Algorithms/ParallelForExample.cpp
@@ -16,7 +16,7 @@ void initVector( Vector< double, Device >& v,
 {
    auto view = v.getConstView();
    auto init = [=] __cuda_callable__  ( int i, const double c ) mutable {
-      view[ i ] = c;
+      view[ i ] = c; }
 
    ParallelFor< Device >::exec( 0, v.getSize(), init, c );
 }
diff --git a/Documentation/Tutorials/ForLoops/StaticForExample-3.cpp b/Documentation/Tutorials/ForLoops/StaticForExample-3.cpp
new file mode 100644
index 000000000..7ee4afd72
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/StaticForExample-3.cpp
@@ -0,0 +1,4 @@
+for( int i = 0; i < Size; i++ )
+{
+   a[ i ] = b[ i ] + c; sum += a[ i ];
+};
-- 
GitLab


From 6eb4798451ffdf9d69d7101f2f77a307e92e3e9c Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 19 Nov 2019 16:32:12 +0100
Subject: [PATCH 14/43] Writing documentation for StaticFor.

---
 src/TNL/Algorithms/StaticFor.h | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/TNL/Algorithms/StaticFor.h b/src/TNL/Algorithms/StaticFor.h
index c74045458..e9bcb43e2 100644
--- a/src/TNL/Algorithms/StaticFor.h
+++ b/src/TNL/Algorithms/StaticFor.h
@@ -15,10 +15,28 @@
 namespace TNL {
 namespace Algorithms {
 
-// Manual unrolling does not make sense for loops with a large iterations
-// count. For a very large iterations count it would trigger the compiler's
-// limit on recursive template instantiation. Also note that the compiler
-// will (at least partially) unroll loops with static bounds anyway.
+/***
+ * \brief StaticFor is a wrapper for common for-loop with explicit unrolling.
+ *
+ * StaticFor can be used only for for-loops bounds of which are known at the
+ * compile time. StaticFor performs explicit loop unrolling for better performance.
+ * This, however, does not make sense for loops with a large iterations
+ * count. For a very large iterations count it could trigger the compiler's
+ * limit on recursive template instantiation. Also note that the compiler
+ * will (at least partially) unroll loops with static bounds anyway. For theses
+ * reasons, the explicit loop unrolling can be controlled by the third template
+ * parameter.
+ *
+ * \tparam Begin the loop will iterate over indexes [Begin,End)
+ * \tparam End the loop will iterate over indexes [Begin,End)
+ * \tparam unrolled controls the explicit loop unrolling. If it is true, the
+ *   unrolling is performed.
+ *
+ * \par Example
+ * \include Algorithms/StaticForExample.cpp
+ * \par Output
+ * \include StaticForExample.out
+ */
 template< int Begin, int End, bool unrolled = (End - Begin <= 8) >
 struct StaticFor;
 
@@ -27,6 +45,12 @@ struct StaticFor< Begin, End, true >
 {
    static_assert( Begin < End, "Wrong index interval for StaticFor. Begin must be less than end." );
 
+   /**
+    * \brief Static method for execution od the StaticFor.
+    *
+    * @param f is a (lambda) function to be performed in each iteration.
+    * @param args are auxiliary data to be passed to the function f.
+    */
    template< typename Function, typename... Args >
    __cuda_callable__
    static void exec( const Function& f, Args&&... args )
-- 
GitLab


From 44d08f19a74a33e0034f737b04ee0284f6f10069 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 19 Nov 2019 16:32:56 +0100
Subject: [PATCH 15/43] Writing documentation on StaticFor.

---
 .../Examples/Algorithms/StaticForExample.cpp  | 28 +++++++++++++++++++
 .../Tutorials/ForLoops/StaticForExample-3.cpp |  5 +---
 2 files changed, 29 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/Examples/Algorithms/StaticForExample.cpp

diff --git a/Documentation/Examples/Algorithms/StaticForExample.cpp b/Documentation/Examples/Algorithms/StaticForExample.cpp
new file mode 100644
index 000000000..47757458d
--- /dev/null
+++ b/Documentation/Examples/Algorithms/StaticForExample.cpp
@@ -0,0 +1,28 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/StaticVector.h>
+#include <TNL/Algorithms/StaticFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+int main( int argc, char* argv[] )
+{
+   /****
+    * Create two static vectors
+    */
+   const int Size( 3 );
+   StaticVector< Size, double > a, b;
+   a = 1.0;
+   b = 2.0;
+   double sum( 0.0 );
+
+   /****
+    * Compute an addition of a vector and a constant number.
+    */
+   auto addition = [&]( int i, const double& c ) { a[ i ] = b[ i ] + c; sum += a[ i ]; };
+   Algorithms::StaticFor< 0, Size >::exec( addition, 3.14 );
+   std::cout << "a = " << a << std::endl;
+   std::cout << "sum = " << sum << std::endl;
+}
+
diff --git a/Documentation/Tutorials/ForLoops/StaticForExample-3.cpp b/Documentation/Tutorials/ForLoops/StaticForExample-3.cpp
index 7ee4afd72..5298b00a1 100644
--- a/Documentation/Tutorials/ForLoops/StaticForExample-3.cpp
+++ b/Documentation/Tutorials/ForLoops/StaticForExample-3.cpp
@@ -1,4 +1 @@
-for( int i = 0; i < Size; i++ )
-{
-   a[ i ] = b[ i ] + c; sum += a[ i ];
-};
+Algorithms::StaticFor< 0, Size, true >::exec( addition, 3.14 );
\ No newline at end of file
-- 
GitLab


From 5134448c38b0ddcc886bc396039862ef3f4b60b9 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 19 Nov 2019 17:16:04 +0100
Subject: [PATCH 16/43] Writting tutorial on TemplateStaticFor.

---
 .../Algorithms/TemplateStaticForExample.cpp   | 28 ++++++++++++++++
 .../Tutorials/ForLoops/CMakeLists.txt         |  6 +++-
 .../ForLoops/TemplateStaticForExample.cpp     | 32 ++++++++++++++++++
 .../ForLoops/tutorial_04_ForLoops.md          | 33 ++++++++++++++++++-
 4 files changed, 97 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/Examples/Algorithms/TemplateStaticForExample.cpp
 create mode 100644 Documentation/Tutorials/ForLoops/TemplateStaticForExample.cpp

diff --git a/Documentation/Examples/Algorithms/TemplateStaticForExample.cpp b/Documentation/Examples/Algorithms/TemplateStaticForExample.cpp
new file mode 100644
index 000000000..47757458d
--- /dev/null
+++ b/Documentation/Examples/Algorithms/TemplateStaticForExample.cpp
@@ -0,0 +1,28 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/StaticVector.h>
+#include <TNL/Algorithms/StaticFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+int main( int argc, char* argv[] )
+{
+   /****
+    * Create two static vectors
+    */
+   const int Size( 3 );
+   StaticVector< Size, double > a, b;
+   a = 1.0;
+   b = 2.0;
+   double sum( 0.0 );
+
+   /****
+    * Compute an addition of a vector and a constant number.
+    */
+   auto addition = [&]( int i, const double& c ) { a[ i ] = b[ i ] + c; sum += a[ i ]; };
+   Algorithms::StaticFor< 0, Size >::exec( addition, 3.14 );
+   std::cout << "a = " << a << std::endl;
+   std::cout << "sum = " << sum << std::endl;
+}
+
diff --git a/Documentation/Tutorials/ForLoops/CMakeLists.txt b/Documentation/Tutorials/ForLoops/CMakeLists.txt
index 1e21dc237..738b10020 100644
--- a/Documentation/Tutorials/ForLoops/CMakeLists.txt
+++ b/Documentation/Tutorials/ForLoops/CMakeLists.txt
@@ -11,8 +11,12 @@ ENDIF()
 ADD_EXECUTABLE( StaticForExample StaticForExample.cpp )
 ADD_CUSTOM_COMMAND( COMMAND StaticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/StaticForExample.out OUTPUT StaticForExample.out )
 
+ADD_EXECUTABLE( TemplateStaticForExample TemplateStaticForExample.cpp )
+ADD_CUSTOM_COMMAND( COMMAND TemplateStaticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/TemplateStaticForExample.out OUTPUT TemplateStaticForExample.out )
+
 IF( BUILD_CUDA )
 ADD_CUSTOM_TARGET( ForLoops-cuda ALL DEPENDS
    ParallelForExample.out
-   StaticForExample.out )
+   StaticForExample.out
+   TemplateStaticForExample.out )
 ENDIF()
diff --git a/Documentation/Tutorials/ForLoops/TemplateStaticForExample.cpp b/Documentation/Tutorials/ForLoops/TemplateStaticForExample.cpp
new file mode 100644
index 000000000..eb65fd6cc
--- /dev/null
+++ b/Documentation/Tutorials/ForLoops/TemplateStaticForExample.cpp
@@ -0,0 +1,32 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/StaticVector.h>
+#include <TNL/Algorithms/TemplateStaticFor.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+
+using Index = int;
+const Index Size( 5 );
+
+template< Index I >
+struct LoopBody
+{
+   static void exec( const StaticVector< Size, double >& v ) {
+      std::cout << "v[ " << I << " ] = " << v[ I ] << std::endl;
+   }
+};
+
+int main( int argc, char* argv[] )
+{
+   /****
+    * Initiate static vector
+    */
+   StaticVector< Size, double > v{ 1.0, 2.0, 3.0, 4.0, 5.0 };
+
+   /****
+    * Print out the vector using template parameters for indexing.
+    */
+   Algorithms::TemplateStaticFor< Index, 0, Size, LoopBody >::exec( v );
+}
+
diff --git a/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md b/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md
index e389329a8..4ca415c69 100644
--- a/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md
+++ b/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md
@@ -63,7 +63,38 @@ The effect of `StaticFor` is really the same as usual for-loop. The following co
 
 \include StaticForExample-2.cpp
 
-The benefit of `StaticFor` is mainly in the explicit unrolling of short loops which can improve the performance in some sitautions. `StaticFor` can be used also in CUDA kernels.
+The benefit of `StaticFor` is mainly in the explicit unrolling of short loops which can improve the performance in some situations. `StaticFor` can be forced to do the loop-unrolling in any situations using the third template parameter as follows:
+
+\include StaticForExample-3.cpp
+
+`StaticFor` can be used also in CUDA kernels.
 
 ## Templated Static For<a name="templated_static_for"></a>
 
+Templated static for-loop (`TemplateStaticFor`) is a for-loop in template parameters. For example, if class `LoopBody` is defined as
+
+```
+template< int i >
+struct LoopBody
+{
+   static void exec() { ... };
+}
+```
+
+one might need to execute the following sequence of statements:
+
+```
+LoopBody< 0 >::exec();
+LoopBody< 1 >::exec();
+LoopBody< 3 >::exec();
+...
+LoodBody< N >::exec();
+```
+
+This is exactly what `TemplateStaticFor` can do - in a slightly more general way. See the following example:
+
+\include TemplateStaticForExample.cpp
+
+The output looks as follows:
+
+\include TemplateStaticForExample.out
-- 
GitLab


From 4c64b1fa112a206b64d31c488a935ec97913ac7c Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 19 Nov 2019 17:42:20 +0100
Subject: [PATCH 17/43] Small fixes in StaticFor documentation.

---
 src/TNL/Algorithms/StaticFor.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Algorithms/StaticFor.h b/src/TNL/Algorithms/StaticFor.h
index e9bcb43e2..6a450638f 100644
--- a/src/TNL/Algorithms/StaticFor.h
+++ b/src/TNL/Algorithms/StaticFor.h
@@ -15,7 +15,7 @@
 namespace TNL {
 namespace Algorithms {
 
-/***
+/**
  * \brief StaticFor is a wrapper for common for-loop with explicit unrolling.
  *
  * StaticFor can be used only for for-loops bounds of which are known at the
@@ -48,8 +48,8 @@ struct StaticFor< Begin, End, true >
    /**
     * \brief Static method for execution od the StaticFor.
     *
-    * @param f is a (lambda) function to be performed in each iteration.
-    * @param args are auxiliary data to be passed to the function f.
+    * \param f is a (lambda) function to be performed in each iteration.
+    * \param args are auxiliary data to be passed to the function f.
     */
    template< typename Function, typename... Args >
    __cuda_callable__
-- 
GitLab


From 0a92695429dddee7af45f93213180f18ca9cc9de Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 19 Nov 2019 17:42:43 +0100
Subject: [PATCH 18/43] Writing TemplateStaticFor documentation.

---
 .../Algorithms/TemplateStaticForExample.cpp   | 27 ++++++++++---------
 src/TNL/Algorithms/TemplateStaticFor.h        | 25 +++++++++++++++++
 2 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/Documentation/Examples/Algorithms/TemplateStaticForExample.cpp b/Documentation/Examples/Algorithms/TemplateStaticForExample.cpp
index 47757458d..a2fce79ae 100644
--- a/Documentation/Examples/Algorithms/TemplateStaticForExample.cpp
+++ b/Documentation/Examples/Algorithms/TemplateStaticForExample.cpp
@@ -1,28 +1,31 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/StaticVector.h>
-#include <TNL/Algorithms/StaticFor.h>
+#include <TNL/Algorithms/TemplateStaticFor.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
 
+const int Size( 5 );
+
+template< int I >
+struct LoopBody
+{
+   static void exec( const StaticVector< Size, double >& v ) {
+      std::cout << "v[ " << I << " ] = " << v[ I ] << std::endl;
+   }
+};
+
 int main( int argc, char* argv[] )
 {
    /****
-    * Create two static vectors
+    * Initiate static vector
     */
-   const int Size( 3 );
-   StaticVector< Size, double > a, b;
-   a = 1.0;
-   b = 2.0;
-   double sum( 0.0 );
+   StaticVector< Size, double > v{ 1.0, 2.0, 3.0, 4.0, 5.0 };
 
    /****
-    * Compute an addition of a vector and a constant number.
+    * Print out the vector using template parameters for indexing.
     */
-   auto addition = [&]( int i, const double& c ) { a[ i ] = b[ i ] + c; sum += a[ i ]; };
-   Algorithms::StaticFor< 0, Size >::exec( addition, 3.14 );
-   std::cout << "a = " << a << std::endl;
-   std::cout << "sum = " << sum << std::endl;
+   Algorithms::TemplateStaticFor< 0, Size, LoopBody >::exec( v );
 }
 
diff --git a/src/TNL/Algorithms/TemplateStaticFor.h b/src/TNL/Algorithms/TemplateStaticFor.h
index 753ad9b26..de6eebbee 100644
--- a/src/TNL/Algorithms/TemplateStaticFor.h
+++ b/src/TNL/Algorithms/TemplateStaticFor.h
@@ -17,6 +17,31 @@
 
 namespace TNL {
 namespace Algorithms {
+
+/**
+ * \brief TemplateStaticFor serves for coding for-loops in template parameters.
+ *
+ * The result of calling this loop with a templated class \p LoopBody is as follows:
+ *
+ * LoopBody< begin >::exec( ... );
+ *
+ * LoodBody< begin + 1 >::exec( ... );
+ *
+ * ...
+ *
+ * LoopBody< end - 1 >::exec( ... );
+ *
+ * \tparam IndexType is type of the loop indexes
+ * \tparam begin the loop iterates over index interval [begin,end).
+ * \tparam end the loop iterates over index interval [begin,end).
+ * \tparam LoopBody is a templated class having one template parameter of IndexType.
+ */
+template< typename IndexType,
+          IndexType begin,
+          IndexType end,
+          template< IndexType > class LoopBody >
+struct TemplateStaticFor;
+
 namespace detail {
 
 template< typename IndexType,
-- 
GitLab


From 69de1fb05c13c742a5ec369ed074841654b74856 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 20 Nov 2019 17:00:20 +0100
Subject: [PATCH 19/43] Scipts install and build end with error code when build
 fails.

---
 build   |  4 +++-
 install | 10 ++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/build b/build
index 914c65b19..68966f0f4 100755
--- a/build
+++ b/build
@@ -202,7 +202,9 @@ if [[ "$make" != "make" ]] && [[ "$VERBOSE" ]]; then
    VERBOSE="-v"
 fi
 
-$make ${VERBOSE} $make_target
+if ! $make ${VERBOSE} $make_target; then
+   exit 1
+fi
 
 if [[ ${WITH_DOC} == "yes" ]]; then
    "$ROOT_DIR/Documentation/build" --prefix="$PREFIX"
diff --git a/install b/install
index fe138dfaa..9b66bfbee 100755
--- a/install
+++ b/install
@@ -35,7 +35,10 @@ if [[ ${BUILD_DEBUG} == "yes" ]]; then
       mkdir Debug
    fi
    pushd Debug
-   ../build --root-dir=.. --build=Debug --install=yes ${OPTIONS}
+   if ! ../build --root-dir=.. --build=Debug --install=yes ${OPTIONS}; then
+      echo "Debug build failed."
+      exit 1
+   fi
    popd
 fi
 
@@ -44,7 +47,10 @@ if [[ ${BUILD_RELEASE} == "yes" ]]; then
       mkdir Release
    fi
    pushd Release
-   ../build --root-dir=.. --build=Release --install=yes ${OPTIONS};
+   if ! ../build --root-dir=.. --build=Release --install=yes ${OPTIONS}; then
+      echo "Release build failed."
+      exit 1
+   fi
    popd
 fi
 
-- 
GitLab


From 8c10f04fcb83d4644a7272a8ea560dc9833f7bbd Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 20 Nov 2019 17:02:19 +0100
Subject: [PATCH 20/43] Added description of the Algorithms namespace.

---
 src/TNL/Algorithms/ParallelFor.h | 68 +++++++++++++++++---------------
 1 file changed, 37 insertions(+), 31 deletions(-)

diff --git a/src/TNL/Algorithms/ParallelFor.h b/src/TNL/Algorithms/ParallelFor.h
index 20e6bf796..b8fec8488 100644
--- a/src/TNL/Algorithms/ParallelFor.h
+++ b/src/TNL/Algorithms/ParallelFor.h
@@ -31,13 +31,19 @@
  */
 
 namespace TNL {
-namespace Algorithms {
+/**
+ * \brief Namespace for fundamental TNL algorithms
+ *
+ * It contains algorithms like for-loops, memory operations, (parallel) reduction,
+ * multireduction, scan etc.
+ */
+namespace Algorithms
 
 // TODO: ParallelForMode should be moved to Device (=Executor)
 
 /**
- * \brief Enum for the parallel processing of the for-loop. 
- * 
+ * \brief Enum for the parallel processing of the for-loop.
+ *
  * Synchronous means that the program control returns to the caller when the loop is processed completely.
  * Asynchronous means that the program control returns to the caller immediately even before the loop is processing is finished.
  */
@@ -46,9 +52,9 @@ enum ParallelForMode { SynchronousMode, AsynchronousMode };
 
 /**
  * \brief Parallel for loop for one dimensional interval of indexes.
- * 
+ *
  * \tparam Device says on what device the for-loop is gonna be executed.
- *    It can be Devices::Host, Devices::Cuda or Devices::Sequential. 
+ *    It can be Devices::Host, Devices::Cuda or Devices::Sequential.
  * \tparam Mode defines synchronous/asynchronous mode on parallel devices.
  */
 template< typename Device = Devices::Sequential,
@@ -57,22 +63,22 @@ struct ParallelFor
 {
    /**
     * \brief Static method for execution of the loop.
-    * 
+    *
     * \tparam Index defines the type of indexes over which the loop iterates.
     * \tparam Function is the type of function to be called in each iteration.
-    * \tparam FunctionArgs is a variadic type of additional parameters which are 
+    * \tparam FunctionArgs is a variadic type of additional parameters which are
     *    supposed to be passed to the inner Function.
-    * 
+    *
     * \param start the for-loop iterates over index interval [start, end).
     * \param end the for-loop iterates over index interval [start, end).
     * \param f is the function to be called in each iteration
     * \param args are additional parameters to be passed to the function f.
-    * 
+    *
     * \par Example
     * \include Algorithms/ParallelForExample.cpp
     * \par Output
     * \include ParallelForExample.out
-    * 
+    *
     */
    template< typename Index,
              typename Function,
@@ -86,9 +92,9 @@ struct ParallelFor
 
 /**
  * \brief Parallel for loop for two dimensional domain of indexes.
- * 
+ *
  * \tparam Device says on what device the for-loop is gonna be executed.
- *    It can be Devices::Host, Devices::Cuda or Devices::Sequential. 
+ *    It can be Devices::Host, Devices::Cuda or Devices::Sequential.
  * \tparam Mode defines synchronous/asynchronous mode on parallel devices.
  */
 template< typename Device = Devices::Sequential,
@@ -97,30 +103,30 @@ struct ParallelFor2D
 {
    /**
     * \brief Static method for execution of the loop.
-    * 
+    *
     * \tparam Index defines the type of indexes over which the loop iterates.
     * \tparam Function is the type of function to be called in each iteration.
-    * \tparam FunctionArgs is a variadic type of additional parameters which are 
+    * \tparam FunctionArgs is a variadic type of additional parameters which are
     *    supposed to be passed to the inner Function.
-    * 
+    *
     * \param startX the for-loop iterates over index domain [startX,endX)x[startY,endY).
     * \param startY the for-loop iterates over index domain [startX,endX)x[startY,endY).
     * \param endX the for-loop iterates over index domain [startX,endX)x[startY,endY).
     * \param endY the for-loop iterates over index domain [startX,endX)x[startY,endY).
     * \param f is the function to be called in each iteration
     * \param args are additional parameters to be passed to the function f.
-    * 
-    * The function f is called for each iteration as 
-    * 
+    *
+    * The function f is called for each iteration as
+    *
     * f( i, j, args... )
-    * 
+    *
     * where the first parameter is changing more often than the second one.
     *
     * \par Example
     * \include Algorithms/ParallelForExample-2D.cpp
     * \par Output
     * \include ParallelForExample-2D.out
-    * 
+    *
     */
    template< typename Index,
              typename Function,
@@ -135,9 +141,9 @@ struct ParallelFor2D
 
 /**
  * \brief Parallel for loop for three dimensional domain of indexes.
- * 
+ *
  * \tparam Device says on what device the for-loop is gonna be executed.
- *    It can be Devices::Host, Devices::Cuda or Devices::Sequential. 
+ *    It can be Devices::Host, Devices::Cuda or Devices::Sequential.
  * \tparam Mode defines synchronous/asynchronous mode on parallel devices.
  */
 template< typename Device = Devices::Sequential,
@@ -146,12 +152,12 @@ struct ParallelFor3D
 {
    /**
     * \brief Static method for execution of the loop.
-    * 
+    *
     * \tparam Index defines the type of indexes over which the loop iterates.
     * \tparam Function is the type of function to be called in each iteration.
-    * \tparam FunctionArgs is a variadic type of additional parameters which are 
+    * \tparam FunctionArgs is a variadic type of additional parameters which are
     *    supposed to be passed to the inner Function.
-    * 
+    *
     * \param startX the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
     * \param startY the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
     * \param startZ the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
@@ -160,18 +166,18 @@ struct ParallelFor3D
     * \param endZ the for-loop iterates over index domain [startX,endX)x[startY,endY)x[startZ,endZ).
     * \param f is the function to be called in each iteration
     * \param args are additional parameters to be passed to the function f.
-    * 
-    * The function f is called for each iteration as 
-    * 
+    *
+    * The function f is called for each iteration as
+    *
     * f( i, j, k, args... )
-    * 
+    *
     * where the first parameter is changing the most often.
-    * 
+    *
     * \par Example
     * \include Algorithms/ParallelForExample-3D.cpp
     * \par Output
     * \include ParallelForExample-3D.out
-    * 
+    *
     */
    template< typename Index,
              typename Function,
-- 
GitLab


From 280cf82dd47e6094f553f6fa72d75b363a947365 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 20 Nov 2019 17:02:53 +0100
Subject: [PATCH 21/43] Fixing ParallelFor example.

---
 Documentation/Examples/Algorithms/ParallelForExample.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Documentation/Examples/Algorithms/ParallelForExample.cpp b/Documentation/Examples/Algorithms/ParallelForExample.cpp
index 87ea0d0cf..41ea2f1a2 100644
--- a/Documentation/Examples/Algorithms/ParallelForExample.cpp
+++ b/Documentation/Examples/Algorithms/ParallelForExample.cpp
@@ -5,7 +5,6 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
-using namespace TNL::Algorithms;
 
 /****
  * Set all elements of the vector v to the constant c.
@@ -16,9 +15,9 @@ void initVector( Vector< double, Device >& v,
 {
    auto view = v.getConstView();
    auto init = [=] __cuda_callable__  ( int i, const double c ) mutable {
-      view[ i ] = c; }
+      view[ i ] = c; };
 
-   ParallelFor< Device >::exec( 0, v.getSize(), init, c );
+   Algorithms::ParallelFor< Device >::exec( 0, v.getSize(), init, c );
 }
 
 int main( int argc, char* argv[] )
-- 
GitLab


From c1a71beb74ef6fe1f28fb9d79caaaf5871c12f0f Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 20 Nov 2019 17:03:13 +0100
Subject: [PATCH 22/43] Added documentation for template static for-loop.

---
 src/TNL/Algorithms/TemplateStaticFor.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/TNL/Algorithms/TemplateStaticFor.h b/src/TNL/Algorithms/TemplateStaticFor.h
index de6eebbee..c96c816dc 100644
--- a/src/TNL/Algorithms/TemplateStaticFor.h
+++ b/src/TNL/Algorithms/TemplateStaticFor.h
@@ -35,6 +35,11 @@ namespace Algorithms {
  * \tparam begin the loop iterates over index interval [begin,end).
  * \tparam end the loop iterates over index interval [begin,end).
  * \tparam LoopBody is a templated class having one template parameter of IndexType.
+ *
+ * \par Example
+ * \include Algorithms/TamplateStaticForExample.cpp
+ * \par Output
+ * \include TamplateStaticForExample.out
  */
 template< typename IndexType,
           IndexType begin,
@@ -50,6 +55,12 @@ template< typename IndexType,
           template< IndexType > class LoopBody >
 struct TemplateStaticForExecutor
 {
+   /**
+    * \brief Static method initiating the for-loop.
+    *
+    * \tparam Args type of user defined data to be passed to for-loop.
+    * \param args user defined data to be passed to for-loop.
+    */
    template< typename... Args >
    __cuda_callable__
    static void exec( Args&&... args )
-- 
GitLab


From 73b38abf030e1b46fedff02a5265edeae53c1edd Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 20 Nov 2019 17:26:07 +0100
Subject: [PATCH 23/43] Fixed namespace in ParallelFor.h.

---
 src/TNL/Algorithms/ParallelFor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Algorithms/ParallelFor.h b/src/TNL/Algorithms/ParallelFor.h
index b8fec8488..471e81ad9 100644
--- a/src/TNL/Algorithms/ParallelFor.h
+++ b/src/TNL/Algorithms/ParallelFor.h
@@ -37,7 +37,7 @@ namespace TNL {
  * It contains algorithms like for-loops, memory operations, (parallel) reduction,
  * multireduction, scan etc.
  */
-namespace Algorithms
+namespace Algorithms {
 
 // TODO: ParallelForMode should be moved to Device (=Executor)
 
-- 
GitLab


From e1d3086e1594ec39844bf9a631ace2f193ccea99 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 20 Nov 2019 17:29:03 +0100
Subject: [PATCH 24/43] Fixed documentation of mode in ParallelFor.

---
 src/TNL/Algorithms/ParallelFor.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/TNL/Algorithms/ParallelFor.h b/src/TNL/Algorithms/ParallelFor.h
index 471e81ad9..cb096f879 100644
--- a/src/TNL/Algorithms/ParallelFor.h
+++ b/src/TNL/Algorithms/ParallelFor.h
@@ -46,6 +46,8 @@ namespace Algorithms {
  *
  * Synchronous means that the program control returns to the caller when the loop is processed completely.
  * Asynchronous means that the program control returns to the caller immediately even before the loop is processing is finished.
+ *
+ * Only parallel for-loops in CUDA are affected by this mode.
  */
 enum ParallelForMode { SynchronousMode, AsynchronousMode };
 
-- 
GitLab


From ff50a94031597b8eaa5657b5ba1ecbce75f66cb2 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 20 Nov 2019 17:33:55 +0100
Subject: [PATCH 25/43] Added comment for namespace Pointers.

---
 src/TNL/Pointers/SmartPointer.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/TNL/Pointers/SmartPointer.h b/src/TNL/Pointers/SmartPointer.h
index e6700781c..66516af70 100644
--- a/src/TNL/Pointers/SmartPointer.h
+++ b/src/TNL/Pointers/SmartPointer.h
@@ -11,6 +11,12 @@
 #pragma once
 
 namespace TNL {
+
+/**
+ * \brief Namespace for TNL pointers.
+ *
+ * Pointers in TNL are similar to STL pointers but they work across different device.
+ */
 namespace Pointers {
 
 class SmartPointer
-- 
GitLab


From f683b869de17d7e914d28e447c9afdb6307e87b2 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 20 Nov 2019 17:42:58 +0100
Subject: [PATCH 26/43] Avoided numbering of tutorial files.

---
 .../{tutorial_01_Arrays.md => tutorial_Arrays.md}      |  2 +-
 ...l.md => tutorial_building_applications_with_tnl.md} |  2 +-
 .../{tutorial_04_ForLoops.md => tutorial_ForLoops.md}  |  2 +-
 ...eductionAndScan.md => tutorial_ReductionAndScan.md} |  2 +-
 .../{tutorial_02_Vectors.md => tutorial_Vectors.md}    |  2 +-
 Documentation/Tutorials/index.md                       | 10 +++++-----
 6 files changed, 10 insertions(+), 10 deletions(-)
 rename Documentation/Tutorials/Arrays/{tutorial_01_Arrays.md => tutorial_Arrays.md} (99%)
 rename Documentation/Tutorials/BuildWithTNL/{tutorial_00_building_applications_with_tnl.md => tutorial_building_applications_with_tnl.md} (98%)
 rename Documentation/Tutorials/ForLoops/{tutorial_04_ForLoops.md => tutorial_ForLoops.md} (99%)
 rename Documentation/Tutorials/ReductionAndScan/{tutorial_03_ReductionAndScan.md => tutorial_ReductionAndScan.md} (99%)
 rename Documentation/Tutorials/Vectors/{tutorial_02_Vectors.md => tutorial_Vectors.md} (98%)

diff --git a/Documentation/Tutorials/Arrays/tutorial_01_Arrays.md b/Documentation/Tutorials/Arrays/tutorial_Arrays.md
similarity index 99%
rename from Documentation/Tutorials/Arrays/tutorial_01_Arrays.md
rename to Documentation/Tutorials/Arrays/tutorial_Arrays.md
index cb07521dc..0d728935e 100644
--- a/Documentation/Tutorials/Arrays/tutorial_01_Arrays.md
+++ b/Documentation/Tutorials/Arrays/tutorial_Arrays.md
@@ -1,4 +1,4 @@
-\page tutorial_01_arrays  Arrays tutorial
+\page tutorial_Arrays  Arrays tutorial
 
 ## Introduction
 
diff --git a/Documentation/Tutorials/BuildWithTNL/tutorial_00_building_applications_with_tnl.md b/Documentation/Tutorials/BuildWithTNL/tutorial_building_applications_with_tnl.md
similarity index 98%
rename from Documentation/Tutorials/BuildWithTNL/tutorial_00_building_applications_with_tnl.md
rename to Documentation/Tutorials/BuildWithTNL/tutorial_building_applications_with_tnl.md
index 6d1d1bc7b..2d9572b76 100644
--- a/Documentation/Tutorials/BuildWithTNL/tutorial_00_building_applications_with_tnl.md
+++ b/Documentation/Tutorials/BuildWithTNL/tutorial_building_applications_with_tnl.md
@@ -1,4 +1,4 @@
-\page tutorial_00_building_applications_with_tnl  Building Applications with TNL
+\page tutorial_building_applications_with_tnl  Building Applications with TNL
 
 ## Introduction
 
diff --git a/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md b/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
similarity index 99%
rename from Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md
rename to Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
index 4ca415c69..ba6f3ea2d 100644
--- a/Documentation/Tutorials/ForLoops/tutorial_04_ForLoops.md
+++ b/Documentation/Tutorials/ForLoops/tutorial_ForLoops.md
@@ -1,4 +1,4 @@
-\page tutorial_04_ForLoops For loops
+\page tutorial_ForLoops For loops
 
 ## Introduction
 
diff --git a/Documentation/Tutorials/ReductionAndScan/tutorial_03_ReductionAndScan.md b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
similarity index 99%
rename from Documentation/Tutorials/ReductionAndScan/tutorial_03_ReductionAndScan.md
rename to Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
index a82dcfa4c..717f17da5 100644
--- a/Documentation/Tutorials/ReductionAndScan/tutorial_03_ReductionAndScan.md
+++ b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
@@ -1,4 +1,4 @@
-\page tutorial_03_reduction Flexible (parallel) reduction and prefix-sum tutorial
+\page tutorial_ReductionAndScan Flexible (parallel) reduction and prefix-sum tutorial
 
 ## Introduction
 
diff --git a/Documentation/Tutorials/Vectors/tutorial_02_Vectors.md b/Documentation/Tutorials/Vectors/tutorial_Vectors.md
similarity index 98%
rename from Documentation/Tutorials/Vectors/tutorial_02_Vectors.md
rename to Documentation/Tutorials/Vectors/tutorial_Vectors.md
index 301f410a5..acbe7e7f0 100644
--- a/Documentation/Tutorials/Vectors/tutorial_02_Vectors.md
+++ b/Documentation/Tutorials/Vectors/tutorial_Vectors.md
@@ -1,4 +1,4 @@
-\page tutorial_02_vectors  Vectors tutorial
+\page tutorial_Vectors  Vectors tutorial
 
 ## Introduction
 
diff --git a/Documentation/Tutorials/index.md b/Documentation/Tutorials/index.md
index 8146a6bd8..f14b79c3c 100644
--- a/Documentation/Tutorials/index.md
+++ b/Documentation/Tutorials/index.md
@@ -2,8 +2,8 @@
 
 ## Tutorials
 
-1. [Building applications with TNL](tutorial_00_building_applications_with_tnl.html)
-2. [Arrays](tutorial_01_arrays.html)
-3. [Vectors](tutorial_02_vectors.html)
-4. [Flexible parallel reduction and prefix-sum](tutorial_03_reduction.html)
-5. [For loops](tutorial_04_ForLoops.html)
+1. [Building applications with TNL](tutorial_building_applications_with_tnl.html)
+2. [Arrays](tutorial_Arrays.html)
+3. [Vectors](tutorial_Vectors.html)
+4. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html)
+5. [For loops](tutorial_ForLoops.html)
-- 
GitLab


From 13972fe39044788b22a8effe9cc2cbd7b74645f3 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 21 Nov 2019 14:03:26 +0100
Subject: [PATCH 27/43] Writing tutorial on UniquePointer.

---
 Documentation/Tutorials/CMakeLists.txt          |  1 +
 Documentation/Tutorials/Pointers/CMakeLists.txt | 12 ++++++++++++
 Documentation/Tutorials/index.md                |  1 +
 3 files changed, 14 insertions(+)
 create mode 100644 Documentation/Tutorials/Pointers/CMakeLists.txt

diff --git a/Documentation/Tutorials/CMakeLists.txt b/Documentation/Tutorials/CMakeLists.txt
index 8f9971d47..56fbb202c 100644
--- a/Documentation/Tutorials/CMakeLists.txt
+++ b/Documentation/Tutorials/CMakeLists.txt
@@ -3,3 +3,4 @@ add_subdirectory( Arrays )
 add_subdirectory( Vectors )
 add_subdirectory( ReductionAndScan )
 add_subdirectory( ForLoops )
+add_subdirectory( Pointers )
diff --git a/Documentation/Tutorials/Pointers/CMakeLists.txt b/Documentation/Tutorials/Pointers/CMakeLists.txt
new file mode 100644
index 000000000..de824666c
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/CMakeLists.txt
@@ -0,0 +1,12 @@
+IF( BUILD_CUDA )
+   CUDA_ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cu )
+   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
+ELSE()
+   ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cu )
+   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
+ENDIF()
+
+IF( BUILD_CUDA )
+ADD_CUSTOM_TARGET( TutorialsPointers ALL DEPENDS
+   UniquePointerExample.out )
+ENDIF()
diff --git a/Documentation/Tutorials/index.md b/Documentation/Tutorials/index.md
index f14b79c3c..0dd60716f 100644
--- a/Documentation/Tutorials/index.md
+++ b/Documentation/Tutorials/index.md
@@ -7,3 +7,4 @@
 3. [Vectors](tutorial_Vectors.html)
 4. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html)
 5. [For loops](tutorial_ForLoops.html)
+6. [Cross-device pointers](tutorial_Pointers.html)
-- 
GitLab


From 02753058b0a2e15055b2b32497a8ee7e8d3a5181 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 21 Nov 2019 14:13:05 +0100
Subject: [PATCH 28/43] Added example on UniquePointer.

---
 .../Pointers/UniquePointerExample.cpp         | 41 +++++++++++++++++++
 .../Pointers/UniquePointerExample.cu          |  1 +
 .../Tutorials/Pointers/tutorial_Pointers.md   | 37 +++++++++++++++++
 3 files changed, 79 insertions(+)
 create mode 100644 Documentation/Tutorials/Pointers/UniquePointerExample.cpp
 create mode 120000 Documentation/Tutorials/Pointers/UniquePointerExample.cu
 create mode 100644 Documentation/Tutorials/Pointers/tutorial_Pointers.md

diff --git a/Documentation/Tutorials/Pointers/UniquePointerExample.cpp b/Documentation/Tutorials/Pointers/UniquePointerExample.cpp
new file mode 100644
index 000000000..55eb9e9c3
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/UniquePointerExample.cpp
@@ -0,0 +1,41 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Pointers/UniquePointer.h>
+
+
+using namespace TNL;
+
+using ArrayHost = Containers::Array< int, Devices::Host >;
+using ArrayCuda = Containers::Array< int, Devices::Cuda >;
+
+__global__ void checkArray( const ArrayCuda* ptr )
+{
+   printf( "Array size is: %d\n", ptr->getSize() );
+   for( int i = 0; i < ptr->getSize(); i++ )
+      printf( "a[ %d ] = %d \n", i, ( *ptr )[ i ] );
+}
+
+int main( int argc, char* argv[] )
+{
+
+   /***
+    * Make unique pointer on array on CPU and manipulate the
+    * array via the pointer.
+    */
+   Pointers::UniquePointer< ArrayHost > array_host_ptr( 10 );
+   *array_host_ptr = 1;
+   std::cout << "Array = " << *array_host_ptr << std::endl;
+
+   /***
+    * Let's do the same in CUDA
+    */
+#ifdef HAVE_CUDA
+   Pointers::UniquePointer< ArrayCuda > array_cuda_ptr( 10 );
+   array_cuda_ptr.modifyData< Devices::Host >() = 1;
+   //Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   //checkArray<<< 1, 1 >>>( &array_cuda_ptr.getData< Devices::Cuda >() );
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Tutorials/Pointers/UniquePointerExample.cu b/Documentation/Tutorials/Pointers/UniquePointerExample.cu
new file mode 120000
index 000000000..a7c9828d5
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/UniquePointerExample.cu
@@ -0,0 +1 @@
+UniquePointerExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/tutorial_Pointers.md b/Documentation/Tutorials/Pointers/tutorial_Pointers.md
new file mode 100644
index 000000000..8e4106ad3
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/tutorial_Pointers.md
@@ -0,0 +1,37 @@
+\page tutorial_Pointers  Cross-device pointers tutorial
+
+## Introduction
+
+Smart pointers in TNL are motivated by the smart pointerin the STL library. In addition, they work across different devices and so they make data management easier.
+
+## Table of Contents
+1. [Unique pointers](#unique_pointers)
+2. [Shared pointers](#shared_pointers)
+3. [Device pointers](#device_pointers)
+
+
+## Unique pointers <a name="unique_pointers"></a>
+
+Simillar to STL smart pointer `std::unique_ptr` `UniquePointer` is a smart poinetr managing certain dynamicaly allocated object. The object is automatically  deallocated when the pointer goes out of scope. The definition of `UniquePointer` reads as:
+
+```
+template< typename Object, typename Device = typename Object::DeviceType >
+class UniquePointer;
+```
+
+It takes two template parameters:
+
+1. `Object` is a type of object managed by the pointer.
+2. `Device` is a device where the object is to be allocated.
+
+If the device type is `Devices::Host`, `UniquePointer` behaves as usual unique smart pointer. If the device is different, `Devices::Cuda` for example, the unique pointer creates an image if the object even in the host memory. It means, that one can manipulate the object on the host. All smart pointers are registered in a special register using which they can be easily synchronised before calling a CUDA kernel. This means that all modified images of the objects in the memory are transferred on the GPU. See the following example:
+
+\include UniquePointerExample.cpp
+
+The result looks as:
+
+\include UniquePointerExample.out
+
+## Shared pointers <a name="shared_pointers"></a>
+
+## Device pointers <a name="device_pointers"></a>
-- 
GitLab


From f3da6becdd4d32a0d31227e471d039a741472d45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 21 Nov 2019 20:47:19 +0100
Subject: [PATCH 29/43] Writing SharedPointer tutorial.

---
 .../Tutorials/Pointers/CMakeLists.txt         | 17 ++++-
 .../Pointers/SharedPointerExample.cpp         | 49 ++++++++++++
 .../Pointers/SharedPointerExample.cu          |  1 +
 .../Pointers/UniquePointerExample.cpp         | 28 ++++---
 .../Pointers/UniquePointerHostExample.cpp     | 23 ++++++
 .../Tutorials/Pointers/tutorial_Pointers.md   | 76 ++++++++++++++++++-
 6 files changed, 173 insertions(+), 21 deletions(-)
 create mode 100644 Documentation/Tutorials/Pointers/SharedPointerExample.cpp
 create mode 120000 Documentation/Tutorials/Pointers/SharedPointerExample.cu
 create mode 100644 Documentation/Tutorials/Pointers/UniquePointerHostExample.cpp

diff --git a/Documentation/Tutorials/Pointers/CMakeLists.txt b/Documentation/Tutorials/Pointers/CMakeLists.txt
index de824666c..bf6581c52 100644
--- a/Documentation/Tutorials/Pointers/CMakeLists.txt
+++ b/Documentation/Tutorials/Pointers/CMakeLists.txt
@@ -1,12 +1,23 @@
 IF( BUILD_CUDA )
    CUDA_ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cu )
    ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
+   CUDA_ADD_EXECUTABLE( SharedPointerExample SharedPointerExample.cu )
+   ADD_CUSTOM_COMMAND( COMMAND SharedPointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SharedPointerExample.out OUTPUT SharedPointerExample.out )
 ELSE()
-   ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cu )
+   ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cpp )
    ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
 ENDIF()
 
+ADD_EXECUTABLE( UniquePointerHostExample UniquePointerHostExample.cpp )
+ADD_CUSTOM_COMMAND( COMMAND UniquePointerHostExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerHostExample.out OUTPUT UniquePointerHostExample.out )
+
+
 IF( BUILD_CUDA )
-ADD_CUSTOM_TARGET( TutorialsPointers ALL DEPENDS
-   UniquePointerExample.out )
+ADD_CUSTOM_TARGET( TutorialsPointersCuda ALL DEPENDS
+   UniquePointerExample.out
+   SharedPointerExample.out )
 ENDIF()
+
+ADD_CUSTOM_TARGET( TutorialsPointers ALL DEPENDS
+   UniquePointerHostExample.out
+)
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/SharedPointerExample.cpp b/Documentation/Tutorials/Pointers/SharedPointerExample.cpp
new file mode 100644
index 000000000..8df827b05
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/SharedPointerExample.cpp
@@ -0,0 +1,49 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Pointers/SharedPointer.h>
+
+using namespace TNL;
+
+using ArrayCuda = Containers::Array< int, Devices::Cuda >;
+
+struct Tuple
+{
+   Pointers::SharedPointer< ArrayCuda > a1, a2;
+};
+
+__global__ void checkArray( const Tuple t )
+{
+   printf( "Array size is: %d\n", ptr->getSize() );
+   for( int i = 0; i < ptr->getSize(); i++ )
+      printf( "a[ %d ] = %d \n", i, ( *ptr )[ i ] );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Create a tuple of arrays and print the in CUDA kernel
+    */
+#ifdef HAVE_CUDA
+   Tuple t;
+   t.a1.modifyData< Devices::Host >().setSize( 10 );
+   t.a1.modifyData< Devices::Host >() = 1;
+   t.a2.modifyData< Devices::Host >().setSize( 10 );
+   t.a2.modifyData< Devices::Host >() = 2;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printkArrays<<< 1, 1 >>>( t );
+
+   /***
+    * Resize the array
+    */
+   t.a1.modifyData< Devices::Host >().setSize( 5 );
+   t.a1.modifyData< Devices::Host >() = 3;
+   t.a2.modifyData< Devices::Host >().setSize( 5 );
+   t.a2.modifyData< Devices::Host >() = 4;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printArrays<<< 1, 1 >>>( t );
+#endif
+   return EXIT_SUCCESS;
+
+}
+
diff --git a/Documentation/Tutorials/Pointers/SharedPointerExample.cu b/Documentation/Tutorials/Pointers/SharedPointerExample.cu
new file mode 120000
index 000000000..7d10e3312
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/SharedPointerExample.cu
@@ -0,0 +1 @@
+SharedPointerExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/UniquePointerExample.cpp b/Documentation/Tutorials/Pointers/UniquePointerExample.cpp
index 55eb9e9c3..6f25305e9 100644
--- a/Documentation/Tutorials/Pointers/UniquePointerExample.cpp
+++ b/Documentation/Tutorials/Pointers/UniquePointerExample.cpp
@@ -3,13 +3,11 @@
 #include <TNL/Containers/Array.h>
 #include <TNL/Pointers/UniquePointer.h>
 
-
 using namespace TNL;
 
-using ArrayHost = Containers::Array< int, Devices::Host >;
 using ArrayCuda = Containers::Array< int, Devices::Cuda >;
 
-__global__ void checkArray( const ArrayCuda* ptr )
+__global__ void printArray( const ArrayCuda* ptr )
 {
    printf( "Array size is: %d\n", ptr->getSize() );
    for( int i = 0; i < ptr->getSize(); i++ )
@@ -18,24 +16,24 @@ __global__ void checkArray( const ArrayCuda* ptr )
 
 int main( int argc, char* argv[] )
 {
-
    /***
-    * Make unique pointer on array on CPU and manipulate the
-    * array via the pointer.
+    * Create an array and print its elements in CUDA kernel
     */
-   Pointers::UniquePointer< ArrayHost > array_host_ptr( 10 );
-   *array_host_ptr = 1;
-   std::cout << "Array = " << *array_host_ptr << std::endl;
+#ifdef HAVE_CUDA
+   Pointers::UniquePointer< ArrayCuda > array_ptr( 10 );
+   array_ptr.modifyData< Devices::Host >() = 1;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printArray<<< 1, 1 >>>( &array_ptr.getData< Devices::Cuda >() );
 
    /***
-    * Let's do the same in CUDA
+    * Resize the array and print it again
     */
-#ifdef HAVE_CUDA
-   Pointers::UniquePointer< ArrayCuda > array_cuda_ptr( 10 );
-   array_cuda_ptr.modifyData< Devices::Host >() = 1;
-   //Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
-   //checkArray<<< 1, 1 >>>( &array_cuda_ptr.getData< Devices::Cuda >() );
+   array_ptr->setSize( 5 );
+   array_ptr.modifyData< Devices::Host >() = 2;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printArray<<< 1, 1 >>>( &array_ptr.getData< Devices::Cuda >() );
 #endif
    return EXIT_SUCCESS;
+
 }
 
diff --git a/Documentation/Tutorials/Pointers/UniquePointerHostExample.cpp b/Documentation/Tutorials/Pointers/UniquePointerHostExample.cpp
new file mode 100644
index 000000000..1fd0bba0e
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/UniquePointerHostExample.cpp
@@ -0,0 +1,23 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Pointers/UniquePointer.h>
+
+using namespace TNL;
+
+using ArrayHost = Containers::Array< int, Devices::Host >;
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Make unique pointer on array on CPU and manipulate the
+    * array via the pointer.
+    */
+   Pointers::UniquePointer< ArrayHost > array_ptr( 10 );
+   *array_ptr = 1;
+   std::cout << "Array size is " << array_ptr->getSize() << std::endl;
+   std::cout << "Array = " << *array_ptr << std::endl;
+   return EXIT_SUCCESS;
+}
+
+
diff --git a/Documentation/Tutorials/Pointers/tutorial_Pointers.md b/Documentation/Tutorials/Pointers/tutorial_Pointers.md
index 8e4106ad3..d8b3f907d 100644
--- a/Documentation/Tutorials/Pointers/tutorial_Pointers.md
+++ b/Documentation/Tutorials/Pointers/tutorial_Pointers.md
@@ -2,7 +2,7 @@
 
 ## Introduction
 
-Smart pointers in TNL are motivated by the smart pointerin the STL library. In addition, they work across different devices and so they make data management easier.
+Smart pointers in TNL are motivated by the smart pointerin the STL library. In addition, they can manage image of the object they hold on different devices which makes objects offloading easier.
 
 ## Table of Contents
 1. [Unique pointers](#unique_pointers)
@@ -12,7 +12,7 @@ Smart pointers in TNL are motivated by the smart pointerin the STL library. In a
 
 ## Unique pointers <a name="unique_pointers"></a>
 
-Simillar to STL smart pointer `std::unique_ptr` `UniquePointer` is a smart poinetr managing certain dynamicaly allocated object. The object is automatically  deallocated when the pointer goes out of scope. The definition of `UniquePointer` reads as:
+Simillar to STL smart pointer `std::unique_ptr` `UniquePointer` is a smart pointer managing certain dynamicaly allocated object. The object is automatically deallocated when the pointer goes out of scope. The definition of `UniquePointer` reads as:
 
 ```
 template< typename Object, typename Device = typename Object::DeviceType >
@@ -24,7 +24,16 @@ It takes two template parameters:
 1. `Object` is a type of object managed by the pointer.
 2. `Device` is a device where the object is to be allocated.
 
-If the device type is `Devices::Host`, `UniquePointer` behaves as usual unique smart pointer. If the device is different, `Devices::Cuda` for example, the unique pointer creates an image if the object even in the host memory. It means, that one can manipulate the object on the host. All smart pointers are registered in a special register using which they can be easily synchronised before calling a CUDA kernel. This means that all modified images of the objects in the memory are transferred on the GPU. See the following example:
+If the device type is `Devices::Host`, `UniquePointer` behaves as usual unique smart pointer. See the following example:
+
+\include UniquePointerHostExample.cpp
+
+The result is:
+
+\include UniquePointerHostExample.out
+
+
+If the device is different, `Devices::Cuda` for example, the unique pointer creates an image if the object even in the host memory. It means, that one can manipulate the object on the host. All smart pointers are registered in a special register using which they can be easily synchronised with the host images before calling a CUDA kernel. This means that all modified images of the objects in the memory are transferred on the GPU. See the following example:
 
 \include UniquePointerExample.cpp
 
@@ -32,6 +41,67 @@ The result looks as:
 
 \include UniquePointerExample.out
 
+A disadventage of `UniquePointer` is that it cannot be passed to the CUDA kernel since it requires making a copy of it. This is, however, from the nature of this object, prohibited. Not only this is solved by a `SharedPointer`.
+
 ## Shared pointers <a name="shared_pointers"></a>
 
+One of the main goals of the TNL library is to make the development of the HPC code, including GPU kernels as easy and efficient as possible. One way to do this is to profit from the object opriented programming even in CUDA kernels. Let us explain it on arrays. From certain point of view `Array` can be understood as an object consisiting of data and metadata. Data part means elements that we insert into the array. Metadata is a pointer to the data but also size of the array. This information makes use of the class easier. Though it is not necessary in any situations it may help to check array bounds when accessing the array elements for example. It is something that, when it is performed even in CUDA kernels, may help significantly with finding bugs in a code.  To do this, we need to transfer on the GPU not only pointers to the data but also complete metadata. It is simple if the structure which is supposed to be transfered on the GPU does not have pointers to metadata. See the following example:
+
+```
+struct Array
+{
+   double* data;
+   int size;
+};
+```
+
+If the pointer `data` points to a memory on GPU, this array can be passed to a kernel like this:
+
+```
+Array a;
+cudaKernel<<< gridSize, blockSize >>>( a );
+```
+
+The kernel `cudaKernel` can access the data as follows:
+
+```
+__global__ void cudaKernel( Array a )
+{
+   if( thredadIdx.x. < a.size )
+      a.data[ threadIdx.x ] = 0;
+}
+```
+
+But what if we have an object like this:
+
+```
+struct ArrayTuple
+{
+   Array *a1, *a2;
+}
+```
+
+Assume that there is an instance of `ArrayTuple` lets say `tuple` containing pointers to instances `a1` and `a2` of `Array`. The instances must be allocated on the GPU if one wants to simply pass the `tuple` to the CUDA kernel. Indeed, the CUDA kernels needs the arrays `a1` and `a2` to be on the GPU. See the following example:
+
+```
+__global__ tupleKernel( ArrayTuple tuple )
+{
+   if( threadIdx.x < tuple.a1->size )
+      tuple.a1->data[ threadIdx.x ] = 0;
+   if( threadIdx.x < tuple.a2->size )
+      tuple.a2->data[ threadIdx.x ] = 0;
+}
+
+```
+
+See, that the kernel needs to dereference `tuple.a1` and `tuple.a2`. Therefore these pointers must point to the global memoty of the GPU which means that arrays `a1` and `a2` must be allocated there using [cudaMalloc](http://developer.download.nvidia.com/compute/cuda/2_3/toolkit/docs/online/group__CUDART__MEMORY_gc63ffd93e344b939d6399199d8b12fef.html) lets say. It means, however, that the arrays `a1` and `a2` cannot be managed (for example resizing them requires changing `a1->size` and `a2->size`) on the host system by the CPU. The only solution to this is to have images of `a1` and `a2` and in the host memory and to copy them on the GPU before calling the CUDA kernel. One must not forget to modify the pointers in the `tuple` to point to the array copies on the GPU. To simplify this, TNL offers *cross-device shared smart pointers*. In addition to common smart pointers thay can manage an images of an object on different devices. Note that [CUDA Unified Memory](https://devblogs.nvidia.com/unified-memory-cuda-beginners/) is an answer to this problem as well. TNL cross-device smart pointers can be more efficient in some situations. (TODO: Prove this with benchmark problem.)
+
+The previous example could be implemented in TNL as follows:
+
+\include SharedPointerExample.cpp
+
+The result looks as:
+
+\include SharedPointerExample.out
+
 ## Device pointers <a name="device_pointers"></a>
-- 
GitLab


From f11da7b1849f932e24cafc144df9337e21e4274c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 25 Nov 2019 14:41:46 +0100
Subject: [PATCH 30/43] SharedPointer dereferencing is device sensitive.

---
 src/TNL/Pointers/SharedPointerCuda.h          | 20 +++++++++++++++++++
 .../Pointers/SharedPointerCudaTest.cu         | 14 +++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/src/TNL/Pointers/SharedPointerCuda.h b/src/TNL/Pointers/SharedPointerCuda.h
index 54dd4ee3c..510f172d8 100644
--- a/src/TNL/Pointers/SharedPointerCuda.h
+++ b/src/TNL/Pointers/SharedPointerCuda.h
@@ -383,30 +383,50 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
          return this->allocate( args... );
       }
 
+      __cuda_callable__
       const Object* operator->() const
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
+#ifdef __CUDA_ARCH__
+         return this->cuda_pointer;
+#else
          return &this->pd->data;
+#endif
       }
 
+      __cuda_callable__
       Object* operator->()
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
+#ifdef __CUDA_ARCH__
+         return this->cuda_pointer;
+#else
          this->pd->maybe_modified = true;
          return &this->pd->data;
+#endif
       }
 
+      __cuda_callable__
       const Object& operator *() const
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
+#ifdef __CUDA_ARCH__
+         return *( this->cuda_pointer );
+#else
          return this->pd->data;
+#endif
       }
 
+      __cuda_callable__
       Object& operator *()
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
+#ifdef __CUDA_ARCH__
+         return *( this->cuda_pointer );
+#else
          this->pd->maybe_modified = true;
          return this->pd->data;
+#endif
       }
 
       __cuda_callable__
diff --git a/src/UnitTests/Pointers/SharedPointerCudaTest.cu b/src/UnitTests/Pointers/SharedPointerCudaTest.cu
index 83b6b4793..d21f4319c 100644
--- a/src/UnitTests/Pointers/SharedPointerCudaTest.cu
+++ b/src/UnitTests/Pointers/SharedPointerCudaTest.cu
@@ -77,6 +77,14 @@ __global__ void copyArrayKernel( const TNL::Containers::Array< int, Devices::Cud
    }
 }
 
+__global__ void copyArrayKernel2( const Pointers::SharedPointer< TNL::Containers::Array< int, Devices::Cuda > > inArray,
+                                  int* outArray )
+{
+   if( threadIdx.x < 2 )
+   {
+      outArray[ threadIdx.x ] = ( *inArray )[ threadIdx.x ];
+   }
+}
 #endif
 
 TEST( SharedPointerCudaTest, getDataArrayTest )
@@ -100,6 +108,12 @@ TEST( SharedPointerCudaTest, getDataArrayTest )
    ASSERT_EQ( testArray_host[ 0 ], 1 );
    ASSERT_EQ( testArray_host[ 1 ], 2 );
 
+   copyArrayKernel2<<< 1, 2 >>>( ptr, testArray_device );
+   cudaMemcpy( testArray_host, testArray_device, 2 * sizeof( int ), cudaMemcpyDeviceToHost );
+
+   ASSERT_EQ( testArray_host[ 0 ], 1 );
+   ASSERT_EQ( testArray_host[ 1 ], 2 );
+
    delete[] testArray_host;
    cudaFree( testArray_device );
 
-- 
GitLab


From fc532769f78f726f4ccc84159f7b12dff84d8333 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 25 Nov 2019 14:42:54 +0100
Subject: [PATCH 31/43] Writing SharePointer tutorial.

---
 .../Pointers/SharedPointerExample.cpp         | 39 ++++++++-----
 .../Pointers/codeSnippetSharedPointer-1.cpp   |  6 ++
 .../Pointers/codeSnippetSharedPointer-2.cpp   |  2 +
 .../Pointers/codeSnippetSharedPointer-3.cpp   |  5 ++
 .../Pointers/codeSnippetSharedPointer-4.cpp   |  4 ++
 .../Pointers/codeSnippetSharedPointer-5.cpp   |  7 +++
 .../Pointers/codeSnippetUniquePointer.cpp     |  2 +
 .../Tutorials/Pointers/tutorial_Pointers.md   | 57 +++++--------------
 8 files changed, 64 insertions(+), 58 deletions(-)
 create mode 100644 Documentation/Tutorials/Pointers/codeSnippetSharedPointer-1.cpp
 create mode 100644 Documentation/Tutorials/Pointers/codeSnippetSharedPointer-2.cpp
 create mode 100644 Documentation/Tutorials/Pointers/codeSnippetSharedPointer-3.cpp
 create mode 100644 Documentation/Tutorials/Pointers/codeSnippetSharedPointer-4.cpp
 create mode 100644 Documentation/Tutorials/Pointers/codeSnippetSharedPointer-5.cpp
 create mode 100644 Documentation/Tutorials/Pointers/codeSnippetUniquePointer.cpp

diff --git a/Documentation/Tutorials/Pointers/SharedPointerExample.cpp b/Documentation/Tutorials/Pointers/SharedPointerExample.cpp
index 8df827b05..5facaf201 100644
--- a/Documentation/Tutorials/Pointers/SharedPointerExample.cpp
+++ b/Documentation/Tutorials/Pointers/SharedPointerExample.cpp
@@ -9,14 +9,26 @@ using ArrayCuda = Containers::Array< int, Devices::Cuda >;
 
 struct Tuple
 {
+   Tuple( const int size ):
+   a1( size ), a2( size ){};
+
+   void setSize( const int size )
+   {
+      a1->setSize( size );
+      a2->setSize( size );
+   }
+
    Pointers::SharedPointer< ArrayCuda > a1, a2;
 };
 
-__global__ void checkArray( const Tuple t )
+__global__ void printTuple( const Tuple t )
 {
-   printf( "Array size is: %d\n", ptr->getSize() );
-   for( int i = 0; i < ptr->getSize(); i++ )
-      printf( "a[ %d ] = %d \n", i, ( *ptr )[ i ] );
+   printf( "Tuple size is: %d\n", t.a1->getSize() );
+   for( int i = 0; i < t.a1->getSize(); i++ )
+   {
+      printf( "a1[ %d ] = %d \n", i, ( *t.a1 )[ i ] );
+      printf( "a2[ %d ] = %d \n", i, ( *t.a2 )[ i ] );
+   }
 }
 
 int main( int argc, char* argv[] )
@@ -25,23 +37,20 @@ int main( int argc, char* argv[] )
     * Create a tuple of arrays and print the in CUDA kernel
     */
 #ifdef HAVE_CUDA
-   Tuple t;
-   t.a1.modifyData< Devices::Host >().setSize( 10 );
-   t.a1.modifyData< Devices::Host >() = 1;
-   t.a2.modifyData< Devices::Host >().setSize( 10 );
-   t.a2.modifyData< Devices::Host >() = 2;
+   Tuple t( 3 );
+   *t.a1 = 1;
+   *t.a2 = 2;
    Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
-   printkArrays<<< 1, 1 >>>( t );
+   printTuple<<< 1, 1 >>>( t );
 
    /***
     * Resize the array
     */
-   t.a1.modifyData< Devices::Host >().setSize( 5 );
-   t.a1.modifyData< Devices::Host >() = 3;
-   t.a2.modifyData< Devices::Host >().setSize( 5 );
-   t.a2.modifyData< Devices::Host >() = 4;
+   t.setSize( 5 );
+   *t.a1 = 3;
+   *t.a2 = 4;
    Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
-   printArrays<<< 1, 1 >>>( t );
+   printTuple<<< 1, 1 >>>( t );
 #endif
    return EXIT_SUCCESS;
 
diff --git a/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-1.cpp b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-1.cpp
new file mode 100644
index 000000000..e3753a279
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-1.cpp
@@ -0,0 +1,6 @@
+struct Array
+{
+   double* data;
+   int size;
+};
+
diff --git a/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-2.cpp b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-2.cpp
new file mode 100644
index 000000000..c46e4cfc1
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-2.cpp
@@ -0,0 +1,2 @@
+Array a;
+cudaKernel<<< gridSize, blockSize >>>( a );
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-3.cpp b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-3.cpp
new file mode 100644
index 000000000..8d4be832d
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-3.cpp
@@ -0,0 +1,5 @@
+__global__ void cudaKernel( Array a )
+{
+   if( thredadIdx.x. < a.size )
+      a.data[ threadIdx.x ] = 0;
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-4.cpp b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-4.cpp
new file mode 100644
index 000000000..eb723a6d8
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-4.cpp
@@ -0,0 +1,4 @@
+struct ArrayTuple
+{
+   Array *a1, *a2;
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-5.cpp b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-5.cpp
new file mode 100644
index 000000000..0fe942e15
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/codeSnippetSharedPointer-5.cpp
@@ -0,0 +1,7 @@
+__global__ tupleKernel( ArrayTuple tuple )
+{
+   if( threadIdx.x < tuple.a1->size )
+      tuple.a1->data[ threadIdx.x ] = 0;
+   if( threadIdx.x < tuple.a2->size )
+      tuple.a2->data[ threadIdx.x ] = 0;
+}
diff --git a/Documentation/Tutorials/Pointers/codeSnippetUniquePointer.cpp b/Documentation/Tutorials/Pointers/codeSnippetUniquePointer.cpp
new file mode 100644
index 000000000..bd180fe27
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/codeSnippetUniquePointer.cpp
@@ -0,0 +1,2 @@
+template< typename Object, typename Device = typename Object::DeviceType >
+class UniquePointer;
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/tutorial_Pointers.md b/Documentation/Tutorials/Pointers/tutorial_Pointers.md
index d8b3f907d..25072ec00 100644
--- a/Documentation/Tutorials/Pointers/tutorial_Pointers.md
+++ b/Documentation/Tutorials/Pointers/tutorial_Pointers.md
@@ -2,7 +2,7 @@
 
 ## Introduction
 
-Smart pointers in TNL are motivated by the smart pointerin the STL library. In addition, they can manage image of the object they hold on different devices which makes objects offloading easier.
+Smart pointers in TNL are motivated by the smart pointers in the STL library. In addition, they can manage image of the object they hold on different devices which is supposed to make objects offloading easier.
 
 ## Table of Contents
 1. [Unique pointers](#unique_pointers)
@@ -12,12 +12,9 @@ Smart pointers in TNL are motivated by the smart pointerin the STL library. In a
 
 ## Unique pointers <a name="unique_pointers"></a>
 
-Simillar to STL smart pointer `std::unique_ptr` `UniquePointer` is a smart pointer managing certain dynamicaly allocated object. The object is automatically deallocated when the pointer goes out of scope. The definition of `UniquePointer` reads as:
+Simillar to STL unique smart pointer `std::unique_ptr`, `UniquePointer` manages certain dynamicaly allocated object. The object is automatically deallocated when the pointer goes out of scope. The definition of `UniquePointer` reads as:
 
-```
-template< typename Object, typename Device = typename Object::DeviceType >
-class UniquePointer;
-```
+\include codeSnippetUniquePointer.cpp
 
 It takes two template parameters:
 
@@ -33,7 +30,7 @@ The result is:
 \include UniquePointerHostExample.out
 
 
-If the device is different, `Devices::Cuda` for example, the unique pointer creates an image if the object even in the host memory. It means, that one can manipulate the object on the host. All smart pointers are registered in a special register using which they can be easily synchronised with the host images before calling a CUDA kernel. This means that all modified images of the objects in the memory are transferred on the GPU. See the following example:
+If the device is different, `Devices::Cuda` for example, the unique pointer creates an image of the object even in the host memory. It allows one to manipulate the object on the host. All smart pointers are registered in a special register using which they can be synchronised with the host images before calling a CUDA kernel - all at once. This means that all modified images of the objects in the host memory are transferred on the GPU. See the following example:
 
 \include UniquePointerExample.cpp
 
@@ -41,58 +38,30 @@ The result looks as:
 
 \include UniquePointerExample.out
 
-A disadventage of `UniquePointer` is that it cannot be passed to the CUDA kernel since it requires making a copy of it. This is, however, from the nature of this object, prohibited. Not only this is solved by a `SharedPointer`.
+A disadventage of `UniquePointer` is that it cannot be passed to the CUDA kernel since it requires making a copy of itself. This is, however, from the nature of this object, prohibited. Not only for this reason, TNL offers also a `SharedPointer`.
 
 ## Shared pointers <a name="shared_pointers"></a>
 
-One of the main goals of the TNL library is to make the development of the HPC code, including GPU kernels as easy and efficient as possible. One way to do this is to profit from the object opriented programming even in CUDA kernels. Let us explain it on arrays. From certain point of view `Array` can be understood as an object consisiting of data and metadata. Data part means elements that we insert into the array. Metadata is a pointer to the data but also size of the array. This information makes use of the class easier. Though it is not necessary in any situations it may help to check array bounds when accessing the array elements for example. It is something that, when it is performed even in CUDA kernels, may help significantly with finding bugs in a code.  To do this, we need to transfer on the GPU not only pointers to the data but also complete metadata. It is simple if the structure which is supposed to be transfered on the GPU does not have pointers to metadata. See the following example:
+One of the main goals of the TNL library is to make the development of the HPC code, including GPU kernels, as easy and efficient as possible. One way to do this is to profit from the object opriented programming even in CUDA kernels. Let us explain it on arrays. From certain point of view `Array` can be understood as an object consisting of data and metadata. Data part means elements that we insert into the array. Metadata is a pointer to the data but also size of the array. This information makes use of the class easier for example by checking array bounds when accessing the array elements. It is something that, when it is performed even in CUDA kernels, may help significantly with finding bugs in a code. To do this, we need to transfer not only pointers to the data but also complete metadata on the device. It is simple if the structure which is supposed to be transfered on the GPU does not have pointers to metadata. See the following example:
 
-```
-struct Array
-{
-   double* data;
-   int size;
-};
-```
+
+\include codeSnippetSharedPointer-1.cpp
 
 If the pointer `data` points to a memory on GPU, this array can be passed to a kernel like this:
 
-```
-Array a;
-cudaKernel<<< gridSize, blockSize >>>( a );
-```
+\include codeSnippetSharedPointer-2.cpp
 
 The kernel `cudaKernel` can access the data as follows:
 
-```
-__global__ void cudaKernel( Array a )
-{
-   if( thredadIdx.x. < a.size )
-      a.data[ threadIdx.x ] = 0;
-}
-```
+\include codeSnippetSharedPointer-3.cpp
 
 But what if we have an object like this:
 
-```
-struct ArrayTuple
-{
-   Array *a1, *a2;
-}
-```
+\include codeSnippetSharedPointer-4.cpp
 
 Assume that there is an instance of `ArrayTuple` lets say `tuple` containing pointers to instances `a1` and `a2` of `Array`. The instances must be allocated on the GPU if one wants to simply pass the `tuple` to the CUDA kernel. Indeed, the CUDA kernels needs the arrays `a1` and `a2` to be on the GPU. See the following example:
 
-```
-__global__ tupleKernel( ArrayTuple tuple )
-{
-   if( threadIdx.x < tuple.a1->size )
-      tuple.a1->data[ threadIdx.x ] = 0;
-   if( threadIdx.x < tuple.a2->size )
-      tuple.a2->data[ threadIdx.x ] = 0;
-}
-
-```
+\include codeSnippetSharedPointer-5.cpp
 
 See, that the kernel needs to dereference `tuple.a1` and `tuple.a2`. Therefore these pointers must point to the global memoty of the GPU which means that arrays `a1` and `a2` must be allocated there using [cudaMalloc](http://developer.download.nvidia.com/compute/cuda/2_3/toolkit/docs/online/group__CUDART__MEMORY_gc63ffd93e344b939d6399199d8b12fef.html) lets say. It means, however, that the arrays `a1` and `a2` cannot be managed (for example resizing them requires changing `a1->size` and `a2->size`) on the host system by the CPU. The only solution to this is to have images of `a1` and `a2` and in the host memory and to copy them on the GPU before calling the CUDA kernel. One must not forget to modify the pointers in the `tuple` to point to the array copies on the GPU. To simplify this, TNL offers *cross-device shared smart pointers*. In addition to common smart pointers thay can manage an images of an object on different devices. Note that [CUDA Unified Memory](https://devblogs.nvidia.com/unified-memory-cuda-beginners/) is an answer to this problem as well. TNL cross-device smart pointers can be more efficient in some situations. (TODO: Prove this with benchmark problem.)
 
@@ -105,3 +74,5 @@ The result looks as:
 \include SharedPointerExample.out
 
 ## Device pointers <a name="device_pointers"></a>
+
+The last type of the smart pointer implemented in TNL is `DevicePointer`. It works the same way as `SharedPointer` but it does not create new object on the host system. `DevicePointer` is therefore useful in situation when there is already an object created in the host memory and we want to create its image even on the device. Both images are linked one with each other and so one can just manipulate the one on the host and then synchronize it on the device.
-- 
GitLab


From 809e8337552664fe8ea7c50a47c2bc2542b9695e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 25 Nov 2019 16:20:05 +0100
Subject: [PATCH 32/43] Fixed ParallelFor example.

---
 Documentation/Examples/Algorithms/ParallelForExample.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/Examples/Algorithms/ParallelForExample.cpp b/Documentation/Examples/Algorithms/ParallelForExample.cpp
index 41ea2f1a2..9c056fa1d 100644
--- a/Documentation/Examples/Algorithms/ParallelForExample.cpp
+++ b/Documentation/Examples/Algorithms/ParallelForExample.cpp
@@ -13,7 +13,7 @@ template< typename Device >
 void initVector( Vector< double, Device >& v,
                  const double& c )
 {
-   auto view = v.getConstView();
+   auto view = v.getView();
    auto init = [=] __cuda_callable__  ( int i, const double c ) mutable {
       view[ i ] = c; };
 
-- 
GitLab


From 4805fad4740906462bc8c7e88f18bd75ec7b4458 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 25 Nov 2019 16:20:27 +0100
Subject: [PATCH 33/43] Extending UniquePointer and SharedPointer tutorials.

---
 Documentation/Tutorials/Pointers/tutorial_Pointers.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Documentation/Tutorials/Pointers/tutorial_Pointers.md b/Documentation/Tutorials/Pointers/tutorial_Pointers.md
index 25072ec00..d8fbef6ee 100644
--- a/Documentation/Tutorials/Pointers/tutorial_Pointers.md
+++ b/Documentation/Tutorials/Pointers/tutorial_Pointers.md
@@ -38,7 +38,7 @@ The result looks as:
 
 \include UniquePointerExample.out
 
-A disadventage of `UniquePointer` is that it cannot be passed to the CUDA kernel since it requires making a copy of itself. This is, however, from the nature of this object, prohibited. Not only for this reason, TNL offers also a `SharedPointer`.
+A disadventage of `UniquePointer` is that it cannot be passed to the CUDA kernel since it requires making a copy of itself. This is, however, from the nature of this object, prohibited. For this reason we have to derreference the pointer on the host. This is done by a method `getData`. Its template parameter tells what object image we want to dereference - the one on the host or the one on the device. When we passing the object on the device, we need to get the device image. The method `getData` returns constant reference on the object. Non-constant reference is accessible via a method `modifyData`. When this method is used to get the reference on the host image, the pointer is marked as **potentialy modified**. Note that we need to have non-const reference even when we need to change the data (array elements for example) but not the meta-data (array size for example). If meta-data do not change there is no need to synchronize the object image with the one on the device. To distinguish between these two situations, the smart pointer keeps one more object image which stores the meta-data state since the last synchronization. Before the device image is synchronised, the host image and the last-synchronization-state image are compared. If they do not change no synchronization is required. One can see that TNL cross-device smart pointers are really meant only for small objects, otherwise the smart pointers overhead might be significant.
 
 ## Shared pointers <a name="shared_pointers"></a>
 
@@ -73,6 +73,8 @@ The result looks as:
 
 \include SharedPointerExample.out
 
+One of the differences between `UniquePointer` and `SmartPointer` is that the `SmartPointer` can be passed to the CUDA kernel. Dereferencing by operators `*` and `->` can be done in kernels as well and the result is reference to a proper object image i.e. on the host or the device. When these operators are used on constant smart pointer, constant reference is returned which is the same as calling the method `getData` with appropriate explicitely stated `Device` template parameter. In case of non-constant `SharedPointer` non-constant reference is obtained. It has the same effect as calling `modifyData` method. On the host system, everything what was mentioned in the section about `UniquePointer` holds even for the `SharedPointer`. In addition, `modifyData` method call or non-constant dereferencing can be done in kernel on the device. In this case, the programmer gets non-constant reference to an object which is however meant to be used to change the data managed by the object but not the metadata. There is no way to synchronize objects managed by the smart pointers from the device to the host. **It means that the metadata should not be changed on the device!** In fact, it would not make sense. Imagine changing array size or re-allocating the array within a CUDA kernel. This is something one should never do.
+
 ## Device pointers <a name="device_pointers"></a>
 
 The last type of the smart pointer implemented in TNL is `DevicePointer`. It works the same way as `SharedPointer` but it does not create new object on the host system. `DevicePointer` is therefore useful in situation when there is already an object created in the host memory and we want to create its image even on the device. Both images are linked one with each other and so one can just manipulate the one on the host and then synchronize it on the device.
-- 
GitLab


From 537407ec9a19e066f2711679c1cb8b5267994d9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 25 Nov 2019 16:58:41 +0100
Subject: [PATCH 34/43] Writing documentation on DevicePointer.

---
 .../Tutorials/Pointers/CMakeLists.txt         |  5 +-
 .../Pointers/DevicePointerExample.cpp         | 54 +++++++++++++++++++
 .../Pointers/DevicePointerExample.cu          |  1 +
 .../Pointers/SharedPointerExample.cpp         |  4 +-
 .../Tutorials/Pointers/tutorial_Pointers.md   |  8 ++-
 src/TNL/Pointers/DevicePointer.h              | 20 +++++++
 6 files changed, 88 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/Tutorials/Pointers/DevicePointerExample.cpp
 create mode 120000 Documentation/Tutorials/Pointers/DevicePointerExample.cu

diff --git a/Documentation/Tutorials/Pointers/CMakeLists.txt b/Documentation/Tutorials/Pointers/CMakeLists.txt
index bf6581c52..0535e8fd5 100644
--- a/Documentation/Tutorials/Pointers/CMakeLists.txt
+++ b/Documentation/Tutorials/Pointers/CMakeLists.txt
@@ -3,6 +3,8 @@ IF( BUILD_CUDA )
    ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
    CUDA_ADD_EXECUTABLE( SharedPointerExample SharedPointerExample.cu )
    ADD_CUSTOM_COMMAND( COMMAND SharedPointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SharedPointerExample.out OUTPUT SharedPointerExample.out )
+   CUDA_ADD_EXECUTABLE( DevicePointerExample DevicePointerExample.cu )
+   ADD_CUSTOM_COMMAND( COMMAND DevicePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DevicePointerExample.out OUTPUT DevicePointerExample.out )
 ELSE()
    ADD_EXECUTABLE( UniquePointerExample UniquePointerExample.cpp )
    ADD_CUSTOM_COMMAND( COMMAND UniquePointerExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
@@ -15,7 +17,8 @@ ADD_CUSTOM_COMMAND( COMMAND UniquePointerHostExample > ${TNL_DOCUMENTATION_OUTPU
 IF( BUILD_CUDA )
 ADD_CUSTOM_TARGET( TutorialsPointersCuda ALL DEPENDS
    UniquePointerExample.out
-   SharedPointerExample.out )
+   SharedPointerExample.out
+   DevicePointerExample.out )
 ENDIF()
 
 ADD_CUSTOM_TARGET( TutorialsPointers ALL DEPENDS
diff --git a/Documentation/Tutorials/Pointers/DevicePointerExample.cpp b/Documentation/Tutorials/Pointers/DevicePointerExample.cpp
new file mode 100644
index 000000000..144ae98b0
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/DevicePointerExample.cpp
@@ -0,0 +1,54 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Pointers/DevicePointer.h>
+
+using namespace TNL;
+
+using ArrayCuda = Containers::Array< int, Devices::Cuda >;
+
+struct Tuple
+{
+   Tuple( ArrayCuda& _a1, ArrayCuda& _a2 ):
+   a1( _a1 ), a2( _a2 ){};
+
+   Pointers::DevicePointer< ArrayCuda > a1, a2;
+};
+
+__global__ void printTuple( const Tuple t )
+{
+   printf( "Tuple size is: %d\n", t.a1->getSize() );
+   for( int i = 0; i < t.a1->getSize(); i++ )
+   {
+      printf( "a1[ %d ] = %d \n", i, ( *t.a1 )[ i ] );
+      printf( "a2[ %d ] = %d \n", i, ( *t.a2 )[ i ] );
+   }
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Create a tuple of arrays and print them in CUDA kernel
+    */
+#ifdef HAVE_CUDA
+   ArrayCuda a1( 3 ), a2( 3 );
+   Tuple t( a1, a2 );
+   a1 = 1;
+   a2 = 2;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printTuple<<< 1, 1 >>>( t );
+
+   /***
+    * Resize the arrays
+    */
+   a1.setSize( 5 );
+   a2.setSize( 5 );
+   a1 = 3;
+   a2 = 4;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printTuple<<< 1, 1 >>>( t );
+#endif
+   return EXIT_SUCCESS;
+
+}
+
diff --git a/Documentation/Tutorials/Pointers/DevicePointerExample.cu b/Documentation/Tutorials/Pointers/DevicePointerExample.cu
new file mode 120000
index 000000000..b17ef30da
--- /dev/null
+++ b/Documentation/Tutorials/Pointers/DevicePointerExample.cu
@@ -0,0 +1 @@
+DevicePointerExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/SharedPointerExample.cpp b/Documentation/Tutorials/Pointers/SharedPointerExample.cpp
index 5facaf201..287aae8e8 100644
--- a/Documentation/Tutorials/Pointers/SharedPointerExample.cpp
+++ b/Documentation/Tutorials/Pointers/SharedPointerExample.cpp
@@ -34,7 +34,7 @@ __global__ void printTuple( const Tuple t )
 int main( int argc, char* argv[] )
 {
    /***
-    * Create a tuple of arrays and print the in CUDA kernel
+    * Create a tuple of arrays and print them in CUDA kernel
     */
 #ifdef HAVE_CUDA
    Tuple t( 3 );
@@ -44,7 +44,7 @@ int main( int argc, char* argv[] )
    printTuple<<< 1, 1 >>>( t );
 
    /***
-    * Resize the array
+    * Resize the arrays
     */
    t.setSize( 5 );
    *t.a1 = 3;
diff --git a/Documentation/Tutorials/Pointers/tutorial_Pointers.md b/Documentation/Tutorials/Pointers/tutorial_Pointers.md
index d8fbef6ee..f9ef457e4 100644
--- a/Documentation/Tutorials/Pointers/tutorial_Pointers.md
+++ b/Documentation/Tutorials/Pointers/tutorial_Pointers.md
@@ -77,4 +77,10 @@ One of the differences between `UniquePointer` and `SmartPointer` is that the `S
 
 ## Device pointers <a name="device_pointers"></a>
 
-The last type of the smart pointer implemented in TNL is `DevicePointer`. It works the same way as `SharedPointer` but it does not create new object on the host system. `DevicePointer` is therefore useful in situation when there is already an object created in the host memory and we want to create its image even on the device. Both images are linked one with each other and so one can just manipulate the one on the host and then synchronize it on the device.
+The last type of the smart pointer implemented in TNL is `DevicePointer`. It works the same way as `SharedPointer` but it does not create new object on the host system. `DevicePointer` is therefore useful in situation when there is already an object created in the host memory and we want to create its image even on the device. Both images are linked one with each other and so one can just manipulate the one on the host and then synchronize it on the device. The following listing is a modification of the previous example with tuple:
+
+\include DevicePointerExample.cpp
+
+The result looks the same:
+
+\include DevicePointerExample.out
diff --git a/src/TNL/Pointers/DevicePointer.h b/src/TNL/Pointers/DevicePointer.h
index 5276c3ed4..b72aaf9b1 100644
--- a/src/TNL/Pointers/DevicePointer.h
+++ b/src/TNL/Pointers/DevicePointer.h
@@ -267,26 +267,46 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          pointer.cuda_pointer = nullptr;
       }
 
+      __cuda_callable__
       const Object* operator->() const
       {
+#ifdef __CUDA_ARCH__
+         return this->cuda_pointer;
+#else
          return this->pointer;
+#endif
       }
 
+      __cuda_callable__
       Object* operator->()
       {
+#ifdef __CUDA_ARCH__
+         return this->cuda_pointer;
+#else
          this->pd->maybe_modified = true;
          return this->pointer;
+#endif
       }
 
+      __cuda_callable__
       const Object& operator *() const
       {
+#ifdef __CUDA_ARCH__
+         return *( this->cuda_pointer );
+#else
          return *( this->pointer );
+#endif
       }
 
+      __cuda_callable__
       Object& operator *()
       {
+#ifdef __CUDA_ARCH__
+         return *( this->cuda_pointer );
+#else
          this->pd->maybe_modified = true;
          return *( this->pointer );
+#endif
       }
 
       __cuda_callable__
-- 
GitLab


From fa91036bf24524823461abe9046fca86d9fd17d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 25 Nov 2019 19:25:22 +0100
Subject: [PATCH 35/43] Writing documentation on UniquePointer.

---
 Documentation/Examples/CMakeLists.txt         |   1 +
 .../Examples/Pointers/CMakeLists.txt          |  14 +
 .../Pointers/DevicePointerExample.cpp         |  54 ++++
 .../Examples/Pointers/DevicePointerExample.cu |   1 +
 .../Pointers/SharedPointerExample.cpp         |  58 +++++
 .../Examples/Pointers/SharedPointerExample.cu |   1 +
 .../Pointers/UniquePointerExample.cpp         |  41 +++
 .../Examples/Pointers/UniquePointerExample.cu |   1 +
 .../Pointers/UniquePointerExample.cpp         |   3 +-
 src/TNL/Pointers/UniquePointer.h              | 240 +++++++++++++++++-
 10 files changed, 409 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/Examples/Pointers/CMakeLists.txt
 create mode 100644 Documentation/Examples/Pointers/DevicePointerExample.cpp
 create mode 120000 Documentation/Examples/Pointers/DevicePointerExample.cu
 create mode 100644 Documentation/Examples/Pointers/SharedPointerExample.cpp
 create mode 120000 Documentation/Examples/Pointers/SharedPointerExample.cu
 create mode 100644 Documentation/Examples/Pointers/UniquePointerExample.cpp
 create mode 120000 Documentation/Examples/Pointers/UniquePointerExample.cu

diff --git a/Documentation/Examples/CMakeLists.txt b/Documentation/Examples/CMakeLists.txt
index 29b9a9657..45689f9e9 100644
--- a/Documentation/Examples/CMakeLists.txt
+++ b/Documentation/Examples/CMakeLists.txt
@@ -1,5 +1,6 @@
 ADD_SUBDIRECTORY( Algorithms )
 ADD_SUBDIRECTORY( Containers )
+ADD_SUBDIRECTORY( Pointers )
 
 ADD_EXECUTABLE( FileExample FileExample.cpp )
 ADD_CUSTOM_COMMAND( COMMAND FileExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/FileExample.out OUTPUT FileExample.out )
diff --git a/Documentation/Examples/Pointers/CMakeLists.txt b/Documentation/Examples/Pointers/CMakeLists.txt
new file mode 100644
index 000000000..e33b34f28
--- /dev/null
+++ b/Documentation/Examples/Pointers/CMakeLists.txt
@@ -0,0 +1,14 @@
+IF( BUILD_CUDA )
+   CUDA_ADD_EXECUTABLE(UniquePointerExampleCuda UniquePointerExample.cu)
+   ADD_CUSTOM_COMMAND( COMMAND UniquePointerExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UniquePointerExample.out OUTPUT UniquePointerExample.out )
+   CUDA_ADD_EXECUTABLE(SharedPointerExampleCuda SharedPointerExample.cu)
+   ADD_CUSTOM_COMMAND( COMMAND SharedPointerExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SharedPointerExample.out OUTPUT SharedPointerExample.out )
+   CUDA_ADD_EXECUTABLE(DevicePointerExampleCuda DevicePointerExample.cu)
+   ADD_CUSTOM_COMMAND( COMMAND DevicePointerExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DevicePointerExample.out OUTPUT DevicePointerExample.out )
+ENDIF()
+
+ADD_CUSTOM_TARGET( RunPointersExamples ALL DEPENDS
+   UniquePointerExample.out
+   SharedPointerExample.out
+   DevicePointerExample.out
+ )
diff --git a/Documentation/Examples/Pointers/DevicePointerExample.cpp b/Documentation/Examples/Pointers/DevicePointerExample.cpp
new file mode 100644
index 000000000..144ae98b0
--- /dev/null
+++ b/Documentation/Examples/Pointers/DevicePointerExample.cpp
@@ -0,0 +1,54 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Pointers/DevicePointer.h>
+
+using namespace TNL;
+
+using ArrayCuda = Containers::Array< int, Devices::Cuda >;
+
+struct Tuple
+{
+   Tuple( ArrayCuda& _a1, ArrayCuda& _a2 ):
+   a1( _a1 ), a2( _a2 ){};
+
+   Pointers::DevicePointer< ArrayCuda > a1, a2;
+};
+
+__global__ void printTuple( const Tuple t )
+{
+   printf( "Tuple size is: %d\n", t.a1->getSize() );
+   for( int i = 0; i < t.a1->getSize(); i++ )
+   {
+      printf( "a1[ %d ] = %d \n", i, ( *t.a1 )[ i ] );
+      printf( "a2[ %d ] = %d \n", i, ( *t.a2 )[ i ] );
+   }
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Create a tuple of arrays and print them in CUDA kernel
+    */
+#ifdef HAVE_CUDA
+   ArrayCuda a1( 3 ), a2( 3 );
+   Tuple t( a1, a2 );
+   a1 = 1;
+   a2 = 2;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printTuple<<< 1, 1 >>>( t );
+
+   /***
+    * Resize the arrays
+    */
+   a1.setSize( 5 );
+   a2.setSize( 5 );
+   a1 = 3;
+   a2 = 4;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printTuple<<< 1, 1 >>>( t );
+#endif
+   return EXIT_SUCCESS;
+
+}
+
diff --git a/Documentation/Examples/Pointers/DevicePointerExample.cu b/Documentation/Examples/Pointers/DevicePointerExample.cu
new file mode 120000
index 000000000..b17ef30da
--- /dev/null
+++ b/Documentation/Examples/Pointers/DevicePointerExample.cu
@@ -0,0 +1 @@
+DevicePointerExample.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Pointers/SharedPointerExample.cpp b/Documentation/Examples/Pointers/SharedPointerExample.cpp
new file mode 100644
index 000000000..287aae8e8
--- /dev/null
+++ b/Documentation/Examples/Pointers/SharedPointerExample.cpp
@@ -0,0 +1,58 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Pointers/SharedPointer.h>
+
+using namespace TNL;
+
+using ArrayCuda = Containers::Array< int, Devices::Cuda >;
+
+struct Tuple
+{
+   Tuple( const int size ):
+   a1( size ), a2( size ){};
+
+   void setSize( const int size )
+   {
+      a1->setSize( size );
+      a2->setSize( size );
+   }
+
+   Pointers::SharedPointer< ArrayCuda > a1, a2;
+};
+
+__global__ void printTuple( const Tuple t )
+{
+   printf( "Tuple size is: %d\n", t.a1->getSize() );
+   for( int i = 0; i < t.a1->getSize(); i++ )
+   {
+      printf( "a1[ %d ] = %d \n", i, ( *t.a1 )[ i ] );
+      printf( "a2[ %d ] = %d \n", i, ( *t.a2 )[ i ] );
+   }
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Create a tuple of arrays and print them in CUDA kernel
+    */
+#ifdef HAVE_CUDA
+   Tuple t( 3 );
+   *t.a1 = 1;
+   *t.a2 = 2;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printTuple<<< 1, 1 >>>( t );
+
+   /***
+    * Resize the arrays
+    */
+   t.setSize( 5 );
+   *t.a1 = 3;
+   *t.a2 = 4;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printTuple<<< 1, 1 >>>( t );
+#endif
+   return EXIT_SUCCESS;
+
+}
+
diff --git a/Documentation/Examples/Pointers/SharedPointerExample.cu b/Documentation/Examples/Pointers/SharedPointerExample.cu
new file mode 120000
index 000000000..7d10e3312
--- /dev/null
+++ b/Documentation/Examples/Pointers/SharedPointerExample.cu
@@ -0,0 +1 @@
+SharedPointerExample.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Pointers/UniquePointerExample.cpp b/Documentation/Examples/Pointers/UniquePointerExample.cpp
new file mode 100644
index 000000000..6a42d34b1
--- /dev/null
+++ b/Documentation/Examples/Pointers/UniquePointerExample.cpp
@@ -0,0 +1,41 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Array.h>
+#include <TNL/Pointers/UniquePointer.h>
+
+using namespace TNL;
+
+using ArrayCuda = Containers::Array< int, Devices::Cuda >;
+
+#ifdef HAVE_CUDA
+__global__ void printArray( const ArrayCuda* ptr )
+{
+   printf( "Array size is: %d\n", ptr->getSize() );
+   for( int i = 0; i < ptr->getSize(); i++ )
+      printf( "a[ %d ] = %d \n", i, ( *ptr )[ i ] );
+}
+#endif
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Create an array and print its elements in CUDA kernel
+    */
+#ifdef HAVE_CUDA
+   Pointers::UniquePointer< ArrayCuda > array_ptr( 10 );
+   array_ptr.modifyData< Devices::Host >() = 1;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printArray<<< 1, 1 >>>( &array_ptr.getData< Devices::Cuda >() );
+
+   /***
+    * Resize the array and print it again
+    */
+   array_ptr.modifyData< Devices::Host >().setSize( 5 );
+   array_ptr.modifyData< Devices::Host >() = 2;
+   std::cout << array_ptr.modifyData< Devices::Host >().getSize() << std::endl;
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+   printArray<<< 1, 1 >>>( &array_ptr.getData< Devices::Cuda >() );
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Examples/Pointers/UniquePointerExample.cu b/Documentation/Examples/Pointers/UniquePointerExample.cu
new file mode 120000
index 000000000..a7c9828d5
--- /dev/null
+++ b/Documentation/Examples/Pointers/UniquePointerExample.cu
@@ -0,0 +1 @@
+UniquePointerExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Pointers/UniquePointerExample.cpp b/Documentation/Tutorials/Pointers/UniquePointerExample.cpp
index 6f25305e9..ad49cbdd6 100644
--- a/Documentation/Tutorials/Pointers/UniquePointerExample.cpp
+++ b/Documentation/Tutorials/Pointers/UniquePointerExample.cpp
@@ -7,12 +7,14 @@ using namespace TNL;
 
 using ArrayCuda = Containers::Array< int, Devices::Cuda >;
 
+#ifdef HAVE_CUDA
 __global__ void printArray( const ArrayCuda* ptr )
 {
    printf( "Array size is: %d\n", ptr->getSize() );
    for( int i = 0; i < ptr->getSize(); i++ )
       printf( "a[ %d ] = %d \n", i, ( *ptr )[ i ] );
 }
+#endif
 
 int main( int argc, char* argv[] )
 {
@@ -34,6 +36,5 @@ int main( int argc, char* argv[] )
    printArray<<< 1, 1 >>>( &array_ptr.getData< Devices::Cuda >() );
 #endif
    return EXIT_SUCCESS;
-
 }
 
diff --git a/src/TNL/Pointers/UniquePointer.h b/src/TNL/Pointers/UniquePointer.h
index 071de4d51..baa93e589 100644
--- a/src/TNL/Pointers/UniquePointer.h
+++ b/src/TNL/Pointers/UniquePointer.h
@@ -25,65 +25,153 @@
 namespace TNL {
 namespace Pointers {
 
+/**
+ * \brief Cross-device unique smart pointer.
+ * 
+ * This smart pointer is inspired by std::unique_ptr from STL library. It means
+ * that the object owned by the smart pointer is accessible only through this
+ * smart pointer. One cannot make any copy of this smart pointer. In addition,
+ * the smart pointer is able to work across different devices which means that the
+ * object owned by the smart pointer is mirrored on both host and device.
+ * 
+ * **NOTE: When using smart pointers to pass objects on GPU, one must call 
+ * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >() 
+ * before calling a CUDA kernel working with smart pointers.**
+ * 
+ * \tparam Object is a type of object to be owned by the pointer.
+ * \tparam Device is device where the object is to be allocated. The object is
+ * always allocated on the host system as well for easier object manipulation.
+ * 
+ * See also \ref SharedPointer and \ref DevicePointer.
+ * 
+ * See also \ref UniquePointer< Object, Devices::Host > and \ref UniquePointer< Object, Devices::Cuda >.
+ *
+ * \par Example
+ * \include Pointers/UniquePointerExample.cpp
+ * \par Output
+ * \include UniquePointerExample.out
+ */
 template< typename Object, typename Device = typename Object::DeviceType >
 class UniquePointer
 {
 };
 
+/**
+ * \brief Specialization of the UniqueSmart pointer for the host system.
+ * 
+ * \tparam  Object is a type of object to be owned by the pointer.
+ */
 template< typename Object >
 class UniquePointer< Object, Devices::Host > : public SmartPointer
 {
    public:
 
-      typedef Object ObjectType;
-      typedef Devices::Host DeviceType;
+      /**
+       * \typedef ObjectType is the type of object owned by the pointer. 
+       */
+      using ObjectType = Object;
 
+      /**
+       * \typedef DeviceType is the type of device where the object is to be
+       * mirrored.
+       */
+      using DeviceType = Devices::Host;
+
+      /**
+       * \brief Constructor of empty pointer.
+       */
       UniquePointer( std::nullptr_t )
       : pointer( nullptr )
       {}
 
+      /**
+       * \brief Constructor with parameters of the Object constructor.
+       * 
+       * \tparam Args is variadic template type of arguments of the Object constructor.
+       * \tparam args are arguments passed to the Object constructor.
+       */
       template< typename... Args >
       explicit  UniquePointer( const Args... args )
       {
          this->pointer = new Object( args... );
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by constant smart pointer.
+       * 
+       * \return constant pointer to the object owned by this smart pointer.
+       */
       const Object* operator->() const
       {
          TNL_ASSERT_TRUE( this->pointer, "Attempt to dereference a null pointer" );
          return this->pointer;
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by non-constant smart pointer.
+       * 
+       * \return pointer to the object owned by this smart pointer.
+       */
       Object* operator->()
       {
          TNL_ASSERT_TRUE( this->pointer, "Attempt to dereference a null pointer" );
          return this->pointer;
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by constant smart pointer.
+       * 
+       * \return constant reference to the object owned by this smart pointer.
+       */
       const Object& operator *() const
       {
          TNL_ASSERT_TRUE( this->pointer, "Attempt to dereference a null pointer" );
          return *( this->pointer );
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by non-constant smart pointer.
+       * 
+       * \return reference to the object owned by this smart pointer.
+       */
       Object& operator *()
       {
          TNL_ASSERT_TRUE( this->pointer, "Attempt to dereference a null pointer" );
          return *( this->pointer );
       }
 
+      /**
+       * \brief Conversion to boolean type.
+       * 
+       * \return Returns true if the pointer is not empty, false otherwise.
+       */
       __cuda_callable__
       operator bool() const
       {
          return this->pointer;
       }
 
+      /**
+       * \brief Negation operator.
+       *
+       * \return Returns false if the pointer is not empty, true otherwise.
+       */
       __cuda_callable__
       bool operator!() const
       {
          return ! this->pointer;
       }
 
+      /**
+       * \brief Constant object reference getter.
+       *
+       * No synchronization of this pointer will be performed due to calling
+       * this method.
+       * 
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       const Object& getData() const
       {
@@ -91,6 +179,18 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
          return *( this->pointer );
       }
 
+      /**
+       * \brief Non-constant object reference getter.
+       *
+       * After calling this method, the object owned by the pointer might need
+       * to be synchronized. One should not forget to call 
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >() 
+       * before calling CUDA kernel using object from this smart pointer.
+       * 
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       Object& modifyData()
       {
@@ -98,6 +198,15 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
          return *( this->pointer );
       }
 
+      /**
+       * \brief Assignment operator.
+       * 
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
+       * The original pointer \ref ptr is reset to empty state.
+       * 
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       const UniquePointer& operator=( UniquePointer& ptr )
       {
          if( this->pointer )
@@ -107,16 +216,36 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
          return *this;
       }
 
+      /**
+       * \brief Assignment operator.
+       * 
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
+       * The original pointer \ref ptr is reset to empty state.
+       * 
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       const UniquePointer& operator=( UniquePointer&& ptr )
       {
          return this->operator=( ptr );
       }
 
+      /**
+       * \brief Cross-device pointer synchronization.
+       * 
+       * This method is usually called by the smart pointers register when calling
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >() 
+       * 
+       * \return true if the synchronization was successful, false otherwise.
+       */
       bool synchronize()
       {
          return true;
       }
 
+      /**
+       * \brief Destructor.
+       */
       ~UniquePointer()
       {
          if( this->pointer )
@@ -129,19 +258,41 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
       Object* pointer;
 };
 
+/**
+ * \brief Specialization of the UniqueSmart pointer for the CUDA device.
+ * 
+ * \tparam  Object is a type of object to be owned by the pointer.
+ */
 template< typename Object >
 class UniquePointer< Object, Devices::Cuda > : public SmartPointer
 {
    public:
 
-      typedef Object ObjectType;
-      typedef Devices::Cuda DeviceType;
+      /**
+       * \typedef ObjectType is the type of object owned by the pointer. 
+       */
+      using ObjectType = Object;
+
+      /**
+       * \typedef DeviceType is the type of device where the object is to be
+       * mirrored.
+       */
+      using DeviceType = Devices::Cuda;
 
+      /**
+       * \brief Constructor of empty pointer.
+       */
       UniquePointer( std::nullptr_t )
       : pd( nullptr ),
         cuda_pointer( nullptr )
       {}
 
+      /**
+       * \brief Constructor with parameters of the Object constructor.
+       * 
+       * \tparam Args is variadic template type of arguments of the Object constructor.
+       * \tparam args are arguments passed to the Object constructor.
+       */
       template< typename... Args >
       explicit  UniquePointer( const Args... args )
       : pd( nullptr ),
@@ -150,12 +301,22 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
          this->allocate( args... );
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by constant smart pointer.
+       * 
+       * \return constant pointer to the object owned by this smart pointer.
+       */
       const Object* operator->() const
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
          return &this->pd->data;
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by non-constant smart pointer.
+       * 
+       * \return pointer to the object owned by this smart pointer.
+       */
       Object* operator->()
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
@@ -163,12 +324,22 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
          return &this->pd->data;
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by constant smart pointer.
+       * 
+       * \return constant reference to the object owned by this smart pointer.
+       */
       const Object& operator *() const
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
          return this->pd->data;
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by non-constant smart pointer.
+       * 
+       * \return reference to the object owned by this smart pointer.
+       */
       Object& operator *()
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
@@ -176,18 +347,38 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
          return this->pd->data;
       }
 
+      /**
+       * \brief Conversion to boolean type.
+       * 
+       * \return Returns true if the pointer is not empty, false otherwise.
+       */
       __cuda_callable__
       operator bool() const
       {
          return this->pd;
       }
 
+      /**
+       * \brief Negation operator.
+       *
+       * \return Returns false if the pointer is not empty, true otherwise.
+       */
       __cuda_callable__
       bool operator!() const
       {
          return ! this->pd;
       }
 
+      /**
+       * \brief Constant object reference getter.
+       *
+       * No synchronization of this pointer will be performed due to calling
+       * this method.
+       * 
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       const Object& getData() const
       {
@@ -200,6 +391,18 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
             return *( this->cuda_pointer );
       }
 
+      /**
+       * \brief Non-constant object reference getter.
+       *
+       * After calling this method, the object owned by the pointer might need
+       * to be synchronized. One should not forget to call 
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >() 
+       * before calling CUDA kernel using object from this smart pointer.
+       * 
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       Object& modifyData()
       {
@@ -215,6 +418,15 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
             return *( this->cuda_pointer );
       }
 
+      /**
+       * \brief Assignment operator.
+       * 
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
+       * The original pointer \ref ptr is reset to empty state.
+       * 
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       const UniquePointer& operator=( UniquePointer& ptr )
       {
          this->free();
@@ -225,11 +437,28 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
          return *this;
       }
 
+      /**
+       * \brief Assignment operator.
+       * 
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
+       * The original pointer \ref ptr is reset to empty state.
+       * 
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       const UniquePointer& operator=( UniquePointer&& ptr )
       {
          return this->operator=( ptr );
       }
 
+      /**
+       * \brief Cross-device pointer synchronization.
+       * 
+       * This method is usually called by the smart pointers register when calling
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >() 
+       * 
+       * \return true if the synchronization was successful, false otherwise.
+       */
       bool synchronize()
       {
          if( ! this->pd )
@@ -248,6 +477,9 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
 #endif
       }
 
+      /**
+       * \brief Destructor.
+       */
       ~UniquePointer()
       {
          this->free();
-- 
GitLab


From 9c8ef8a8c109461fa3e424e79f19a37ac0221ab0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 25 Nov 2019 20:46:58 +0100
Subject: [PATCH 36/43] Writing documentation for SharedPointer.

---
 .../Pointers/DevicePointerExample.cpp         |   2 +
 .../Pointers/SharedPointerExample.cpp         |   2 +
 src/TNL/Pointers/SharedPointer.h              |  26 +
 src/TNL/Pointers/SharedPointerCuda.h          | 558 ++++++++++++------
 src/TNL/Pointers/SharedPointerHost.h          | 202 ++++++-
 src/TNL/Pointers/UniquePointer.h              |  13 +-
 6 files changed, 579 insertions(+), 224 deletions(-)

diff --git a/Documentation/Examples/Pointers/DevicePointerExample.cpp b/Documentation/Examples/Pointers/DevicePointerExample.cpp
index 144ae98b0..897b92962 100644
--- a/Documentation/Examples/Pointers/DevicePointerExample.cpp
+++ b/Documentation/Examples/Pointers/DevicePointerExample.cpp
@@ -15,6 +15,7 @@ struct Tuple
    Pointers::DevicePointer< ArrayCuda > a1, a2;
 };
 
+#ifdef HAVE_CUDA
 __global__ void printTuple( const Tuple t )
 {
    printf( "Tuple size is: %d\n", t.a1->getSize() );
@@ -24,6 +25,7 @@ __global__ void printTuple( const Tuple t )
       printf( "a2[ %d ] = %d \n", i, ( *t.a2 )[ i ] );
    }
 }
+#endif
 
 int main( int argc, char* argv[] )
 {
diff --git a/Documentation/Examples/Pointers/SharedPointerExample.cpp b/Documentation/Examples/Pointers/SharedPointerExample.cpp
index 287aae8e8..be149518c 100644
--- a/Documentation/Examples/Pointers/SharedPointerExample.cpp
+++ b/Documentation/Examples/Pointers/SharedPointerExample.cpp
@@ -21,6 +21,7 @@ struct Tuple
    Pointers::SharedPointer< ArrayCuda > a1, a2;
 };
 
+#ifdef HAVE_CUDA
 __global__ void printTuple( const Tuple t )
 {
    printf( "Tuple size is: %d\n", t.a1->getSize() );
@@ -30,6 +31,7 @@ __global__ void printTuple( const Tuple t )
       printf( "a2[ %d ] = %d \n", i, ( *t.a2 )[ i ] );
    }
 }
+#endif
 
 int main( int argc, char* argv[] )
 {
diff --git a/src/TNL/Pointers/SharedPointer.h b/src/TNL/Pointers/SharedPointer.h
index 93f63f807..293434ccd 100644
--- a/src/TNL/Pointers/SharedPointer.h
+++ b/src/TNL/Pointers/SharedPointer.h
@@ -22,6 +22,32 @@
 namespace TNL {
 namespace Pointers {
 
+/**
+ * \brief Cross-device shared smart pointer.
+ * 
+ * This smart pointer is inspired by std::shared_ptr from STL library. It means
+ * that the object owned by the smart pointer can be shared with other
+ * smart pointers. One can make a copy of this smart pointer. In addition,
+ * the smart pointer is able to work across different devices which means that the
+ * object owned by the smart pointer is mirrored on both host and device.
+ * 
+ * **NOTE: When using smart pointers to pass objects on GPU, one must call 
+ * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >() 
+ * before calling a CUDA kernel working with smart pointers.**
+ * 
+ * \tparam Object is a type of object to be owned by the pointer.
+ * \tparam Device is device where the object is to be allocated. The object is
+ * always allocated on the host system as well for easier object manipulation.
+ * 
+ * See also \ref UniquePointer and \ref DevicePointer.
+ * 
+ * See also \ref SharedPointer< Object, Devices::Host > and \ref SharedPointer< Object, Devices::Cuda >.
+ *
+ * \par Example
+ * \include Pointers/SharedPointerExample.cpp
+ * \par Output
+ * \include SharedPointerExample.out
+ */
 template< typename Object,
           typename Device = typename Object::DeviceType >
 class SharedPointer
diff --git a/src/TNL/Pointers/SharedPointerCuda.h b/src/TNL/Pointers/SharedPointerCuda.h
index 510f172d8..81951a5e9 100644
--- a/src/TNL/Pointers/SharedPointerCuda.h
+++ b/src/TNL/Pointers/SharedPointerCuda.h
@@ -28,15 +28,25 @@ namespace Pointers {
 
 //#define HAVE_CUDA_UNIFIED_MEMORY
 
-#ifdef HAVE_CUDA_UNIFIED_MEMORY
+#if ! defined HAVE_CUDA_UNIFIED_MEMORY
+
+/**
+ * \brief Specialization of the UniquePointer for the CUDA device.
+ * 
+ * \tparam  Object is a type of object to be owned by the pointer.
+ */
 template< typename Object >
 class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 {
    private:
-      // Convenient template alias for controlling the selection of copy- and
-      // move-constructors and assignment operators using SFINAE.
-      // The type Object_ is "enabled" iff Object_ and Object are not the same,
-      // but after removing const and volatile qualifiers they are the same.
+      /**
+       * \typedef Enabler
+       * 
+       * Convenient template alias for controlling the selection of copy- and
+       * move-constructors and assignment operators using SFINAE.
+       * The type Object_ is "enabled" iff Object_ and Object are not the same,
+       * but after removing const and volatile qualifiers they are the same.
+       */
       template< typename Object_ >
       using Enabler = std::enable_if< ! std::is_same< Object_, Object >::value &&
                                       std::is_same< typename std::remove_cv< Object >::type, Object_ >::value >;
@@ -47,71 +57,129 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 
    public:
 
+      /**
+       * \typedef ObjectType is the type of object owned by the pointer. 
+       */
       using ObjectType = Object;
-      using DeviceType = Devices::Cuda; 
 
+      /**
+       * \typedef DeviceType is the type of device where the object is to be
+       * mirrored.
+       */
+      using DeviceType = Devices::Cuda;
+
+      /**
+       * \brief Constructor of empty pointer.
+       */
       SharedPointer( std::nullptr_t )
-      : pd( nullptr )
+      : pd( nullptr ),
+        cuda_pointer( nullptr )
       {}
 
+      /**
+       * \brief Constructor with parameters of the Object constructor.
+       * 
+       * \tparam Args is variadic template type of arguments of the Object constructor.
+       * \tparam args are arguments passed to the Object constructor.
+       */
       template< typename... Args >
       explicit  SharedPointer( Args... args )
-      : pd( nullptr )
+      : pd( nullptr ),
+        cuda_pointer( nullptr )
       {
-#ifdef TNL_DEBUG_SHARED_POINTERS
-         std::cerr << "Creating shared pointer to " << getType< ObjectType >() << std::endl;
-#endif
          this->allocate( args... );
       }
 
-      // this is needed only to avoid the default compiler-generated constructor
-      SharedPointer( const SharedPointer& pointer )
-      : pd( (PointerData*) pointer.pd )
+      /**
+       * \brief Copy constructor.
+       * 
+       * \param pointer is the source shared pointer.
+       */
+      SharedPointer( const SharedPointer& pointer ) // this is needed only to avoid the default compiler-generated constructor
+      : pd( (PointerData*) pointer.pd ),
+        cuda_pointer( pointer.cuda_pointer )
       {
          this->pd->counter += 1;
       }
 
-      // conditional constructor for non-const -> const data
+      /**
+       * \brief Copy constructor.
+       * 
+       * This is specialization for compatible object types.
+       * 
+       * See \ref Enabler.
+       * 
+       * \param pointer is the source shared pointer.
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      SharedPointer( const SharedPointer<  Object_, DeviceType >& pointer )
-      : pd( (PointerData*) pointer.pd )
+      SharedPointer( const SharedPointer<  Object_, DeviceType >& pointer ) // conditional constructor for non-const -> const data
+      : pd( (PointerData*) pointer.pd ),
+        cuda_pointer( pointer.cuda_pointer )
       {
          this->pd->counter += 1;
       }
 
-      // this is needed only to avoid the default compiler-generated constructor
-      SharedPointer( SharedPointer&& pointer )
-      : pd( (PointerData*) pointer.pd )
+      /**
+       * \brief Move constructor.
+       * 
+       * \param pointer is the source shared pointer.
+       */
+      SharedPointer( SharedPointer&& pointer ) // this is needed only to avoid the default compiler-generated constructor
+      : pd( (PointerData*) pointer.pd ),
+        cuda_pointer( pointer.cuda_pointer )
       {
          pointer.pd = nullptr;
+         pointer.cuda_pointer = nullptr;
       }
 
-      // conditional constructor for non-const -> const data
+      /**
+       * \brief Move constructor.
+       * 
+       * This is specialization for compatible object types.
+       * 
+       * See \ref Enabler.
+       * 
+       * \param pointer is the source shared pointer.
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      SharedPointer( SharedPointer<  Object_, DeviceType >&& pointer )
-      : pd( (PointerData*) pointer.pd )
+      SharedPointer( SharedPointer<  Object_, DeviceType >&& pointer ) // conditional constructor for non-const -> const data
+      : pd( (PointerData*) pointer.pd ),
+        cuda_pointer( pointer.cuda_pointer )
       {
          pointer.pd = nullptr;
+         pointer.cuda_pointer = nullptr;
       }
 
+      /**
+       * \brief Create new object based in given constructor parameters.
+       * 
+       * \tparam Args is variadic template type of arguments to be passed to the
+       *    object constructor.
+       * \param args are arguments to be passed to the object constructor.
+       * \return true if recreation was successful, false otherwise.
+       */
       template< typename... Args >
       bool recreate( Args... args )
       {
 #ifdef TNL_DEBUG_SHARED_POINTERS
          std::cerr << "Recreating shared pointer to " << getType< ObjectType >() << std::endl;
 #endif
-         if( ! this->counter )
+         if( ! this->pd )
             return this->allocate( args... );
 
-         if( *this->pd->counter == 1 )
+         if( this->pd->counter == 1 )
          {
             /****
              * The object is not shared -> recreate it in-place, without reallocation
              */
-            this->pd->data.~ObjectType();
-            new ( this->pd->data ) ObjectType( args... );
+            this->pd->data.~Object();
+            new ( &this->pd->data ) Object( args... );
+#ifdef HAVE_CUDA
+            cudaMemcpy( (void*) this->cuda_pointer, (void*) &this->pd->data, sizeof( Object ), cudaMemcpyHostToDevice );
+#endif
+            this->set_last_sync_state();
             return true;
          }
 
@@ -121,167 +189,380 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
          return this->allocate( args... );
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by constant smart pointer.
+       * 
+       * \return constant pointer to the object owned by this smart pointer. It
+       * returns pointer to object image on the CUDA device if it is called from CUDA 
+       * kernel and pointer to host image otherwise.
+       */
+      __cuda_callable__
       const Object* operator->() const
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
+#ifdef __CUDA_ARCH__
+         return this->cuda_pointer;
+#else
          return &this->pd->data;
+#endif
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by non-constant smart pointer.
+       * 
+       * \return pointer to the object owned by this smart pointer. It
+       * returns pointer to object image on the CUDA device if it is called from CUDA 
+       * kernel and pointer to host image otherwise.
+       */
+      __cuda_callable__
       Object* operator->()
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
+#ifdef __CUDA_ARCH__
+         return this->cuda_pointer;
+#else
+         this->pd->maybe_modified = true;
          return &this->pd->data;
+#endif
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by constant smart pointer.
+       * 
+       * \return constant reference to the object owned by this smart pointer. It
+       * returns reference to object image on the CUDA device if it is called from CUDA 
+       * kernel and reference to host image otherwise.
+       */
+      __cuda_callable__
       const Object& operator *() const
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
+#ifdef __CUDA_ARCH__
+         return *( this->cuda_pointer );
+#else
          return this->pd->data;
+#endif
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by non-constant smart pointer.
+       * 
+       * \return reference to the object owned by this smart pointer.  It
+       * returns reference to object image on the CUDA device if it is called from CUDA 
+       * kernel and reference to host image otherwise.
+       */
+      __cuda_callable__
       Object& operator *()
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
+#ifdef __CUDA_ARCH__
+         return *( this->cuda_pointer );
+#else
+         this->pd->maybe_modified = true;
          return this->pd->data;
+#endif
       }
 
+      /**
+       * \brief Conversion to boolean type.
+       * 
+       * \return Returns true if the pointer is not empty, false otherwise.
+       */
       __cuda_callable__
       operator bool() const
       {
          return this->pd;
       }
 
+      /**
+       * \brief Negation operator.
+       *
+       * \return Returns false if the pointer is not empty, true otherwise.
+       */
       __cuda_callable__
       bool operator!() const
       {
          return ! this->pd;
       }
 
+      /**
+       * \brief Constant object reference getter.
+       *
+       * No synchronization of this pointer will be performed due to calling
+       * this method.
+       * 
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       __cuda_callable__
       const Object& getData() const
       {
+         static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value, "Only Devices::Host or Devices::Cuda devices are accepted here." );
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
-         return this->pd->data;
+         TNL_ASSERT_TRUE( this->cuda_pointer, "Attempt to dereference a null pointer" );
+         if( std::is_same< Device, Devices::Host >::value )
+            return this->pd->data;
+         if( std::is_same< Device, Devices::Cuda >::value )
+            return *( this->cuda_pointer );
       }
 
+      /**
+       * \brief Non-constant object reference getter.
+       *
+       * After calling this method, the object owned by the pointer might need
+       * to be synchronized. One should not forget to call 
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >() 
+       * before calling CUDA kernel using object from this smart pointer.
+       * 
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       __cuda_callable__
       Object& modifyData()
       {
+         static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value, "Only Devices::Host or Devices::Cuda devices are accepted here." );
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
-         return this->pd->data;
+         TNL_ASSERT_TRUE( this->cuda_pointer, "Attempt to dereference a null pointer" );
+         if( std::is_same< Device, Devices::Host >::value )
+         {
+            this->pd->maybe_modified = true;
+            return this->pd->data;
+         }
+         if( std::is_same< Device, Devices::Cuda >::value )
+            return *( this->cuda_pointer );
       }
 
-      // this is needed only to avoid the default compiler-generated operator
-      const SharedPointer& operator=( const SharedPointer& ptr )
+      /**
+       * \brief Assignment operator.
+       * 
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
+       * 
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
+      const SharedPointer& operator=( const SharedPointer& ptr ) // this is needed only to avoid the default compiler-generated operator
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
-         if( this->pd != nullptr ) 
+         this->cuda_pointer = ptr.cuda_pointer;
+         if( this->pd != nullptr )
             this->pd->counter += 1;
+#ifdef TNL_DEBUG_SHARED_POINTERS
+         std::cerr << "Copy-assigned shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
+#endif
          return *this;
       }
 
-      // conditional operator for non-const -> const data
+      /**
+       * \brief Assignment operator for compatible object types.
+       * 
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
+       * 
+       * See \ref Enabler.
+       * 
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const SharedPointer& operator=( const SharedPointer<  Object_, DeviceType >& ptr )
+      const SharedPointer& operator=( const SharedPointer<  Object_, DeviceType >& ptr ) // conditional operator for non-const -> const data
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
+         this->cuda_pointer = ptr.cuda_pointer;
          if( this->pd != nullptr )
             this->pd->counter += 1;
+#ifdef TNL_DEBUG_SHARED_POINTERS
+         std::cerr << "Copy-assigned shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
+#endif
          return *this;
       }
 
-      // this is needed only to avoid the default compiler-generated operator
-      const SharedPointer& operator=( SharedPointer&& ptr )
+      /**
+       * \brief Move operator.
+       * 
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
+       * 
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
+      const SharedPointer& operator=( SharedPointer&& ptr ) // this is needed only to avoid the default compiler-generated operator
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
+         this->cuda_pointer = ptr.cuda_pointer;
          ptr.pd = nullptr;
+         ptr.cuda_pointer = nullptr;
+#ifdef TNL_DEBUG_SHARED_POINTERS
+         std::cerr << "Move-assigned shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
+#endif
          return *this;
       }
 
-      // conditional operator for non-const -> const data
+      /**
+       * \brief Move operator.
+       * 
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
+       * 
+       * See \ref Enabler.
+       * 
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const SharedPointer& operator=( SharedPointer<  Object_, DeviceType >&& ptr )
+      const SharedPointer& operator=( SharedPointer<  Object_, DeviceType >&& ptr ) // conditional operator for non-const -> const data
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
+         this->cuda_pointer = ptr.cuda_pointer;
          ptr.pd = nullptr;
+         ptr.cuda_pointer = nullptr;
+#ifdef TNL_DEBUG_SHARED_POINTERS
+         std::cerr << "Move-assigned shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
+#endif
          return *this;
       }
 
+      /**
+       * \brief Cross-device pointer synchronization.
+       * 
+       * For the smart pointers in the host, this method does nothing.
+       * 
+       * \return true.
+       */
       bool synchronize()
       {
+         if( ! this->pd )
+            return true;
+#ifdef HAVE_CUDA
+         if( this->modified() )
+         {
+#ifdef TNL_DEBUG_SHARED_POINTERS
+            std::cerr << "Synchronizing shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
+            std::cerr << "   ( " << sizeof( Object ) << " bytes, CUDA adress " << this->cuda_pointer << " )" << std::endl;
+#endif
+            TNL_ASSERT( this->cuda_pointer, );
+            cudaMemcpy( (void*) this->cuda_pointer, (void*) &this->pd->data, sizeof( Object ), cudaMemcpyHostToDevice );
+            TNL_CHECK_CUDA_DEVICE;
+            this->set_last_sync_state();
+            return true;
+         }
          return true;
+#else
+         return false;
+#endif
       }
 
+      /**
+       * \brief Reset the pointer to empty state.
+       */
       void clear()
       {
          this->free();
       }
 
+      /**
+       * \brief Swap the owned object with another pointer.
+       *
+       * \param ptr2 the other shared pointer for swapping.
+       */
       void swap( SharedPointer& ptr2 )
       {
          std::swap( this->pd, ptr2.pd );
+         std::swap( this->cuda_pointer, ptr2.cuda_pointer );
       }
 
+      /**
+       * \brief Destructor.
+       */
       ~SharedPointer()
       {
          this->free();
+         getSmartPointersRegister< DeviceType >().remove( this );
       }
 
-
    protected:
 
       struct PointerData
       {
          Object data;
+         char data_image[ sizeof(Object) ];
          int counter;
+         bool maybe_modified;
 
          template< typename... Args >
          explicit PointerData( Args... args )
          : data( args... ),
-           counter( 1 )
+           counter( 1 ),
+           maybe_modified( false )
          {}
       };
 
       template< typename... Args >
       bool allocate( Args... args )
       {
-#ifdef HAVE_CUDA
-         if( cudaMallocManaged( ( void** ) &this->pd, sizeof( PointerData ) != cudaSuccess ) )
-            return false;
-         new ( this->pd ) PointerData( args... );
-         return true;
-#else
-         return false;
+         this->pd = new PointerData( args... );
+         // pass to device
+         this->cuda_pointer = Cuda::passToDevice( this->pd->data );
+         // set last-sync state
+         this->set_last_sync_state();
+#ifdef TNL_DEBUG_SHARED_POINTERS
+         std::cerr << "Created shared pointer to " << getType< ObjectType >() << " (cuda_pointer = " << this->cuda_pointer << ")" << std::endl;
 #endif
+         getSmartPointersRegister< DeviceType >().insert( this );
+         return true;
+      }
+
+      void set_last_sync_state()
+      {
+         TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
+         std::memcpy( (void*) &this->pd->data_image, (void*) &this->pd->data, sizeof( Object ) );
+         this->pd->maybe_modified = false;
+      }
+
+      bool modified()
+      {
+         TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
+         // optimization: skip bitwise comparison if we're sure that the data is the same
+         if( ! this->pd->maybe_modified )
+            return false;
+         return std::memcmp( (void*) &this->pd->data_image, (void*) &this->pd->data, sizeof( Object ) ) != 0;
       }
 
       void free()
       {
          if( this->pd )
          {
+#ifdef TNL_DEBUG_SHARED_POINTERS
+            std::cerr << "Freeing shared pointer: counter = " << this->pd->counter << ", cuda_pointer = " << this->cuda_pointer << ", type: " << getType< ObjectType >() << std::endl;
+#endif
             if( ! --this->pd->counter )
             {
-#ifdef HAVE_CUDA
-               cudaFree( this->pd );
-#endif
+               delete this->pd;
                this->pd = nullptr;
+               if( this->cuda_pointer )
+                  Cuda::freeFromDevice( this->cuda_pointer );
+#ifdef TNL_DEBUG_SHARED_POINTERS
+               std::cerr << "...deleted data." << std::endl;
+#endif
             }
          }
       }
 
       PointerData* pd;
-};
-
-#else // HAVE_CUDA_UNIFIED_MEMORY
 
+      // cuda_pointer can't be part of PointerData structure, since we would be
+      // unable to dereference this-pd on the device
+      Object* cuda_pointer;
+};
+   
+#else 
+// Implementation with CUDA unified memory. It is very slow, we keep it only for experimental reasons.
 template< typename Object >
 class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 {
@@ -304,22 +585,22 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       using DeviceType = Devices::Cuda; 
 
       SharedPointer( std::nullptr_t )
-      : pd( nullptr ),
-        cuda_pointer( nullptr )
+      : pd( nullptr )
       {}
 
       template< typename... Args >
       explicit  SharedPointer( Args... args )
-      : pd( nullptr ),
-        cuda_pointer( nullptr )
+      : pd( nullptr )
       {
+#ifdef TNL_DEBUG_SHARED_POINTERS
+         std::cerr << "Creating shared pointer to " << getType< ObjectType >() << std::endl;
+#endif
          this->allocate( args... );
       }
 
       // this is needed only to avoid the default compiler-generated constructor
       SharedPointer( const SharedPointer& pointer )
-      : pd( (PointerData*) pointer.pd ),
-        cuda_pointer( pointer.cuda_pointer )
+      : pd( (PointerData*) pointer.pd )
       {
          this->pd->counter += 1;
       }
@@ -328,30 +609,25 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
       SharedPointer( const SharedPointer<  Object_, DeviceType >& pointer )
-      : pd( (PointerData*) pointer.pd ),
-        cuda_pointer( pointer.cuda_pointer )
+      : pd( (PointerData*) pointer.pd )
       {
          this->pd->counter += 1;
       }
 
       // this is needed only to avoid the default compiler-generated constructor
       SharedPointer( SharedPointer&& pointer )
-      : pd( (PointerData*) pointer.pd ),
-        cuda_pointer( pointer.cuda_pointer )
+      : pd( (PointerData*) pointer.pd )
       {
          pointer.pd = nullptr;
-         pointer.cuda_pointer = nullptr;
       }
 
       // conditional constructor for non-const -> const data
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
       SharedPointer( SharedPointer<  Object_, DeviceType >&& pointer )
-      : pd( (PointerData*) pointer.pd ),
-        cuda_pointer( pointer.cuda_pointer )
+      : pd( (PointerData*) pointer.pd )
       {
          pointer.pd = nullptr;
-         pointer.cuda_pointer = nullptr;
       }
 
       template< typename... Args >
@@ -360,20 +636,16 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 #ifdef TNL_DEBUG_SHARED_POINTERS
          std::cerr << "Recreating shared pointer to " << getType< ObjectType >() << std::endl;
 #endif
-         if( ! this->pd )
+         if( ! this->counter )
             return this->allocate( args... );
 
-         if( this->pd->counter == 1 )
+         if( *this->pd->counter == 1 )
          {
             /****
              * The object is not shared -> recreate it in-place, without reallocation
              */
-            this->pd->data.~Object();
-            new ( &this->pd->data ) Object( args... );
-#ifdef HAVE_CUDA
-            cudaMemcpy( (void*) this->cuda_pointer, (void*) &this->pd->data, sizeof( Object ), cudaMemcpyHostToDevice );
-#endif
-            this->set_last_sync_state();
+            this->pd->data.~ObjectType();
+            new ( this->pd->data ) ObjectType( args... );
             return true;
          }
 
@@ -383,50 +655,28 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
          return this->allocate( args... );
       }
 
-      __cuda_callable__
       const Object* operator->() const
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
-#ifdef __CUDA_ARCH__
-         return this->cuda_pointer;
-#else
          return &this->pd->data;
-#endif
       }
 
-      __cuda_callable__
       Object* operator->()
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
-#ifdef __CUDA_ARCH__
-         return this->cuda_pointer;
-#else
-         this->pd->maybe_modified = true;
          return &this->pd->data;
-#endif
       }
 
-      __cuda_callable__
       const Object& operator *() const
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
-#ifdef __CUDA_ARCH__
-         return *( this->cuda_pointer );
-#else
          return this->pd->data;
-#endif
       }
 
-      __cuda_callable__
       Object& operator *()
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
-#ifdef __CUDA_ARCH__
-         return *( this->cuda_pointer );
-#else
-         this->pd->maybe_modified = true;
          return this->pd->data;
-#endif
       }
 
       __cuda_callable__
@@ -445,29 +695,16 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       __cuda_callable__
       const Object& getData() const
       {
-         static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value, "Only Devices::Host or Devices::Cuda devices are accepted here." );
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
-         TNL_ASSERT_TRUE( this->cuda_pointer, "Attempt to dereference a null pointer" );
-         if( std::is_same< Device, Devices::Host >::value )
-            return this->pd->data;
-         if( std::is_same< Device, Devices::Cuda >::value )
-            return *( this->cuda_pointer );
+         return this->pd->data;
       }
 
       template< typename Device = Devices::Host >
       __cuda_callable__
       Object& modifyData()
       {
-         static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value, "Only Devices::Host or Devices::Cuda devices are accepted here." );
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
-         TNL_ASSERT_TRUE( this->cuda_pointer, "Attempt to dereference a null pointer" );
-         if( std::is_same< Device, Devices::Host >::value )
-         {
-            this->pd->maybe_modified = true;
-            return this->pd->data;
-         }
-         if( std::is_same< Device, Devices::Cuda >::value )
-            return *( this->cuda_pointer );
+         return this->pd->data;
       }
 
       // this is needed only to avoid the default compiler-generated operator
@@ -475,12 +712,8 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
-         this->cuda_pointer = ptr.cuda_pointer;
-         if( this->pd != nullptr )
+         if( this->pd != nullptr ) 
             this->pd->counter += 1;
-#ifdef TNL_DEBUG_SHARED_POINTERS
-         std::cerr << "Copy-assigned shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
-#endif
          return *this;
       }
 
@@ -491,12 +724,8 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
-         this->cuda_pointer = ptr.cuda_pointer;
          if( this->pd != nullptr )
             this->pd->counter += 1;
-#ifdef TNL_DEBUG_SHARED_POINTERS
-         std::cerr << "Copy-assigned shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
-#endif
          return *this;
       }
 
@@ -505,12 +734,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
-         this->cuda_pointer = ptr.cuda_pointer;
          ptr.pd = nullptr;
-         ptr.cuda_pointer = nullptr;
-#ifdef TNL_DEBUG_SHARED_POINTERS
-         std::cerr << "Move-assigned shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
-#endif
          return *this;
       }
 
@@ -521,36 +745,13 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
-         this->cuda_pointer = ptr.cuda_pointer;
          ptr.pd = nullptr;
-         ptr.cuda_pointer = nullptr;
-#ifdef TNL_DEBUG_SHARED_POINTERS
-         std::cerr << "Move-assigned shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
-#endif
          return *this;
       }
 
       bool synchronize()
       {
-         if( ! this->pd )
-            return true;
-#ifdef HAVE_CUDA
-         if( this->modified() )
-         {
-#ifdef TNL_DEBUG_SHARED_POINTERS
-            std::cerr << "Synchronizing shared pointer: counter = " << this->pd->counter << ", type: " << getType< ObjectType >() << std::endl;
-            std::cerr << "   ( " << sizeof( Object ) << " bytes, CUDA adress " << this->cuda_pointer << " )" << std::endl;
-#endif
-            TNL_ASSERT( this->cuda_pointer, );
-            cudaMemcpy( (void*) this->cuda_pointer, (void*) &this->pd->data, sizeof( Object ), cudaMemcpyHostToDevice );
-            TNL_CHECK_CUDA_DEVICE;
-            this->set_last_sync_state();
-            return true;
-         }
          return true;
-#else
-         return false;
-#endif
       }
 
       void clear()
@@ -561,90 +762,59 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       void swap( SharedPointer& ptr2 )
       {
          std::swap( this->pd, ptr2.pd );
-         std::swap( this->cuda_pointer, ptr2.cuda_pointer );
       }
 
       ~SharedPointer()
       {
          this->free();
-         getSmartPointersRegister< DeviceType >().remove( this );
       }
 
+
    protected:
 
       struct PointerData
       {
          Object data;
-         char data_image[ sizeof(Object) ];
          int counter;
-         bool maybe_modified;
 
          template< typename... Args >
          explicit PointerData( Args... args )
          : data( args... ),
-           counter( 1 ),
-           maybe_modified( false )
+           counter( 1 )
          {}
       };
 
       template< typename... Args >
       bool allocate( Args... args )
       {
-         this->pd = new PointerData( args... );
-         // pass to device
-         this->cuda_pointer = Cuda::passToDevice( this->pd->data );
-         // set last-sync state
-         this->set_last_sync_state();
-#ifdef TNL_DEBUG_SHARED_POINTERS
-         std::cerr << "Created shared pointer to " << getType< ObjectType >() << " (cuda_pointer = " << this->cuda_pointer << ")" << std::endl;
-#endif
-         getSmartPointersRegister< DeviceType >().insert( this );
-         return true;
-      }
-
-      void set_last_sync_state()
-      {
-         TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
-         std::memcpy( (void*) &this->pd->data_image, (void*) &this->pd->data, sizeof( Object ) );
-         this->pd->maybe_modified = false;
-      }
-
-      bool modified()
-      {
-         TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
-         // optimization: skip bitwise comparison if we're sure that the data is the same
-         if( ! this->pd->maybe_modified )
+#ifdef HAVE_CUDA
+         if( cudaMallocManaged( ( void** ) &this->pd, sizeof( PointerData ) != cudaSuccess ) )
             return false;
-         return std::memcmp( (void*) &this->pd->data_image, (void*) &this->pd->data, sizeof( Object ) ) != 0;
+         new ( this->pd ) PointerData( args... );
+         return true;
+#else
+         return false;
+#endif
       }
 
       void free()
       {
          if( this->pd )
          {
-#ifdef TNL_DEBUG_SHARED_POINTERS
-            std::cerr << "Freeing shared pointer: counter = " << this->pd->counter << ", cuda_pointer = " << this->cuda_pointer << ", type: " << getType< ObjectType >() << std::endl;
-#endif
             if( ! --this->pd->counter )
             {
-               delete this->pd;
-               this->pd = nullptr;
-               if( this->cuda_pointer )
-                  Cuda::freeFromDevice( this->cuda_pointer );
-#ifdef TNL_DEBUG_SHARED_POINTERS
-               std::cerr << "...deleted data." << std::endl;
+#ifdef HAVE_CUDA
+               cudaFree( this->pd );
 #endif
+               this->pd = nullptr;
             }
          }
       }
 
       PointerData* pd;
-
-      // cuda_pointer can't be part of PointerData structure, since we would be
-      // unable to dereference this-pd on the device
-      Object* cuda_pointer;
 };
-#endif // HAVE_CUDA_UNIFIED_MEMORY
+
+#endif // ! HAVE_CUDA_UNIFIED_MEMORY
 
 } // namespace Pointers
 } // namespace TNL
diff --git a/src/TNL/Pointers/SharedPointerHost.h b/src/TNL/Pointers/SharedPointerHost.h
index 39a6d4da4..9e71205f5 100644
--- a/src/TNL/Pointers/SharedPointerHost.h
+++ b/src/TNL/Pointers/SharedPointerHost.h
@@ -24,14 +24,23 @@
 namespace TNL {
 namespace Pointers {
 
+/**
+ * \brief Specialization of the UniquePointer for the host system.
+ * 
+ * \tparam  Object is a type of object to be owned by the pointer.
+ */
 template< typename Object >
 class SharedPointer< Object, Devices::Host > : public SmartPointer
 {
    private:
-      // Convenient template alias for controlling the selection of copy- and
-      // move-constructors and assignment operators using SFINAE.
-      // The type Object_ is "enabled" iff Object_ and Object are not the same,
-      // but after removing const and volatile qualifiers they are the same.
+      
+      /**
+       * \typedef Enabler
+       * Convenient template alias for controlling the selection of copy- and
+       * move-constructors and assignment operators using SFINAE.
+       * The type Object_ is "enabled" iff Object_ and Object are not the same,
+       * but after removing const and volatile qualifiers they are the same.
+       */
       template< typename Object_ >
       using Enabler = std::enable_if< ! std::is_same< Object_, Object >::value &&
                                       std::is_same< typename std::remove_cv< Object >::type, Object_ >::value >;
@@ -42,13 +51,30 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
 
    public:
 
+      /**
+       * \typedef ObjectType is the type of object owned by the pointer. 
+       */
       using ObjectType = Object;
-      using DeviceType = Devices::Host; 
 
+      /**
+       * \typedef DeviceType is the type of device where the object is to be
+       * mirrored.
+       */
+      using DeviceType = Devices::Host;
+
+      /**
+       * \brief Constructor of empty pointer.
+       */
       SharedPointer( std::nullptr_t )
       : pd( nullptr )
       {}
 
+      /**
+       * \brief Constructor with parameters of the Object constructor.
+       * 
+       * \tparam Args is variadic template type of arguments of the Object constructor.
+       * \tparam args are arguments passed to the Object constructor.
+       */
       template< typename... Args >
       explicit  SharedPointer( Args... args )
       : pd( nullptr )
@@ -59,38 +85,70 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
          this->allocate( args... );
       }
 
-      // this is needed only to avoid the default compiler-generated constructor
-      SharedPointer( const SharedPointer& pointer )
+      /**
+       * \brief Copy constructor.
+       * 
+       * \param pointer is the source shared pointer.
+       */
+      SharedPointer( const SharedPointer& pointer ) // this is needed only to avoid the default compiler-generated constructor
       : pd( (PointerData*) pointer.pd )
       {
          this->pd->counter += 1;
       }
 
-      // conditional constructor for non-const -> const data
+      /**
+       * \brief Copy constructor.
+       * 
+       * This is specialization for compatible object types.
+       * 
+       * See \ref Enabler.
+       * 
+       * \param pointer is the source shared pointer.
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      SharedPointer( const SharedPointer<  Object_, DeviceType >& pointer )
+      SharedPointer( const SharedPointer<  Object_, DeviceType >& pointer ) // conditional constructor for non-const -> const data
       : pd( (PointerData*) pointer.pd )
       {
          this->pd->counter += 1;
       }
 
-      // this is needed only to avoid the default compiler-generated constructor
-      SharedPointer( SharedPointer&& pointer )
+      /**
+       * \brief Move constructor.
+       * 
+       * \param pointer is the source shared pointer.
+       */
+      SharedPointer( SharedPointer&& pointer ) // this is needed only to avoid the default compiler-generated constructor
       : pd( (PointerData*) pointer.pd )
       {
          pointer.pd = nullptr;
       }
 
-      // conditional constructor for non-const -> const data
+      /**
+       * \brief Move constructor.
+       * 
+       * This is specialization for compatible object types.
+       * 
+       * See \ref Enabler.
+       * 
+       * \param pointer is the source shared pointer.
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      SharedPointer( SharedPointer<  Object_, DeviceType >&& pointer )
+      SharedPointer( SharedPointer<  Object_, DeviceType >&& pointer ) // conditional constructor for non-const -> const data
       : pd( (PointerData*) pointer.pd )
       {
          pointer.pd = nullptr;
       }
 
+      /**
+       * \brief Create new object based in given constructor parameters.
+       * 
+       * \tparam Args is variadic template type of arguments to be passed to the
+       *    object constructor.
+       * \param args are arguments to be passed to the object constructor.
+       * \return true if recreation was successful, false otherwise.
+       */
       template< typename... Args >
       bool recreate( Args... args )
       {
@@ -116,42 +174,80 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
          return this->allocate( args... );
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by constant smart pointer.
+       * 
+       * \return constant pointer to the object owned by this smart pointer.
+       */
       const Object* operator->() const
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
          return &this->pd->data;
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by non-constant smart pointer.
+       * 
+       * \return pointer to the object owned by this smart pointer.
+       */
       Object* operator->()
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
          return &this->pd->data;
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by constant smart pointer.
+       * 
+       * \return constant reference to the object owned by this smart pointer.
+       */
       const Object& operator *() const
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
          return this->pd->data;
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by non-constant smart pointer.
+       * 
+       * \return reference to the object owned by this smart pointer.
+       */
       Object& operator *()
       {
          TNL_ASSERT_TRUE( this->pd, "Attempt to dereference a null pointer" );
          return this->pd->data;
       }
 
-      __cuda_callable__
+      /**
+       * \brief Conversion to boolean type.
+       * 
+       * \return Returns true if the pointer is not empty, false otherwise.
+       */
       operator bool() const
       {
          return this->pd;
       }
 
-      __cuda_callable__
+      /**
+       * \brief Negation operator.
+       *
+       * \return Returns false if the pointer is not empty, true otherwise.
+       */
       bool operator!() const
       {
          return ! this->pd;
       }
 
+      /**
+       * \brief Constant object reference getter.
+       *
+       * No synchronization of this pointer will be performed due to calling
+       * this method.
+       * 
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       __cuda_callable__
       const Object& getData() const
@@ -160,6 +256,16 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
          return this->pd->data;
       }
 
+      /**
+       * \brief Non-constant object reference getter.
+       *
+       * No synchronization of this pointer will be performed due to calling
+       * this method.
+       * 
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       __cuda_callable__
       Object& modifyData()
@@ -168,8 +274,15 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
          return this->pd->data;
       }
 
-      // this is needed only to avoid the default compiler-generated operator
-      const SharedPointer& operator=( const SharedPointer& ptr )
+      /**
+       * \brief Assignment operator.
+       * 
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
+       * 
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
+      const SharedPointer& operator=( const SharedPointer& ptr ) // this is needed only to avoid the default compiler-generated operator
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
@@ -178,10 +291,19 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
          return *this;
       }
 
-      // conditional operator for non-const -> const data
+      /**
+       * \brief Assignment operator for compatible object types.
+       * 
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
+       * 
+       * See \ref Enabler.
+       * 
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const SharedPointer& operator=( const SharedPointer<  Object_, DeviceType >& ptr )
+      const SharedPointer& operator=( const SharedPointer<  Object_, DeviceType >& ptr ) // conditional operator for non-const -> const data
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
@@ -190,8 +312,15 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
          return *this;
       }
 
-      // this is needed only to avoid the default compiler-generated operator
-      const SharedPointer& operator=( SharedPointer&& ptr )
+      /**
+       * \brief Move operator.
+       * 
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
+       * 
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
+      const SharedPointer& operator=( SharedPointer&& ptr ) // this is needed only to avoid the default compiler-generated operator
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
@@ -199,10 +328,19 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
          return *this;
       }
 
-      // conditional operator for non-const -> const data
+      /**
+       * \brief Move operator.
+       * 
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
+       * 
+       * See \ref Enabler.
+       * 
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const SharedPointer& operator=( SharedPointer<  Object_, DeviceType >&& ptr )
+      const SharedPointer& operator=( SharedPointer<  Object_, DeviceType >&& ptr ) // conditional operator for non-const -> const data
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
@@ -210,21 +348,39 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
          return *this;
       }
 
+      /**
+       * \brief Cross-device pointer synchronization.
+       * 
+       * For the smart pointers in the host, this method does nothing.
+       * 
+       * \return true.
+       */
       bool synchronize()
       {
          return true;
       }
 
+      /**
+       * \brief Reset the pointer to empty state.
+       */
       void clear()
       {
          this->free();
       }
 
+      /**
+       * \brief Swap the owned object with another pointer.
+       *
+       * \param ptr2 the other shared pointer for swapping.
+       */
       void swap( SharedPointer& ptr2 )
       {
          std::swap( this->pd, ptr2.pd );
       }
 
+      /**
+       * \brief Destructor.
+       */
       ~SharedPointer()
       {
          this->free();
diff --git a/src/TNL/Pointers/UniquePointer.h b/src/TNL/Pointers/UniquePointer.h
index baa93e589..76f06f523 100644
--- a/src/TNL/Pointers/UniquePointer.h
+++ b/src/TNL/Pointers/UniquePointer.h
@@ -57,7 +57,7 @@ class UniquePointer
 };
 
 /**
- * \brief Specialization of the UniqueSmart pointer for the host system.
+ * \brief Specialization of the UniquePointer for the host system.
  * 
  * \tparam  Object is a type of object to be owned by the pointer.
  */
@@ -217,7 +217,7 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
       }
 
       /**
-       * \brief Assignment operator.
+       * \brief Move operator.
        * 
        * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
        * The original pointer \ref ptr is reset to empty state.
@@ -233,10 +233,9 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
       /**
        * \brief Cross-device pointer synchronization.
        * 
-       * This method is usually called by the smart pointers register when calling
-       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >() 
+       * For the smart pointers in the host, this method does nothing.
        * 
-       * \return true if the synchronization was successful, false otherwise.
+       * \return true.
        */
       bool synchronize()
       {
@@ -259,7 +258,7 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
 };
 
 /**
- * \brief Specialization of the UniqueSmart pointer for the CUDA device.
+ * \brief Specialization of the UniquePointer for the CUDA device.
  * 
  * \tparam  Object is a type of object to be owned by the pointer.
  */
@@ -438,7 +437,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
       }
 
       /**
-       * \brief Assignment operator.
+       * \brief Move operator.
        * 
        * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
        * The original pointer \ref ptr is reset to empty state.
-- 
GitLab


From 8c030dc02a20549ff75f99f63335f1b3b1aef93c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 25 Nov 2019 22:06:12 +0100
Subject: [PATCH 37/43] Fixed Documentation/Examples/Pointers/CMakeLists.txt.

---
 Documentation/Examples/Pointers/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/Examples/Pointers/CMakeLists.txt b/Documentation/Examples/Pointers/CMakeLists.txt
index e33b34f28..ef7a5f615 100644
--- a/Documentation/Examples/Pointers/CMakeLists.txt
+++ b/Documentation/Examples/Pointers/CMakeLists.txt
@@ -5,10 +5,11 @@ IF( BUILD_CUDA )
    ADD_CUSTOM_COMMAND( COMMAND SharedPointerExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SharedPointerExample.out OUTPUT SharedPointerExample.out )
    CUDA_ADD_EXECUTABLE(DevicePointerExampleCuda DevicePointerExample.cu)
    ADD_CUSTOM_COMMAND( COMMAND DevicePointerExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/DevicePointerExample.out OUTPUT DevicePointerExample.out )
-ENDIF()
 
 ADD_CUSTOM_TARGET( RunPointersExamples ALL DEPENDS
    UniquePointerExample.out
    SharedPointerExample.out
    DevicePointerExample.out
  )
+
+ENDIF()
-- 
GitLab


From 6d1d4bf3b689ace9420701c984e119feed20b144 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 26 Nov 2019 12:32:48 +0100
Subject: [PATCH 38/43] Added nullptr constructor and swap method to
 DevicePointer.

---
 src/TNL/Pointers/DevicePointer.h | 48 ++++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Pointers/DevicePointer.h b/src/TNL/Pointers/DevicePointer.h
index b72aaf9b1..acac6aafa 100644
--- a/src/TNL/Pointers/DevicePointer.h
+++ b/src/TNL/Pointers/DevicePointer.h
@@ -20,7 +20,8 @@
 #include <TNL/TypeInfo.h>
 #include <TNL/Cuda/MemoryHelpers.h>
 
-#include <cstring>  // std::memcpy, std::memcmp
+#include <cstring>
+#include <c++/8/bits/c++config.h>  // std::memcpy, std::memcmp
 
 namespace TNL {
 namespace Pointers {
@@ -60,6 +61,13 @@ class DevicePointer< Object, Devices::Host > : public SmartPointer
       typedef Object ObjectType;
       typedef Devices::Host DeviceType;
 
+      /**
+       * \brief Constructor of empty pointer.
+       */
+      DevicePointer( std::nullptr_t )
+      : pointer( nullptr )
+      {}
+
       explicit  DevicePointer( ObjectType& obj )
       : pointer( nullptr )
       {
@@ -181,6 +189,17 @@ class DevicePointer< Object, Devices::Host > : public SmartPointer
          return true;
       }
 
+      /**
+       * \brief Swap the owned object with another pointer.
+       *
+       * \param ptr2 the other shared pointer for swapping.
+       */
+      void swap( DevicePointer& ptr2 )
+      {
+         std::swap( this->pointer, ptr2.pointer );
+      }
+
+
       ~DevicePointer()
       {
       }
@@ -215,6 +234,14 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
       typedef Object ObjectType;
       typedef Devices::Cuda DeviceType;
 
+      /**
+       * \brief Constructor of empty pointer.
+       */
+      DevicePointer( std::nullptr_t )
+      : pointer( nullptr ),
+        pd( nullptr ),
+        cuda_pointer( nullptr ) {}
+
       explicit  DevicePointer( ObjectType& obj )
       : pointer( nullptr ),
         pd( nullptr ),
@@ -359,7 +386,8 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          this->pointer = ptr.pointer;
          this->pd = (PointerData*) ptr.pd;
          this->cuda_pointer = ptr.cuda_pointer;
-         this->pd->counter += 1;
+         if( this->pd )
+            this->pd->counter += 1;
          return *this;
       }
 
@@ -372,7 +400,8 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          this->pointer = ptr.pointer;
          this->pd = (PointerData*) ptr.pd;
          this->cuda_pointer = ptr.cuda_pointer;
-         this->pd->counter += 1;
+         if( this->pd )
+            this->pd->counter += 1;
          return *this;
       }
 
@@ -424,6 +453,19 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
 #endif
       }
 
+      /**
+       * \brief Swap the owned object with another pointer.
+       *
+       * \param ptr2 the other shared pointer for swapping.
+       */
+      void swap( DevicePointer& ptr2 )
+      {
+         std::swap( this->pointer, ptr2.pointer );
+         std::swap( this->pd, ptr2.pd );
+         std::swap( this->cuda_pointer, ptr2.cuda_pointer );
+      }
+
+
       ~DevicePointer()
       {
          this->free();
-- 
GitLab


From 27dc6c5341975d99f9ecc8587379a0dd46a446e4 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 26 Nov 2019 12:33:24 +0100
Subject: [PATCH 39/43] Added DevicePointer unit test.

---
 src/UnitTests/Pointers/CMakeLists.txt         |   4 +
 .../Pointers/DevicePointerCudaTest.cu         | 153 ++++++++++++++++++
 2 files changed, 157 insertions(+)
 create mode 100644 src/UnitTests/Pointers/DevicePointerCudaTest.cu

diff --git a/src/UnitTests/Pointers/CMakeLists.txt b/src/UnitTests/Pointers/CMakeLists.txt
index 0f5e98656..8845cdfbd 100644
--- a/src/UnitTests/Pointers/CMakeLists.txt
+++ b/src/UnitTests/Pointers/CMakeLists.txt
@@ -14,4 +14,8 @@ if( BUILD_CUDA )
                         OPTIONS ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( SharedPointerCudaTest ${GTEST_BOTH_LIBRARIES} )
    ADD_TEST( SharedPointerCudaTest ${EXECUTABLE_OUTPUT_PATH}/SharedPointerCudaTest${CMAKE_EXECUTABLE_SUFFIX} )
+   CUDA_ADD_EXECUTABLE( DevicePointerCudaTest DevicePointerCudaTest.cu 
+                        OPTIONS ${CXX_TESTS_FLAGS} )
+   TARGET_LINK_LIBRARIES( DevicePointerCudaTest ${GTEST_BOTH_LIBRARIES} )
+   ADD_TEST( DevicePointerCudaTest ${EXECUTABLE_OUTPUT_PATH}/DevicePointerCudaTest${CMAKE_EXECUTABLE_SUFFIX} )
 endif( BUILD_CUDA )
diff --git a/src/UnitTests/Pointers/DevicePointerCudaTest.cu b/src/UnitTests/Pointers/DevicePointerCudaTest.cu
new file mode 100644
index 000000000..76320904a
--- /dev/null
+++ b/src/UnitTests/Pointers/DevicePointerCudaTest.cu
@@ -0,0 +1,153 @@
+/***************************************************************************
+                          DevicePointerCudaTest.cpp  -  description
+                             -------------------
+    begin                : Nov 26, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <cstdlib>
+#include <TNL/Devices/Host.h>
+#include <TNL/Pointers/DevicePointer.h>
+#include <TNL/Containers/StaticArray.h>
+#include <TNL/Containers/Array.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+#endif
+
+#include <TNL/Devices/Cuda.h>
+
+using namespace TNL;
+
+#ifdef HAVE_GTEST
+TEST( DevicePointerCudaTest, ConstructorTest )
+{
+#ifdef HAVE_CUDA
+   using TestType =  TNL::Containers::StaticArray< 2, int  >;
+   TestType obj1;
+   Pointers::DevicePointer< TestType, Devices::Cuda > ptr1( obj1 );
+
+   ptr1->x() = 0;
+   ptr1->y() = 0;
+   ASSERT_EQ( ptr1->x(), 0 );
+   ASSERT_EQ( ptr1->y(), 0 );
+
+   TestType obj2( 1,2 );
+   Pointers::DevicePointer< TestType, Devices::Cuda > ptr2( obj2 );
+   ASSERT_EQ( ptr2->x(), 1 );
+   ASSERT_EQ( ptr2->y(), 2 );
+
+   ptr1 = ptr2;
+   ASSERT_EQ( ptr1->x(), 1 );
+   ASSERT_EQ( ptr1->y(), 2 );
+#endif
+};
+
+TEST( DevicePointerCudaTest, getDataTest )
+{
+#ifdef HAVE_CUDA
+   using TestType = TNL::Containers::StaticArray< 2, int  >;
+   TestType obj1( 1, 2 );
+   Pointers::DevicePointer< TestType, Devices::Cuda > ptr1( obj1 );
+
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+
+   TestType aux;
+
+   cudaMemcpy( ( void*) &aux, &ptr1.getData< Devices::Cuda >(), sizeof( TestType ), cudaMemcpyDeviceToHost );
+
+   ASSERT_EQ( aux[ 0 ], 1 );
+   ASSERT_EQ( aux[ 1 ], 2 );
+#endif  // HAVE_CUDA
+};
+
+#ifdef HAVE_CUDA
+__global__ void copyArrayKernel( const TNL::Containers::Array< int, Devices::Cuda >* inArray,
+                                 int* outArray )
+{
+   if( threadIdx.x < 2 )
+   {
+      outArray[ threadIdx.x ] = ( *inArray )[ threadIdx.x ];
+   }
+}
+
+__global__ void copyArrayKernel2( const Pointers::DevicePointer< TNL::Containers::Array< int, Devices::Cuda > > inArray,
+                                  int* outArray )
+{
+   if( threadIdx.x < 2 )
+   {
+      outArray[ threadIdx.x ] = ( *inArray )[ threadIdx.x ];
+   }
+}
+#endif
+
+TEST( DevicePointerCudaTest, getDataArrayTest )
+{
+#ifdef HAVE_CUDA
+   using TestType = TNL::Containers::Array< int, Devices::Cuda  >;
+   TestType obj;
+   Pointers::DevicePointer< TestType > ptr( obj );
+
+   ptr->setSize( 2 );
+   ptr->setElement( 0, 1 );
+   ptr->setElement( 1, 2 );
+
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+
+   int *testArray_device, *testArray_host;
+   cudaMalloc( ( void** ) &testArray_device, 2 * sizeof( int ) );
+   copyArrayKernel<<< 1, 2 >>>( &ptr.getData< Devices::Cuda >(), testArray_device );
+   testArray_host = new int [ 2 ];
+   cudaMemcpy( testArray_host, testArray_device, 2 * sizeof( int ), cudaMemcpyDeviceToHost );
+
+   ASSERT_EQ( testArray_host[ 0 ], 1 );
+   ASSERT_EQ( testArray_host[ 1 ], 2 );
+
+   copyArrayKernel2<<< 1, 2 >>>( ptr, testArray_device );
+   cudaMemcpy( testArray_host, testArray_device, 2 * sizeof( int ), cudaMemcpyDeviceToHost );
+
+   ASSERT_EQ( testArray_host[ 0 ], 1 );
+   ASSERT_EQ( testArray_host[ 1 ], 2 );
+
+   delete[] testArray_host;
+   cudaFree( testArray_device );
+
+#endif
+};
+
+TEST( DevicePointerCudaTest, nullptrAssignement )
+{
+#ifdef HAVE_CUDA
+   using TestType = Pointers::DevicePointer< double, Devices::Cuda >;
+   double o1 = 5;
+   TestType p1( o1 ), p2( nullptr );
+
+   // This should not crash
+   p1 = p2;
+
+   ASSERT_FALSE( p1 );
+   ASSERT_FALSE( p2 );
+#endif
+}
+
+TEST( DevicePointerCudaTest, swap )
+{
+#ifdef HAVE_CUDA
+   using TestType = Pointers::DevicePointer< double, Devices::Cuda >;
+   double o1( 1 ), o2( 2 );
+   TestType p1( o1 ), p2( o2 );
+
+   p1.swap( p2 );
+
+   ASSERT_EQ( *p1, 2 );
+   ASSERT_EQ( *p2, 1 );
+#endif
+}
+
+#endif
+
+
+#include "../main.h"
-- 
GitLab


From df86df23455fef169c27e26930ccbd8554dd1645 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 26 Nov 2019 12:33:43 +0100
Subject: [PATCH 40/43] Avoided testing with CUDA Unified Memory in
 SharedPointer.

---
 src/UnitTests/Pointers/SharedPointerCudaTest.cu | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/UnitTests/Pointers/SharedPointerCudaTest.cu b/src/UnitTests/Pointers/SharedPointerCudaTest.cu
index d21f4319c..37cfc56b7 100644
--- a/src/UnitTests/Pointers/SharedPointerCudaTest.cu
+++ b/src/UnitTests/Pointers/SharedPointerCudaTest.cu
@@ -50,11 +50,6 @@ TEST( SharedPointerCudaTest, getDataTest )
    typedef TNL::Containers::StaticArray< 2, int  > TestType;
    Pointers::SharedPointer< TestType, Devices::Cuda > ptr1( 1, 2 );
 
-#ifdef HAVE_CUDA_UNIFIED_MEMORY
-   ASSERT_EQ( ptr1->x(), 1 );
-   ASSERT_EQ( ptr1->y(), 2 );
-#else
-
    Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
 
    TestType aux;
@@ -63,7 +58,6 @@ TEST( SharedPointerCudaTest, getDataTest )
 
    ASSERT_EQ( aux[ 0 ], 1 );
    ASSERT_EQ( aux[ 1 ], 2 );
-#endif  // HAVE_CUDA_UNIFIED_MEMORY
 #endif  // HAVE_CUDA
 };
 
-- 
GitLab


From cc136efa4c508dcaee4275c0bb1f55fc24e6c51e Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 26 Nov 2019 13:18:49 +0100
Subject: [PATCH 41/43] Fixes in UniquePointer documentation.

---
 src/TNL/Pointers/UniquePointer.h | 100 +++++++++++++++----------------
 1 file changed, 50 insertions(+), 50 deletions(-)

diff --git a/src/TNL/Pointers/UniquePointer.h b/src/TNL/Pointers/UniquePointer.h
index 76f06f523..66bc4a33c 100644
--- a/src/TNL/Pointers/UniquePointer.h
+++ b/src/TNL/Pointers/UniquePointer.h
@@ -27,23 +27,23 @@ namespace Pointers {
 
 /**
  * \brief Cross-device unique smart pointer.
- * 
+ *
  * This smart pointer is inspired by std::unique_ptr from STL library. It means
  * that the object owned by the smart pointer is accessible only through this
  * smart pointer. One cannot make any copy of this smart pointer. In addition,
  * the smart pointer is able to work across different devices which means that the
  * object owned by the smart pointer is mirrored on both host and device.
- * 
- * **NOTE: When using smart pointers to pass objects on GPU, one must call 
- * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >() 
+ *
+ * **NOTE: When using smart pointers to pass objects on GPU, one must call
+ * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >()
  * before calling a CUDA kernel working with smart pointers.**
- * 
+ *
  * \tparam Object is a type of object to be owned by the pointer.
  * \tparam Device is device where the object is to be allocated. The object is
  * always allocated on the host system as well for easier object manipulation.
- * 
+ *
  * See also \ref SharedPointer and \ref DevicePointer.
- * 
+ *
  * See also \ref UniquePointer< Object, Devices::Host > and \ref UniquePointer< Object, Devices::Cuda >.
  *
  * \par Example
@@ -57,8 +57,8 @@ class UniquePointer
 };
 
 /**
- * \brief Specialization of the UniquePointer for the host system.
- * 
+ * \brief Specialization of the \ref UniquePointer for the host system.
+ *
  * \tparam  Object is a type of object to be owned by the pointer.
  */
 template< typename Object >
@@ -67,7 +67,7 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
    public:
 
       /**
-       * \typedef ObjectType is the type of object owned by the pointer. 
+       * \typedef ObjectType is the type of object owned by the pointer.
        */
       using ObjectType = Object;
 
@@ -86,7 +86,7 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Constructor with parameters of the Object constructor.
-       * 
+       *
        * \tparam Args is variadic template type of arguments of the Object constructor.
        * \tparam args are arguments passed to the Object constructor.
        */
@@ -98,7 +98,7 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Arrow operator for accessing the object owned by constant smart pointer.
-       * 
+       *
        * \return constant pointer to the object owned by this smart pointer.
        */
       const Object* operator->() const
@@ -109,7 +109,7 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Arrow operator for accessing the object owned by non-constant smart pointer.
-       * 
+       *
        * \return pointer to the object owned by this smart pointer.
        */
       Object* operator->()
@@ -120,7 +120,7 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Dereferencing operator for accessing the object owned by constant smart pointer.
-       * 
+       *
        * \return constant reference to the object owned by this smart pointer.
        */
       const Object& operator *() const
@@ -131,7 +131,7 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Dereferencing operator for accessing the object owned by non-constant smart pointer.
-       * 
+       *
        * \return reference to the object owned by this smart pointer.
        */
       Object& operator *()
@@ -142,7 +142,7 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Conversion to boolean type.
-       * 
+       *
        * \return Returns true if the pointer is not empty, false otherwise.
        */
       __cuda_callable__
@@ -167,7 +167,7 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
        *
        * No synchronization of this pointer will be performed due to calling
        * this method.
-       * 
+       *
        * \tparam Device says what image of the object one want to dereference. It
        * can be either \ref DeviceType or Devices::Host.
        * \return constant reference to the object image on given device.
@@ -183,10 +183,10 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
        * \brief Non-constant object reference getter.
        *
        * After calling this method, the object owned by the pointer might need
-       * to be synchronized. One should not forget to call 
-       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >() 
+       * to be synchronized. One should not forget to call
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >()
        * before calling CUDA kernel using object from this smart pointer.
-       * 
+       *
        * \tparam Device says what image of the object one want to dereference. It
        * can be either \ref DeviceType or Devices::Host.
        * \return constant reference to the object image on given device.
@@ -200,10 +200,10 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Assignment operator.
-       * 
-       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
        * The original pointer \ref ptr is reset to empty state.
-       * 
+       *
        * \param ptr input pointer
        * \return constant reference to \ref this
        */
@@ -218,10 +218,10 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Move operator.
-       * 
-       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
        * The original pointer \ref ptr is reset to empty state.
-       * 
+       *
        * \param ptr input pointer
        * \return constant reference to \ref this
        */
@@ -232,9 +232,9 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Cross-device pointer synchronization.
-       * 
+       *
        * For the smart pointers in the host, this method does nothing.
-       * 
+       *
        * \return true.
        */
       bool synchronize()
@@ -258,8 +258,8 @@ class UniquePointer< Object, Devices::Host > : public SmartPointer
 };
 
 /**
- * \brief Specialization of the UniquePointer for the CUDA device.
- * 
+ * \brief Specialization of the \ref UniquePointer for the CUDA device.
+ *
  * \tparam  Object is a type of object to be owned by the pointer.
  */
 template< typename Object >
@@ -268,7 +268,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
    public:
 
       /**
-       * \typedef ObjectType is the type of object owned by the pointer. 
+       * \typedef ObjectType is the type of object owned by the pointer.
        */
       using ObjectType = Object;
 
@@ -288,7 +288,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Constructor with parameters of the Object constructor.
-       * 
+       *
        * \tparam Args is variadic template type of arguments of the Object constructor.
        * \tparam args are arguments passed to the Object constructor.
        */
@@ -302,7 +302,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Arrow operator for accessing the object owned by constant smart pointer.
-       * 
+       *
        * \return constant pointer to the object owned by this smart pointer.
        */
       const Object* operator->() const
@@ -313,7 +313,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Arrow operator for accessing the object owned by non-constant smart pointer.
-       * 
+       *
        * \return pointer to the object owned by this smart pointer.
        */
       Object* operator->()
@@ -325,7 +325,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Dereferencing operator for accessing the object owned by constant smart pointer.
-       * 
+       *
        * \return constant reference to the object owned by this smart pointer.
        */
       const Object& operator *() const
@@ -336,7 +336,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Dereferencing operator for accessing the object owned by non-constant smart pointer.
-       * 
+       *
        * \return reference to the object owned by this smart pointer.
        */
       Object& operator *()
@@ -348,7 +348,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Conversion to boolean type.
-       * 
+       *
        * \return Returns true if the pointer is not empty, false otherwise.
        */
       __cuda_callable__
@@ -373,7 +373,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
        *
        * No synchronization of this pointer will be performed due to calling
        * this method.
-       * 
+       *
        * \tparam Device says what image of the object one want to dereference. It
        * can be either \ref DeviceType or Devices::Host.
        * \return constant reference to the object image on given device.
@@ -394,10 +394,10 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
        * \brief Non-constant object reference getter.
        *
        * After calling this method, the object owned by the pointer might need
-       * to be synchronized. One should not forget to call 
-       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >() 
+       * to be synchronized. One should not forget to call
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >()
        * before calling CUDA kernel using object from this smart pointer.
-       * 
+       *
        * \tparam Device says what image of the object one want to dereference. It
        * can be either \ref DeviceType or Devices::Host.
        * \return constant reference to the object image on given device.
@@ -419,10 +419,10 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Assignment operator.
-       * 
-       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
        * The original pointer \ref ptr is reset to empty state.
-       * 
+       *
        * \param ptr input pointer
        * \return constant reference to \ref this
        */
@@ -438,10 +438,10 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Move operator.
-       * 
-       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
        * The original pointer \ref ptr is reset to empty state.
-       * 
+       *
        * \param ptr input pointer
        * \return constant reference to \ref this
        */
@@ -452,10 +452,10 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Cross-device pointer synchronization.
-       * 
+       *
        * This method is usually called by the smart pointers register when calling
-       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >() 
-       * 
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >()
+       *
        * \return true if the synchronization was successful, false otherwise.
        */
       bool synchronize()
-- 
GitLab


From 57486ffeea477a3c18b66332864ffe7473ca3097 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 26 Nov 2019 13:19:16 +0100
Subject: [PATCH 42/43] Fixes in SharedPointer documentation.

---
 src/TNL/Pointers/SharedPointerCuda.h | 99 ++++++++++++++--------------
 src/TNL/Pointers/SharedPointerHost.h | 78 +++++++++++-----------
 2 files changed, 89 insertions(+), 88 deletions(-)

diff --git a/src/TNL/Pointers/SharedPointerCuda.h b/src/TNL/Pointers/SharedPointerCuda.h
index 81951a5e9..f4f73ec39 100644
--- a/src/TNL/Pointers/SharedPointerCuda.h
+++ b/src/TNL/Pointers/SharedPointerCuda.h
@@ -31,8 +31,8 @@ namespace Pointers {
 #if ! defined HAVE_CUDA_UNIFIED_MEMORY
 
 /**
- * \brief Specialization of the UniquePointer for the CUDA device.
- * 
+ * \brief Specialization of the \ref SharedPointer for the CUDA device.
+ *
  * \tparam  Object is a type of object to be owned by the pointer.
  */
 template< typename Object >
@@ -41,7 +41,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
    private:
       /**
        * \typedef Enabler
-       * 
+       *
        * Convenient template alias for controlling the selection of copy- and
        * move-constructors and assignment operators using SFINAE.
        * The type Object_ is "enabled" iff Object_ and Object are not the same,
@@ -58,7 +58,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
    public:
 
       /**
-       * \typedef ObjectType is the type of object owned by the pointer. 
+       * \typedef ObjectType is the type of object owned by the pointer.
        */
       using ObjectType = Object;
 
@@ -78,7 +78,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Constructor with parameters of the Object constructor.
-       * 
+       *
        * \tparam Args is variadic template type of arguments of the Object constructor.
        * \tparam args are arguments passed to the Object constructor.
        */
@@ -92,7 +92,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Copy constructor.
-       * 
+       *
        * \param pointer is the source shared pointer.
        */
       SharedPointer( const SharedPointer& pointer ) // this is needed only to avoid the default compiler-generated constructor
@@ -104,11 +104,11 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Copy constructor.
-       * 
+       *
        * This is specialization for compatible object types.
-       * 
+       *
        * See \ref Enabler.
-       * 
+       *
        * \param pointer is the source shared pointer.
        */
       template< typename Object_,
@@ -122,7 +122,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Move constructor.
-       * 
+       *
        * \param pointer is the source shared pointer.
        */
       SharedPointer( SharedPointer&& pointer ) // this is needed only to avoid the default compiler-generated constructor
@@ -135,11 +135,11 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Move constructor.
-       * 
+       *
        * This is specialization for compatible object types.
-       * 
+       *
        * See \ref Enabler.
-       * 
+       *
        * \param pointer is the source shared pointer.
        */
       template< typename Object_,
@@ -154,7 +154,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Create new object based in given constructor parameters.
-       * 
+       *
        * \tparam Args is variadic template type of arguments to be passed to the
        *    object constructor.
        * \param args are arguments to be passed to the object constructor.
@@ -191,9 +191,9 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Arrow operator for accessing the object owned by constant smart pointer.
-       * 
+       *
        * \return constant pointer to the object owned by this smart pointer. It
-       * returns pointer to object image on the CUDA device if it is called from CUDA 
+       * returns pointer to object image on the CUDA device if it is called from CUDA
        * kernel and pointer to host image otherwise.
        */
       __cuda_callable__
@@ -209,9 +209,9 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Arrow operator for accessing the object owned by non-constant smart pointer.
-       * 
+       *
        * \return pointer to the object owned by this smart pointer. It
-       * returns pointer to object image on the CUDA device if it is called from CUDA 
+       * returns pointer to object image on the CUDA device if it is called from CUDA
        * kernel and pointer to host image otherwise.
        */
       __cuda_callable__
@@ -228,9 +228,9 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Dereferencing operator for accessing the object owned by constant smart pointer.
-       * 
+       *
        * \return constant reference to the object owned by this smart pointer. It
-       * returns reference to object image on the CUDA device if it is called from CUDA 
+       * returns reference to object image on the CUDA device if it is called from CUDA
        * kernel and reference to host image otherwise.
        */
       __cuda_callable__
@@ -246,9 +246,9 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Dereferencing operator for accessing the object owned by non-constant smart pointer.
-       * 
+       *
        * \return reference to the object owned by this smart pointer.  It
-       * returns reference to object image on the CUDA device if it is called from CUDA 
+       * returns reference to object image on the CUDA device if it is called from CUDA
        * kernel and reference to host image otherwise.
        */
       __cuda_callable__
@@ -265,7 +265,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Conversion to boolean type.
-       * 
+       *
        * \return Returns true if the pointer is not empty, false otherwise.
        */
       __cuda_callable__
@@ -290,7 +290,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
        *
        * No synchronization of this pointer will be performed due to calling
        * this method.
-       * 
+       *
        * \tparam Device says what image of the object one want to dereference. It
        * can be either \ref DeviceType or Devices::Host.
        * \return constant reference to the object image on given device.
@@ -312,10 +312,10 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
        * \brief Non-constant object reference getter.
        *
        * After calling this method, the object owned by the pointer might need
-       * to be synchronized. One should not forget to call 
-       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >() 
+       * to be synchronized. One should not forget to call
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >()
        * before calling CUDA kernel using object from this smart pointer.
-       * 
+       *
        * \tparam Device says what image of the object one want to dereference. It
        * can be either \ref DeviceType or Devices::Host.
        * \return constant reference to the object image on given device.
@@ -338,9 +338,9 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Assignment operator.
-       * 
-       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
-       * 
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
        * \param ptr input pointer
        * \return constant reference to \ref this
        */
@@ -359,11 +359,11 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Assignment operator for compatible object types.
-       * 
-       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
-       * 
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
        * See \ref Enabler.
-       * 
+       *
        * \param ptr input pointer
        * \return constant reference to \ref this
        */
@@ -384,9 +384,9 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Move operator.
-       * 
-       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
-       * 
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
        * \param ptr input pointer
        * \return constant reference to \ref this
        */
@@ -405,11 +405,11 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Move operator.
-       * 
-       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
-       * 
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
        * See \ref Enabler.
-       * 
+       *
        * \param ptr input pointer
        * \return constant reference to \ref this
        */
@@ -430,10 +430,11 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 
       /**
        * \brief Cross-device pointer synchronization.
-       * 
-       * For the smart pointers in the host, this method does nothing.
-       * 
-       * \return true.
+       *
+       * This method is usually called by the smart pointers register when calling
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >()
+       *
+       * \return true if the synchronization was successful, false otherwise.
        */
       bool synchronize()
       {
@@ -560,8 +561,8 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       // unable to dereference this-pd on the device
       Object* cuda_pointer;
 };
-   
-#else 
+
+#else
 // Implementation with CUDA unified memory. It is very slow, we keep it only for experimental reasons.
 template< typename Object >
 class SharedPointer< Object, Devices::Cuda > : public SmartPointer
@@ -582,7 +583,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
    public:
 
       using ObjectType = Object;
-      using DeviceType = Devices::Cuda; 
+      using DeviceType = Devices::Cuda;
 
       SharedPointer( std::nullptr_t )
       : pd( nullptr )
@@ -712,7 +713,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       {
          this->free();
          this->pd = (PointerData*) ptr.pd;
-         if( this->pd != nullptr ) 
+         if( this->pd != nullptr )
             this->pd->counter += 1;
          return *this;
       }
diff --git a/src/TNL/Pointers/SharedPointerHost.h b/src/TNL/Pointers/SharedPointerHost.h
index 9e71205f5..ea8654d16 100644
--- a/src/TNL/Pointers/SharedPointerHost.h
+++ b/src/TNL/Pointers/SharedPointerHost.h
@@ -25,15 +25,15 @@ namespace TNL {
 namespace Pointers {
 
 /**
- * \brief Specialization of the UniquePointer for the host system.
- * 
+ * \brief Specialization of the \ref SharedPointer for the host system.
+ *
  * \tparam  Object is a type of object to be owned by the pointer.
  */
 template< typename Object >
 class SharedPointer< Object, Devices::Host > : public SmartPointer
 {
    private:
-      
+
       /**
        * \typedef Enabler
        * Convenient template alias for controlling the selection of copy- and
@@ -52,7 +52,7 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
    public:
 
       /**
-       * \typedef ObjectType is the type of object owned by the pointer. 
+       * \typedef ObjectType is the type of object owned by the pointer.
        */
       using ObjectType = Object;
 
@@ -63,7 +63,7 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
       using DeviceType = Devices::Host;
 
       /**
-       * \brief Constructor of empty pointer.
+       * \brief Constructor of an empty pointer.
        */
       SharedPointer( std::nullptr_t )
       : pd( nullptr )
@@ -71,7 +71,7 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Constructor with parameters of the Object constructor.
-       * 
+       *
        * \tparam Args is variadic template type of arguments of the Object constructor.
        * \tparam args are arguments passed to the Object constructor.
        */
@@ -87,7 +87,7 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Copy constructor.
-       * 
+       *
        * \param pointer is the source shared pointer.
        */
       SharedPointer( const SharedPointer& pointer ) // this is needed only to avoid the default compiler-generated constructor
@@ -98,11 +98,11 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Copy constructor.
-       * 
+       *
        * This is specialization for compatible object types.
-       * 
+       *
        * See \ref Enabler.
-       * 
+       *
        * \param pointer is the source shared pointer.
        */
       template< typename Object_,
@@ -115,7 +115,7 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Move constructor.
-       * 
+       *
        * \param pointer is the source shared pointer.
        */
       SharedPointer( SharedPointer&& pointer ) // this is needed only to avoid the default compiler-generated constructor
@@ -126,11 +126,11 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Move constructor.
-       * 
+       *
        * This is specialization for compatible object types.
-       * 
+       *
        * See \ref Enabler.
-       * 
+       *
        * \param pointer is the source shared pointer.
        */
       template< typename Object_,
@@ -143,7 +143,7 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Create new object based in given constructor parameters.
-       * 
+       *
        * \tparam Args is variadic template type of arguments to be passed to the
        *    object constructor.
        * \param args are arguments to be passed to the object constructor.
@@ -176,7 +176,7 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Arrow operator for accessing the object owned by constant smart pointer.
-       * 
+       *
        * \return constant pointer to the object owned by this smart pointer.
        */
       const Object* operator->() const
@@ -187,7 +187,7 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Arrow operator for accessing the object owned by non-constant smart pointer.
-       * 
+       *
        * \return pointer to the object owned by this smart pointer.
        */
       Object* operator->()
@@ -198,7 +198,7 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Dereferencing operator for accessing the object owned by constant smart pointer.
-       * 
+       *
        * \return constant reference to the object owned by this smart pointer.
        */
       const Object& operator *() const
@@ -209,7 +209,7 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Dereferencing operator for accessing the object owned by non-constant smart pointer.
-       * 
+       *
        * \return reference to the object owned by this smart pointer.
        */
       Object& operator *()
@@ -220,7 +220,7 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Conversion to boolean type.
-       * 
+       *
        * \return Returns true if the pointer is not empty, false otherwise.
        */
       operator bool() const
@@ -243,7 +243,7 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
        *
        * No synchronization of this pointer will be performed due to calling
        * this method.
-       * 
+       *
        * \tparam Device says what image of the object one want to dereference. It
        * can be either \ref DeviceType or Devices::Host.
        * \return constant reference to the object image on given device.
@@ -261,7 +261,7 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
        *
        * No synchronization of this pointer will be performed due to calling
        * this method.
-       * 
+       *
        * \tparam Device says what image of the object one want to dereference. It
        * can be either \ref DeviceType or Devices::Host.
        * \return constant reference to the object image on given device.
@@ -276,9 +276,9 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Assignment operator.
-       * 
-       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
-       * 
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
        * \param ptr input pointer
        * \return constant reference to \ref this
        */
@@ -293,11 +293,11 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Assignment operator for compatible object types.
-       * 
-       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
-       * 
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
        * See \ref Enabler.
-       * 
+       *
        * \param ptr input pointer
        * \return constant reference to \ref this
        */
@@ -314,9 +314,9 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Move operator.
-       * 
-       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
-       * 
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
        * \param ptr input pointer
        * \return constant reference to \ref this
        */
@@ -330,11 +330,11 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Move operator.
-       * 
-       * It assigns object owned by the pointer \ref ptr to \ref this pointer. 
-       * 
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
        * See \ref Enabler.
-       * 
+       *
        * \param ptr input pointer
        * \return constant reference to \ref this
        */
@@ -350,9 +350,9 @@ class SharedPointer< Object, Devices::Host > : public SmartPointer
 
       /**
        * \brief Cross-device pointer synchronization.
-       * 
-       * For the smart pointers in the host, this method does nothing.
-       * 
+       *
+       * For the smart pointers on the host, this method does nothing.
+       *
        * \return true.
        */
       bool synchronize()
-- 
GitLab


From 91166cb2e20b20f767c0516ed61095da75509d81 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 26 Nov 2019 13:19:42 +0100
Subject: [PATCH 43/43] Writing device pointer documentation.

---
 src/TNL/Pointers/DevicePointer.h | 408 ++++++++++++++++++++++++++-----
 1 file changed, 351 insertions(+), 57 deletions(-)

diff --git a/src/TNL/Pointers/DevicePointer.h b/src/TNL/Pointers/DevicePointer.h
index acac6aafa..df3e1d5c2 100644
--- a/src/TNL/Pointers/DevicePointer.h
+++ b/src/TNL/Pointers/DevicePointer.h
@@ -20,15 +20,31 @@
 #include <TNL/TypeInfo.h>
 #include <TNL/Cuda/MemoryHelpers.h>
 
-#include <cstring>
-#include <c++/8/bits/c++config.h>  // std::memcpy, std::memcmp
+#include <cstring>  // std::memcpy, std::memcmp
 
 namespace TNL {
 namespace Pointers {
 
-/***
- * The DevicePointer is like SharedPointer, except it takes an existing host
+/**
+ * \brief The DevicePointer is like SharedPointer, except it takes an existing host
  * object - there is no call to the ObjectType's constructor nor destructor.
+ *
+ * **NOTE: When using smart pointers to pass objects on GPU, one must call
+ * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >()
+ * before calling a CUDA kernel working with smart pointers.**
+ *
+ * \tparam Object is a type of object to be owned by the pointer.
+ * \tparam Device is device where the object is to be allocated. The object is
+ * always allocated on the host system as well for easier object manipulation.
+ *
+ * See also \ref UniquePointer and \ref SharedPointer.
+ *
+ * See also \ref DevicePointer< Object, Devices::Host > and \ref DevicePointer< Object, Devices::Cuda >.
+ *
+ * \par Example
+ * \include Pointers/DevicePointerExample.cpp
+ * \par Output
+ * \include DevicePointerExample.out
  */
 template< typename Object,
           typename Device = typename Object::DeviceType >
@@ -37,17 +53,22 @@ class DevicePointer
    static_assert( ! std::is_same< Device, void >::value, "The device cannot be void. You need to specify the device explicitly in your code." );
 };
 
-/****
- * Specialization for Devices::Host
+/**
+ * \brief Specialization of the \ref DevicePointer for the host system.
+ *
+ * \tparam  Object is a type of object to be owned by the pointer.
  */
 template< typename Object >
 class DevicePointer< Object, Devices::Host > : public SmartPointer
 {
    private:
-      // Convenient template alias for controlling the selection of copy- and
-      // move-constructors and assignment operators using SFINAE.
-      // The type Object_ is "enabled" iff Object_ and Object are not the same,
-      // but after removing const and volatile qualifiers they are the same.
+      /**
+       * \typedef Enabler
+       * Convenient template alias for controlling the selection of copy- and
+       * move-constructors and assignment operators using SFINAE.
+       * The type Object_ is "enabled" iff Object_ and Object are not the same,
+       * but after removing const and volatile qualifiers they are the same.
+       */
       template< typename Object_ >
       using Enabler = std::enable_if< ! std::is_same< Object_, Object >::value &&
                                       std::is_same< typename std::remove_cv< Object >::type, Object_ >::value >;
@@ -58,84 +79,161 @@ class DevicePointer< Object, Devices::Host > : public SmartPointer
 
    public:
 
-      typedef Object ObjectType;
-      typedef Devices::Host DeviceType;
+      /**
+       * \typedef ObjectType is the type of object owned by the pointer.
+       */
+      using ObjectType = Object;
 
       /**
-       * \brief Constructor of empty pointer.
+       * \typedef DeviceType is the type of device where the object is to be
+       * mirrored.
+       */
+      using DeviceType = Devices::Host;
+
+      /**
+       * \brief Constructor of an empty pointer.
        */
       DevicePointer( std::nullptr_t )
       : pointer( nullptr )
       {}
 
+      /**
+       * \brief Constructor with an object reference.
+       *
+       * \param obj reference to an object to be managed by the pointer.
+       */
       explicit  DevicePointer( ObjectType& obj )
       : pointer( nullptr )
       {
          this->pointer = &obj;
       }
 
-      // this is needed only to avoid the default compiler-generated constructor
-      DevicePointer( const DevicePointer& pointer )
+      /**
+       * \brief Copy constructor.
+       *
+       * \param pointer is the source device pointer.
+       */
+      DevicePointer( const DevicePointer& pointer ) // this is needed only to avoid the default compiler-generated constructor
       : pointer( pointer.pointer )
       {
       }
 
-      // conditional constructor for non-const -> const data
+      /**
+       * \brief Copy constructor.
+       *
+       * This is specialization for compatible object types.
+       *
+       * See \ref Enabler.
+       *
+       * \param pointer is the source device pointer.
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      DevicePointer( const DevicePointer< Object_, DeviceType >& pointer )
+      DevicePointer( const DevicePointer< Object_, DeviceType >& pointer ) // conditional constructor for non-const -> const data
       : pointer( pointer.pointer )
       {
       }
 
-      // this is needed only to avoid the default compiler-generated constructor
-      DevicePointer( DevicePointer&& pointer )
+      /**
+       * \brief Move constructor.
+       *
+       * \param pointer is the source device pointer.
+       */
+      DevicePointer( DevicePointer&& pointer ) // this is needed only to avoid the default compiler-generated constructor
       : pointer( pointer.pointer )
       {
          pointer.pointer = nullptr;
       }
 
-      // conditional constructor for non-const -> const data
+      /**
+       * \brief Move constructor.
+       *
+       * This is specialization for compatible object types.
+       *
+       * See \ref Enabler.
+       *
+       * \param pointer is the source device pointer.
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      DevicePointer( DevicePointer< Object_, DeviceType >&& pointer )
+      DevicePointer( DevicePointer< Object_, DeviceType >&& pointer ) // conditional constructor for non-const -> const data
       : pointer( pointer.pointer )
       {
          pointer.pointer = nullptr;
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by constant smart pointer.
+       *
+       * \return constant pointer to the object owned by this smart pointer.
+       */
       const Object* operator->() const
       {
          return this->pointer;
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by non-constant smart pointer.
+       *
+       * \return pointer to the object owned by this smart pointer.
+       */
       Object* operator->()
       {
          return this->pointer;
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by constant smart pointer.
+       *
+       * \return constant reference to the object owned by this smart pointer.
+       */
       const Object& operator *() const
       {
          return *( this->pointer );
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by non-constant smart pointer.
+       *
+       * \return reference to the object owned by this smart pointer.
+       */
       Object& operator *()
       {
          return *( this->pointer );
       }
 
+      /**
+       * \brief Conversion to boolean type.
+       *
+       * \return Returns true if the pointer is not empty, false otherwise.
+       */
       __cuda_callable__
       operator bool() const
       {
          return this->pointer;
       }
 
+      /**
+       * \brief Negation operator.
+       *
+       * \return Returns false if the pointer is not empty, true otherwise.
+       */
       __cuda_callable__
       bool operator!() const
       {
          return ! this->pointer;
       }
 
+      /**
+       * \brief Constant object reference getter.
+       *
+       * No synchronization of this pointer will be performed due to calling
+       * this method.
+       *
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       __cuda_callable__
       const Object& getData() const
@@ -143,6 +241,16 @@ class DevicePointer< Object, Devices::Host > : public SmartPointer
          return *( this->pointer );
       }
 
+      /**
+       * \brief Non-constant object reference getter.
+       *
+       * No synchronization of this pointer will be performed due to calling
+       * this method.
+       *
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       __cuda_callable__
       Object& modifyData()
@@ -150,40 +258,79 @@ class DevicePointer< Object, Devices::Host > : public SmartPointer
          return *( this->pointer );
       }
 
-      // this is needed only to avoid the default compiler-generated operator
-      const DevicePointer& operator=( const DevicePointer& ptr )
+      /**
+       * \brief Assignment operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
+      const DevicePointer& operator=( const DevicePointer& ptr ) // this is needed only to avoid the default compiler-generated operator
       {
          this->pointer = ptr.pointer;
          return *this;
       }
 
-      // conditional operator for non-const -> const data
+      /**
+       * \brief Assignment operator for compatible object types.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * See \ref Enabler.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const DevicePointer& operator=( const DevicePointer< Object_, DeviceType >& ptr )
+      const DevicePointer& operator=( const DevicePointer< Object_, DeviceType >& ptr ) // conditional operator for non-const -> const data
       {
          this->pointer = ptr.pointer;
          return *this;
       }
 
-      // this is needed only to avoid the default compiler-generated operator
-      const DevicePointer& operator=( DevicePointer&& ptr )
+      /**
+       * \brief Move operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
+      const DevicePointer& operator=( DevicePointer&& ptr ) // this is needed only to avoid the default compiler-generated operator
       {
          this->pointer = ptr.pointer;
          ptr.pointer = nullptr;
          return *this;
       }
 
-      // conditional operator for non-const -> const data
+      /**
+       * \brief Move operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * See \ref Enabler.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const DevicePointer& operator=( DevicePointer< Object_, DeviceType >&& ptr )
+      const DevicePointer& operator=( DevicePointer< Object_, DeviceType >&& ptr ) // conditional operator for non-const -> const data
       {
          this->pointer = ptr.pointer;
          ptr.pointer = nullptr;
          return *this;
       }
 
+      /**
+       * \brief Cross-device pointer synchronization.
+       *
+       * For the smart pointers on the host, this method does nothing.
+       *
+       * \return true.
+       */
       bool synchronize()
       {
          return true;
@@ -192,14 +339,16 @@ class DevicePointer< Object, Devices::Host > : public SmartPointer
       /**
        * \brief Swap the owned object with another pointer.
        *
-       * \param ptr2 the other shared pointer for swapping.
+       * \param ptr2 the other device pointer for swapping.
        */
       void swap( DevicePointer& ptr2 )
       {
          std::swap( this->pointer, ptr2.pointer );
       }
 
-
+      /**
+       * \brief Destructor.
+       */
       ~DevicePointer()
       {
       }
@@ -210,17 +359,23 @@ class DevicePointer< Object, Devices::Host > : public SmartPointer
       Object* pointer;
 };
 
-/****
- * Specialization for CUDA
+/**
+ * \brief Specialization of the \ref DevicePointer for the CUDA device.
+ *
+ * \tparam  Object is a type of object to be owned by the pointer.
  */
 template< typename Object >
 class DevicePointer< Object, Devices::Cuda > : public SmartPointer
 {
    private:
-      // Convenient template alias for controlling the selection of copy- and
-      // move-constructors and assignment operators using SFINAE.
-      // The type Object_ is "enabled" iff Object_ and Object are not the same,
-      // but after removing const and volatile qualifiers they are the same.
+      /**
+       * \typedef Enabler
+       *
+       * Convenient template alias for controlling the selection of copy- and
+       * move-constructors and assignment operators using SFINAE.
+       * The type Object_ is "enabled" iff Object_ and Object are not the same,
+       * but after removing const and volatile qualifiers they are the same.
+       */
       template< typename Object_ >
       using Enabler = std::enable_if< ! std::is_same< Object_, Object >::value &&
                                       std::is_same< typename std::remove_cv< Object >::type, Object_ >::value >;
@@ -231,8 +386,16 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
 
    public:
 
-      typedef Object ObjectType;
-      typedef Devices::Cuda DeviceType;
+      /**
+       * \typedef ObjectType is the type of object owned by the pointer.
+       */
+      using ObjectType = Object;
+
+      /**
+       * \typedef DeviceType is the type of device where the object is to be
+       * mirrored.
+       */
+      using DeviceType = Devices::Cuda;
 
       /**
        * \brief Constructor of empty pointer.
@@ -242,6 +405,11 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
         pd( nullptr ),
         cuda_pointer( nullptr ) {}
 
+      /**
+       * \brief Constructor with an object reference.
+       *
+       * \param obj is a reference on an object to be managed by the pointer.
+       */
       explicit  DevicePointer( ObjectType& obj )
       : pointer( nullptr ),
         pd( nullptr ),
@@ -250,8 +418,12 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          this->allocate( obj );
       }
 
-      // this is needed only to avoid the default compiler-generated constructor
-      DevicePointer( const DevicePointer& pointer )
+      /**
+       * \brief Copy constructor.
+       *
+       * \param pointer is the source device pointer.
+       */
+      DevicePointer( const DevicePointer& pointer ) // this is needed only to avoid the default compiler-generated constructor
       : pointer( pointer.pointer ),
         pd( (PointerData*) pointer.pd ),
         cuda_pointer( pointer.cuda_pointer )
@@ -259,10 +431,18 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          this->pd->counter += 1;
       }
 
-      // conditional constructor for non-const -> const data
+      /**
+       * \brief Copy constructor.
+       *
+       * This is specialization for compatible object types.
+       *
+       * See \ref Enabler.
+       *
+       * \param pointer is the source device pointer.
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      DevicePointer( const DevicePointer< Object_, DeviceType >& pointer )
+      DevicePointer( const DevicePointer< Object_, DeviceType >& pointer ) // conditional constructor for non-const -> const data
       : pointer( pointer.pointer ),
         pd( (PointerData*) pointer.pd ),
         cuda_pointer( pointer.cuda_pointer )
@@ -270,8 +450,12 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          this->pd->counter += 1;
       }
 
-      // this is needed only to avoid the default compiler-generated constructor
-      DevicePointer( DevicePointer&& pointer )
+      /**
+       * \brief Move constructor.
+       *
+       * \param pointer is the source device pointer.
+       */
+      DevicePointer( DevicePointer&& pointer ) // this is needed only to avoid the default compiler-generated constructor
       : pointer( pointer.pointer ),
         pd( (PointerData*) pointer.pd ),
         cuda_pointer( pointer.cuda_pointer )
@@ -281,10 +465,18 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          pointer.cuda_pointer = nullptr;
       }
 
-      // conditional constructor for non-const -> const data
+      /**
+       * \brief Move constructor.
+       *
+       * This is specialization for compatible object types.
+       *
+       * See \ref Enabler.
+       *
+       * \param pointer is the source device pointer.
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      DevicePointer( DevicePointer< Object_, DeviceType >&& pointer )
+      DevicePointer( DevicePointer< Object_, DeviceType >&& pointer ) // conditional constructor for non-const -> const data
       : pointer( pointer.pointer ),
         pd( (PointerData*) pointer.pd ),
         cuda_pointer( pointer.cuda_pointer )
@@ -294,6 +486,13 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          pointer.cuda_pointer = nullptr;
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by constant smart pointer.
+       *
+       * \return constant pointer to the object owned by this smart pointer. It
+       * returns pointer to object image on the CUDA device if it is called from CUDA
+       * kernel and pointer to host image otherwise.
+       */
       __cuda_callable__
       const Object* operator->() const
       {
@@ -304,6 +503,13 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
 #endif
       }
 
+      /**
+       * \brief Arrow operator for accessing the object owned by non-constant smart pointer.
+       *
+       * \return pointer to the object owned by this smart pointer. It
+       * returns pointer to object image on the CUDA device if it is called from CUDA
+       * kernel and pointer to host image otherwise.
+       */
       __cuda_callable__
       Object* operator->()
       {
@@ -315,6 +521,13 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
 #endif
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by constant smart pointer.
+       *
+       * \return constant reference to the object owned by this smart pointer. It
+       * returns reference to object image on the CUDA device if it is called from CUDA
+       * kernel and reference to host image otherwise.
+       */
       __cuda_callable__
       const Object& operator *() const
       {
@@ -325,6 +538,13 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
 #endif
       }
 
+      /**
+       * \brief Dereferencing operator for accessing the object owned by non-constant smart pointer.
+       *
+       * \return reference to the object owned by this smart pointer.  It
+       * returns reference to object image on the CUDA device if it is called from CUDA
+       * kernel and reference to host image otherwise.
+       */
       __cuda_callable__
       Object& operator *()
       {
@@ -336,18 +556,38 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
 #endif
       }
 
+      /**
+       * \brief Conversion to boolean type.
+       *
+       * \return Returns true if the pointer is not empty, false otherwise.
+       */
       __cuda_callable__
       operator bool() const
       {
          return this->pd;
       }
 
+      /**
+       * \brief Negation operator.
+       *
+       * \return Returns false if the pointer is not empty, true otherwise.
+       */
       __cuda_callable__
       bool operator!() const
       {
          return ! this->pd;
       }
 
+      /**
+       * \brief Constant object reference getter.
+       *
+       * No synchronization of this pointer will be performed due to calling
+       * this method.
+       *
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       __cuda_callable__
       const Object& getData() const
@@ -362,6 +602,18 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
             return *( this->cuda_pointer );
       }
 
+      /**
+       * \brief Non-constant object reference getter.
+       *
+       * After calling this method, the object owned by the pointer might need
+       * to be synchronized. One should not forget to call
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >()
+       * before calling CUDA kernel using object from this smart pointer.
+       *
+       * \tparam Device says what image of the object one want to dereference. It
+       * can be either \ref DeviceType or Devices::Host.
+       * \return constant reference to the object image on given device.
+       */
       template< typename Device = Devices::Host >
       __cuda_callable__
       Object& modifyData()
@@ -379,8 +631,15 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
             return *( this->cuda_pointer );
       }
 
-      // this is needed only to avoid the default compiler-generated operator
-      const DevicePointer& operator=( const DevicePointer& ptr )
+      /**
+       * \brief Assignment operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
+      const DevicePointer& operator=( const DevicePointer& ptr ) // this is needed only to avoid the default compiler-generated operator
       {
          this->free();
          this->pointer = ptr.pointer;
@@ -391,10 +650,19 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          return *this;
       }
 
-      // conditional operator for non-const -> const data
+      /**
+       * \brief Assignment operator for compatible object types.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * See \ref Enabler.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const DevicePointer& operator=( const DevicePointer< Object_, DeviceType >& ptr )
+      const DevicePointer& operator=( const DevicePointer< Object_, DeviceType >& ptr ) // conditional operator for non-const -> const data
       {
          this->free();
          this->pointer = ptr.pointer;
@@ -405,8 +673,15 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          return *this;
       }
 
-      // this is needed only to avoid the default compiler-generated operator
-      const DevicePointer& operator=( DevicePointer&& ptr )
+      /**
+       * \brief Move operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
+      const DevicePointer& operator=( DevicePointer&& ptr ) // this is needed only to avoid the default compiler-generated operator
       {
          this->free();
          this->pointer = ptr.pointer;
@@ -418,10 +693,19 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          return *this;
       }
 
-      // conditional operator for non-const -> const data
+      /**
+       * \brief Move operator.
+       *
+       * It assigns object owned by the pointer \ref ptr to \ref this pointer.
+       *
+       * See \ref Enabler.
+       *
+       * \param ptr input pointer
+       * \return constant reference to \ref this
+       */
       template< typename Object_,
                 typename = typename Enabler< Object_ >::type >
-      const DevicePointer& operator=( DevicePointer< Object_, DeviceType >&& ptr )
+      const DevicePointer& operator=( DevicePointer< Object_, DeviceType >&& ptr ) // conditional operator for non-const -> const data
       {
          this->free();
          this->pointer = ptr.pointer;
@@ -433,6 +717,14 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          return *this;
       }
 
+      /**
+       * \brief Cross-device pointer synchronization.
+       *
+       * This method is usually called by the smart pointers register when calling
+       * \ref Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >()
+       *
+       * \return true if the synchronization was successful, false otherwise.
+       */
       bool synchronize()
       {
          if( ! this->pd )
@@ -456,7 +748,7 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
       /**
        * \brief Swap the owned object with another pointer.
        *
-       * \param ptr2 the other shared pointer for swapping.
+       * \param ptr2 the other device pointer for swapping.
        */
       void swap( DevicePointer& ptr2 )
       {
@@ -465,7 +757,9 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          std::swap( this->cuda_pointer, ptr2.cuda_pointer );
       }
 
-
+      /**
+       * \brief Destructor.
+       */
       ~DevicePointer()
       {
          this->free();
-- 
GitLab