diff --git a/.gitignore b/.gitignore
index 95228a33e88d9365ddb32d498eea03a54192e3f0..26b082851c1968ea8a0896a009be495c73f884b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,44 +1,8 @@
-
-# /
-/build
-/missing
-/Makefile.in
-/ltmain.sh
-/install-sh
-/depcomp
-/configure
-/config.*
-/aclocal.m4
-/m4
-/autom4te.cache
+.settings
+/nbproject
 /Debug
 /Release
-.settings
-
-.settings
-
-# /src/
-/src/Makefile.in
-
-# /src/core/
-/src/core/Makefile.in
-
-# /src/debug/
-/src/debug/Makefile.in
-
-# /src/diff/
-/src/diff/Makefile.in
-
-# /src/matrix/
-/src/matrix/*.in
-
-# /tools/
-/tools/Makefile.in
-
-# /tools/share/
-/tools/share/Makefile.in
-
-# /tools/src/
-/tools/src/Makefile.in
 /Testing
 /CMakeLists.txt.user
+/doc/_build
+/Build
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d343d7ff6caa8defdd9897ef041e6502beee288a..2d21da4e56e7651962cf73363c667ced452b18e8 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,23 +15,22 @@ include( UseCodeCoverage )
 if( CMAKE_BUILD_TYPE STREQUAL "Debug")
     set( PROJECT_BUILD_PATH ${PROJECT_SOURCE_DIR}/Debug/src )
     set( PROJECT_TESTS_PATH ${PROJECT_SOURCE_DIR}/Debug/tests )
+    set( PROJECT_TOOLS_PATH ${PROJECT_SOURCE_DIR}/Debug/tools )
     set( LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/Debug/lib )
     set( EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/Debug/bin )
     set( debugExt -dbg )
-    AddCompilerFlag( "-g" )
+    set( CXX_FLAGS "${CXXFLAGS} -g ")
+    #AddCompilerFlag( "-g" )
 else()
     set( PROJECT_BUILD_PATH ${PROJECT_SOURCE_DIR}/Release/src )
     set( PROJECT_TESTS_PATH ${PROJECT_SOURCE_DIR}/Release/tests )
+    set( PROJECT_TOOLS_PATH ${PROJECT_SOURCE_DIR}/Release/tools )
     set( LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/Release/lib)
     set( EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/Release/bin)
-    OptimizeForArchitecture()
-    AddCompilerFlag( "-O3 -DNDEBUG" )
+    #OptimizeForArchitecture()
+    AddCompilerFlag( "-O3 -march=native -DNDEBUG" )
 endif()
 
-if( WITH_TEMPLATE_EXPLICIT_INSTANTIATION STREQUAL "yes" )
-   AddCompilerFlag( "-DTEMPLATE_EXPLICIT_INSTANTIATION " )
-endif()   
-
 #####
 # Check for CUDA
 #
@@ -44,10 +43,60 @@ if( WITH_CUDA STREQUAL "yes" )
         set(CUDA_SEPARABLE_COMPILATION ON)
         set(CUSPARSE_LIBRARY /usr/local/cuda/lib64/libcusparse.so) # TODO: fix this              
         set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -DHAVE_CUDA )
-        AddCompilerFlag( "-DHAVE_NOT_CXX11 -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128 " )          
-        set( CUDA_ADD_EXECUTABLE_OPTIONS -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 )
-        set( CUDA_ADD_LIBRARY_OPTIONS -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -shared )
+        AddCompilerFlag( "-DHAVE_NOT_CXX11" ) # -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128 " )
+        set( ALL_CUDA_ARCHS -gencode arch=compute_20,code=sm_20
+                            -gencode arch=compute_30,code=sm_30
+                            -gencode arch=compute_32,code=sm_32 
+                            -gencode arch=compute_37,code=sm_37 
+                            -gencode arch=compute_37,code=sm_37 
+                            -gencode arch=compute_50,code=sm_50 
+                            -gencode arch=compute_52,code=sm_52 )
+        if( WITH_CUDA_ARCH STREQUAL "all" )
+           set( CUDA_ARCH ${ALL_CUDA_ARCHS} )   
+        else()
+            if( WITH_CUDA_ARCH STREQUAL "auto")
+                ####
+                # Select GPU architecture
+                #
+                set( CUDA_ARCH_EXECUTABLE ${EXECUTABLE_OUTPUT_PATH}/tnl-cuda-arch)
+                set( CUDA_ARCH_SOURCE ${PROJECT_SOURCE_DIR}/tools/src/tnl-cuda-arch.cu)
+                message( "Compiling tnl-cuda-arch ..." )
+                file( MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} )
+                execute_process( COMMAND nvcc ${CUDA_ARCH_SOURCE} -o ${CUDA_ARCH_EXECUTABLE}
+                                 RESULT_VARIABLE CUDA_ARCH_RESULT
+                                 OUTPUT_VARIABLE CUDA_ARCH_OUTPUT
+                                 ERROR_VARIABLE CUDA_ARCH_OUTPUT )
+                execute_process( COMMAND ${CUDA_ARCH_EXECUTABLE}
+                                 OUTPUT_VARIABLE CUDA_ARCH )
+                if( NOT CUDA_ARCH_RESULT )
+                    # strip linebreaks and convert to list delimited with ';'
+                    string( REGEX REPLACE "[\n ]" ";" CUDA_ARCH ${CUDA_ARCH} )
+                    # cache the result
+                    set( CUDA_ARCH ${CUDA_ARCH} CACHE LIST "GPU architecture options" )
+                else()
+                    message( "Failed to detect GPU architecture:\n${CUDA_ARCH_OUTPUT}" )
+                    message( "Using (almost) all GPU architectures as fallback." )
+                    set( CUDA_ARCH ${ALL_CUDA_ARCHS} )
+                endif()
+                message( "GPU architecture options:  ${CUDA_ARCH}" )
+            else()
+                set( CUDA_ARCH -gencode arch=compute_${WITH_CUDA_ARCH},code=sm_${WITH_CUDA_ARCH} )
+            endif()
+        endif()
+        set( CUDA_ADD_EXECUTABLE_OPTIONS ${CUDA_ARCH} )
+        set( CUDA_ADD_LIBRARY_OPTIONS ${CUDA_ARCH} -shared )
         set( CUDA_LINKER_OPTIONS "-arch sm_20 -shared " )
+
+
+        ####
+        # Check for cuBLAS
+        #
+        if( WITH_CUBLAS STREQUAL "yes" ) 
+            message( "Enabling CUBLAS." )
+            set( HAVE_CUBLAS TRUE)
+            set( HAVE_CUBLAS "#define HAVE_CUBLAS" )
+        endif( WITH_CUBLAS STREQUAL "yes" )       
+
         ####
         # Check for CUSP
         #
@@ -85,6 +134,7 @@ if( WITH_CUDA STREQUAL "yes" )
       AddCompilerFlag( "-std=gnu++0x" )         
     endif( CUDA_FOUND )
 else( WITH_CUDA STREQUAL "yes" )
+   #AddCompilerFlag( "-std=gnu++0x -ftree-vectorizer-verbose=1" )       
    AddCompilerFlag( "-std=gnu++0x" )       
 endif( WITH_CUDA STREQUAL "yes" )    
 
@@ -93,7 +143,8 @@ endif( WITH_CUDA STREQUAL "yes" )
 #
 find_package( OpenMP ) 
 if( OPENMP_FOUND )
-   AddCompilerFlag( "-DHAVE_OPENMP -fopenmp" )
+   #AddCompilerFlag( "-DHAVE_OPENMP -fopenmp" )
+   set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_OPENMP -fopenmp")
 # TODO: finish this
 endif()
 
@@ -139,51 +190,55 @@ endif()
 ####
 # Check for cppunit
 #
-FIND_PATH(CPPUNIT_INCLUDE_DIR cppunit/TestCase.h
-  /usr/local/include
-  /usr/include
-  DOC "CppUnit headers."
-)
+if( WITH_TESTS STREQUAL "yes" )
+    FIND_PATH(CPPUNIT_INCLUDE_DIR cppunit/TestCase.h
+      /usr/local/include
+      /usr/include
+      DOC "CppUnit headers."
+    )
 
-####
-# With Win32, important to have both
-#
-if(WIN32)
-  FIND_LIBRARY(CPPUNIT_LIBRARY cppunit
-               ${CPPUNIT_INCLUDE_DIR}/../lib
-               /usr/local/lib
-               /usr/lib)
-  FIND_LIBRARY(CPPUNIT_DEBUG_LIBRARY cppunitd
-               ${CPPUNIT_INCLUDE_DIR}/../lib
-               /usr/local/lib
-               /usr/lib)
-else(WIN32)
-  # On unix system, debug and release have the same name
-  FIND_LIBRARY(CPPUNIT_LIBRARY cppunit
-               ${CPPUNIT_INCLUDE_DIR}/../lib
-               /usr/local/lib
-               /usr/lib)
-  FIND_LIBRARY(CPPUNIT_DEBUG_LIBRARY cppunit
-               ${CPPUNIT_INCLUDE_DIR}/../lib
-               /usr/local/lib
-               /usr/lib)
-endif(WIN32)
-
-
-if( ${CPPUNIT_INCLUDE_DIR} STREQUAL "CPPUNIT_INCLUDE_DIR-NOTFOUND" )
-      message( "CPPUNIT not found." )
-      set( HAVE_CPPUNIT "//#define HAVE_CPPUNIT" )
-else( ${CPPUNIT_INCLUDE_DIR} STREQUAL "CPPUNIT_INCLUDE_DIR-NOTFOUND" )
-  message( "CPPUNIT headers found -- ${CPPUNIT_INCLUDE_DIR}" )
-  if(CPPUNIT_LIBRARY)
-    message( "CPPUNIT library found -- ${CPPUNIT_LIBRARY}" )
-    set(CPPUNIT_FOUND "YES")
-    set(CPPUNIT_LIBRARIES ${CPPUNIT_LIBRARY} ${CMAKE_DL_LIBS})
-    set(CPPUNIT_DEBUG_LIBRARIES ${CPPUNIT_DEBUG_LIBRARY}
-                                ${CMAKE_DL_LIBS})
-   set( HAVE_CPPUNIT "#define HAVE_CPPUNIT" )
-  endif(CPPUNIT_LIBRARY)
-endif( ${CPPUNIT_INCLUDE_DIR} STREQUAL "CPPUNIT_INCLUDE_DIR-NOTFOUND" )
+    ####
+    # With Win32, important to have both
+    #
+    if(WIN32)
+      FIND_LIBRARY(CPPUNIT_LIBRARY cppunit
+                   ${CPPUNIT_INCLUDE_DIR}/../lib
+                   /usr/local/lib
+                   /usr/lib)
+      FIND_LIBRARY(CPPUNIT_DEBUG_LIBRARY cppunitd
+                   ${CPPUNIT_INCLUDE_DIR}/../lib
+                   /usr/local/lib
+                   /usr/lib)
+    else(WIN32)
+      # On unix system, debug and release have the same name
+      FIND_LIBRARY(CPPUNIT_LIBRARY cppunit
+                   ${CPPUNIT_INCLUDE_DIR}/../lib
+                   /usr/local/lib
+                   /usr/lib)
+      FIND_LIBRARY(CPPUNIT_DEBUG_LIBRARY cppunit
+                   ${CPPUNIT_INCLUDE_DIR}/../lib
+                   /usr/local/lib
+                   /usr/lib)
+    endif(WIN32)
+
+
+    if( ${CPPUNIT_INCLUDE_DIR} STREQUAL "CPPUNIT_INCLUDE_DIR-NOTFOUND" )
+          message( "CPPUNIT not found." )
+          set( HAVE_CPPUNIT "//#define HAVE_CPPUNIT" )
+    else( ${CPPUNIT_INCLUDE_DIR} STREQUAL "CPPUNIT_INCLUDE_DIR-NOTFOUND" )
+      message( "CPPUNIT headers found -- ${CPPUNIT_INCLUDE_DIR}" )
+      if(CPPUNIT_LIBRARY)
+        message( "CPPUNIT library found -- ${CPPUNIT_LIBRARY}" )
+        set(CPPUNIT_FOUND "YES")
+        set(CPPUNIT_LIBRARIES ${CPPUNIT_LIBRARY} ${CMAKE_DL_LIBS})
+        set(CPPUNIT_DEBUG_LIBRARIES ${CPPUNIT_DEBUG_LIBRARY}
+                                    ${CMAKE_DL_LIBS})
+       set( HAVE_CPPUNIT "#define HAVE_CPPUNIT" )
+      endif(CPPUNIT_LIBRARY)
+    endif( ${CPPUNIT_INCLUDE_DIR} STREQUAL "CPPUNIT_INCLUDE_DIR-NOTFOUND" )
+    ENABLE_TESTING()
+    INCLUDE( Dart )
+endif( WITH_TESTS STREQUAL "yes" )
 
 #if( BUILD_MPI )
 #   FIND_PATH( PETSC_INCLUDE_DIR petsc.h
@@ -210,9 +265,33 @@ endif( ${CPPUNIT_INCLUDE_DIR} STREQUAL "CPPUNIT_INCLUDE_DIR-NOTFOUND" )
 #   endif()
 #endif()
 
+####
+# Explicit template instantiation
+#
+if( WITH_TEMPLATE_INSTANTIATION STREQUAL "yes" )
+   AddCompilerFlag( "-DTEMPLATE_EXPLICIT_INSTANTIATION " )
+endif()   
+
+if( INSTANTIATE_INT STREQUAL "yes" )
+   AddCompilerFlag( "-DINSTANTIATE_INT " )
+endif()   
+
+if( INSTANTIATE_LONG_INT STREQUAL "yes" )
+   AddCompilerFlag( "-DINSTANTIATE_LONG_INT " )
+endif()   
+
+if( INSTANTIATE_FLOAT STREQUAL "yes" )
+   AddCompilerFlag( "-DINSTANTIATE_FLOAT " )
+endif()   
+
+if( INSTANTIATE_DOUBLE STREQUAL "yes" )
+   AddCompilerFlag( "-DINSTANTIATE_DOUBLE " )
+endif()   
+
+if( INSTANTIATE_LONG_DOUBLE STREQUAL "yes" )
+   AddCompilerFlag( "-DINSTANTIATE_LONG_DOUBLE " )
+endif()   
 
-ENABLE_TESTING()
-INCLUDE( Dart )
 set( CXX_TEST_FLAGS "-fprofile-arcs -ftest-coverage" )
 set( LD_TEST_FLAGS "-lgcov -coverage" )
 
diff --git a/TODO b/TODO
index 410049b00d07ddf33a2ec43ab8ef72d12246c678..c8ed457124545ba0f48fc89c85a1777e92ca1339 100644
--- a/TODO
+++ b/TODO
@@ -1,3 +1,5 @@
+TODO: v tnlMeshResolver se provadi preklad pro vsechny mozne sablonove parametry => prorezat
+
 TODO: napsat FunctionDiscretizer pro jednotne rozhrani RightHandSide
 
 TODO: doplnit mesh travelsals pro jine mesh entity nez cell
diff --git a/build b/build
new file mode 100755
index 0000000000000000000000000000000000000000..cf269c8138fe75ea889184025d938a10bb04f52c
--- /dev/null
+++ b/build
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+TARGET=TNL
+PREFIX=${HOME}/local
+WITH_CUDA="yes"
+WITH_TESTS="yes"
+
+WITH_CUDA_ARCH="auto"
+WITH_CUBLAS="no"
+WITH_TEMPLATE_INSTANTIATION="yes"
+INSTANTIATE_LONG_INT="yes"
+INSTANTIATE_INT="yes"
+INSTANTIATE_LONG_DOUBLE="yes"
+INSTANTIATE_DOUBLE="yes"
+INSTANTIATE_FLOAT="yes"
+CMAKE="cmake"
+CMAKE_ONLY="no"
+HELP="no"
+VERBOSE=""
+ROOT_DIR="."
+BUILD_JOBS=`grep -c processor /proc/cpuinfo`
+
+for option in "$@"
+do
+    case $option in
+        --prefix=*                     ) PREFIX="${option#*=}" ;;
+        --build=*                      ) BUILD="${option#*=}" ;;
+        --with-tests=*                 ) WITH_TESTS="${option#*=}" ;;
+        --with-cuda=*                  ) WITH_CUDA="${option#*=}" ;;
+        --with-cublas=*                ) WITH_CUBLAS="${option#*=}" ;;
+        --with-cuda-arch=*             ) WITH_CUDA_ARCH="${option#*=}";;
+        --with-templates-instantiation ) WITH_TEMPLATE_INSTANTIATION="${option#*=}" ;;
+        --instantiate-long-int=*       ) INSTANTIATE_LONG_INT="${option#*=}" ;;
+        --instantiate-int=*            ) INSTANTIATE_INT="${option#*=}" ;;
+        --instantiate-long-double=*    ) INSTANTIATE_LONG_DOUBLE="${option#*=}" ;;
+        --instantiate-double=*         ) INSTANTIATE_DOUBLE="${option#*=}" ;;
+        --instantiate-float=*          ) INSTANTIATE_FLOAT="${option#*=}" ;;
+        --fast-build                   ) INSTANTIATE_LONG_INT="no"
+                                         INSTANTIATE_INT="yes"
+                                         INSTANTIATE_LONG_DOUBLE="no"
+                                         INSTANTIATE_DOUBLE="yes"
+                                         INSTANTIATE_FLOAT="no"
+                                         WITH_CUDA_ARCH="auto" ;;
+        --with-cmake=*                 ) CMAKE="${option#*=}" ;;
+        --build-jobs=*                 ) BUILD_JOBS="${option#*=}" ;;
+        --cmake-only=*                 ) CMAKE_ONLY="${option#*=}" ;;
+        --verbose                      ) VERBOSE="VERBOSE=1" ;;
+        --root-dir=*                   ) ROOT_DIR="${option#*=}" ;;
+        --help                         ) HELP="yes" ;;
+        *                              ) 
+           echo "Unknown option ${option}. Use --help for more information."
+           exit 1 ;;
+    esac
+done
+
+if test ${HELP} = "yes";
+then
+    echo "TNL build options:"
+    echo ""
+    echo "   --prefix=PATH                         Prefix for the installation directory. ${HOME}/local by default."
+    echo "   --build=Debug/Release                 Build type."
+    echo "   --with-tests=yes/no                   Enable unit tests. 'yes' by default (libcppunit-dev is required)."
+    echo "   --with-cuda=yes/no                    Enable CUDA. 'yes' by default (CUDA Toolkit is required)."
+    echo "   --with-cuda-arch=all/auto/30/35/...   Choose CUDA architecture."   
+    echo "   --with-templates-instantiation=yes/no Some TNL templates are precompiled during the build. 'yes' by default."
+    echo "   --with-cmake=CMAKE                    Path to cmake. 'cmake' by default."
+    echo "   --build-jobs=NUM                      Number of processes to be used for the build. It is set to a number of CPU cores by default."
+    echo "   --verbose                             It enables verbose build."
+    echo "   --root-dir=PATH                       Path to the TNL source code root dir."
+    echo "   --help                                Write this help."
+    exit 1
+fi
+
+echo "Configuring ${BUILD} $TARGET ..."
+
+${CMAKE} ${ROOT_DIR} \
+         -DCMAKE_BUILD_TYPE=${BUILD} \
+         -DCMAKE_INSTALL_PREFIX=${PREFIX} \
+         -DWITH_CUDA=${WITH_CUDA} \
+         -DWITH_CUDA_ARCH=${WITH_CUDA_ARCH} \
+         -DWITH_CUBLAS=${WITH_CUBLAS} \
+         -DWITH_TESTS=${WITH_TESTS} \
+         -DPETSC_DIR=${PETSC_DIR} \
+         -DWITH_TEMPLATE_INSTANTIATION=${WITH_TEMPLATE_INSTANTIATION} \
+         -DINSTANTIATE_FLOAT=${INSTANTIATE_FLOAT} \
+         -DINSTANTIATE_DOUBLE=${INSTANTIATE_DOUBLE} \
+         -DINSTANTIATE_LONG_DOUBLE=${INSTANTIATE_LONG_DOUBLE} \
+         -DINSTANTIATE_INT=${INSTANTIATE_INT} \
+         -DINSTANTIATE_LONG_INT=${INSTANTIATE_LONG_INT}
+
+if test ${CMAKE_ONLY} = "yes";
+then
+    exit 1
+fi
+
+echo "Building ${BUILD} $TARGET using $BUILD_JOBS processors ..."
+
+make -j${BUILD_JOBS} ${VERBOSE}
+
+if test WITH_TESTS = "yes";
+then
+    make -j${BUILD_JOBS} test
+fi
+
+exit 0
\ No newline at end of file
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..0c09d9529610ac26a27bb1783e8d003721ad3315
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,177 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/TNL.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/TNL.qhc"
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/TNL"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/TNL"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdcf0336e26e9c90b32d3bfe140235ed9178156f
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,331 @@
+# -*- coding: utf-8 -*-
+#
+# TNL documentation build configuration file, created by
+# sphinx-quickstart on Sun Mar 29 13:12:39 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.coverage',
+    'sphinx.ext.mathjax',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'TNL'
+copyright = u'2015, Tomáš Oberhuber'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '0.1'
+# The full version, including alpha/beta/rc tags.
+release = '0.1'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'TNLdoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  ('index', 'TNL.tex', u'TNL Documentation',
+   u'Tomáš Oberhuber', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'tnl', u'TNL Documentation',
+     [u'Tomáš Oberhuber'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  ('index', 'TNL', u'TNL Documentation',
+   u'Tomáš Oberhuber', 'TNL', 'One line description of project.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
+
+
+# -- Options for Epub output ----------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = u'TNL'
+epub_author = u'Tomáš Oberhuber'
+epub_publisher = u'Tomáš Oberhuber'
+epub_copyright = u'2015, Tomáš Oberhuber'
+
+# The basename for the epub file. It defaults to the project name.
+#epub_basename = u'TNL'
+
+# The HTML theme for the epub output. Since the default themes are not optimized
+# for small screen space, using the same theme for HTML and epub output is
+# usually not wise. This defaults to 'epub', a theme designed to save visual
+# space.
+#epub_theme = 'epub'
+
+# The language of the text. It defaults to the language option
+# or en if the language is not set.
+#epub_language = ''
+
+# The scheme of the identifier. Typical schemes are ISBN or URL.
+#epub_scheme = ''
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#epub_identifier = ''
+
+# A unique identification for the text.
+#epub_uid = ''
+
+# A tuple containing the cover image and cover page html template filenames.
+#epub_cover = ()
+
+# A sequence of (type, uri, title) tuples for the guide element of content.opf.
+#epub_guide = ()
+
+# HTML files that should be inserted before the pages created by sphinx.
+# The format is a list of tuples containing the path and title.
+#epub_pre_files = []
+
+# HTML files shat should be inserted after the pages created by sphinx.
+# The format is a list of tuples containing the path and title.
+#epub_post_files = []
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+# The depth of the table of contents in toc.ncx.
+#epub_tocdepth = 3
+
+# Allow duplicate toc entries.
+#epub_tocdup = True
+
+# Choose between 'default' and 'includehidden'.
+#epub_tocscope = 'default'
+
+# Fix unsupported image types using the PIL.
+#epub_fix_images = False
+
+# Scale large images.
+#epub_max_image_width = 0
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#epub_show_urls = 'inline'
+
+# If false, no index is generated.
+#epub_use_index = True
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e76c24afab71f694da91ad2ea6b4764ca75d3b88
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,27 @@
+.. TNL documentation master file, created by
+   sphinx-quickstart on Sun Mar 29 13:12:39 2015.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to TNL's documentation!
+===============================
+
+Contents:
+
+.. toctree::
+   :maxdepth: 2
+
+   Introduction to TNL <intro>
+   Installation <install>
+   Users guid to PDE solvers <pde-solvers>
+
+   
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
diff --git a/doc/install.rst b/doc/install.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dc8261a102d61036f38566ea6f258e02759c1f46
--- /dev/null
+++ b/doc/install.rst
@@ -0,0 +1,8 @@
+============
+Installation
+============
+
+TNL can be downloaded from GitHub.
+
+
+
diff --git a/doc/intro.rst b/doc/intro.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d00f28855a518b785c947006a79002f1177acd5c
--- /dev/null
+++ b/doc/intro.rst
@@ -0,0 +1,33 @@
+============
+Introduction
+============
+
+TNL means *Template Numerical Library*. Aim of this project is to develop *efficient, flexible and easy to use* numerical library.
+
+**Efficiency**
+   Complex numerical simulations may take hundreds of hours. Fast and efficient solvers are therefore very important. TNL is designed to profit from abilities of new accelerators like GPUs (NVidia GeForce, Tesla) and MICs (Xeon Phi). To generate efficient executables, we avoid use of virtual methods on low levels of the code. Instead, C++ templates are used. 
+
+**Flexibility**
+   Development of new numerical schemes and solvers often requires to test many different approaches. Thanks to C++ templates and the design of TNL, it should be quite easy to switch between different schemes, solvers, meshes, precision of the floating point arithmetics or parallel architectures.
+
+**Easy to use**
+   Thanks to C++ templates, TNL offers automatic set-up of underlying structures (numerical meshes, sparse matrices, etc.), solvers (linear solvers, Runge-Kutta solvers, PDE solvers) and parallel architectures (GPU, MIC or MPI (not implemented yet)). TNL can also manage configuration parameters passed from the command line. The user may then concentrate only on the numerical model. 
+
+:Authors:
+   **Tomáš Oberhuber** - TNL design
+
+   **Vítězslav Žabka** - unstructured numerical mesh
+
+   **Vladimír Klement** - multigrid methods
+
+   **Tomáš Sobotík** - numerical methods for signed distance function
+
+   **Ondřej Székely** - FDM solvers for non-linear diffusion problems
+
+   **Libor Bakajsa** - sparse matrix formats for GPUs
+
+   **Jan Vacata** - sparse matrix formats for GPUs
+
+   **Martin Heller** - sparse matrix formats for GPUs
+
+   **Matěj Novotný** - quad double arithmetics
\ No newline at end of file
diff --git a/doc/pde-solvers.rst b/doc/pde-solvers.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6f7eed46d377e19862f297d31ec6d70f3846ed56
--- /dev/null
+++ b/doc/pde-solvers.rst
@@ -0,0 +1,3 @@
+===========
+PDE Solvers
+===========
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 6063a109864aaa2e0aec4e9e4b017def62c202a9..80e1cf9833e94a871c812d04cc1ede21ab3ebaa3 100755
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,5 +1,3 @@
-add_subdirectory( make-project )
-add_subdirectory( simple-solver )
 add_subdirectory( heat-equation )
 add_subdirectory( navier-stokes )
 add_subdirectory( mean-curvature-flow )
diff --git a/examples/heat-equation/tnl-heat-equation-eoc.h b/examples/heat-equation/tnl-heat-equation-eoc.h
index 21157bfaaeb3fd759da28fd7be323824525c1b28..7001a2d71c0dc45166468a752844847794c0cc82 100644
--- a/examples/heat-equation/tnl-heat-equation-eoc.h
+++ b/examples/heat-equation/tnl-heat-equation-eoc.h
@@ -19,16 +19,16 @@
 #define TNL_HEAT_EQUATION_EOC_H_
 
 #include <solvers/tnlSolver.h>
-#include <solvers/tnlFastBuildConfig.h>
-#include <solvers/tnlConfigTags.h>
-#include <functions/tnlTestFunction.h>
+#include <solvers/tnlFastBuildConfigTag.h>
+#include <solvers/tnlBuildConfigTags.h>
+#include <functors/tnlTestFunction.h>
 #include <operators/diffusion/tnlLinearDiffusion.h>
 #include <operators/diffusion/tnlExactLinearDiffusion.h>
 #include <operators/tnlAnalyticDirichletBoundaryConditions.h>
 #include <problems/tnlHeatEquationEocRhs.h>
 #include <problems/tnlHeatEquationEocProblem.h>
 
-//typedef tnlDefaultConfigTag BuildConfig;
+//typedef tnlDefaultBuildConfigTag BuildConfig;
 typedef tnlFastBuildConfig BuildConfig;
 
 template< typename ConfigTag >
diff --git a/examples/heat-equation/tnl-heat-equation.h b/examples/heat-equation/tnl-heat-equation.h
index 9eaf50c9e7bd583d899ad100e1741d177cb5121b..77c3121bc4718c06921cb67789224054ff1ed041 100644
--- a/examples/heat-equation/tnl-heat-equation.h
+++ b/examples/heat-equation/tnl-heat-equation.h
@@ -19,17 +19,17 @@
 #define TNL_HEAT_EQUATION_H_
 
 #include <solvers/tnlSolver.h>
-#include <solvers/tnlFastBuildConfig.h>
-#include <solvers/tnlConfigTags.h>
+#include <solvers/tnlFastBuildConfigTag.h>
+#include <solvers/tnlBuildConfigTags.h>
 #include <operators/diffusion/tnlLinearDiffusion.h>
 #include <operators/tnlAnalyticDirichletBoundaryConditions.h>
 #include <operators/tnlDirichletBoundaryConditions.h>
 #include <operators/tnlAnalyticNeumannBoundaryConditions.h>
 #include <operators/tnlNeumannBoundaryConditions.h>
-#include <functions/tnlConstantFunction.h>
+#include <functors/tnlConstantFunction.h>
 #include <problems/tnlHeatEquationProblem.h>
 
-//typedef tnlDefaultConfigTag BuildConfig;
+//typedef tnlDefaultBuildConfigTag BuildConfig;
 typedef tnlFastBuildConfig BuildConfig;
 
 template< typename ConfigTag >
@@ -80,34 +80,33 @@ class heatEquationSetter
          if( boundaryConditionsType == "dirichlet" )
          {
             typedef tnlAnalyticDirichletBoundaryConditions< MeshType, ConstantFunction, Real, Index > BoundaryConditions;
-            typedef tnlHeatEquationProblem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Solver;
+            typedef tnlHeatEquationProblem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Problem;
             SolverStarter solverStarter;
-            return solverStarter.template run< Solver >( parameters );
+            return solverStarter.template run< Problem >( parameters );
          }
          typedef tnlAnalyticNeumannBoundaryConditions< MeshType, ConstantFunction, Real, Index > BoundaryConditions;
-         typedef tnlHeatEquationProblem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Solver;
+         typedef tnlHeatEquationProblem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Problem;
          SolverStarter solverStarter;
-         return solverStarter.template run< Solver >( parameters );
+         return solverStarter.template run< Problem >( parameters );
       }
       typedef tnlVector< Real, Device, Index > VectorType;
       if( boundaryConditionsType == "dirichlet" )
       {
          typedef tnlDirichletBoundaryConditions< MeshType, VectorType, Real, Index > BoundaryConditions;
-         typedef tnlHeatEquationProblem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Solver;
+         typedef tnlHeatEquationProblem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Problem;
          SolverStarter solverStarter;
-         return solverStarter.template run< Solver >( parameters );
+         return solverStarter.template run< Problem >( parameters );
       }
       typedef tnlNeumannBoundaryConditions< MeshType, VectorType, Real, Index > BoundaryConditions;
-      typedef tnlHeatEquationProblem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Solver;
+      typedef tnlHeatEquationProblem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Problem;
       SolverStarter solverStarter;
-      return solverStarter.template run< Solver >( parameters );
+      return solverStarter.template run< Problem >( parameters );
    };
 };
 
 int main( int argc, char* argv[] )
 {
-   tnlSolver< heatEquationSetter, heatEquationConfig, BuildConfig > solver;
-   if( ! solver. run( argc, argv ) )
+   if( ! tnlSolver< heatEquationSetter, heatEquationConfig, BuildConfig >::run( argc, argv ) )
       return EXIT_FAILURE;
    return EXIT_SUCCESS;
 }
diff --git a/examples/make-project/CMakeLists.txt b/examples/make-project/CMakeLists.txt
deleted file mode 100755
index 85e54342b7ccff7a625b52249cb0bba30b5c4455..0000000000000000000000000000000000000000
--- a/examples/make-project/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-INSTALL( FILES Makefile
-               main.cpp
-               program-name.cfg.desc
-         DESTINATION share/tnl-${tnlVersion}/examples/make-project )
\ No newline at end of file
diff --git a/examples/make-project/Makefile b/examples/make-project/Makefile
deleted file mode 100644
index b00118da70bb6ce3fcc31b53e594be3119bd0875..0000000000000000000000000000000000000000
--- a/examples/make-project/Makefile
+++ /dev/null
@@ -1,42 +0,0 @@
-TNL_VERSION=0.1
-TNL_INSTALL_DIR=${HOME}/local/lib
-TNL_INCLUDE_DIR=${HOME}/local/include/tnl-${TNL_VERSION}
-
-TARGET = program-name
-CONFIG_FILE = $(TARGET).cfg.desc
-INSTALL_DIR = ${HOME}/local
-CXX = g++
-CUDA_CXX = nvcc
-CXX_FLAGS = -std=gnu++0x -I$(TNL_INCLUDE_DIR)
-LD_FLAGS = -L$(TNL_INSTALL_DIR) -ltnl-0.1
-
-SOURCES = main.cpp
-HEADERS = 
-OBJECTS = main.o
-DIST = $(SOURCES) Makefile
-
-all: $(TARGET)
-clean: 
-	rm -f $(OBJECTS)
-	rm -f $(TARGET)-conf.h	
-
-dist: $(DIST)
-	tar zcvf $(TARGET).tgz $(DIST) 
-
-install: $(TARGET)
-	cp $(TARGET) $(INSTALL_DIR)/bin
-	cp $(CONFIG_FILE) $(INSTALL_DIR)/share
-
-uninstall: $(TARGET)
-	rm -f $(INSTALL_DIR)/bin/$(TARGET) 
-	rm -f $(CONFIG_FILE) $(INSTALL_DIR)/share
-
-$(TARGET): $(OBJECTS)
-	$(CXX) -o $(TARGET) $(OBJECTS) $(LD_FLAGS)
-
-%.o: %.cpp $(TARGET)-conf.h $(HEADERS)
-	$(CXX) -c -o $@ $(CXX_FLAGS) $<
-
-$(TARGET)-conf.h:
-	echo "#define CONFIG_FILE \"${INSTALL_DIR}/share/${CONFIG_FILE}\" " > $(TARGET)-conf.h 
-
diff --git a/examples/make-project/main.cpp b/examples/make-project/main.cpp
deleted file mode 100644
index 80da9581b78d20815510d1b1c7b6ad4ebf7148a5..0000000000000000000000000000000000000000
--- a/examples/make-project/main.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/***************************************************************************
-                          main.cpp  -  description
-                             -------------------
-    begin                : Jan 12, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#include "program-name-conf.h"
-#include <config/tnlConfigDescription.h>
-#include <config/tnlParameterContainer.h>
-
-int main( int argc, char* argv[] )
-{
-   tnlParameterContainer parameters;
-   tnlConfigDescription conf_desc;
-   if( conf_desc.parseConfigDescription( CONFIG_FILE ) != 0 )
-      return EXIT_FAILURE;
-   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
-   {
-      conf_desc.printUsage( argv[ 0 ] );
-      return EXIT_FAILURE;
-   }
-
-   /****
-    * Write your code here
-    */
-   return EXIT_SUCCESS;
-}
-
-
diff --git a/examples/make-project/program-name.cfg.desc b/examples/make-project/program-name.cfg.desc
deleted file mode 100644
index f2ac640391feb48e75ad4e3f9c4fc215afd5d52a..0000000000000000000000000000000000000000
--- a/examples/make-project/program-name.cfg.desc
+++ /dev/null
@@ -1,22 +0,0 @@
-group IO
-{
-   string input-file            [Input file name.];
-   string output-file           [Output file name.];
-   real output-period           [Intervals for writing the state of the computation (in the meaning of parameter t).];
-},[Arguments describing input and output data.];
-group Problem
-{
-   real final-t(!)              [When reaching this t the computation will stop.];
-
-},[Setting up the problem we solve.];
-group Method
-{
-   string method(!)             [Method for solving the problem.];
-},[Parameters controling the method we use.];
-group Solver
-{
-   string  solver-name;
-   real    max-solver-res( 1.0e-6 ); 
-   integer max-solver-iterations( 1000000 );
-},[Parameters of the solver];
-
diff --git a/examples/simple-solver/CMakeLists.txt b/examples/simple-solver/CMakeLists.txt
deleted file mode 100755
index 5b8bcc644608ccc1cd6ba3338de994df7f5ec09a..0000000000000000000000000000000000000000
--- a/examples/simple-solver/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-INSTALL( FILES Makefile
-               main.cpp
-               simpleProblemSolver.h
-               simpleProblemSolver_impl.h
-               simpleProblemSetter.h
-               simpleProblemSetter_impl.h
-               simpleProblemConfig.h
-               run-simple-solver
-         DESTINATION share/tnl-${tnlVersion}/examples/simple-solver )
\ No newline at end of file
diff --git a/examples/simple-solver/Makefile b/examples/simple-solver/Makefile
deleted file mode 100644
index 2e9fb8bb68dde48d07ffa7c3fc2fa979742fc378..0000000000000000000000000000000000000000
--- a/examples/simple-solver/Makefile
+++ /dev/null
@@ -1,37 +0,0 @@
-TNL_VERSION=0.1
-TNL_INSTALL_DIR=${HOME}/local/lib
-TNL_INCLUDE_DIR=${HOME}/local/include/tnl-${TNL_VERSION}
-
-TARGET = simple-solver
-CONFIG_FILE = $(TARGET).cfg.desc
-INSTALL_DIR = ${HOME}/local
-CXX = g++
-CUDA_CXX = nvcc
-CXX_FLAGS = -std=gnu++0x -I$(TNL_INCLUDE_DIR)
-LD_FLAGS = -L$(TNL_INSTALL_DIR) -ltnl-0.1
-
-SOURCES = main.cpp
-HEADERS = 
-OBJECTS = main.o
-DIST = $(SOURCES) Makefile
-
-all: $(TARGET)
-clean: 
-	rm -f $(OBJECTS)	
-
-dist: $(DIST)
-	tar zcvf $(TARGET).tgz $(DIST) 
-
-install: $(TARGET)
-	cp $(TARGET) $(INSTALL_DIR)/bin
-	cp $(CONFIG_FILE) $(INSTALL_DIR)/share
-
-uninstall: $(TARGET)
-	rm -f $(INSTALL_DIR)/bin/$(TARGET) 
-	rm -f $(CONFIG_FILE) $(INSTALL_DIR)/share
-
-$(TARGET): $(OBJECTS)
-	$(CXX) -o $(TARGET) $(OBJECTS) $(LD_FLAGS)
-
-%.o: %.cpp $(HEADERS)
-	$(CXX) -c -o $@ $(CXX_FLAGS) $<
diff --git a/examples/simple-solver/main.cpp b/examples/simple-solver/main.cpp
deleted file mode 100644
index 2ab0f85a74e0cdeeeb1f4c41f316b36b6022e472..0000000000000000000000000000000000000000
--- a/examples/simple-solver/main.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/***************************************************************************
-                          main.cpp  -  description
-                             -------------------
-    begin                : Jan 12, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#include "simpleProblemSetter.h"
-#include "simpleProblemConfig.h"
-#include <solvers/tnlSolver.h>
-#include <solvers/tnlFastBuildConfig.h>
-#include <solvers/tnlConfigTags.h>
-
-//typedef tnlDefaultConfigTag BuildConfig;
-typedef tnlFastBuildConfig BuildConfig;
-
-int main( int argc, char* argv[] )
-{
-   tnlSolver< simpleProblemSetter, simpleProblemConfig, BuildConfig > solver;
-   if( ! solver. run( argc, argv ) )
-      return EXIT_FAILURE;
-   return EXIT_SUCCESS;
-}
-
-
diff --git a/examples/simple-solver/run-simple-solver b/examples/simple-solver/run-simple-solver
deleted file mode 100644
index ee7d7234e64ddec4478199640b92dde89e922e46..0000000000000000000000000000000000000000
--- a/examples/simple-solver/run-simple-solver
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-tnl-grid-setup --dimensions 2 \
-               --origin-x 0.0 \
-               --origin-y 0.0 \
-               --proportions-x 1.0 \
-               --proportions-y 1.0 \
-               --size-x 100 \
-               --size-y 100
-               
-tnl-discrete --function sin-waves \
-             --output-file u-ini.tnl               
-
-simple-solver --dimensions 2 \
-              --time-discretisation explicit \
-              --discrete-solver merson \
-              --snapshot-period 0.01 \
-              --final-time 1.0
-              
-tnl-view --mesh mesh.tnl *tnl              
-              
\ No newline at end of file
diff --git a/examples/simple-solver/simpleProblemConfig.h b/examples/simple-solver/simpleProblemConfig.h
deleted file mode 100644
index c29d6afc4732ea1fea47ec20977722fd3221ab58..0000000000000000000000000000000000000000
--- a/examples/simple-solver/simpleProblemConfig.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/***************************************************************************
-                          simpleProblemConfig.h  -  description
-                             -------------------
-    begin                : Jul 8, 2014
-    copyright            : (C) 2014 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef SIMPLEPROBLEMCONFIG_H_
-#define SIMPLEPROBLEMCONFIG_H_
-
-#include <config/tnlConfigDescription.h>
-
-template< typename ConfigTag >
-class simpleProblemConfig
-{
-   public:
-      static void configSetup( tnlConfigDescription& config )
-      {
-         config.addDelimiter( "Simple solver settings:" );
-         config.addEntry        < tnlString > ( "problem-name", "This defines particular problem.", "simpl" );
-      }
-};
-
-#endif /* SIMPLESOLVERCONFIG_H_ */
diff --git a/examples/simple-solver/simpleProblemSetter.h b/examples/simple-solver/simpleProblemSetter.h
deleted file mode 100644
index cd8f01c41d9279ef39e09b5b046ba220872ef6ad..0000000000000000000000000000000000000000
--- a/examples/simple-solver/simpleProblemSetter.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/***************************************************************************
-                          simpleProblemSetter.h  -  description
-                             -------------------
-    begin                : Feb 23, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef SIMPLEPROBLEMTYPESSETTER_H_
-#define SIMPLEPROBLEMTYPESSETTER_H_
-
-#include <config/tnlParameterContainer.h>
-#include <mesh/tnlGrid.h>
-#include "simpleProblemSolver.h"
-
-template< typename RealType,
-          typename DeviceType,
-          typename IndexType,
-          typename MeshType,
-          typename ConfigTag,
-          typename SolverStarter >
-class simpleProblemSetter
-{
-   public:
-
-   static bool run( const tnlParameterContainer& parameters );
-};
-
-#include "simpleProblemSetter_impl.h"
-
-#endif /* SIMPLEPROBLEMSETTER_H_ */
diff --git a/examples/simple-solver/simpleProblemSetter_impl.h b/examples/simple-solver/simpleProblemSetter_impl.h
deleted file mode 100644
index 5dae6e57158ad5006e580b3cbf471d6a7f30e3f9..0000000000000000000000000000000000000000
--- a/examples/simple-solver/simpleProblemSetter_impl.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/***************************************************************************
-                          simpleProblemSetter_impl.h  -  description
-                             -------------------
-    begin                : Mar 10, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef SIMPLEPROBLEMSETTER_IMPL_H_
-#define SIMPLEPROBLEMSETTER_IMPL_H_
-
-template< typename RealType,
-          typename DeviceType,
-          typename IndexType,
-          typename MeshType,
-          typename ConfigTag,
-          typename SolverStarter >
-bool simpleProblemSetter< RealType, DeviceType, IndexType, MeshType, ConfigTag, SolverStarter > :: run( const tnlParameterContainer& parameters )
-{
-   SolverStarter solverStarter;
-   return solverStarter. template run< simpleProblemSolver< MeshType > >( parameters );
-}
-
-
-#endif /* SIMPLEPROBLEMSETTER_IMPL_H_ */
diff --git a/examples/simple-solver/simpleProblemSolver.h b/examples/simple-solver/simpleProblemSolver.h
deleted file mode 100644
index 3ca680273366b95f5569cba0a38e23ec42de916a..0000000000000000000000000000000000000000
--- a/examples/simple-solver/simpleProblemSolver.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/***************************************************************************
-                          simpleProblemSolver.h  -  description
-                             -------------------
-    begin                : Feb 23, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef SIMPLEPROBLEMSOLVER_H_
-#define SIMPLEPROBLEMSOLVER_H_
-
-#include <matrices/tnlCSRMatrix.h>
-#include <solvers/preconditioners/tnlDummyPreconditioner.h>
-#include <solvers/tnlSolverMonitor.h>
-#include <core/tnlLogger.h>
-#include <core/vectors/tnlVector.h>
-#include <core/vectors/tnlSharedVector.h>
-
-template< typename Mesh >
-class simpleProblemSolver
-{
-   public:
-
-   typedef typename Mesh :: RealType RealType;
-   typedef typename Mesh :: DeviceType DeviceType;
-   typedef typename Mesh :: IndexType IndexType;
-   typedef Mesh MeshType;
-   typedef tnlVector< RealType, DeviceType, IndexType> DofVectorType;
-   typedef tnlCSRMatrix< RealType, DeviceType, IndexType > DiscreteSolverMatrixType;
-   typedef tnlDummyPreconditioner< RealType, DeviceType, IndexType > DiscreteSolverPreconditioner;
-
-   static tnlString getTypeStatic();
-
-   tnlString getPrologHeader() const;
-
-   void writeProlog( tnlLogger& logger,
-                     const tnlParameterContainer& parameters ) const;
-
-   bool setup( const tnlParameterContainer& parameters );
-
-   IndexType getDofs( const MeshType& mesh ) const;
-
-   IndexType getAuxiliaryDofs( const MeshType& mesh ) const;
-
-   void bindDofs( const MeshType& mesh,
-                  DofVectorType& dofs,
-                  DofVectorType& auxiliaryDofs );
-
-   bool setInitialCondition( const tnlParameterContainer& parameters );
-
-   bool makeSnapshot( const RealType& time,
-                      const IndexType& step,
-                      const MeshType& mesh );
-
-
-
-   void GetExplicitRHS( const RealType& time,
-                        const RealType& tau,
-                        const MeshType& mesh,
-                        DofVectorType& _u,
-                        DofVectorType& _fu );
-
-   tnlSolverMonitor< RealType, IndexType >* getSolverMonitor();
-
-   protected:
-
-   DofVectorType dofVector;
-
-   tnlSharedVector< RealType, DeviceType, IndexType > u, v;
-
-   MeshType mesh;
-
-};
-
-#include "simpleProblemSolver_impl.h"
-
-#endif /* SIMPLEPROBLEM_H_ */
diff --git a/examples/simple-solver/simpleProblemSolver_impl.h b/examples/simple-solver/simpleProblemSolver_impl.h
deleted file mode 100644
index b00f59111a6ac0bdbc7dd6096da92799dc3d1ead..0000000000000000000000000000000000000000
--- a/examples/simple-solver/simpleProblemSolver_impl.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/***************************************************************************
-                          simpleProblemSolver_impl.h  -  description
-                             -------------------
-    begin                : Mar 10, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef SIMPLEPROBLEMSOLVER_IMPL_H_
-#define SIMPLEPROBLEMSOLVER_IMPL_H_
-
-#include <core/mfilename.h>
-
-template< typename Mesh >
-tnlString simpleProblemSolver< Mesh>::getTypeStatic()
-{
-   /****
-    * Replace 'simpleProblemSolver' by the name of your solver.
-    */
-   return tnlString( "simpleProblemSolver< " ) + Mesh :: getTypeStatic() + " >";
-}
-
-template< typename Mesh >
-tnlString simpleProblemSolver< Mesh>::getPrologHeader() const
-{
-   /****
-    * Replace 'Simple Problem' by the your desired title in the log table.
-    */
-   return tnlString( "Simple Problem" );
-}
-
-template< typename Mesh >
-void simpleProblemSolver< Mesh>::writeProlog( tnlLogger& logger,
-                                              const tnlParameterContainer& parameters ) const
-{
-   /****
-    * In prolog, write all input parameters which define the numerical simulation.
-    * Use methods:
-    *
-    *    logger. writeParameters< Type >( "Label:", "name", parameters );
-    *
-    *  or
-    *
-    *    logger. writeParameter< Type >( "Label:", value );
-    *
-    *  See tnlLogger.h for more details.
-    */
-
-   logger. WriteParameter< tnlString >( "Problem name:", "problem-name", parameters );
-   logger. WriteParameter< int >( "Simple parameter:", 1 );
-}
-
-template< typename Mesh >
-bool simpleProblemSolver< Mesh>::setup( const tnlParameterContainer& parameters )
-{
-   /****
-    * Set-up your solver here. It means:
-    * 1. Read input parameters and model coefficients like these
-    */
-   const tnlString& problemName = parameters. getParameter< tnlString >( "problem-name" );
-   return true;
-}
-
-template< typename Mesh >
-typename simpleProblemSolver< Mesh >::IndexType simpleProblemSolver< Mesh>::getDofs( const Mesh& mesh ) const
-{
-   /****
-    * Set-up DOFs and supporting grid functions
-    */
-   return 2*mesh.getDofs();
-}
-
-template< typename Mesh >
-typename simpleProblemSolver< Mesh >::IndexType simpleProblemSolver< Mesh>::getAuxiliaryDofs( const Mesh& mesh ) const
-{
-   /****
-    * Set-up DOFs and supporting grid functions
-    */
-   return 2*mesh.getDofs();
-}
-
-
-template< typename Mesh >
-void simpleProblemSolver< Mesh >::bindDofs( const MeshType& mesh,
-                                            DofVectorType& dofVector,
-                                            DofVectorType& auxiliaryDofVector )
-{
-   /****
-    * You may use tnlSharedVector if you need to split the dofVector into more
-    * grid functions like the following example:
-    */
-   const IndexType dofs = this->getDofs( mesh );
-   this -> u. bind( & dofVector. getData()[ 0 * dofs ], dofs );
-   this -> v. bind( & dofVector. getData()[ 1 * dofs ], dofs );
-   /****
-    * You may now treat u and v as usual vectors and indirectly work with this->dofVector.
-    */
-}
-
-template< typename Mesh >
-bool simpleProblemSolver< Mesh>::setInitialCondition( const tnlParameterContainer& parameters )
-{
-   /****
-    * Set the initial condition here. Manipulate only this -> dofVector.
-    */
-   /*const tnlString& initialConditionFile = parameters.getParameter< tnlString >( "initial-condition" );
-   if( ! this->u.load( initialConditionFile ) )
-   {
-      cerr << "I am not able to load the initial condition from the file " << initialConditionFile << "." << endl;
-      return false;
-   }*/
-   return true;
-}
-
-template< typename Mesh >
-bool simpleProblemSolver< Mesh>::makeSnapshot( const RealType& time,
-                                               const IndexType& step,
-                                               const MeshType& mesh )
-{
-   /****
-    * Use this method to write state of the solver to file(s).
-    * All data are stored in this -> dofVector. You may use
-    * supporting vectors and bind them with the dofVector as before.
-    */
-   cout << endl << "Writing output at time " << time << " step " << step << "." << endl;
-
-   /****
-    * Now write them to files.
-    */
-   tnlString fileName;
-   FileNameBaseNumberEnding( "u-", step, 5, ".tnl", fileName );
-   if( ! this -> u. save( fileName ) )
-      return false;
-
-   FileNameBaseNumberEnding( "v-", step, 5, ".tnl", fileName );
-   if( ! this -> v. save( fileName ) )
-      return false;
-
-   return true;
-}
-
-template< typename Mesh >
-void simpleProblemSolver< Mesh>::GetExplicitRHS( const RealType& time,
-                                                 const RealType& tau,
-                                                 const MeshType& mesh,
-                                                 DofVectorType& _u,
-                                                 DofVectorType& _fu )
-{
-   /****
-    * If you use an explicit solver like tnlEulerSolver or tnlMersonSolver, you
-    * need to implement this method. Compute the right-hand side of
-    *
-    *   d/dt u(x) = fu( x, u )
-    *
-    * You may use supporting vectors again if you need.
-    */
-
-   _fu.setValue( 1.0 );
-   if( DeviceType :: getDevice() == tnlHostDevice )
-   {
-      /****
-       *  Write the host solver here.
-       */
-   }
-#ifdef HAVE_CUDA
-   if( DeviceType :: getDevice() == tnlCudaDevice )
-   {
-      /****
-       * Write the CUDA solver here.
-       */
-   }
-#endif
-}
-
-template< typename Mesh >
-tnlSolverMonitor< typename simpleProblemSolver< Mesh > :: RealType,
-                  typename simpleProblemSolver< Mesh > :: IndexType >*
-   simpleProblemSolver< Mesh >::getSolverMonitor()
-{
-   return 0;
-}
-
-#endif /* SIMPLEPROBLEM_IMPL_H_ */
diff --git a/install b/install
index 217830e0b630bc3bb0a67552eec438508640725c..4b5331984f666120f49f85a99295a8f64aa630fc 100755
--- a/install
+++ b/install
@@ -1,43 +1,86 @@
 #!/bin/bash
 
-TARGET=TNL
-INSTALL_PREFIX=${HOME}/local
-WITH_CUDA=yes
-TEMPLATE_EXPLICIT_INSTANTIATION=yes
-#VERBOSE="VERBOSE=1"
+BUILD_DEBUG="yes"
+BUILD_RELEASE="yes"
 
-CMAKE="cmake"
-CPUS=`grep -c processor /proc/cpuinfo`
-#CPUS="1"
+OPTIONS=""
 
+for option in "$@"
+do
+    case $option in
+        --no-debug                    ) BUILD_DEBUG="no" ;;
+        --no-release                  ) BUILD_RELEASE="no" ;;        
+        *                             ) OPTIONS="${OPTIONS} ${option}" ;;
+    esac
+done
 
-echo "Building $TARGET using $CPUS processors."
+if test ${BUILD_DEBUG} = "yes";
+then
+    if [ ! -d Debug ];
+    then
+       mkdir Debug
+    fi
+    cd Debug
+    ../build --root-dir=.. --build=Debug ${OPTIONS}
+    if test $? != 0;
+    then
+       exit 1
+    fi
+    make install
+    cd ..
+fi
 
-if [ ! -d Debug ];
+if test ${BUILD_RELEASE} = "yes";
 then
-   mkdir Debug
+    if [ ! -d Release ];
+    then
+       mkdir Release
+    fi
+    cd Release
+    ../build --root-dir=.. --build=Release ${OPTIONS}
+    if test $? != 0;
+    then
+        exit 1
+    fi
+    make install
+    cd ..
 fi
-if [ ! -d Release ];
+
+TNL_TEST=`which tnl-bindir`
+
+if test x${TNL_TEST} = x;
 then
-   mkdir Release
+    echo ""
+    echo "WARNING !!!"
+    echo ""
+    echo "Your system does not see TNL which was installed right now."
+    echo "You need to add it to your system variables PATH and LD_LIBRARY_PATH."
+    echo "Add the following to your .bashrc file:"
+    echo ""
+    
+    PREFIX=${HOME}/local
+    for option in "$@"
+    do
+        case $option in
+            --prefix=*                     ) PREFIX="${option#*=}" ;;
+        esac
+    done
+
+    echo "if test x\${PATH} = x;"
+    echo "then"
+    echo "   PATH=${PREFIX}/bin"
+    echo "else"
+    echo "   PATH=\${PATH}:${PREFIX}/bin"
+    echo "fi"
+    echo "if test x\${LD_LIBRARY_PATH} = x;"
+    echo "then"
+    echo "   LD_LIBRARY_PATH=${PREFIX}/lib"
+    echo "else"
+    echo "   LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:${PREFIX}/lib"
+    echo "fi"
+    echo "export PATH"
+    echo "export LD_LIBRARY_PATH"
 fi
 
-cd Debug
-${CMAKE} .. -DCMAKE_BUILD_TYPE=Debug \
-            -DCMAKE_INSTALL_PREFIX=${HOME}/local \
-            -DWITH_CUDA=${WITH_CUDA} \
-            -DPETSC_DIR=${PETSC_DIR} \
-            -DWITH_TEMPLATE_EXPLICIT_INSTANTIATION=${TEMPLATE_EXPLICIT_INSTANTIATION}
-make -j${CPUS} ${VERBOSE}
-make -j${CPUS} test
-make -j${CPUS} install
-
-cd ../Release
-${CMAKE} .. -DCMAKE_INSTALL_PREFIX=${HOME}/local \
-            -DWITH_CUDA=${WITH_CUDA} \
-            -DPETSC_DIR=${PETSC_DIR} \
-            -DWITH_TEMPLATE_EXPLICIT_INSTANTIATION=${TEMPLATE_EXPLICIT_INSTANTIATION}
-make -j${CPUS} ${VERBOSE}
-make -j${CPUS} test
-make -j${CPUS} install
+exit 0
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d6c150901a9f30bed8ac8846dc07c38c57c53dc8..6851150184a1b4c0077b75ed992740f14442e109 100755
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,5 @@
 INCLUDE_DIRECTORIES( config )
-ADD_SUBDIRECTORY( functions )
+ADD_SUBDIRECTORY( functors )
 ADD_SUBDIRECTORY( config )
 ADD_SUBDIRECTORY( core )
 ADD_SUBDIRECTORY( debug )
@@ -10,7 +10,7 @@ ADD_SUBDIRECTORY( problems )
 ADD_SUBDIRECTORY( solvers )
 ADD_SUBDIRECTORY( legacy )
 
-set( tnl_SOURCES ${tnl_functions_SOURCES}
+set( tnl_SOURCES ${tnl_functors_SOURCES}
                  ${tnl_config_SOURCES}
                  ${tnl_core_SOURCES}
                  ${tnl_legacy_SOURCES}
@@ -21,7 +21,7 @@ set( tnl_SOURCES ${tnl_functions_SOURCES}
                  ${tnl_problems_SOURCES}
                   )
 
-set( tnl_CUDA__SOURCES ${tnl_functions_CUDA__SOURCES}
+set( tnl_CUDA__SOURCES ${tnl_functors_CUDA__SOURCES}
                        ${tnl_config_CUDA__SOURCES}
                        ${tnl_core_CUDA__SOURCES}
                        ${tnl_legacy_CUDA__SOURCES}
@@ -35,6 +35,9 @@ set( tnl_CUDA__SOURCES ${tnl_functions_CUDA__SOURCES}
 if( BUILD_CUDA )
    CUDA_ADD_LIBRARY( tnl${debugExt}-${tnlVersion} SHARED ${tnl_CUDA__SOURCES}
                                                   OPTIONS ${CUDA_ADD_LIBRARY_OPTIONS} )
+    if( HAVE_CUBLAS )
+       CUDA_ADD_CUBLAS_TO_TARGET( tnl${debugExt}-${tnlVersion} )
+    endif( HAVE_CUBLAS )
 else( BUILD_CUDA )
    ADD_LIBRARY( tnl${debugExt}-${tnlVersion} SHARED 
                 ${tnl_SOURCES} )
@@ -51,6 +54,9 @@ IF( BUILD_MPI )
    if( BUILD_CUDA )
       CUDA_ADD_LIBRARY( tnl-mpi${debugExt}-${tnlVersion} SHARED ${tnl_CUDA__SOURCES} 
                                                          OPTIONS ${CUDA_ADD_LIBRARY_OPTIONS} )
+      if( HAVE_CUBLAS )
+         CUDA_ADD_CUBLAS_TO_TARGET( tnl-mpi${debugExt}-${tnlVersion} )
+      endif( HAVE_CUBLAS )
    else( BUILD_CUDA )
          ADD_LIBRARY( tnl-mpi${debugExt}-${tnlVersion} SHARED
                       ${tnl_SOURCES} )  
diff --git a/src/core/arrays/tnlArray.h b/src/core/arrays/tnlArray.h
index e5f96e3c91014c72bdeba15515a6c4df15a644f3..6ebf9399b58afc449d8f5ec3fefa9cdfcd0df2cd 100644
--- a/src/core/arrays/tnlArray.h
+++ b/src/core/arrays/tnlArray.h
@@ -61,24 +61,15 @@ class tnlArray : public virtual tnlObject
 
    void reset();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getSize() const;
+   __cuda_callable__ Index getSize() const;
 
    void setElement( const Index i, const Element& x );
 
    Element getElement( Index i ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Element& operator[] ( Index i );
+   __cuda_callable__ Element& operator[] ( Index i );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element& operator[] ( Index i ) const;
+   __cuda_callable__ const Element& operator[] ( Index i ) const;
 
    tnlArray< Element, Device, Index >& operator = ( const tnlArray< Element, Device, Index >& array );
 
@@ -93,15 +84,9 @@ class tnlArray : public virtual tnlObject
 
    void setValue( const Element& e );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element* getData() const;
+   __cuda_callable__ const Element* getData() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Element* getData();
+   __cuda_callable__ Element* getData();
 
    /*!
     * Returns true if non-zero size is set.
diff --git a/src/core/arrays/tnlArrayOperationsCuda_impl.cpp b/src/core/arrays/tnlArrayOperationsCuda_impl.cpp
index b1dba6e09237516ef773e933b372e84b817010e4..18cc3c6280e573e47c6b0a218bca034ac793d086 100644
--- a/src/core/arrays/tnlArrayOperationsCuda_impl.cpp
+++ b/src/core/arrays/tnlArrayOperationsCuda_impl.cpp
@@ -22,152 +22,275 @@
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< char,        int >( char*& data, const int size );
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< int,         int >( int*& data, const int size );
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< long int,    int >( long int*& data, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< float,       int >( float*& data, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< double,      int >( double*& data, const int size );
-//template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, int >( long double*& data, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, int >( long double*& data, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< char,        long int >( char*& data, const long int size );
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< int,         long int >( int*& data, const long int size );
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< long int,    long int >( long int*& data, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< float,       long int >( float*& data, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< double,      long int >( double*& data, const long int size );
-//template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, long int >( long double*& data, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, long int >( long double*& data, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlCuda >::freeMemory< char        >( char* data );
 template bool tnlArrayOperations< tnlCuda >::freeMemory< int         >( int* data );
 template bool tnlArrayOperations< tnlCuda >::freeMemory< long int    >( long int* data );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::freeMemory< float       >( float* data );
+#endif
 template bool tnlArrayOperations< tnlCuda >::freeMemory< double      >( double* data );
-//template bool tnlArrayOperations< tnlCuda >::freeMemory< long double >( long double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::freeMemory< long double >( long double* data );
+#endif
 
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< char        >( char* data, const char& value );
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< int         >( int* data, const int& value );
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< long int    >( long int* data, const long int& value );
+#ifdef INSTANTIATE_FLOAT
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< float       >( float* data, const float& value );
+#endif
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< double      >( double* data, const double& value );
-//template void tnlArrayOperations< tnlCuda >::setMemoryElement< long double >( long double* data, const long double& value );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template void tnlArrayOperations< tnlCuda >::setMemoryElement< long double >( long double* data, const long double& value );
+#endif
 
 template char        tnlArrayOperations< tnlCuda >::getMemoryElement< char        >( const char* data );
 template int         tnlArrayOperations< tnlCuda >::getMemoryElement< int         >( const int* data );
 template long int    tnlArrayOperations< tnlCuda >::getMemoryElement< long int    >( const long int* data );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlArrayOperations< tnlCuda >::getMemoryElement< float       >( const float* data );
+#endif
 template double      tnlArrayOperations< tnlCuda >::getMemoryElement< double      >( const double* data );
-//template long double tnlArrayOperations< tnlCuda >::getMemoryElement< long double >( const long double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlArrayOperations< tnlCuda >::getMemoryElement< long double >( const long double* data );
+#endif
 
 template char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        int >( char* data, const int i );
 template int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         int >( int* data, const int i );
 template long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    int >( long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 template float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       int >( float* data, const int i );
+#endif
 template double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      int >( double* data, const int i );
-//template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( long double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        long int >( char* data, const long int i );
 template int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         long int >( int* data, const long int i );
 template long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    long int >( long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 template float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       long int >( float* data, const long int i );
+#endif
 template double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      long int >( double* data, const long int i );
-//template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( long double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( long double* data, const long int i );
+#endif
+#endif
 
 template const char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        int >( const char* data, const int i );
 template const int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         int >( const int* data, const int i );
 template const long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    int >( const long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 template const float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       int >( const float* data, const int i );
+#endif
 template const double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      int >( const double* data, const int i );
-//template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( const long double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( const long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template const char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        long int >( const char* data, const long int i );
 template const int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         long int >( const int* data, const long int i );
 template const long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    long int >( const long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 template const float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       long int >( const float* data, const long int i );
+#endif
 template const double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      long int >( const double* data, const long int i );
-//template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( const long double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( const long double* data, const long int i );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlCuda >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 template bool tnlArrayOperations< tnlCuda >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 template bool tnlArrayOperations< tnlCuda >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-//template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 template bool tnlArrayOperations< tnlCuda >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 template bool tnlArrayOperations< tnlCuda >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-//template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
+
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-//template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-//template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
+
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-//template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-//template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlCuda >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 template bool tnlArrayOperations< tnlCuda >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-//template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 template bool tnlArrayOperations< tnlCuda >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-//template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
+
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-//template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-//template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
+
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-//template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-//template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
-
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlCuda >::setMemory< char,        int >( char* destination, const char& value, const int size );
 template bool tnlArrayOperations< tnlCuda >::setMemory< int,         int >( int* destination, const int& value, const int size );
 template bool tnlArrayOperations< tnlCuda >::setMemory< long int,    int >( long int* destination, const long int& value, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::setMemory< float,       int >( float* destination, const float& value, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::setMemory< double,      int >( double* destination, const double& value, const int size );
-//template bool tnlArrayOperations< tnlCuda >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda >::setMemory< char,        long int >( char* destination, const char& value, const long int size );
 template bool tnlArrayOperations< tnlCuda >::setMemory< int,         long int >( int* destination, const int& value, const long int size );
 template bool tnlArrayOperations< tnlCuda >::setMemory< long int,    long int >( long int* destination, const long int& value, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::setMemory< float,       long int >( float* destination, const float& value, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::setMemory< double,      long int >( double* destination, const double& value, const long int size );
-//template bool tnlArrayOperations< tnlCuda >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
+#endif
+#endif
 
 #endif
 
diff --git a/src/core/arrays/tnlArrayOperationsCuda_impl.cu b/src/core/arrays/tnlArrayOperationsCuda_impl.cu
index 326ead4c762d042f5fe31c4393d4abd7ab4393d4..38f97587561979873bb702f76e671ba5b25735df 100644
--- a/src/core/arrays/tnlArrayOperationsCuda_impl.cu
+++ b/src/core/arrays/tnlArrayOperationsCuda_impl.cu
@@ -22,155 +22,274 @@
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< char,        int >( char*& data, const int size );
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< int,         int >( int*& data, const int size );
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< long int,    int >( long int*& data, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< float,       int >( float*& data, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< double,      int >( double*& data, const int size );
-//template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, int >( long double*& data, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, int >( long double*& data, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< char,        long int >( char*& data, const long int size );
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< int,         long int >( int*& data, const long int size );
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< long int,    long int >( long int*& data, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< float,       long int >( float*& data, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< double,      long int >( double*& data, const long int size );
-//template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, long int >( long double*& data, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, long int >( long double*& data, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlCuda >::freeMemory< char        >( char* data );
 template bool tnlArrayOperations< tnlCuda >::freeMemory< int         >( int* data );
 template bool tnlArrayOperations< tnlCuda >::freeMemory< long int    >( long int* data );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::freeMemory< float       >( float* data );
+#endif
 template bool tnlArrayOperations< tnlCuda >::freeMemory< double      >( double* data );
-//template bool tnlArrayOperations< tnlCuda >::freeMemory< long double >( long double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::freeMemory< long double >( long double* data );
+#endif
 
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< char        >( char* data, const char& value );
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< int         >( int* data, const int& value );
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< long int    >( long int* data, const long int& value );
+#ifdef INSTANTIATE_FLOAT
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< float       >( float* data, const float& value );
+#endif
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< double      >( double* data, const double& value );
-//template void tnlArrayOperations< tnlCuda >::setMemoryElement< long double >( long double* data, const long double& value );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template void tnlArrayOperations< tnlCuda >::setMemoryElement< long double >( long double* data, const long double& value );
+#endif
 
 template char        tnlArrayOperations< tnlCuda >::getMemoryElement< char        >( const char* data );
 template int         tnlArrayOperations< tnlCuda >::getMemoryElement< int         >( const int* data );
 template long int    tnlArrayOperations< tnlCuda >::getMemoryElement< long int    >( const long int* data );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlArrayOperations< tnlCuda >::getMemoryElement< float       >( const float* data );
+#endif
 template double      tnlArrayOperations< tnlCuda >::getMemoryElement< double      >( const double* data );
-//template long double tnlArrayOperations< tnlCuda >::getMemoryElement< long double >( const long double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlArrayOperations< tnlCuda >::getMemoryElement< long double >( const long double* data );
+#endif
 
 template char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        int >( char* data, const int i );
 template int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         int >( int* data, const int i );
 template long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    int >( long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 template float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       int >( float* data, const int i );
+#endif
 template double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      int >( double* data, const int i );
-//template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( long double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        long int >( char* data, const long int i );
 template int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         long int >( int* data, const long int i );
 template long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    long int >( long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 template float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       long int >( float* data, const long int i );
+#endif
 template double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      long int >( double* data, const long int i );
-//template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( long double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( long double* data, const long int i );
+#endif
+#endif
 
 template const char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        int >( const char* data, const int i );
 template const int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         int >( const int* data, const int i );
 template const long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    int >( const long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 template const float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       int >( const float* data, const int i );
+#endif
 template const double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      int >( const double* data, const int i );
-//template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( const long double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( const long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template const char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        long int >( const char* data, const long int i );
 template const int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         long int >( const int* data, const long int i );
 template const long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    long int >( const long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 template const float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       long int >( const float* data, const long int i );
+#endif
 template const double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      long int >( const double* data, const long int i );
-//template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( const long double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( const long double* data, const long int i );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlCuda >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 template bool tnlArrayOperations< tnlCuda >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 template bool tnlArrayOperations< tnlCuda >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-//template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 template bool tnlArrayOperations< tnlCuda >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 template bool tnlArrayOperations< tnlCuda >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-//template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
+
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-//template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-//template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
+
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-//template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-//template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlCuda >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 template bool tnlArrayOperations< tnlCuda >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-//template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 template bool tnlArrayOperations< tnlCuda >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-//template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
+
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-//template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-//template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
+
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-//template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-//template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
-
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlCuda >::setMemory< char,        int >( char* destination, const char& value, const int size );
 template bool tnlArrayOperations< tnlCuda >::setMemory< int,         int >( int* destination, const int& value, const int size );
 template bool tnlArrayOperations< tnlCuda >::setMemory< long int,    int >( long int* destination, const long int& value, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::setMemory< float,       int >( float* destination, const float& value, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::setMemory< double,      int >( double* destination, const double& value, const int size );
-//template bool tnlArrayOperations< tnlCuda >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda >::setMemory< char,        long int >( char* destination, const char& value, const long int size );
 template bool tnlArrayOperations< tnlCuda >::setMemory< int,         long int >( int* destination, const int& value, const long int size );
 template bool tnlArrayOperations< tnlCuda >::setMemory< long int,    long int >( long int* destination, const long int& value, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::setMemory< float,       long int >( float* destination, const float& value, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::setMemory< double,      long int >( double* destination, const double& value, const long int size );
-//template bool tnlArrayOperations< tnlCuda >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
-
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
+#endif
 #endif
 
-
-
-
+#endif
diff --git a/src/core/arrays/tnlArrayOperationsCuda_impl.h b/src/core/arrays/tnlArrayOperationsCuda_impl.h
index 5622d4961d85ba07e018b5bedd6dbdb44abfacc1..4179c38ffa9292b72612d7b4f093b0ef58f68382 100644
--- a/src/core/arrays/tnlArrayOperationsCuda_impl.h
+++ b/src/core/arrays/tnlArrayOperationsCuda_impl.h
@@ -374,151 +374,275 @@ bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory( const Element1* host
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< char,        int >( char*& data, const int size );
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< int,         int >( int*& data, const int size );
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< long int,    int >( long int*& data, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< float,       int >( float*& data, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< double,      int >( double*& data, const int size );
-//extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, int >( long double*& data, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, int >( long double*& data, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< char,        long int >( char*& data, const long int size );
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< int,         long int >( int*& data, const long int size );
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< long int,    long int >( long int*& data, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< float,       long int >( float*& data, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< double,      long int >( double*& data, const long int size );
-//extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, long int >( long double*& data, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, long int >( long double*& data, const long int size );
+#endif
+#endif
 
 extern template bool tnlArrayOperations< tnlCuda >::freeMemory< char        >( char* data );
 extern template bool tnlArrayOperations< tnlCuda >::freeMemory< int         >( int* data );
 extern template bool tnlArrayOperations< tnlCuda >::freeMemory< long int    >( long int* data );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda >::freeMemory< float       >( float* data );
+#endif
 extern template bool tnlArrayOperations< tnlCuda >::freeMemory< double      >( double* data );
-//extern template bool tnlArrayOperations< tnlCuda >::freeMemory< long double >( long double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda >::freeMemory< long double >( long double* data );
+#endif
 
 extern template void tnlArrayOperations< tnlCuda >::setMemoryElement< char        >( char* data, const char& value );
 extern template void tnlArrayOperations< tnlCuda >::setMemoryElement< int         >( int* data, const int& value );
 extern template void tnlArrayOperations< tnlCuda >::setMemoryElement< long int    >( long int* data, const long int& value );
+#ifdef INSTANTIATE_FLOAT
 extern template void tnlArrayOperations< tnlCuda >::setMemoryElement< float       >( float* data, const float& value );
+#endif
 extern template void tnlArrayOperations< tnlCuda >::setMemoryElement< double      >( double* data, const double& value );
-//extern template void tnlArrayOperations< tnlCuda >::setMemoryElement< long double >( long double* data, const long double& value );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template void tnlArrayOperations< tnlCuda >::setMemoryElement< long double >( long double* data, const long double& value );
+#endif
 
 extern template char        tnlArrayOperations< tnlCuda >::getMemoryElement< char        >( const char* data );
 extern template int         tnlArrayOperations< tnlCuda >::getMemoryElement< int         >( const int* data );
 extern template long int    tnlArrayOperations< tnlCuda >::getMemoryElement< long int    >( const long int* data );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlArrayOperations< tnlCuda >::getMemoryElement< float       >( const float* data );
+#endif
 extern template double      tnlArrayOperations< tnlCuda >::getMemoryElement< double      >( const double* data );
-//extern template long double tnlArrayOperations< tnlCuda >::getMemoryElement< long double >( const long double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlArrayOperations< tnlCuda >::getMemoryElement< long double >( const long double* data );
+#endif
 
 extern template char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        int >( char* data, const int i );
 extern template int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         int >( int* data, const int i );
 extern template long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    int >( long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 extern template float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       int >( float* data, const int i );
+#endif
 extern template double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      int >( double* data, const int i );
-//extern template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( long double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        long int >( char* data, const long int i );
 extern template int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         long int >( int* data, const long int i );
 extern template long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    long int >( long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 extern template float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       long int >( float* data, const long int i );
+#endif
 extern template double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      long int >( double* data, const long int i );
-//extern template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( long double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( long double* data, const long int i );
+#endif
+#endif
 
 extern template const char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        int >( const char* data, const int i );
 extern template const int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         int >( const int* data, const int i );
 extern template const long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    int >( const long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 extern template const float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       int >( const float* data, const int i );
+#endif
 extern template const double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      int >( const double* data, const int i );
-//extern template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( const long double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( const long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template const char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        long int >( const char* data, const long int i );
 extern template const int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         long int >( const int* data, const long int i );
 extern template const long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    long int >( const long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 extern template const float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       long int >( const float* data, const long int i );
+#endif
 extern template const double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      long int >( const double* data, const long int i );
-//extern template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( const long double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( const long double* data, const long int i );
+#endif
+#endif
 
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-//extern template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-//extern template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
+
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-//extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-//extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
+
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-//extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-//extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
 
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-//extern template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-//extern template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
+
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-//extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-//extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
+
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-//extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-//extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
 
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< char,        int >( char* destination, const char& value, const int size );
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< int,         int >( int* destination, const int& value, const int size );
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< long int,    int >( long int* destination, const long int& value, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< float,       int >( float* destination, const float& value, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< double,      int >( double* destination, const double& value, const int size );
-//extern template bool tnlArrayOperations< tnlCuda >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< char,        long int >( char* destination, const char& value, const long int size );
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< int,         long int >( int* destination, const int& value, const long int size );
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< long int,    long int >( long int* destination, const long int& value, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< float,       long int >( float* destination, const float& value, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< double,      long int >( double* destination, const double& value, const long int size );
-//extern template bool tnlArrayOperations< tnlCuda >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
+#endif
+#endif
 
 #endif
 
diff --git a/src/core/arrays/tnlArrayOperationsHost_impl.cpp b/src/core/arrays/tnlArrayOperationsHost_impl.cpp
index 65af28f12ecb3206a2fffb37ed552ea943e83aa8..c2688d84be3411a2bba1e6ac955afdea001e5de8 100644
--- a/src/core/arrays/tnlArrayOperationsHost_impl.cpp
+++ b/src/core/arrays/tnlArrayOperationsHost_impl.cpp
@@ -22,106 +22,178 @@
 template bool tnlArrayOperations< tnlHost >::allocateMemory< char,        int >( char*& data, const int size );
 template bool tnlArrayOperations< tnlHost >::allocateMemory< int,         int >( int*& data, const int size );
 template bool tnlArrayOperations< tnlHost >::allocateMemory< long int,    int >( long int*& data, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::allocateMemory< float,       int >( float*& data, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::allocateMemory< double,      int >( double*& data, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::allocateMemory< long double, int >( long double*& data, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost >::allocateMemory< char,        long int >( char*& data, const long int size );
 template bool tnlArrayOperations< tnlHost >::allocateMemory< int,         long int >( int*& data, const long int size );
 template bool tnlArrayOperations< tnlHost >::allocateMemory< long int,    long int >( long int*& data, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::allocateMemory< float,       long int >( float*& data, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::allocateMemory< double,      long int >( double*& data, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::allocateMemory< long double, long int >( long double*& data, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlHost >::freeMemory< char        >( char* data );
 template bool tnlArrayOperations< tnlHost >::freeMemory< int         >( int* data );
 template bool tnlArrayOperations< tnlHost >::freeMemory< long int    >( long int* data );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::freeMemory< float       >( float* data );
+#endif
 template bool tnlArrayOperations< tnlHost >::freeMemory< double      >( double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::freeMemory< long double >( long double* data );
+#endif
 
 template void tnlArrayOperations< tnlHost >::setMemoryElement< char        >( char* data, const char& value );
 template void tnlArrayOperations< tnlHost >::setMemoryElement< int         >( int* data, const int& value );
 template void tnlArrayOperations< tnlHost >::setMemoryElement< long int    >( long int* data, const long int& value );
+#ifdef INSTANTIATE_FLOAT
 template void tnlArrayOperations< tnlHost >::setMemoryElement< float       >( float* data, const float& value );
+#endif
 template void tnlArrayOperations< tnlHost >::setMemoryElement< double      >( double* data, const double& value );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template void tnlArrayOperations< tnlHost >::setMemoryElement< long double >( long double* data, const long double& value );
+#endif
 
 template char        tnlArrayOperations< tnlHost >::getMemoryElement< char        >( char* data );
 template int         tnlArrayOperations< tnlHost >::getMemoryElement< int         >( int* data );
 template long int    tnlArrayOperations< tnlHost >::getMemoryElement< long int    >( long int* data );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlArrayOperations< tnlHost >::getMemoryElement< float       >( float* data );
+#endif
 template double      tnlArrayOperations< tnlHost >::getMemoryElement< double      >( double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlArrayOperations< tnlHost >::getMemoryElement< long double >( long double* data );
+#endif
 
 template char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        int >( char* data, const int i );
 template int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         int >( int* data, const int i );
 template long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    int >( long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 template float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       int >( float* data, const int i );
+#endif
 template double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      int >( double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, int >( long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        long int >( char* data, const long int i );
 template int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         long int >( int* data, const long int i );
 template long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    long int >( long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 template float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       long int >( float* data, const long int i );
+#endif
 template double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      long int >( double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, long int >( long double* data, const long int i );
+#endif
+#endif
 
 template const char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        int >( const char* data, const int i );
 template const int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         int >( const int* data, const int i );
 template const long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    int >( const long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 template const float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       int >( const float* data, const int i );
+#endif
 template const double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      int >( const double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template const long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, int >( const long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template const char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        long int >( const char* data, const long int i );
 template const int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         long int >( const int* data, const long int i );
 template const long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    long int >( const long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 template const float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       long int >( const float* data, const long int i );
+#endif
 template const double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      long int >( const double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template const long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, long int >( const long double* data, const long int i );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlHost >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 template bool tnlArrayOperations< tnlHost >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 template bool tnlArrayOperations< tnlHost >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 template bool tnlArrayOperations< tnlHost >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 template bool tnlArrayOperations< tnlHost >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlHost >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 template bool tnlArrayOperations< tnlHost >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 template bool tnlArrayOperations< tnlHost >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 template bool tnlArrayOperations< tnlHost >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 template bool tnlArrayOperations< tnlHost >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlHost >::setMemory< char,        int >( char* destination, const char& value, const int size );
 template bool tnlArrayOperations< tnlHost >::setMemory< int,         int >( int* destination, const int& value, const int size );
 template bool tnlArrayOperations< tnlHost >::setMemory< long int,    int >( long int* destination, const long int& value, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::setMemory< float,       int >( float* destination, const float& value, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::setMemory< double,      int >( double* destination, const double& value, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost >::setMemory< char,        long int >( char* destination, const char& value, const long int size );
 template bool tnlArrayOperations< tnlHost >::setMemory< int,         long int >( int* destination, const int& value, const long int size );
 template bool tnlArrayOperations< tnlHost >::setMemory< long int,    long int >( long int* destination, const long int& value, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::setMemory< float,       long int >( float* destination, const float& value, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::setMemory< double,      long int >( double* destination, const double& value, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
-
+#endif
 #endif
 
-
-
-
+#endif
diff --git a/src/core/arrays/tnlArrayOperationsHost_impl.cu b/src/core/arrays/tnlArrayOperationsHost_impl.cu
index 8e719a923940a514c9d0444df192b02025f31dd4..2f7e7e4ebfae0af37fd42863f6d40e740f0daa30 100644
--- a/src/core/arrays/tnlArrayOperationsHost_impl.cu
+++ b/src/core/arrays/tnlArrayOperationsHost_impl.cu
@@ -22,106 +22,178 @@
 template bool tnlArrayOperations< tnlHost >::allocateMemory< char,        int >( char*& data, const int size );
 template bool tnlArrayOperations< tnlHost >::allocateMemory< int,         int >( int*& data, const int size );
 template bool tnlArrayOperations< tnlHost >::allocateMemory< long int,    int >( long int*& data, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::allocateMemory< float,       int >( float*& data, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::allocateMemory< double,      int >( double*& data, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::allocateMemory< long double, int >( long double*& data, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost >::allocateMemory< char,        long int >( char*& data, const long int size );
 template bool tnlArrayOperations< tnlHost >::allocateMemory< int,         long int >( int*& data, const long int size );
 template bool tnlArrayOperations< tnlHost >::allocateMemory< long int,    long int >( long int*& data, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::allocateMemory< float,       long int >( float*& data, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::allocateMemory< double,      long int >( double*& data, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::allocateMemory< long double, long int >( long double*& data, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlHost >::freeMemory< char        >( char* data );
 template bool tnlArrayOperations< tnlHost >::freeMemory< int         >( int* data );
 template bool tnlArrayOperations< tnlHost >::freeMemory< long int    >( long int* data );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::freeMemory< float       >( float* data );
+#endif
 template bool tnlArrayOperations< tnlHost >::freeMemory< double      >( double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::freeMemory< long double >( long double* data );
+#endif
 
 template void tnlArrayOperations< tnlHost >::setMemoryElement< char        >( char* data, const char& value );
 template void tnlArrayOperations< tnlHost >::setMemoryElement< int         >( int* data, const int& value );
 template void tnlArrayOperations< tnlHost >::setMemoryElement< long int    >( long int* data, const long int& value );
+#ifdef INSTANTIATE_FLOAT
 template void tnlArrayOperations< tnlHost >::setMemoryElement< float       >( float* data, const float& value );
+#endif
 template void tnlArrayOperations< tnlHost >::setMemoryElement< double      >( double* data, const double& value );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template void tnlArrayOperations< tnlHost >::setMemoryElement< long double >( long double* data, const long double& value );
+#endif
 
 template char        tnlArrayOperations< tnlHost >::getMemoryElement< char        >( char* data );
 template int         tnlArrayOperations< tnlHost >::getMemoryElement< int         >( int* data );
 template long int    tnlArrayOperations< tnlHost >::getMemoryElement< long int    >( long int* data );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlArrayOperations< tnlHost >::getMemoryElement< float       >( float* data );
+#endif
 template double      tnlArrayOperations< tnlHost >::getMemoryElement< double      >( double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlArrayOperations< tnlHost >::getMemoryElement< long double >( long double* data );
+#endif
 
 template char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        int >( char* data, const int i );
 template int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         int >( int* data, const int i );
 template long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    int >( long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 template float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       int >( float* data, const int i );
+#endif
 template double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      int >( double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, int >( long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        long int >( char* data, const long int i );
 template int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         long int >( int* data, const long int i );
 template long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    long int >( long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 template float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       long int >( float* data, const long int i );
+#endif
 template double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      long int >( double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, long int >( long double* data, const long int i );
+#endif
+#endif
 
 template const char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        int >( const char* data, const int i );
 template const int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         int >( const int* data, const int i );
 template const long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    int >( const long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 template const float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       int >( const float* data, const int i );
+#endif
 template const double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      int >( const double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template const long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, int >( const long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template const char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        long int >( const char* data, const long int i );
 template const int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         long int >( const int* data, const long int i );
 template const long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    long int >( const long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 template const float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       long int >( const float* data, const long int i );
+#endif
 template const double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      long int >( const double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template const long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, long int >( const long double* data, const long int i );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlHost >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 template bool tnlArrayOperations< tnlHost >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 template bool tnlArrayOperations< tnlHost >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 template bool tnlArrayOperations< tnlHost >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 template bool tnlArrayOperations< tnlHost >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlHost >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 template bool tnlArrayOperations< tnlHost >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 template bool tnlArrayOperations< tnlHost >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 template bool tnlArrayOperations< tnlHost >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 template bool tnlArrayOperations< tnlHost >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlHost >::setMemory< char,        int >( char* destination, const char& value, const int size );
 template bool tnlArrayOperations< tnlHost >::setMemory< int,         int >( int* destination, const int& value, const int size );
 template bool tnlArrayOperations< tnlHost >::setMemory< long int,    int >( long int* destination, const long int& value, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::setMemory< float,       int >( float* destination, const float& value, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::setMemory< double,      int >( double* destination, const double& value, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost >::setMemory< char,        long int >( char* destination, const char& value, const long int size );
 template bool tnlArrayOperations< tnlHost >::setMemory< int,         long int >( int* destination, const int& value, const long int size );
 template bool tnlArrayOperations< tnlHost >::setMemory< long int,    long int >( long int* destination, const long int& value, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::setMemory< float,       long int >( float* destination, const float& value, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::setMemory< double,      long int >( double* destination, const double& value, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
-
+#endif
 #endif
 
-
-
-
+#endif
diff --git a/src/core/arrays/tnlArrayOperationsHost_impl.h b/src/core/arrays/tnlArrayOperationsHost_impl.h
index afb56febb7eae7cc3e1e1224fcf280b6fa9fe736..27ee9093ff21854ab9aff2ba573f0209af10ef13 100644
--- a/src/core/arrays/tnlArrayOperationsHost_impl.h
+++ b/src/core/arrays/tnlArrayOperationsHost_impl.h
@@ -50,22 +50,22 @@ Element tnlArrayOperations< tnlHost >::getMemoryElement( Element* data )
 
 template< typename Element, typename Index >
 Element& tnlArrayOperations< tnlHost >::getArrayElementReference( Element* data,
-                                                                    const Index i )
+                                                                  const Index i )
 {
    return data[ i ];
 };
 
 template< typename Element, typename Index >
 const Element& tnlArrayOperations< tnlHost >::getArrayElementReference( const Element* data,
-                                                                          const Index i )
+                                                                       const Index i )
 {
    return data[ i ];
 };
 
 template< typename Element, typename Index >
 bool tnlArrayOperations< tnlHost >::setMemory( Element* data,
-                                                        const Element& value,
-                                                        const Index size )
+                                               const Element& value,
+                                               const Index size )
 {
    for( Index i = 0; i < size; i ++ )
       data[ i ] = value;
@@ -111,103 +111,179 @@ bool tnlArrayOperations< tnlHost >::compareMemory( const DestinationElement* des
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< char,        int >( char*& data, const int size );
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< int,         int >( int*& data, const int size );
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< long int,    int >( long int*& data, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< float,       int >( float*& data, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< double,      int >( double*& data, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< long double, int >( long double*& data, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< char,        long int >( char*& data, const long int size );
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< int,         long int >( int*& data, const long int size );
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< long int,    long int >( long int*& data, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< float,       long int >( float*& data, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< double,      long int >( double*& data, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< long double, long int >( long double*& data, const long int size );
+#endif
+#endif
 
 extern template bool tnlArrayOperations< tnlHost >::freeMemory< char        >( char* data );
 extern template bool tnlArrayOperations< tnlHost >::freeMemory< int         >( int* data );
 extern template bool tnlArrayOperations< tnlHost >::freeMemory< long int    >( long int* data );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost >::freeMemory< float       >( float* data );
+#endif
 extern template bool tnlArrayOperations< tnlHost >::freeMemory< double      >( double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool tnlArrayOperations< tnlHost >::freeMemory< long double >( long double* data );
+#endif
 
 extern template void tnlArrayOperations< tnlHost >::setMemoryElement< char        >( char* data, const char& value );
 extern template void tnlArrayOperations< tnlHost >::setMemoryElement< int         >( int* data, const int& value );
 extern template void tnlArrayOperations< tnlHost >::setMemoryElement< long int    >( long int* data, const long int& value );
+#ifdef INSTANTIATE_FLOAT
 extern template void tnlArrayOperations< tnlHost >::setMemoryElement< float       >( float* data, const float& value );
+#endif
 extern template void tnlArrayOperations< tnlHost >::setMemoryElement< double      >( double* data, const double& value );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template void tnlArrayOperations< tnlHost >::setMemoryElement< long double >( long double* data, const long double& value );
+#endif
 
 extern template char        tnlArrayOperations< tnlHost >::getMemoryElement< char        >( char* data );
 extern template int         tnlArrayOperations< tnlHost >::getMemoryElement< int         >( int* data );
 extern template long int    tnlArrayOperations< tnlHost >::getMemoryElement< long int    >( long int* data );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlArrayOperations< tnlHost >::getMemoryElement< float       >( float* data );
+#endif
 extern template double      tnlArrayOperations< tnlHost >::getMemoryElement< double      >( double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlArrayOperations< tnlHost >::getMemoryElement< long double >( long double* data );
+#endif
 
 extern template char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        int >( char* data, const int i );
 extern template int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         int >( int* data, const int i );
 extern template long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    int >( long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 extern template float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       int >( float* data, const int i );
+#endif
 extern template double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      int >( double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, int >( long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        long int >( char* data, const long int i );
 extern template int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         long int >( int* data, const long int i );
 extern template long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    long int >( long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 extern template float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       long int >( float* data, const long int i );
+#endif
 extern template double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      long int >( double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, long int >( long double* data, const long int i );
+#endif
+#endif
 
 extern template const char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        int >( const char* data, const int i );
 extern template const int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         int >( const int* data, const int i );
 extern template const long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    int >( const long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 extern template const float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       int >( const float* data, const int i );
+#endif
 extern template const double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      int >( const double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template const long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, int >( const long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template const char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        long int >( const char* data, const long int i );
 extern template const int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         long int >( const int* data, const long int i );
 extern template const long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    long int >( const long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 extern template const float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       long int >( const float* data, const long int i );
+#endif
 extern template const double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      long int >( const double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template const long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, long int >( const long double* data, const long int i );
+#endif
+#endif
 
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< char,                char, int >( char* destination, const char* source, const int size );
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< int,                  int, int >( int* destination, const int* source, const int size );
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< long int,        long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< float,              float, int >( float* destination, const float* source, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< double,            double, int >( double* destination, const double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< long double,  long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< char,                char, long int >( char* destination, const char* source, const long int size );
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< int,                  int, long int >( int* destination, const int* source, const long int size );
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< long int,        long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< float,              float, long int >( float* destination, const float* source, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< double,            double, long int >( double* destination, const double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< long double,  long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
 
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
 
 extern template bool tnlArrayOperations< tnlHost >::setMemory< char,        int >( char* destination, const char& value, const int size );
 extern template bool tnlArrayOperations< tnlHost >::setMemory< int,         int >( int* destination, const int& value, const int size );
 extern template bool tnlArrayOperations< tnlHost >::setMemory< long int,    int >( long int* destination, const long int& value, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost >::setMemory< float,       int >( float* destination, const float& value, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost >::setMemory< double,      int >( double* destination, const double& value, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool tnlArrayOperations< tnlHost >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlHost >::setMemory< char,        long int >( char* destination, const char& value, const long int size );
 extern template bool tnlArrayOperations< tnlHost >::setMemory< int,         long int >( int* destination, const int& value, const long int size );
 extern template bool tnlArrayOperations< tnlHost >::setMemory< long int,    long int >( long int* destination, const long int& value, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost >::setMemory< float,       long int >( float* destination, const float& value, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost >::setMemory< double,      long int >( double* destination, const double& value, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool tnlArrayOperations< tnlHost >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
+#endif
+#endif
 
 #endif
 
diff --git a/src/core/arrays/tnlArray_impl.cpp b/src/core/arrays/tnlArray_impl.cpp
index 6a429605828f2408f2cac263a8dcba637079be8d..c021341d1618065fb9fdea67e551c8829a2c42ee 100644
--- a/src/core/arrays/tnlArray_impl.cpp
+++ b/src/core/arrays/tnlArray_impl.cpp
@@ -19,16 +19,44 @@
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlArray< float, tnlHost, int >;
+#endif
 template class tnlArray< double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlArray< long double, tnlHost, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlArray< float, tnlHost, long int >;
+#endif
 template class tnlArray< double, tnlHost, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlArray< long double, tnlHost, long int >;
+#endif
+#endif
 
 #ifndef HAVE_CUDA
+#ifdef INSTANTIATE_FLOAT
 template class tnlArray< float, tnlCuda, int >;
+#endif
 template class tnlArray< double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlArray< long double, tnlCuda, int >;
+#endif
+
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlArray< float, tnlCuda, long int >;
+#endif
 template class tnlArray< double, tnlCuda, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlArray< long double, tnlCuda, long int >;
+#endif
+#endif
+
 #endif
 
 #endif
diff --git a/src/core/arrays/tnlArray_impl.cu b/src/core/arrays/tnlArray_impl.cu
index 21a149024cc5ce0b9d65c739752139a0eaa31bb6..f6b42f22efcaa757fc27c80728fa379b46b35958 100644
--- a/src/core/arrays/tnlArray_impl.cu
+++ b/src/core/arrays/tnlArray_impl.cu
@@ -20,10 +20,25 @@
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
 #ifdef HAVE_CUDA
+#ifdef INSTANTIATE_FLOAT
 template class tnlArray< float, tnlCuda, int >;
+#endif
 template class tnlArray< double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlArray< long double, tnlCuda, int >;
+#endif
+
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlArray< float, tnlCuda, long int >;
+#endif
 template class tnlArray< double, tnlCuda, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlArray< long double, tnlCuda, long int >;
+#endif
+#endif
+
 #endif
 
 #endif
diff --git a/src/core/arrays/tnlArray_impl.h b/src/core/arrays/tnlArray_impl.h
index 11ec6f56f620af6b6190d27b366c891028bf547a..3655f886914843aa421983532d67cf3743f3a7db 100644
--- a/src/core/arrays/tnlArray_impl.h
+++ b/src/core/arrays/tnlArray_impl.h
@@ -145,9 +145,7 @@ void tnlArray< Element, Device, Index > :: reset()
 template< typename Element,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlArray< Element, Device, Index > :: getSize() const
 {
    return this -> size;
@@ -182,9 +180,7 @@ Element tnlArray< Element, Device, Index > :: getElement( Index i ) const
 template< typename Element,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Element& tnlArray< Element, Device, Index > :: operator[] ( Index i )
 {
    tnlAssert( 0 <= i && i < this -> getSize(),
@@ -198,9 +194,7 @@ Element& tnlArray< Element, Device, Index > :: operator[] ( Index i )
 template< typename Element,
            typename Device,
            typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Element& tnlArray< Element, Device, Index > :: operator[] ( Index i ) const
 {
    tnlAssert( 0 <= i && i < this -> getSize(),
@@ -295,9 +289,7 @@ void tnlArray< Element, Device, Index > :: setValue( const Element& e )
 template< typename Element,
            typename Device,
            typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Element* tnlArray< Element, Device, Index > :: getData() const
 {
    return this -> data;
@@ -306,9 +298,7 @@ const Element* tnlArray< Element, Device, Index > :: getData() const
 template< typename Element,
            typename Device,
            typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Element* tnlArray< Element, Device, Index > :: getData()
 {
    return this -> data;
@@ -434,16 +424,34 @@ ostream& operator << ( ostream& str, const tnlArray< Element, Device, Index >& v
 
 // TODO: this does not work with CUDA 5.5 - fix it later
 
-/*extern template class tnlArray< float, tnlHost, int >;
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlArray< float, tnlHost, int >;
+#endif
 extern template class tnlArray< double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlArray< long double, tnlHost, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlArray< float, tnlHost, long int >;
-extern template class tnlArray< double, tnlHost, long int >;*/
+#endif
+extern template class tnlArray< double, tnlHost, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlArray< long double, tnlHost, long int >;
+#endif
+#endif
 
 #ifdef HAVE_CUDA
-/*extern template class tnlArray< float, tnlCuda, int >;
-extern template class tnlArray< double, tnlCuda, int >;
-extern template class tnlArray< float, tnlCuda, long int >;
-extern template class tnlArray< double, tnlCuda, long int >;*/
+/*
+ #ifdef INSTANTIATE_FLOAT
+ extern template class tnlArray< float, tnlCuda, int >;
+ #endif
+ extern template class tnlArray< double, tnlCuda, int >;
+ #ifdef INSTANTIATE_FLOAT
+ extern template class tnlArray< float, tnlCuda, long int >;
+ #endif
+ extern template class tnlArray< double, tnlCuda, long int >;*/
 #endif
 
 #endif
diff --git a/src/core/arrays/tnlConstSharedArray.h b/src/core/arrays/tnlConstSharedArray.h
index bf14408b365ce74893be48a1b63fee1bddaad313..b258bcb8bb2168fdd5cdfe58f184a8a9aaede5ee 100644
--- a/src/core/arrays/tnlConstSharedArray.h
+++ b/src/core/arrays/tnlConstSharedArray.h
@@ -59,17 +59,11 @@ class tnlConstSharedArray : public tnlObject
 
    void reset();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getSize() const;
+   __cuda_callable__ Index getSize() const;
 
    Element getElement( Index i ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element& operator[] ( Index i ) const;
+   __cuda_callable__ const Element& operator[] ( Index i ) const;
 
    tnlConstSharedArray< Element, Device, Index >& operator = ( const tnlConstSharedArray< Element, Device, Index >& array );
 
@@ -82,10 +76,7 @@ class tnlConstSharedArray : public tnlObject
    template< typename Array >
    bool operator != ( const Array& array ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element* getData() const;
+   __cuda_callable__ const Element* getData() const;
 
    /****
     * Returns true if non-zero size is set.
diff --git a/src/core/arrays/tnlConstSharedArray_impl.h b/src/core/arrays/tnlConstSharedArray_impl.h
index 4a541474a753728419cfe6b185b5bc7339e3fc6a..7f0b0e804246e677b2449fdc9667b9f4ff934f97 100644
--- a/src/core/arrays/tnlConstSharedArray_impl.h
+++ b/src/core/arrays/tnlConstSharedArray_impl.h
@@ -119,9 +119,7 @@ void tnlConstSharedArray< Element, Device, Index > :: reset()
 template< typename Element,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlConstSharedArray< Element, Device, Index > :: getSize() const
 {
    return this -> size;
@@ -143,9 +141,7 @@ Element tnlConstSharedArray< Element, Device, Index > :: getElement( Index i ) c
 template< typename Element,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Element& tnlConstSharedArray< Element, Device, Index > :: operator[] ( Index i ) const
 {
    tnlAssert( 0 <= i && i < this -> getSize(),
@@ -281,16 +277,43 @@ ostream& operator << ( ostream& str, const tnlConstSharedArray< Element, Device,
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlConstSharedArray< float, tnlHost, int >;
+#endif
 extern template class tnlConstSharedArray< double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlConstSharedArray< long double, tnlHost, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlConstSharedArray< float, tnlHost, long int >;
+#endif
 extern template class tnlConstSharedArray< double, tnlHost, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlConstSharedArray< long double, tnlHost, long int >;
+#endif
+#endif
 
 #ifdef HAVE_CUDA
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlConstSharedArray< float, tnlCuda, int >;
+#endif
 extern template class tnlConstSharedArray< double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlConstSharedArray< long double, tnlCuda, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlConstSharedArray< float, tnlCuda, long int >;
+#endif
 extern template class tnlConstSharedArray< double, tnlCuda, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlConstSharedArray< long double, tnlCuda, long int >;
+#endif
+
+#endif
 #endif
 
 #endif
diff --git a/src/core/arrays/tnlMultiArray.h b/src/core/arrays/tnlMultiArray.h
index 815102e5d0cfbb400e39c38c35a4287cc8cc62b1..eb24d7afa552628755684120d775a35ea0b89987 100644
--- a/src/core/arrays/tnlMultiArray.h
+++ b/src/core/arrays/tnlMultiArray.h
@@ -41,9 +41,6 @@ class tnlMultiArray< 1, Element, Device, Index > : public tnlArray< Element, Dev
    typedef tnlMultiArray< 1, Element, tnlCuda, Index > CudaType;
 
 
-#ifdef HAVE_CUDA
-   //__device__ __host__
-#endif
    tnlMultiArray();
 
    tnlMultiArray( const tnlString& name );
@@ -60,15 +57,9 @@ class tnlMultiArray< 1, Element, Device, Index > : public tnlArray< Element, Dev
 
    bool setDimensions( const tnlStaticVector< 1, Index >& dimensions );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   void getDimensions( Index& iSize ) const;
+   __cuda_callable__ void getDimensions( Index& iSize ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const tnlStaticVector< 1, Index >& getDimensions() const;
+   __cuda_callable__ const tnlStaticVector< 1, Index >& getDimensions() const;
 
    //! Set dimensions of the array using another array as a template
    template< typename MultiArray >
@@ -76,10 +67,7 @@ class tnlMultiArray< 1, Element, Device, Index > : public tnlArray< Element, Dev
    
    void reset();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getElementIndex( const Index i ) const;
+   __cuda_callable__ Index getElementIndex( const Index i ) const;
 
    void setElement( const Index i, Element value );
 
@@ -91,15 +79,9 @@ class tnlMultiArray< 1, Element, Device, Index > : public tnlArray< Element, Dev
    Element getElement( const Index i ) const;
 
    //! Operator for accessing elements of the array.
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Element& operator()( const Index i );
+   __cuda_callable__ Element& operator()( const Index i );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element& operator()( const Index i ) const;
+   __cuda_callable__ const Element& operator()( const Index i ) const;
 
 
    template< typename MultiArray >
@@ -140,9 +122,6 @@ class tnlMultiArray< 2, Element, Device, Index > : public tnlArray< Element, Dev
    typedef tnlMultiArray< 2, Element, tnlCuda, Index > CudaType;
 
 
-#ifdef HAVE_CUDA
-   //__device__ __host__
-#endif
    tnlMultiArray();
 
    tnlMultiArray( const tnlString& name );
@@ -159,15 +138,9 @@ class tnlMultiArray< 2, Element, Device, Index > : public tnlArray< Element, Dev
 
    bool setDimensions( const tnlStaticVector< 2, Index >& dimensions );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   void getDimensions( Index& jSize, Index& iSize ) const;
+   __cuda_callable__ void getDimensions( Index& jSize, Index& iSize ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const tnlStaticVector< 2, Index >& getDimensions() const;
+   __cuda_callable__ const tnlStaticVector< 2, Index >& getDimensions() const;
 
    //! Set dimensions of the array using another array as a template
    template< typename MultiArray >
@@ -175,10 +148,7 @@ class tnlMultiArray< 2, Element, Device, Index > : public tnlArray< Element, Dev
 
    void reset();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getElementIndex( const Index j, const Index i ) const;
+   __cuda_callable__ Index getElementIndex( const Index j, const Index i ) const;
 
    void setElement( const Index j, const Index i, Element value );
 
@@ -194,15 +164,9 @@ class tnlMultiArray< 2, Element, Device, Index > : public tnlArray< Element, Dev
     *  used to access elements of arrays in different address space
     *  (GPU device usually).
     */
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Element& operator()( const Index j, const Index i );
+   __cuda_callable__ Element& operator()( const Index j, const Index i );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element& operator()( const Index j, const Index i ) const;
+   __cuda_callable__ const Element& operator()( const Index j, const Index i ) const;
 
    template< typename MultiArray >
    bool operator == ( const MultiArray& array ) const;
@@ -243,9 +207,6 @@ class tnlMultiArray< 3, Element, Device, Index > : public tnlArray< Element, Dev
    typedef tnlMultiArray< 3, Element, tnlCuda, Index > CudaType;
 
 
-#ifdef HAVE_CUDA
-   //__device__ __host__
-#endif
    tnlMultiArray();
 
    tnlMultiArray( const tnlString& name );
@@ -262,15 +223,9 @@ class tnlMultiArray< 3, Element, Device, Index > : public tnlArray< Element, Dev
 
    bool setDimensions( const tnlStaticVector< 3, Index >& dimensions );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   void getDimensions( Index& k, Index& j, Index& iSize ) const;
+   __cuda_callable__ void getDimensions( Index& k, Index& j, Index& iSize ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const tnlStaticVector< 3, Index >& getDimensions() const;
+   __cuda_callable__ const tnlStaticVector< 3, Index >& getDimensions() const;
 
    //! Set dimensions of the array using another array as a template
    template< typename MultiArray >
@@ -278,10 +233,7 @@ class tnlMultiArray< 3, Element, Device, Index > : public tnlArray< Element, Dev
 
    void reset();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getElementIndex( const Index k, const Index j, const Index i ) const;
+   __cuda_callable__ Index getElementIndex( const Index k, const Index j, const Index i ) const;
 
    void setElement( const Index k, const Index j, const Index i, Element value );
 
@@ -297,15 +249,9 @@ class tnlMultiArray< 3, Element, Device, Index > : public tnlArray< Element, Dev
     *  used to access elements of arrays in different adress space
     *  (GPU device usualy).
     */
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Element& operator()( const Index k, const Index j, const Index i );
+   __cuda_callable__ Element& operator()( const Index k, const Index j, const Index i );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element& operator()( const Index k, const Index j, const Index i ) const;
+   __cuda_callable__ const Element& operator()( const Index k, const Index j, const Index i ) const;
 
    template< typename MultiArray >
    bool operator == ( const MultiArray& array ) const;
@@ -346,9 +292,6 @@ class tnlMultiArray< 4, Element, Device, Index > : public tnlArray< Element, Dev
    typedef tnlMultiArray< 4, Element, tnlCuda, Index > CudaType;
 
 
-#ifdef HAVE_CUDA
-   //__device__ __host__
-#endif
    tnlMultiArray();
 
    tnlMultiArray( const tnlString& name );
@@ -365,15 +308,9 @@ class tnlMultiArray< 4, Element, Device, Index > : public tnlArray< Element, Dev
 
    bool setDimensions( const tnlStaticVector< 4, Index >& dimensions );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   void getDimensions( Index& l, Index& k, Index& j, Index& iSize ) const;
+   __cuda_callable__ void getDimensions( Index& l, Index& k, Index& j, Index& iSize ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const tnlStaticVector< 4, Index >& getDimensions() const;
+   __cuda_callable__ const tnlStaticVector< 4, Index >& getDimensions() const;
 
    //! Set dimensions of the array using another array as a template
    template< typename MultiArray >
@@ -381,10 +318,7 @@ class tnlMultiArray< 4, Element, Device, Index > : public tnlArray< Element, Dev
 
    void reset();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getElementIndex( const Index l, const Index k, const Index j, const Index i ) const;
+   __cuda_callable__ Index getElementIndex( const Index l, const Index k, const Index j, const Index i ) const;
 
    void setElement( const Index l, const Index k, const Index j, const Index i, Element value );
 
@@ -400,15 +334,9 @@ class tnlMultiArray< 4, Element, Device, Index > : public tnlArray< Element, Dev
     *  used to access elements of arrays in different adress space
     *  (GPU device usualy).
     */
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Element& operator()( const Index l, const Index k, const Index j, const Index i );
+   __cuda_callable__ Element& operator()( const Index l, const Index k, const Index j, const Index i );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element& operator()( const Index l, const Index k, const Index j, const Index i ) const;
+   __cuda_callable__ const Element& operator()( const Index l, const Index k, const Index j, const Index i ) const;
 
    template< typename MultiArray >
    bool operator == ( const MultiArray& array ) const;
@@ -456,39 +384,83 @@ ostream& operator << ( ostream& str, const tnlMultiArray< 4, Element, device, In
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 1, float,  tnlHost, int >;
+#endif
 extern template class tnlMultiArray< 1, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 1, float,  tnlHost, long int >;
+#endif
 extern template class tnlMultiArray< 1, double, tnlHost, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 2, float,  tnlHost, int >;
+#endif
 extern template class tnlMultiArray< 2, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 2, float,  tnlHost, long int >;
+#endif
 extern template class tnlMultiArray< 2, double, tnlHost, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 3, float,  tnlHost, int >;
+#endif
 extern template class tnlMultiArray< 3, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 3, float,  tnlHost, long int >;
+#endif
 extern template class tnlMultiArray< 3, double, tnlHost, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 4, float,  tnlHost, int >;
+#endif
 extern template class tnlMultiArray< 4, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 4, float,  tnlHost, long int >;
+#endif
 extern template class tnlMultiArray< 4, double, tnlHost, long int >;
+#endif
 
 // TODO: There are problems with nvlink - it might be better in later versions
-/*extern template class tnlMultiArray< 1, float,  tnlCuda, int >;
+/*
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiArray< 1, float,  tnlCuda, int >;
+#endif
 extern template class tnlMultiArray< 1, double, tnlCuda, int >;
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 1, float,  tnlCuda, long int >;
+#endif
 extern template class tnlMultiArray< 1, double, tnlCuda, long int >;
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 2, float,  tnlCuda, int >;
+#endif
 extern template class tnlMultiArray< 2, double, tnlCuda, int >;
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 2, float,  tnlCuda, long int >;
+#endif
 extern template class tnlMultiArray< 2, double, tnlCuda, long int >;
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 3, float,  tnlCuda, int >;
+#endif
 extern template class tnlMultiArray< 3, double, tnlCuda, int >;
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 3, float,  tnlCuda, long int >;
+#endif
 extern template class tnlMultiArray< 3, double, tnlCuda, long int >;
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 4, float,  tnlCuda, int >;
+#endif
 extern template class tnlMultiArray< 4, double, tnlCuda, int >;
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 4, float,  tnlCuda, long int >;
+#endif
 extern template class tnlMultiArray< 4, double, tnlCuda, long int >;*/
 
 #endif
diff --git a/src/core/arrays/tnlMultiArray1D_impl.h b/src/core/arrays/tnlMultiArray1D_impl.h
index 7d5afb4edd05481a45d9c9571e50306bbe2c3b5e..b45aabdedb0bf4b5f5d8cb72a2d39070597cd98b 100644
--- a/src/core/arrays/tnlMultiArray1D_impl.h
+++ b/src/core/arrays/tnlMultiArray1D_impl.h
@@ -19,9 +19,6 @@
 #define TNLMULTIARRAY1D_IMPL_H_
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   //__device__ __host__
-#endif
 tnlMultiArray< 1, Element, Device, Index > :: tnlMultiArray()
 {
 }
@@ -103,27 +100,21 @@ void tnlMultiArray< 1, Element, Device, Index >::reset()
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlMultiArray< 1, Element, Device, Index > :: getDimensions( Index& xSize ) const
 {
    xSize = this -> dimensions[ 0 ];
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const tnlStaticVector< 1, Index >& tnlMultiArray< 1, Element, Device, Index > :: getDimensions() const
 {
    return this -> dimensions;
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlMultiArray< 1, Element, Device, Index > :: getElementIndex( const Index i ) const
 {
    tnlAssert( i >= 0 && i < this -> dimensions[ 0 ],
@@ -144,18 +135,14 @@ void tnlMultiArray< 1, Element, Device, Index > :: setElement( const Index i, El
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Element& tnlMultiArray< 1, Element, Device, Index > :: operator()( const Index element )
 {
    return tnlArray< Element, Device, Index > :: operator[]( getElementIndex( element ) );
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Element& tnlMultiArray< 1, Element, Device, Index > :: operator()( const Index element ) const
 {
    return tnlArray< Element, Device, Index > :: operator[]( getElementIndex( element ) );
diff --git a/src/core/arrays/tnlMultiArray2D_impl.h b/src/core/arrays/tnlMultiArray2D_impl.h
index 15c1aacd25ceecd0b4715db6f94ec80428e2c0d9..cc10ee13d4ccb96750a2f63df0e02932bbc2c27e 100644
--- a/src/core/arrays/tnlMultiArray2D_impl.h
+++ b/src/core/arrays/tnlMultiArray2D_impl.h
@@ -21,9 +21,6 @@
 
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   //__device__ __host__
-#endif
 tnlMultiArray< 2, Element, Device, Index > :: tnlMultiArray()
 {
 }
@@ -114,9 +111,7 @@ void tnlMultiArray< 2, Element, Device, Index >::reset()
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlMultiArray< 2, Element, Device, Index > :: getDimensions( Index& jSize, Index& iSize ) const
 {
    iSize = this -> dimensions[ 0 ];
@@ -124,18 +119,14 @@ void tnlMultiArray< 2, Element, Device, Index > :: getDimensions( Index& jSize,
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const tnlStaticVector< 2, Index >& tnlMultiArray< 2, Element, Device, Index > :: getDimensions() const
 {
    return this -> dimensions;
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlMultiArray< 2, Element, Device, Index > :: getElementIndex( const Index j, const Index i ) const
 {
    tnlAssert( i >= 0 && i < this -> dimensions[ 0 ] && j >= 0 && j < this -> dimensions[ 1 ],
@@ -157,18 +148,14 @@ void tnlMultiArray< 2, Element, Device, Index > :: setElement( const Index j, co
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Element& tnlMultiArray< 2, Element, Device, Index > :: operator()( const Index j, const Index i )
 {
    return tnlArray< Element, Device, Index > :: operator[]( getElementIndex( j, i ) );
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Element& tnlMultiArray< 2, Element, Device, Index > :: operator()( const Index j, const Index i ) const
 {
    return tnlArray< Element, Device, Index > :: operator[]( getElementIndex( j, i ) );
diff --git a/src/core/arrays/tnlMultiArray3D_impl.h b/src/core/arrays/tnlMultiArray3D_impl.h
index 2fa78ff7c1f9478b73e81adba2a20c984bc3fbb9..3cd8dfbd237ba327d54d77d9359041320b95ead0 100644
--- a/src/core/arrays/tnlMultiArray3D_impl.h
+++ b/src/core/arrays/tnlMultiArray3D_impl.h
@@ -21,9 +21,6 @@
 
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   //__device__ __host__
-#endif
 tnlMultiArray< 3, Element, Device, Index > :: tnlMultiArray()
 {
 }
@@ -119,9 +116,7 @@ void tnlMultiArray< 3, Element, Device, Index >::reset()
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlMultiArray< 3, Element, Device, Index > :: getDimensions( Index& kSize,
                                                                   Index& jSize,
                                                                   Index& iSize ) const
@@ -132,18 +127,14 @@ void tnlMultiArray< 3, Element, Device, Index > :: getDimensions( Index& kSize,
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const tnlStaticVector< 3, Index >& tnlMultiArray< 3, Element, Device, Index > :: getDimensions() const
 {
    return this -> dimensions;
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlMultiArray< 3, Element, Device, Index > :: getElementIndex( const Index k,
                                                                      const Index j,
                                                                      const Index i ) const
@@ -176,9 +167,7 @@ void tnlMultiArray< 3, Element, Device, Index > :: setElement( const Index k,
 
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Element& tnlMultiArray< 3, Element, Device, Index > :: operator()( const Index k,
                                                                         const Index j,
                                                                         const Index i )
@@ -187,9 +176,7 @@ Element& tnlMultiArray< 3, Element, Device, Index > :: operator()( const Index k
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Element& tnlMultiArray< 3, Element, Device, Index > :: operator()( const Index k,
                                                                                const Index j,
                                                                                const Index i ) const
diff --git a/src/core/arrays/tnlMultiArray4D_impl.h b/src/core/arrays/tnlMultiArray4D_impl.h
index a309545b613e46136088922a001433cb8312aa13..a2ebfcd21f709f8da7ad77d1ec96d3ee54f03a06 100644
--- a/src/core/arrays/tnlMultiArray4D_impl.h
+++ b/src/core/arrays/tnlMultiArray4D_impl.h
@@ -21,9 +21,6 @@
 
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   //__device__ __host__
-#endif
 tnlMultiArray< 4, Element, Device, Index > :: tnlMultiArray()
 {
 }
@@ -124,9 +121,7 @@ void tnlMultiArray< 4, Element, Device, Index >::reset()
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlMultiArray< 4, Element, Device, Index > :: getDimensions( Index& lSize,
                                                                        Index& kSize,
                                                                        Index& jSize,
@@ -139,18 +134,14 @@ void tnlMultiArray< 4, Element, Device, Index > :: getDimensions( Index& lSize,
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const tnlStaticVector< 4, Index >& tnlMultiArray< 4, Element, Device, Index > :: getDimensions() const
 {
    return this -> dimensions;
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlMultiArray< 4, Element, Device, Index > :: getElementIndex( const Index l,
                                                                      const Index k,
                                                                      const Index j,
@@ -188,9 +179,7 @@ void tnlMultiArray< 4, Element, Device, Index > :: setElement( const Index l,
 
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Element& tnlMultiArray< 4, Element, Device, Index > :: operator()( const Index l,
                                                                         const Index k,
                                                                         const Index j,
@@ -200,9 +189,7 @@ Element& tnlMultiArray< 4, Element, Device, Index > :: operator()( const Index l
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Element& tnlMultiArray< 4, Element, Device, Index > :: operator()( const Index l,
                                                                                const Index k,
                                                                                const Index j,
diff --git a/src/core/arrays/tnlMultiArray_impl.cpp b/src/core/arrays/tnlMultiArray_impl.cpp
index 8d6efafd088aca0c75525dab886a763a0cca5b9a..39695db81e0eb208c56811beb023aa250ac855f6 100644
--- a/src/core/arrays/tnlMultiArray_impl.cpp
+++ b/src/core/arrays/tnlMultiArray_impl.cpp
@@ -19,41 +19,95 @@
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 1, float,  tnlHost, int >;
+#endif
 template class tnlMultiArray< 1, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 1, float,  tnlHost, long int >;
+#endif
 template class tnlMultiArray< 1, double, tnlHost, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 2, float,  tnlHost, int >;
+#endif
 template class tnlMultiArray< 2, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 2, float,  tnlHost, long int >;
+#endif
 template class tnlMultiArray< 2, double, tnlHost, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 3, float,  tnlHost, int >;
+#endif
 template class tnlMultiArray< 3, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 3, float,  tnlHost, long int >;
+#endif
 template class tnlMultiArray< 3, double, tnlHost, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 4, float,  tnlHost, int >;
+#endif
 template class tnlMultiArray< 4, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 4, float,  tnlHost, long int >;
+#endif
 template class tnlMultiArray< 4, double, tnlHost, long int >;
+#endif
 
 #ifndef HAVE_CUDA
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 1, float,  tnlCuda, int >;
+#endif
 template class tnlMultiArray< 1, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 1, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiArray< 1, double, tnlCuda, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 2, float,  tnlCuda, int >;
+#endif
 template class tnlMultiArray< 2, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 2, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiArray< 2, double, tnlCuda, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 3, float,  tnlCuda, int >;
+#endif
 template class tnlMultiArray< 3, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 3, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiArray< 3, double, tnlCuda, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 4, float,  tnlCuda, int >;
+#endif
 template class tnlMultiArray< 4, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 4, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiArray< 4, double, tnlCuda, long int >;
+#endif
 
 #endif
 
diff --git a/src/core/arrays/tnlMultiArray_impl.cu b/src/core/arrays/tnlMultiArray_impl.cu
index a168d7111a98bc71a565b73493f69b562c64c9cf..a0ac41bde5a89f13589a8dada3506e5ace09706a 100644
--- a/src/core/arrays/tnlMultiArray_impl.cu
+++ b/src/core/arrays/tnlMultiArray_impl.cu
@@ -20,22 +20,49 @@
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
 #ifdef HAVE_CUDA
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 1, float,  tnlCuda, int >;
+#endif
 template class tnlMultiArray< 1, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 1, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiArray< 1, double, tnlCuda, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 2, float,  tnlCuda, int >;
+#endif
 template class tnlMultiArray< 2, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 2, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiArray< 2, double, tnlCuda, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 3, float,  tnlCuda, int >;
+#endif
 template class tnlMultiArray< 3, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 3, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiArray< 3, double, tnlCuda, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 4, float,  tnlCuda, int >;
+#endif
 template class tnlMultiArray< 4, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 4, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiArray< 4, double, tnlCuda, long int >;
+#endif
 
 #endif
 
diff --git a/src/core/arrays/tnlSharedArray.h b/src/core/arrays/tnlSharedArray.h
index e4a4b0104ac8fc4ad8b265d469aad56a2c9c9c7c..0d1da760fab38c8a64f51373544f71b2b2699e81 100644
--- a/src/core/arrays/tnlSharedArray.h
+++ b/src/core/arrays/tnlSharedArray.h
@@ -19,6 +19,7 @@
 #define TNLSHAREDARRAY_H_
 
 #include <core/tnlObject.h>
+#include <core/tnlCuda.h>
 
 class tnlFile;
 class tnlHost;
@@ -74,24 +75,15 @@ class tnlSharedArray : public tnlObject
 
    void reset();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getSize() const;
+   __cuda_callable__ Index getSize() const;
 
    void setElement( const Index i, const Element& x );
 
    Element getElement( Index i ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Element& operator[] ( Index i );
+   __cuda_callable__ Element& operator[] ( Index i );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element& operator[] ( Index i ) const;
+   __cuda_callable__ const Element& operator[] ( Index i ) const;
 
    tnlSharedArray< Element, Device, Index >& operator = ( const tnlSharedArray< Element, Device, Index >& array );
 
@@ -106,15 +98,9 @@ class tnlSharedArray : public tnlObject
 
    void setValue( const Element& e );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element* getData() const;
+   __cuda_callable__ const Element* getData() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Element* getData();
+   __cuda_callable__ Element* getData();
 
    /*!
     * Returns true if non-zero size is set.
diff --git a/src/core/arrays/tnlSharedArray_impl.cpp b/src/core/arrays/tnlSharedArray_impl.cpp
index 15788dd5d87d245167e5972199592e6b58d234ad..b0c92f0cfeafc61c74d40e5fe66175ad35e1dd9c 100644
--- a/src/core/arrays/tnlSharedArray_impl.cpp
+++ b/src/core/arrays/tnlSharedArray_impl.cpp
@@ -18,17 +18,43 @@
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedArray< float, tnlHost, int >;
+#endif
 template class tnlSharedArray< double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlSharedArray< long double, tnlHost, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedArray< float, tnlHost, long int >;
+#endif
 template class tnlSharedArray< double, tnlHost, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlSharedArray< long double, tnlHost, long int >;
+#endif
+#endif
 
-#ifdef HAVE_CUDA
+/*#ifdef HAVE_CUDA
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedArray< float, tnlCuda, int >;
+#endif
 template class tnlSharedArray< double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlSharedArray< long double, tnlCuda, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedArray< float, tnlCuda, long int >;
+#endif
 template class tnlSharedArray< double, tnlCuda, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlSharedArray< long double, tnlCuda, long int >;
+#endif
 #endif
+#endif*/
 
 #endif
 
diff --git a/src/core/arrays/tnlSharedArray_impl.cu b/src/core/arrays/tnlSharedArray_impl.cu
index 51d2eae92b17f2e597ba3754df9264389ad7c634..010e3ce01bb209093a4056acf182f4f422d3c5c1 100644
--- a/src/core/arrays/tnlSharedArray_impl.cu
+++ b/src/core/arrays/tnlSharedArray_impl.cu
@@ -20,10 +20,23 @@
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
 #ifdef HAVE_CUDA
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedArray< float, tnlCuda, int >;
+#endif
 template class tnlSharedArray< double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlSharedArray< long double, tnlCuda, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedArray< float, tnlCuda, long int >;
+#endif
 template class tnlSharedArray< double, tnlCuda, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlSharedArray< long double, tnlCuda, long int >;
+#endif
+#endif
 #endif
 
 #endif
\ No newline at end of file
diff --git a/src/core/arrays/tnlSharedArray_impl.h b/src/core/arrays/tnlSharedArray_impl.h
index 2b298d15ba9d856bff6369e8391f7af48a9bde5b..6fa7292e2d46778591a3bdd37eccab49e40face3 100644
--- a/src/core/arrays/tnlSharedArray_impl.h
+++ b/src/core/arrays/tnlSharedArray_impl.h
@@ -163,9 +163,7 @@ void tnlSharedArray< Element, Device, Index > :: reset()
 template< typename Element,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlSharedArray< Element, Device, Index > :: getSize() const
 {
    return this -> size;
@@ -200,9 +198,7 @@ Element tnlSharedArray< Element, Device, Index > :: getElement( Index i ) const
 template< typename Element,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Element& tnlSharedArray< Element, Device, Index > :: operator[] ( Index i )
 {
    tnlAssert( 0 <= i && i < this -> getSize(),
@@ -216,9 +212,7 @@ Element& tnlSharedArray< Element, Device, Index > :: operator[] ( Index i )
 template< typename Element,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Element& tnlSharedArray< Element, Device, Index > :: operator[] ( Index i ) const
 {
    tnlAssert( 0 <= i && i < this -> getSize(),
@@ -441,15 +435,34 @@ ostream& operator << ( ostream& str, const tnlSharedArray< Element, Device, Inde
 
 // TODO: this does not work with CUDA 5.5 - fix it later
 
-/*extern template class tnlSharedArray< float, tnlHost, int >;
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlSharedArray< float, tnlHost, int >;
+#endif
 extern template class tnlSharedArray< double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlSharedArray< long double, tnlHost, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlSharedArray< float, tnlHost, long int >;
-extern template class tnlSharedArray< double, tnlHost, long int >;*/
+#endif
+extern template class tnlSharedArray< double, tnlHost, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlSharedArray< long double, tnlHost, long int >;
+#endif
+#endif
+
 
 #ifdef HAVE_CUDA
-/*extern template class tnlSharedArray< float, tnlCuda, int >;
+/*
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlSharedArray< float, tnlCuda, int >;
+#endif
 extern template class tnlSharedArray< double, tnlCuda, int >;
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlSharedArray< float, tnlCuda, long int >;
+#endif
 extern template class tnlSharedArray< double, tnlCuda, long int >;*/
 #endif
 
diff --git a/src/core/arrays/tnlStaticArray1D_impl.h b/src/core/arrays/tnlStaticArray1D_impl.h
index 37150c4c4d0fc0d6556861d2618e42e60ae201cc..170488b655ba13fe968628f860d81ba838b1ed6b 100644
--- a/src/core/arrays/tnlStaticArray1D_impl.h
+++ b/src/core/arrays/tnlStaticArray1D_impl.h
@@ -209,10 +209,16 @@ void tnlStaticArray< 1, Element >::sort()
 #ifndef HAVE_CUDA
 extern template class tnlStaticArray< 1, char >;
 extern template class tnlStaticArray< 1, int >;
+#ifdef INSTANTIATE_LONG_INT
 extern template class tnlStaticArray< 1, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlStaticArray< 1, float >;
+#endif
 extern template class tnlStaticArray< 1, double >;
-//extern template class tnlStaticArray< 1, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlStaticArray< 1, long double >;
+#endif
 #endif
 
 #endif
diff --git a/src/core/arrays/tnlStaticArray2D_impl.h b/src/core/arrays/tnlStaticArray2D_impl.h
index 0f7d7b26d6782924e4fd1b126a2a7c925c078f5a..3e3cd35820ec1c83a86a4ded314d5bd10c5e28f4 100644
--- a/src/core/arrays/tnlStaticArray2D_impl.h
+++ b/src/core/arrays/tnlStaticArray2D_impl.h
@@ -246,10 +246,16 @@ void tnlStaticArray< 2, Element >::sort()
 #ifndef HAVE_CUDA
 extern template class tnlStaticArray< 2, char >;
 extern template class tnlStaticArray< 2, int >;
+#ifdef INSTANTIATE_LONG_INT
 extern template class tnlStaticArray< 2, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlStaticArray< 2, float >;
+#endif
 extern template class tnlStaticArray< 2, double >;
-//extern template class tnlStaticArray< 2, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlStaticArray< 2, long double >;
+#endif
 #endif
 
 #endif
diff --git a/src/core/arrays/tnlStaticArray3D_impl.h b/src/core/arrays/tnlStaticArray3D_impl.h
index 74254d4c76b838df0892614eb4a33c358da49668..5b9755f0281d614177db0e781d2986c57c52030f 100644
--- a/src/core/arrays/tnlStaticArray3D_impl.h
+++ b/src/core/arrays/tnlStaticArray3D_impl.h
@@ -277,10 +277,16 @@ void tnlStaticArray< 3, Element >::sort()
 #ifndef HAVE_CUDA
 extern template class tnlStaticArray< 3, char >;
 extern template class tnlStaticArray< 3, int >;
+#ifdef INSTANTIATE_LONG_INT
 extern template class tnlStaticArray< 3, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlStaticArray< 3, float >;
+#endif
 extern template class tnlStaticArray< 3, double >;
-//extern template class tnlStaticArray< 3, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlStaticArray< 3, long double >;
+#endif
 #endif
 
 #endif
diff --git a/src/core/arrays/tnlStaticArray_impl.cpp b/src/core/arrays/tnlStaticArray_impl.cpp
index 4d9b09efbc0044db090137d712474aa37cf317c8..f16eb80a4eb4bd3dcf929259df2076eadb798e0f 100644
--- a/src/core/arrays/tnlStaticArray_impl.cpp
+++ b/src/core/arrays/tnlStaticArray_impl.cpp
@@ -22,31 +22,55 @@
 
 template class tnlStaticArray< 1, char >;
 template class tnlStaticArray< 1, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlStaticArray< 1, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticArray< 1, float >;
+#endif
 template class tnlStaticArray< 1, double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
 template class tnlStaticArray< 1, long double >;
+#endif
 
 template class tnlStaticArray< 2, char >;
 template class tnlStaticArray< 2, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlStaticArray< 2, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticArray< 2, float >;
+#endif
 template class tnlStaticArray< 2, double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
 template class tnlStaticArray< 2, long double >;
+#endif
 
 template class tnlStaticArray< 3, char >;
 template class tnlStaticArray< 3, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlStaticArray< 3, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticArray< 3, float >;
+#endif
 template class tnlStaticArray< 3, double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
 template class tnlStaticArray< 3, long double >;
+#endif
 
 template class tnlStaticArray< 4, char >;
 template class tnlStaticArray< 4, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlStaticArray< 4, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticArray< 4, float >;
+#endif
 template class tnlStaticArray< 4, double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
 template class tnlStaticArray< 4, long double >;
+#endif
 
 #endif
 #endif
diff --git a/src/core/arrays/tnlStaticArray_impl.cu b/src/core/arrays/tnlStaticArray_impl.cu
index 3cb51866dbc842157d6a1e8d1139e9f0481c16b6..5ae563aab4ed8cb418f1b49e05ef5b5f9bee3d20 100644
--- a/src/core/arrays/tnlStaticArray_impl.cu
+++ b/src/core/arrays/tnlStaticArray_impl.cu
@@ -22,31 +22,55 @@
 
 template class tnlStaticArray< 1, char >;
 template class tnlStaticArray< 1, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlStaticArray< 1, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticArray< 1, float >;
+#endif
 template class tnlStaticArray< 1, double >;
-//template class tnlStaticArray< 1, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlStaticArray< 1, long double >;
+#endif
 
 template class tnlStaticArray< 2, char >;
 template class tnlStaticArray< 2, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlStaticArray< 2, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticArray< 2, float >;
+#endif
 template class tnlStaticArray< 2, double >;
-//template class tnlStaticArray< 2, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlStaticArray< 2, long double >;
+#endif
 
 template class tnlStaticArray< 3, char >;
 template class tnlStaticArray< 3, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlStaticArray< 3, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticArray< 3, float >;
+#endif
 template class tnlStaticArray< 3, double >;
-//template class tnlStaticArray< 3, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlStaticArray< 3, long double >;
+#endif
 
 template class tnlStaticArray< 4, char >;
 template class tnlStaticArray< 4, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlStaticArray< 4, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticArray< 4, float >;
+#endif
 template class tnlStaticArray< 4, double >;
-//template class tnlStaticArray< 4, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlStaticArray< 4, long double >;
+#endif
 
 #endif
 #endif
\ No newline at end of file
diff --git a/src/core/arrays/tnlStaticArray_impl.h b/src/core/arrays/tnlStaticArray_impl.h
index 63afb64be467433395a855b453f863f3c3becb80..08c5606e4f387b0763060ffcb1a05e3b6176d717 100644
--- a/src/core/arrays/tnlStaticArray_impl.h
+++ b/src/core/arrays/tnlStaticArray_impl.h
@@ -222,10 +222,16 @@ ostream& operator << ( ostream& str, const tnlStaticArray< Size, Element >& a )
 #ifndef HAVE_CUDA
 extern template class tnlStaticArray< 4, char >;
 extern template class tnlStaticArray< 4, int >;
+#ifdef INSTANTIATE_LONG_INT
 extern template class tnlStaticArray< 4, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlStaticArray< 4, float >;
+#endif
 extern template class tnlStaticArray< 4, double >;
-//extern template class tnlStaticArray< 4, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlStaticArray< 4, long double >;
+#endif
 #endif
 
 #endif
diff --git a/src/core/cuda/CMakeLists.txt b/src/core/cuda/CMakeLists.txt
index 6406a6a2c318f6fa313af3ea018be4ba34a78aef..b9924873fd76e0837de1209fba6c090a075c380f 100755
--- a/src/core/cuda/CMakeLists.txt
+++ b/src/core/cuda/CMakeLists.txt
@@ -2,7 +2,8 @@ set( headers cuda-prefix-sum.h
              cuda-prefix-sum_impl.h
              cuda-reduction.h             
              cuda-reduction_impl.h
-             reduction-operations.h )
+             reduction-operations.h
+             tnlCublasWrapper.h )
 
 SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/core/cuda ) 
 IF( BUILD_CUDA )
diff --git a/src/core/cuda/cuda-prefix-sum_impl.cu b/src/core/cuda/cuda-prefix-sum_impl.cu
index 36497541e34da0eb35d9252bf7a67754681c191d..74f3e85fb7c75554e37e3f4075aabcf8fa2092bd 100644
--- a/src/core/cuda/cuda-prefix-sum_impl.cu
+++ b/src/core/cuda/cuda-prefix-sum_impl.cu
@@ -27,12 +27,14 @@ template bool cudaPrefixSum( const int size,
                              const enumPrefixSumType prefixSumType );
 
 
+#ifdef INSTANTIATE_FLOAT
 template bool cudaPrefixSum( const int size,
                              const int blockSize,
                              const float *deviceInput,
                              float* deviceOutput,
                              const tnlParallelReductionSum< float, int >& operation,
                              const enumPrefixSumType prefixSumType );
+#endif
 
 template bool cudaPrefixSum( const int size,
                              const int blockSize,
@@ -41,13 +43,16 @@ template bool cudaPrefixSum( const int size,
                              const tnlParallelReductionSum< double, int >& operation,
                              const enumPrefixSumType prefixSumType );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool cudaPrefixSum( const int size,
                              const int blockSize,
                              const long double *deviceInput,
                              long double* deviceOutput,
                              const tnlParallelReductionSum< long double, int >& operation,
                              const enumPrefixSumType prefixSumType );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool cudaPrefixSum( const long int size,
                              const long int blockSize,
                              const int *deviceInput,
@@ -56,12 +61,14 @@ template bool cudaPrefixSum( const long int size,
                              const enumPrefixSumType prefixSumType );
 
 
+#ifdef INSTANTIATE_FLOAT
 template bool cudaPrefixSum( const long int size,
                              const long int blockSize,
                              const float *deviceInput,
                              float* deviceOutput,
                              const tnlParallelReductionSum< float, long int >& operation,
                              const enumPrefixSumType prefixSumType );
+#endif
 
 template bool cudaPrefixSum( const long int size,
                              const long int blockSize,
@@ -70,10 +77,13 @@ template bool cudaPrefixSum( const long int size,
                              const tnlParallelReductionSum< double, long int >& operation,
                              const enumPrefixSumType prefixSumType );
 
-/*template bool cudaPrefixSum( const long int size,
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool cudaPrefixSum( const long int size,
                              const long int blockSize,
                              const long double *deviceInput,
                              long double* deviceOutput,
                              const tnlParallelReductionSum< long double, long int >& operation,
-                             const enumPrefixSumType prefixSumType );*/   
+                             const enumPrefixSumType prefixSumType );
+#endif
+#endif 
 #endif
diff --git a/src/core/cuda/cuda-prefix-sum_impl.h b/src/core/cuda/cuda-prefix-sum_impl.h
index a37b818de944e023d0bc41a9dad1855f4a18b3db..eeb898d4010c8ee7af2419b3da951b7369597fb8 100644
--- a/src/core/cuda/cuda-prefix-sum_impl.h
+++ b/src/core/cuda/cuda-prefix-sum_impl.h
@@ -373,13 +373,16 @@ extern template bool cudaPrefixSum( const int size,
                                     const tnlParallelReductionSum< double, int >& operation,
                                     const enumPrefixSumType prefixSumType );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool cudaPrefixSum( const int size,
                                     const int blockSize,
                                     const long double *deviceInput,
                                     long double* deviceOutput,
                                     const tnlParallelReductionSum< long double, int >& operation,
                                     const enumPrefixSumType prefixSumType );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool cudaPrefixSum( const long int size,
                                     const long int blockSize,
                                     const int *deviceInput,
@@ -402,6 +405,7 @@ extern template bool cudaPrefixSum( const long int size,
                                     const tnlParallelReductionSum< double, long int >& operation,
                                     const enumPrefixSumType prefixSumType );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool cudaPrefixSum( const long int size,
                                     const long int blockSize,
                                     const long double *deviceInput,
@@ -409,7 +413,9 @@ extern template bool cudaPrefixSum( const long int size,
                                     const tnlParallelReductionSum< long double, long int >& operation,
                                     const enumPrefixSumType prefixSumType );
 #endif
+#endif
 
+#endif
 
 #endif
 
diff --git a/src/core/cuda/cuda-reduction-abs-max_impl.cu b/src/core/cuda/cuda-reduction-abs-max_impl.cu
index 93605031fd1071be207ef28c2055761867c74011..8540fc71a381ba90b2db7c341abe24ff1e60f74a 100644
--- a/src/core/cuda/cuda-reduction-abs-max_impl.cu
+++ b/src/core/cuda/cuda-reduction-abs-max_impl.cu
@@ -52,13 +52,14 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, int > >
                                      const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, int > >
                                    ( const tnlParallelReductionAbsMax< long double, int>& operation,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< long double, int> :: ResultType& result );
-                                     
+#endif                                     
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int > >
                                    ( const tnlParallelReductionAbsMax< char, long int >& operation,
@@ -67,6 +68,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int
                                      const typename tnlParallelReductionAbsMax< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< char, long int > :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, long int > >
                                    ( const tnlParallelReductionAbsMax< int, long int >& operation,
                                      const typename tnlParallelReductionAbsMax< int, long int > :: IndexType size,
@@ -88,11 +90,14 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, long in
                                      const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > >
                                    ( const tnlParallelReductionAbsMax< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionAbsMax< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionAbsMax< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 #endif                                     
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-abs-min_impl.cu b/src/core/cuda/cuda-reduction-abs-min_impl.cu
index 812159df5ff33db346c1615a128acc278b938168..629fa37ddcf71fb4adacaf7e7e872fa938fda10c 100644
--- a/src/core/cuda/cuda-reduction-abs-min_impl.cu
+++ b/src/core/cuda/cuda-reduction-abs-min_impl.cu
@@ -52,12 +52,14 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, int > >
                                      const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, int > >
                                    ( const tnlParallelReductionAbsMin< long double, int>& operation,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< long double, int> :: ResultType& result );
+#endif
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int > >
                                    ( const tnlParallelReductionAbsMin< char, long int >& operation,
@@ -66,6 +68,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int
                                      const typename tnlParallelReductionAbsMin< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< char, long int > :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, long int > >
                                    ( const tnlParallelReductionAbsMin< int, long int >& operation,
                                      const typename tnlParallelReductionAbsMin< int, long int > :: IndexType size,
@@ -87,11 +90,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, long in
                                      const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > >
                                    ( const tnlParallelReductionAbsMin< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionAbsMin< long double, long int> :: ResultType& result );*/
-                                    
+                                     typename tnlParallelReductionAbsMin< long double, long int> :: ResultType& result );
+#endif
+#endif                                    
 #endif                                     
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-abs-sum_impl.cu b/src/core/cuda/cuda-reduction-abs-sum_impl.cu
index a6a22f16c8f8c2b925ee2ba2eaca16214fb59b13..a023631a03927fed1f0a57c71d671d7b25dcb01f 100644
--- a/src/core/cuda/cuda-reduction-abs-sum_impl.cu
+++ b/src/core/cuda/cuda-reduction-abs-sum_impl.cu
@@ -52,12 +52,14 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, int > >
                                      const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, int > >
                                    ( const tnlParallelReductionAbsSum< long double, int>& operation,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< long double, int> :: ResultType& result );
+#endif
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, long int > >
                                    ( const tnlParallelReductionAbsSum< char, long int >& operation,
@@ -66,6 +68,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, long int
                                      const typename tnlParallelReductionAbsSum< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< char, long int > :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< int, long int > >
                                    ( const tnlParallelReductionAbsSum< int, long int >& operation,
                                      const typename tnlParallelReductionAbsSum< int, long int > :: IndexType size,
@@ -87,11 +90,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, long in
                                      const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, long int > >
                                    ( const tnlParallelReductionAbsSum< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionAbsSum< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionAbsSum< long double, long int> :: ResultType& result );
+#endif
+#endif
 #endif                                     
diff --git a/src/core/cuda/cuda-reduction-and_impl.cu b/src/core/cuda/cuda-reduction-and_impl.cu
index 592c0cd5847c036e70ee1dca05c724f069360ec8..ac71e46e192fef91c6bbd62fc6418aeb435355f6 100644
--- a/src/core/cuda/cuda-reduction-and_impl.cu
+++ b/src/core/cuda/cuda-reduction-and_impl.cu
@@ -51,13 +51,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, int
                                      const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, int > >
                                    ( const tnlParallelReductionLogicalAnd< long double, int>& operation,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, long int > >
                                    ( const tnlParallelReductionLogicalAnd< char, long int >& operation,
                                      const typename tnlParallelReductionLogicalAnd< char, long int > :: IndexType size,
@@ -86,11 +89,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, lon
                                      const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, long int > >
                                    ( const tnlParallelReductionLogicalAnd< long double, long int>& operation,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionLogicalAnd< long double, long int> :: ResultType& result );*/
-
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionLogicalAnd< long double, long int> :: ResultType& result );
+#endif
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-diff-abs-max_impl.cu b/src/core/cuda/cuda-reduction-diff-abs-max_impl.cu
index 291fdc9d888a2ac71106d733ba20d49a8eaff536..291810ef74385624b4213c0d6afff9b4da009781 100644
--- a/src/core/cuda/cuda-reduction-diff-abs-max_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-abs-max_impl.cu
@@ -52,13 +52,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< double, int
                                      const typename tnlParallelReductionDiffAbsMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, int > >
                                    ( const tnlParallelReductionDiffAbsMax< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< char, long int > >
                                    ( const tnlParallelReductionDiffAbsMax< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsMax< char, long int > :: IndexType size,
@@ -87,11 +90,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< double, lon
                                      const typename tnlParallelReductionDiffAbsMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, long int > >
                                    ( const tnlParallelReductionDiffAbsMax< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMax< long double, long int> :: ResultType& result );*/
-
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionDiffAbsMax< long double, long int> :: ResultType& result );
+#endif
+#endif                        
+#endif             
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-diff-abs-min_impl.cu b/src/core/cuda/cuda-reduction-diff-abs-min_impl.cu
index 14428d5dff3cab5021c55a3168d7d67452d7bf75..d9ce714abde23b2788c9a94a81e03b283860ebf5 100644
--- a/src/core/cuda/cuda-reduction-diff-abs-min_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-abs-min_impl.cu
@@ -53,13 +53,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< double, int
                                      const typename tnlParallelReductionDiffAbsMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, int > >
                                    ( const tnlParallelReductionDiffAbsMin< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< char, long int > >
                                    ( const tnlParallelReductionDiffAbsMin< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsMin< char, long int > :: IndexType size,
@@ -88,11 +91,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< double, lon
                                      const typename tnlParallelReductionDiffAbsMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, long int > >
                                    ( const tnlParallelReductionDiffAbsMin< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMin< long double, long int> :: ResultType& result );*/
-
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionDiffAbsMin< long double, long int> :: ResultType& result );
+#endif
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-diff-abs-sum_impl.cu b/src/core/cuda/cuda-reduction-diff-abs-sum_impl.cu
index f033706ef704e81e405654e799954b136348738e..5298d033491f1d23330a05bc5528c211c1335e99 100644
--- a/src/core/cuda/cuda-reduction-diff-abs-sum_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-abs-sum_impl.cu
@@ -52,13 +52,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< double, int
                                      const typename tnlParallelReductionDiffAbsSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, int > >
                                    ( const tnlParallelReductionDiffAbsSum< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< char, long int > >
                                    ( const tnlParallelReductionDiffAbsSum< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsSum< char, long int > :: IndexType size,
@@ -87,11 +90,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< double, lon
                                      const typename tnlParallelReductionDiffAbsSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, long int > >
                                    ( const tnlParallelReductionDiffAbsSum< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsSum< long double, long int> :: ResultType& result );*/
-
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionDiffAbsSum< long double, long int> :: ResultType& result );
+#endif
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-diff-lp-norm_impl.cu b/src/core/cuda/cuda-reduction-diff-lp-norm_impl.cu
index 1403d34b148a6c553c78454908018e81ea1812bc..2359564477c934f1bcfa62de9e1a155c6875cff3 100644
--- a/src/core/cuda/cuda-reduction-diff-lp-norm_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-lp-norm_impl.cu
@@ -37,13 +37,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< double, int
                                      const typename tnlParallelReductionDiffLpNorm< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, int > >
                                    ( const tnlParallelReductionDiffLpNorm< long double, int>& operation,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< char, long int > >
                                    ( const tnlParallelReductionDiffLpNorm< char, long int >& operation,
                                      const typename tnlParallelReductionDiffLpNorm< char, long int > :: IndexType size,
@@ -72,13 +75,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< double, lon
                                      const typename tnlParallelReductionDiffLpNorm< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, long int > >
                                    ( const tnlParallelReductionDiffLpNorm< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffLpNorm< long double, long int> :: ResultType& result );*/
-
-
-
+                                     typename tnlParallelReductionDiffLpNorm< long double, long int> :: ResultType& result );
+#endif
+#endif
 #endif
diff --git a/src/core/cuda/cuda-reduction-diff-max_impl.cu b/src/core/cuda/cuda-reduction-diff-max_impl.cu
index 76fac28487598502b5ceee8d5830180dba62d9b3..fe91ae6ef6a0f9116df733f392016a588dd9c5e4 100644
--- a/src/core/cuda/cuda-reduction-diff-max_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-max_impl.cu
@@ -52,13 +52,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< double, int >
                                      const typename tnlParallelReductionDiffMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, int > >
                                    ( const tnlParallelReductionDiffMax< long double, int>& operation,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< long double, int> :: ResultType& result );
-                                     
+#endif
+
+#ifdef INSTANTIATE_LONG_INT                                     
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< char, long int > >
                                    ( const tnlParallelReductionDiffMax< char, long int >& operation,
                                      const typename tnlParallelReductionDiffMax< char, long int > :: IndexType size,
@@ -87,11 +90,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< double, long i
                                      const typename tnlParallelReductionDiffMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, long int > >
                                    ( const tnlParallelReductionDiffMax< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffMax< long double, long int> :: ResultType& result );*/
-
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionDiffMax< long double, long int> :: ResultType& result );
+#endif
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-diff-min_impl.cu b/src/core/cuda/cuda-reduction-diff-min_impl.cu
index fe75190d51657d6d658b2fe4623fbc18b7f6f191..ed13335b8b282352727a8a6687c10ae122196a48 100644
--- a/src/core/cuda/cuda-reduction-diff-min_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-min_impl.cu
@@ -53,13 +53,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< double, int >
                                      const typename tnlParallelReductionDiffMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, int > >
                                    ( const tnlParallelReductionDiffMin< long double, int>& operation,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< char, long int > >
                                    ( const tnlParallelReductionDiffMin< char, long int >& operation,
                                      const typename tnlParallelReductionDiffMin< char, long int > :: IndexType size,
@@ -88,11 +91,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< double, long i
                                      const typename tnlParallelReductionDiffMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, long int > >
                                    ( const tnlParallelReductionDiffMin< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffMin< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionDiffMin< long double, long int> :: ResultType& result );
+#endif
+#endif
 #endif                                     
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-diff-sum_impl.cu b/src/core/cuda/cuda-reduction-diff-sum_impl.cu
index ce79e8cc4b0835b68365168e859c5b51bf8f605a..aa08778ea70f66cc6c6d24b86effa4cf942db1d3 100644
--- a/src/core/cuda/cuda-reduction-diff-sum_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-sum_impl.cu
@@ -52,13 +52,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< double, int >
                                      const typename tnlParallelReductionDiffSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, int > >
                                    ( const tnlParallelReductionDiffSum< long double, int>& operation,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< char, long int > >
                                    ( const tnlParallelReductionDiffSum< char, long int >& operation,
                                      const typename tnlParallelReductionDiffSum< char, long int > :: IndexType size,
@@ -87,11 +90,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< double, long i
                                      const typename tnlParallelReductionDiffSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, long int > >
                                    ( const tnlParallelReductionDiffSum< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffSum< long double, long int> :: ResultType& result );*/
-                                    
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionDiffSum< long double, long int> :: ResultType& result );
+#endif                                    
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-equalities_impl.cu b/src/core/cuda/cuda-reduction-equalities_impl.cu
index 8b4fec91d0820ee5c82fdd77e30b3a441adecc93..6bf7f0263055adc7d6deda2f4316de48006ec3f6 100644
--- a/src/core/cuda/cuda-reduction-equalities_impl.cu
+++ b/src/core/cuda/cuda-reduction-equalities_impl.cu
@@ -51,13 +51,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, int
                                      const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, int > >
                                    ( const tnlParallelReductionEqualities< long double, int>& operation,
                                      const typename tnlParallelReductionEqualities< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, long int > >
                                    ( const tnlParallelReductionEqualities< char, long int >& operation,
                                      const typename tnlParallelReductionEqualities< char, long int > :: IndexType size,
@@ -86,11 +89,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, lon
                                      const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, long int > >
                                    ( const tnlParallelReductionEqualities< long double, long int>& operation,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionEqualities< long double, long int> :: ResultType& result );*/
-
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionEqualities< long double, long int> :: ResultType& result );
+#endif
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-inequalities_impl.cu b/src/core/cuda/cuda-reduction-inequalities_impl.cu
index a04537c97d44b606fcc463372e92246c9ddb6caf..828c88af106c9e8965bf64d12bc798aac41a9cae 100644
--- a/src/core/cuda/cuda-reduction-inequalities_impl.cu
+++ b/src/core/cuda/cuda-reduction-inequalities_impl.cu
@@ -51,13 +51,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, i
                                      const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, int > >
                                    ( const tnlParallelReductionInequalities< long double, int>& operation,
                                      const typename tnlParallelReductionInequalities< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, long int > >
                                    ( const tnlParallelReductionInequalities< char, long int >& operation,
                                      const typename tnlParallelReductionInequalities< char, long int > :: IndexType size,
@@ -86,11 +89,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, l
                                      const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, long int > >
                                    ( const tnlParallelReductionInequalities< long double, long int>& operation,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionInequalities< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionInequalities< long double, long int> :: ResultType& result );
+#endif
+#endif
 #endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-lp-norm_impl.cu b/src/core/cuda/cuda-reduction-lp-norm_impl.cu
index ac4fc3a8f6e0e11f86bf078dfa29eabd545bdc58..a5f5d6644cad7c3c9ff11a83e3343f75f16471e4 100644
--- a/src/core/cuda/cuda-reduction-lp-norm_impl.cu
+++ b/src/core/cuda/cuda-reduction-lp-norm_impl.cu
@@ -37,13 +37,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, int > >
                                      const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, int > >
                                    ( const tnlParallelReductionLpNorm< long double, int>& operation,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< int, long int > >
                                    ( const tnlParallelReductionLpNorm< int, long int >& operation,
                                      const typename tnlParallelReductionLpNorm< int, long int > :: IndexType size,
@@ -65,11 +68,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, long in
                                      const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, long int > >
                                    ( const tnlParallelReductionLpNorm< long double, long int>& operation,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionLpNorm< long double, long int> :: ResultType& result );*/
-
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionLpNorm< long double, long int> :: ResultType& result );
+#endif
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-max_impl.cu b/src/core/cuda/cuda-reduction-max_impl.cu
index 63847eb441ca9147270bb58ac63dfcfd74413e45..cba153c81bf8d4a7a0964b62f155217ff10b7a26 100644
--- a/src/core/cuda/cuda-reduction-max_impl.cu
+++ b/src/core/cuda/cuda-reduction-max_impl.cu
@@ -52,13 +52,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionMax< double, int > >
                                      const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, int > >
                                    ( const tnlParallelReductionMax< long double, int>& operation,
                                      const typename tnlParallelReductionMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionMax< char, long int > >
                                    ( const tnlParallelReductionMax< char, long int >& operation,
                                      const typename tnlParallelReductionMax< char, long int > :: IndexType size,
@@ -87,11 +90,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionMax< double, long int >
                                      const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, long int > >
                                    ( const tnlParallelReductionMax< long double, long int>& operation,
                                      const typename tnlParallelReductionMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionMax< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionMax< long double, long int> :: ResultType& result );
+#endif
+#endif
 #endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-min_impl.cu b/src/core/cuda/cuda-reduction-min_impl.cu
index 5ba284339dcc2f19d8c9ba68a2491ef98eb03c57..dc5a1f41407b410e4333f73c70744506814288a3 100644
--- a/src/core/cuda/cuda-reduction-min_impl.cu
+++ b/src/core/cuda/cuda-reduction-min_impl.cu
@@ -52,13 +52,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionMin< double, int > >
                                      const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, int > >
                                    ( const tnlParallelReductionMin< long double, int>& operation,
                                      const typename tnlParallelReductionMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionMin< char, long int > >
                                    ( const tnlParallelReductionMin< char, long int >& operation,
                                      const typename tnlParallelReductionMin< char, long int > :: IndexType size,
@@ -87,11 +90,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionMin< double, long int >
                                      const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, long int > >
                                    ( const tnlParallelReductionMin< long double, long int>& operation,
                                      const typename tnlParallelReductionMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionMin< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionMin< long double, long int> :: ResultType& result );
+#endif
+#endif
 #endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-or_impl.cu b/src/core/cuda/cuda-reduction-or_impl.cu
index 6128fc48c6152a85c17fb49c906cb5d8358fac34..811ec445fd4c79fc99dd02d023fabdf856c71000 100644
--- a/src/core/cuda/cuda-reduction-or_impl.cu
+++ b/src/core/cuda/cuda-reduction-or_impl.cu
@@ -51,13 +51,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, int
                                      const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, int > >
                                    ( const tnlParallelReductionLogicalOr< long double, int>& operation,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, long int > >
                                    ( const tnlParallelReductionLogicalOr< char, long int >& operation,
                                      const typename tnlParallelReductionLogicalOr< char, long int > :: IndexType size,
@@ -86,11 +89,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, long
                                      const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, long int > >
                                    ( const tnlParallelReductionLogicalOr< long double, long int>& operation,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionLogicalOr< long double, long int> :: ResultType& result );*/
-
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionLogicalOr< long double, long int> :: ResultType& result );
+#endif
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-scalar-product_impl.cu b/src/core/cuda/cuda-reduction-scalar-product_impl.cu
index 1ee2d85af2418032f7bd441acf17d3b77588e270..082d65540ae5f5d514436d847b168a97c9ca4ad7 100644
--- a/src/core/cuda/cuda-reduction-scalar-product_impl.cu
+++ b/src/core/cuda/cuda-reduction-scalar-product_impl.cu
@@ -51,13 +51,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< double,
                                      const typename tnlParallelReductionScalarProduct< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, int > >
                                    ( const tnlParallelReductionScalarProduct< long double, int>& operation,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< char, long int > >
                                    ( const tnlParallelReductionScalarProduct< char, long int >& operation,
                                      const typename tnlParallelReductionScalarProduct< char, long int > :: IndexType size,
@@ -86,11 +89,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< double,
                                      const typename tnlParallelReductionScalarProduct< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, long int > >
                                    ( const tnlParallelReductionScalarProduct< long double, long int>& operation,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionScalarProduct< long double, long int> :: ResultType& result );*/
-
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionScalarProduct< long double, long int> :: ResultType& result );
+#endif
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-sum_impl.cu b/src/core/cuda/cuda-reduction-sum_impl.cu
index 9cd01bcfd0c3de5a8f612a2205323f4772a567b3..8447ea5f00d8c448362d13181ae70e4574e3c350 100644
--- a/src/core/cuda/cuda-reduction-sum_impl.cu
+++ b/src/core/cuda/cuda-reduction-sum_impl.cu
@@ -52,13 +52,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionSum< double, int > >
                                      const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, int > >
                                    ( const tnlParallelReductionSum< long double, int>& operation,
                                      const typename tnlParallelReductionSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionSum< char, long int > >
                                    ( const tnlParallelReductionSum< char, long int >& operation,
                                      const typename tnlParallelReductionSum< char, long int > :: IndexType size,
@@ -87,11 +90,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionSum< double, long int >
                                      const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, long int > >
                                    ( const tnlParallelReductionSum< long double, long int>& operation,
                                      const typename tnlParallelReductionSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionSum< long double, long int> :: ResultType& result );*/
-                                     
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionSum< long double, long int> :: ResultType& result );
+#endif                                     
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction_impl.cpp b/src/core/cuda/cuda-reduction_impl.cpp
index 9fd1c74283d16ecc3515f752e47973d8b596bcbb..a36767e388bb265d410f7f77838f5598ec529e23 100644
--- a/src/core/cuda/cuda-reduction_impl.cpp
+++ b/src/core/cuda/cuda-reduction_impl.cpp
@@ -52,13 +52,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionSum< double, int > >
                                      const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, int > >
                                    ( const tnlParallelReductionSum< long double, int>& operation,
                                      const typename tnlParallelReductionSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionSum< char, long int > >
                                    ( const tnlParallelReductionSum< char, long int >& operation,
                                      const typename tnlParallelReductionSum< char, long int > :: IndexType size,
@@ -87,12 +90,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionSum< double, long int >
                                      const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, long int > >
                                    ( const tnlParallelReductionSum< long double, long int>& operation,
                                      const typename tnlParallelReductionSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Min
@@ -126,13 +132,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionMin< double, int > >
                                      const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, int > >
                                    ( const tnlParallelReductionMin< long double, int>& operation,
                                      const typename tnlParallelReductionMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionMin< char, long int > >
                                    ( const tnlParallelReductionMin< char, long int >& operation,
                                      const typename tnlParallelReductionMin< char, long int > :: IndexType size,
@@ -161,12 +170,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionMin< double, long int >
                                      const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, long int > >
                                    ( const tnlParallelReductionMin< long double, long int>& operation,
                                      const typename tnlParallelReductionMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Max
@@ -200,13 +212,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionMax< double, int > >
                                      const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, int > >
                                    ( const tnlParallelReductionMax< long double, int>& operation,
                                      const typename tnlParallelReductionMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionMax< char, long int > >
                                    ( const tnlParallelReductionMax< char, long int >& operation,
                                      const typename tnlParallelReductionMax< char, long int > :: IndexType size,
@@ -235,12 +250,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionMax< double, long int >
                                      const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, long int > >
                                    ( const tnlParallelReductionMax< long double, long int>& operation,
                                      const typename tnlParallelReductionMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Abs sum
@@ -274,13 +292,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, int > >
                                      const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, int > >
                                    ( const tnlParallelReductionAbsSum< long double, int>& operation,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, long int > >
                                    ( const tnlParallelReductionAbsSum< char, long int >& operation,
                                      const typename tnlParallelReductionAbsSum< char, long int > :: IndexType size,
@@ -309,12 +330,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, long in
                                      const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, long int > >
                                    ( const tnlParallelReductionAbsSum< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Abs min
@@ -348,13 +372,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, int > >
                                      const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, int > >
                                    ( const tnlParallelReductionAbsMin< long double, int>& operation,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int > >
                                    ( const tnlParallelReductionAbsMin< char, long int >& operation,
                                      const typename tnlParallelReductionAbsMin< char, long int > :: IndexType size,
@@ -383,16 +410,18 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, long in
                                      const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > >
                                    ( const tnlParallelReductionAbsMin< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< long double, long int> :: ResultType& result );
+#endif
+#endif
 /****
  * Abs max
  */
-
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, int > >
                                    ( const tnlParallelReductionAbsMax< char, int >& operation,
                                      const typename tnlParallelReductionAbsMax< char, int > :: IndexType size,
@@ -421,13 +450,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, int > >
                                      const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, int > >
                                    ( const tnlParallelReductionAbsMax< long double, int>& operation,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int > >
                                    ( const tnlParallelReductionAbsMax< char, long int >& operation,
                                      const typename tnlParallelReductionAbsMax< char, long int > :: IndexType size,
@@ -456,12 +488,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, long in
                                      const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > >
                                    ( const tnlParallelReductionAbsMax< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Logical AND
@@ -494,13 +529,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, int
                                      const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, int > >
                                    ( const tnlParallelReductionLogicalAnd< long double, int>& operation,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, long int > >
                                    ( const tnlParallelReductionLogicalAnd< char, long int >& operation,
                                      const typename tnlParallelReductionLogicalAnd< char, long int > :: IndexType size,
@@ -529,12 +567,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, lon
                                      const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, long int > >
                                    ( const tnlParallelReductionLogicalAnd< long double, long int>& operation,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Logical OR
@@ -567,13 +608,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, int
                                      const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, int > >
                                    ( const tnlParallelReductionLogicalOr< long double, int>& operation,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, long int > >
                                    ( const tnlParallelReductionLogicalOr< char, long int >& operation,
                                      const typename tnlParallelReductionLogicalOr< char, long int > :: IndexType size,
@@ -602,13 +646,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, long
                                      const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, long int > >
                                    ( const tnlParallelReductionLogicalOr< long double, long int>& operation,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< long double, long int> :: ResultType& result );
-
+#endif
+#endif
 
 /****
  * Lp Norm
@@ -627,13 +673,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, int > >
                                      const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, int > >
                                    ( const tnlParallelReductionLpNorm< long double, int>& operation,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< char, long int > >
                                    ( const tnlParallelReductionLpNorm< char, long int >& operation,
                                      const typename tnlParallelReductionLpNorm< char, long int > :: IndexType size,
@@ -662,13 +711,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, long in
                                      const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, long int > >
                                    ( const tnlParallelReductionLpNorm< long double, long int>& operation,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< long double, long int> :: ResultType& result );
-
+#endif
+#endif
 
 /****
  * Equalities
@@ -701,13 +752,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, int
                                      const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, int > >
                                    ( const tnlParallelReductionEqualities< long double, int>& operation,
                                      const typename tnlParallelReductionEqualities< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, long int > >
                                    ( const tnlParallelReductionEqualities< char, long int >& operation,
                                      const typename tnlParallelReductionEqualities< char, long int > :: IndexType size,
@@ -736,13 +790,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, lon
                                      const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, long int > >
                                    ( const tnlParallelReductionEqualities< long double, long int>& operation,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< long double, long int> :: ResultType& result );
-
+#endif
+#endif
 
 /****
  * Inequalities
@@ -775,13 +831,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, i
                                      const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, int > >
                                    ( const tnlParallelReductionInequalities< long double, int>& operation,
                                      const typename tnlParallelReductionInequalities< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, long int > >
                                    ( const tnlParallelReductionInequalities< char, long int >& operation,
                                      const typename tnlParallelReductionInequalities< char, long int > :: IndexType size,
@@ -810,13 +869,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, l
                                      const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, long int > >
                                    ( const tnlParallelReductionInequalities< long double, long int>& operation,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< long double, long int> :: ResultType& result );
-
+#endif
+#endif
 
 /****
  * ScalarProduct
@@ -849,13 +910,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< double,
                                      const typename tnlParallelReductionScalarProduct< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, int > >
                                    ( const tnlParallelReductionScalarProduct< long double, int>& operation,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< char, long int > >
                                    ( const tnlParallelReductionScalarProduct< char, long int >& operation,
                                      const typename tnlParallelReductionScalarProduct< char, long int > :: IndexType size,
@@ -884,12 +948,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< double,
                                      const typename tnlParallelReductionScalarProduct< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, long int > >
                                    ( const tnlParallelReductionScalarProduct< long double, long int>& operation,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff sum
@@ -923,13 +990,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< double, int >
                                      const typename tnlParallelReductionDiffSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, int > >
                                    ( const tnlParallelReductionDiffSum< long double, int>& operation,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< char, long int > >
                                    ( const tnlParallelReductionDiffSum< char, long int >& operation,
                                      const typename tnlParallelReductionDiffSum< char, long int > :: IndexType size,
@@ -958,12 +1028,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< double, long i
                                      const typename tnlParallelReductionDiffSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, long int > >
                                    ( const tnlParallelReductionDiffSum< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff min
@@ -997,13 +1070,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< double, int >
                                      const typename tnlParallelReductionDiffMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, int > >
                                    ( const tnlParallelReductionDiffMin< long double, int>& operation,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< long double, int> :: ResultType& result );
+#endif 
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< char, long int > >
                                    ( const tnlParallelReductionDiffMin< char, long int >& operation,
                                      const typename tnlParallelReductionDiffMin< char, long int > :: IndexType size,
@@ -1032,17 +1108,19 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< double, long i
                                      const typename tnlParallelReductionDiffMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, long int > >
                                    ( const tnlParallelReductionDiffMin< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff max
  */
-
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< char, int > >
                                    ( const tnlParallelReductionDiffMax< char, int >& operation,
                                      const typename tnlParallelReductionDiffMax< char, int > :: IndexType size,
@@ -1071,13 +1149,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< double, int >
                                      const typename tnlParallelReductionDiffMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, int > >
                                    ( const tnlParallelReductionDiffMax< long double, int>& operation,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< char, long int > >
                                    ( const tnlParallelReductionDiffMax< char, long int >& operation,
                                      const typename tnlParallelReductionDiffMax< char, long int > :: IndexType size,
@@ -1106,17 +1187,19 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< double, long i
                                      const typename tnlParallelReductionDiffMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, long int > >
                                    ( const tnlParallelReductionDiffMax< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff abs sum
  */
-
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< char, int > >
                                    ( const tnlParallelReductionDiffAbsSum< char, int >& operation,
                                      const typename tnlParallelReductionDiffAbsSum< char, int > :: IndexType size,
@@ -1145,13 +1228,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< double, int
                                      const typename tnlParallelReductionDiffAbsSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, int > >
                                    ( const tnlParallelReductionDiffAbsSum< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< char, long int > >
                                    ( const tnlParallelReductionDiffAbsSum< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsSum< char, long int > :: IndexType size,
@@ -1180,17 +1266,19 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< double, lon
                                      const typename tnlParallelReductionDiffAbsSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, long int > >
                                    ( const tnlParallelReductionDiffAbsSum< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff abs min
  */
-
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< char, int > >
                                    ( const tnlParallelReductionDiffAbsMin< char, int >& operation,
                                      const typename tnlParallelReductionDiffAbsMin< char, int > :: IndexType size,
@@ -1219,13 +1307,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< double, int
                                      const typename tnlParallelReductionDiffAbsMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, int > >
                                    ( const tnlParallelReductionDiffAbsMin< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< char, long int > >
                                    ( const tnlParallelReductionDiffAbsMin< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsMin< char, long int > :: IndexType size,
@@ -1254,16 +1345,19 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< double, lon
                                      const typename tnlParallelReductionDiffAbsMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, long int > >
                                    ( const tnlParallelReductionDiffAbsMin< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< long double, long int> :: ResultType& result );
+#endif
+#endif
+
 /****
  * Diff abs max
  */
-
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< char, int > >
                                    ( const tnlParallelReductionDiffAbsMax< char, int >& operation,
                                      const typename tnlParallelReductionDiffAbsMax< char, int > :: IndexType size,
@@ -1292,13 +1386,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< double, int
                                      const typename tnlParallelReductionDiffAbsMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, int > >
                                    ( const tnlParallelReductionDiffAbsMax< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< char, long int > >
                                    ( const tnlParallelReductionDiffAbsMax< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsMax< char, long int > :: IndexType size,
@@ -1327,14 +1424,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< double, lon
                                      const typename tnlParallelReductionDiffAbsMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, long int > >
                                    ( const tnlParallelReductionDiffAbsMax< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< long double, long int> :: ResultType& result );
-
-
+#endif
+#endif
 
 /****
  * Diff Lp Norm
@@ -1353,13 +1451,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< double, int
                                      const typename tnlParallelReductionDiffLpNorm< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, int > >
                                    ( const tnlParallelReductionDiffLpNorm< long double, int>& operation,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< char, long int > >
                                    ( const tnlParallelReductionDiffLpNorm< char, long int >& operation,
                                      const typename tnlParallelReductionDiffLpNorm< char, long int > :: IndexType size,
@@ -1388,13 +1489,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< double, lon
                                      const typename tnlParallelReductionDiffLpNorm< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, long int > >
                                    ( const tnlParallelReductionDiffLpNorm< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< long double, long int> :: ResultType& result );
-
+#endif
+#endif
 
 #endif /* TEMPLATE_EXPLICIT_INSTANTIATION */
 
diff --git a/src/core/cuda/cuda-reduction_impl.h b/src/core/cuda/cuda-reduction_impl.h
index 368a75a978f04cb3008e1a76a750c099744cbc60..e374454acab270f27bd7d1e0e6ad954c9207bb23 100644
--- a/src/core/cuda/cuda-reduction_impl.h
+++ b/src/core/cuda/cuda-reduction_impl.h
@@ -47,7 +47,7 @@ template< typename Operation >
 __device__ void reduceAligned( const Operation& operation,
                                typename Operation :: IndexType tid,
                                typename Operation :: IndexType  s,
-                               typename Operation :: ResultType* sdata )
+                               volatile typename Operation :: ResultType* sdata )
 {
    if( tid < s )
    {
@@ -67,7 +67,7 @@ __device__ void reduceNonAligned( const Operation& operation,
                                   typename Operation :: IndexType tid,
                                   typename Operation :: IndexType s,
                                   typename Operation :: IndexType n,
-                                  typename Operation :: ResultType* sdata )
+                                  volatile typename Operation :: ResultType* sdata )
 {
    if( tid < s )
    {
@@ -282,8 +282,8 @@ typename Operation :: IndexType reduceOnCudaDevice( const Operation& operation,
    typedef typename Operation :: RealType RealType;
    typedef typename Operation :: ResultType ResultType;
 
-   const IndexType desBlockSize( 512 );
-   const IndexType desGridSize( 2048 );
+   const IndexType desBlockSize( 256 );
+   const IndexType desGridSize( 65536 );
    dim3 blockSize( 0 ), gridSize( 0 );
 
    /***
@@ -293,8 +293,10 @@ typename Operation :: IndexType reduceOnCudaDevice( const Operation& operation,
    IndexType alignedBlockSize = 1;
    while( alignedBlockSize < blockSize. x ) alignedBlockSize <<= 1;
    blockSize. x = alignedBlockSize;
-
-   gridSize. x = Min( ( IndexType ) ( size / blockSize. x + 1 ) / 2, desGridSize );
+   //const IndexType numberOfBlocks = tnlCuda::getNumberOfBlocks( size / 2, blockSize.x );
+   
+   //gridSize. x = Min( ( IndexType ) ( size / blockSize. x + 1 ) / 2, desGridSize );
+   gridSize. x = Min( tnlCuda::getNumberOfBlocks( size / 2, blockSize.x ), desGridSize );
 
    if( ! output &&
        ! tnlArrayOperations< tnlCuda >::allocateMemory( output, :: Max( ( IndexType ) 1, size / desBlockSize ) ) )
@@ -304,49 +306,50 @@ typename Operation :: IndexType reduceOnCudaDevice( const Operation& operation,
    /***
     * Depending on the blockSize we generate appropriate template instance.
     */
-      switch( blockSize. x )
-      {
-         case 512:
-            tnlCUDAReductionKernel< Operation, 512 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case 256:
-            tnlCUDAReductionKernel< Operation, 256 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case 128:
-            tnlCUDAReductionKernel< Operation, 128 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case  64:
-            tnlCUDAReductionKernel< Operation,  64 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case  32:
-            tnlCUDAReductionKernel< Operation,  32 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case  16:
-            tnlCUDAReductionKernel< Operation,  16 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case   8:
-            tnlCUDAReductionKernel< Operation,   8 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case   4:
-            tnlCUDAReductionKernel< Operation,   4 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case   2:
-            tnlCUDAReductionKernel< Operation,   2 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case   1:
-            tnlAssert( false, cerr << "blockSize should not be 1." << endl );
-         default:
-            tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
-      }
+   switch( blockSize. x )
+   {
+      case 512:
+         tnlCUDAReductionKernel< Operation, 512 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case 256:
+         tnlCUDAReductionKernel< Operation, 256 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case 128:
+         tnlCUDAReductionKernel< Operation, 128 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case  64:
+         tnlCUDAReductionKernel< Operation,  64 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case  32:
+         tnlCUDAReductionKernel< Operation,  32 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case  16:
+         tnlCUDAReductionKernel< Operation,  16 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+     case   8:
+         tnlCUDAReductionKernel< Operation,   8 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case   4:
+         tnlCUDAReductionKernel< Operation,   4 >
+        <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+        break;
+      case   2:
+         tnlCUDAReductionKernel< Operation,   2 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case   1:
+         tnlAssert( false, cerr << "blockSize should not be 1." << endl );
+      default:
+         tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
+   }
+   checkCudaDevice;
    return gridSize. x;
 }
 #endif
@@ -402,6 +405,8 @@ bool reductionOnCudaDevice( const Operation& operation,
                                         deviceAux1,
                                         ( ResultType* ) 0,
                                         deviceAux2 );
+      if( ! checkCudaDevice )
+          return false;
       Swap( deviceAux1, deviceAux2 );
    }
 
@@ -428,7 +433,7 @@ bool reductionOnCudaDevice( const Operation& operation,
       return false;
    if( deviceAux2 && ! tnlArrayOperations< tnlCuda >::freeMemory( deviceAux2 ) )
       return false;
-   return true;
+   return checkCudaDevice;
 #else
    tnlCudaSupportMissingMessage;;
    return false;
@@ -469,13 +474,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionSum< double, int
                                      const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, int > >
                                    ( const tnlParallelReductionSum< long double, int>& operation,
                                      const typename tnlParallelReductionSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionSum< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionSum< char, long int > >
                                    ( const tnlParallelReductionSum< char, long int >& operation,
                                      const typename tnlParallelReductionSum< char, long int > :: IndexType size,
@@ -504,12 +512,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionSum< double, lon
                                      const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, long int > >
                                    ( const tnlParallelReductionSum< long double, long int>& operation,
                                      const typename tnlParallelReductionSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionSum< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionSum< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Min
@@ -543,13 +554,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionMin< double, int
                                      const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, int > >
                                    ( const tnlParallelReductionMin< long double, int>& operation,
                                      const typename tnlParallelReductionMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionMin< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionMin< char, long int > >
                                    ( const tnlParallelReductionMin< char, long int >& operation,
                                      const typename tnlParallelReductionMin< char, long int > :: IndexType size,
@@ -578,12 +592,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionMin< double, lon
                                      const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, long int > >
                                    ( const tnlParallelReductionMin< long double, long int>& operation,
                                      const typename tnlParallelReductionMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionMin< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionMin< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Max
@@ -617,13 +634,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionMax< double, int
                                      const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, int > >
                                    ( const tnlParallelReductionMax< long double, int>& operation,
                                      const typename tnlParallelReductionMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionMax< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionMax< char, long int > >
                                    ( const tnlParallelReductionMax< char, long int >& operation,
                                      const typename tnlParallelReductionMax< char, long int > :: IndexType size,
@@ -652,12 +672,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionMax< double, lon
                                      const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, long int > >
                                    ( const tnlParallelReductionMax< long double, long int>& operation,
                                      const typename tnlParallelReductionMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionMax< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionMax< long double, long int> :: ResultType& result );
+#endif
+#endif
+
 
 /****
  * Abs sum
@@ -691,13 +715,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double,
                                      const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, int > >
                                    ( const tnlParallelReductionAbsSum< long double, int>& operation,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionAbsSum< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionAbsSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, long int > >
                                    ( const tnlParallelReductionAbsSum< char, long int >& operation,
                                      const typename tnlParallelReductionAbsSum< char, long int > :: IndexType size,
@@ -726,12 +753,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double,
                                      const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, long int > >
                                    ( const tnlParallelReductionAbsSum< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionAbsSum< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionAbsSum< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Abs min
@@ -765,13 +795,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double,
                                      const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, int > >
                                    ( const tnlParallelReductionAbsMin< long double, int>& operation,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionAbsMin< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionAbsMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int > >
                                    ( const tnlParallelReductionAbsMin< char, long int >& operation,
                                      const typename tnlParallelReductionAbsMin< char, long int > :: IndexType size,
@@ -800,12 +833,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double,
                                      const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > >
                                    ( const tnlParallelReductionAbsMin< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionAbsMin< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionAbsMin< long double, long int> :: ResultType& result );
+#endif
+#endif
+
 /****
  * Abs max
  */
@@ -838,13 +875,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double,
                                      const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, int > >
                                    ( const tnlParallelReductionAbsMax< long double, int>& operation,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionAbsMax< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionAbsMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int > >
                                    ( const tnlParallelReductionAbsMax< char, long int >& operation,
                                      const typename tnlParallelReductionAbsMax< char, long int > :: IndexType size,
@@ -873,12 +913,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double,
                                      const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > >
                                    ( const tnlParallelReductionAbsMax< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionAbsMax< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionAbsMax< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Logical AND
@@ -911,13 +954,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< doub
                                      const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, int > >
                                    ( const tnlParallelReductionLogicalAnd< long double, int>& operation,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionLogicalAnd< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionLogicalAnd< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, long int > >
                                    ( const tnlParallelReductionLogicalAnd< char, long int >& operation,
                                      const typename tnlParallelReductionLogicalAnd< char, long int > :: IndexType size,
@@ -946,12 +992,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< doub
                                      const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, long int > >
                                    ( const tnlParallelReductionLogicalAnd< long double, long int>& operation,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionLogicalAnd< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionLogicalAnd< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Logical OR
@@ -984,13 +1033,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< doubl
                                      const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, int > >
                                    ( const tnlParallelReductionLogicalOr< long double, int>& operation,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionLogicalOr< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionLogicalOr< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, long int > >
                                    ( const tnlParallelReductionLogicalOr< char, long int >& operation,
                                      const typename tnlParallelReductionLogicalOr< char, long int > :: IndexType size,
@@ -1019,13 +1071,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< doubl
                                      const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, long int > >
                                    ( const tnlParallelReductionLogicalOr< long double, long int>& operation,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionLogicalOr< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionLogicalOr< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Lp Norm
@@ -1044,13 +1098,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double,
                                      const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, int > >
                                    ( const tnlParallelReductionLpNorm< long double, int>& operation,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionLpNorm< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionLpNorm< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< char, long int > >
                                    ( const tnlParallelReductionLpNorm< char, long int >& operation,
                                      const typename tnlParallelReductionLpNorm< char, long int > :: IndexType size,
@@ -1079,13 +1136,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double,
                                      const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, long int > >
                                    ( const tnlParallelReductionLpNorm< long double, long int>& operation,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionLpNorm< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionLpNorm< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Equalities
@@ -1118,13 +1177,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< doub
                                      const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, int > >
                                    ( const tnlParallelReductionEqualities< long double, int>& operation,
                                      const typename tnlParallelReductionEqualities< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionEqualities< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionEqualities< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, long int > >
                                    ( const tnlParallelReductionEqualities< char, long int >& operation,
                                      const typename tnlParallelReductionEqualities< char, long int > :: IndexType size,
@@ -1153,13 +1215,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< doub
                                      const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, long int > >
                                    ( const tnlParallelReductionEqualities< long double, long int>& operation,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionEqualities< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionEqualities< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Inequalities
@@ -1192,13 +1256,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< do
                                      const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, int > >
                                    ( const tnlParallelReductionInequalities< long double, int>& operation,
                                      const typename tnlParallelReductionInequalities< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionInequalities< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionInequalities< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, long int > >
                                    ( const tnlParallelReductionInequalities< char, long int >& operation,
                                      const typename tnlParallelReductionInequalities< char, long int > :: IndexType size,
@@ -1227,13 +1294,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< do
                                      const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, long int > >
                                    ( const tnlParallelReductionInequalities< long double, long int>& operation,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionInequalities< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionInequalities< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * ScalarProduct
@@ -1266,13 +1335,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< d
                                      const typename tnlParallelReductionScalarProduct< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, int > >
                                    ( const tnlParallelReductionScalarProduct< long double, int>& operation,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionScalarProduct< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionScalarProduct< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< char, long int > >
                                    ( const tnlParallelReductionScalarProduct< char, long int >& operation,
                                      const typename tnlParallelReductionScalarProduct< char, long int > :: IndexType size,
@@ -1301,12 +1373,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< d
                                      const typename tnlParallelReductionScalarProduct< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, long int > >
                                    ( const tnlParallelReductionScalarProduct< long double, long int>& operation,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionScalarProduct< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionScalarProduct< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff sum
@@ -1340,13 +1415,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< double,
                                      const typename tnlParallelReductionDiffSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, int > >
                                    ( const tnlParallelReductionDiffSum< long double, int>& operation,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffSum< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< char, long int > >
                                    ( const tnlParallelReductionDiffSum< char, long int >& operation,
                                      const typename tnlParallelReductionDiffSum< char, long int > :: IndexType size,
@@ -1375,12 +1453,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< double,
                                      const typename tnlParallelReductionDiffSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, long int > >
                                    ( const tnlParallelReductionDiffSum< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffSum< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffSum< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff min
@@ -1414,13 +1495,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< double,
                                      const typename tnlParallelReductionDiffMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, int > >
                                    ( const tnlParallelReductionDiffMin< long double, int>& operation,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffMin< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< char, long int > >
                                    ( const tnlParallelReductionDiffMin< char, long int >& operation,
                                      const typename tnlParallelReductionDiffMin< char, long int > :: IndexType size,
@@ -1449,12 +1533,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< double,
                                      const typename tnlParallelReductionDiffMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, long int > >
                                    ( const tnlParallelReductionDiffMin< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffMin< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffMin< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff max
@@ -1488,13 +1575,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< double,
                                      const typename tnlParallelReductionDiffMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, int > >
                                    ( const tnlParallelReductionDiffMax< long double, int>& operation,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffMax< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< char, long int > >
                                    ( const tnlParallelReductionDiffMax< char, long int >& operation,
                                      const typename tnlParallelReductionDiffMax< char, long int > :: IndexType size,
@@ -1523,12 +1613,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< double,
                                      const typename tnlParallelReductionDiffMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, long int > >
                                    ( const tnlParallelReductionDiffMax< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffMax< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffMax< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff abs sum
@@ -1562,13 +1655,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< doub
                                      const typename tnlParallelReductionDiffAbsSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, int > >
                                    ( const tnlParallelReductionDiffAbsSum< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsSum< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffAbsSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< char, long int > >
                                    ( const tnlParallelReductionDiffAbsSum< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsSum< char, long int > :: IndexType size,
@@ -1597,12 +1693,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< doub
                                      const typename tnlParallelReductionDiffAbsSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, long int > >
                                    ( const tnlParallelReductionDiffAbsSum< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsSum< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffAbsSum< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff abs min
@@ -1636,13 +1735,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< doub
                                      const typename tnlParallelReductionDiffAbsMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, int > >
                                    ( const tnlParallelReductionDiffAbsMin< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMin< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffAbsMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< char, long int > >
                                    ( const tnlParallelReductionDiffAbsMin< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsMin< char, long int > :: IndexType size,
@@ -1671,12 +1773,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< doub
                                      const typename tnlParallelReductionDiffAbsMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, long int > >
                                    ( const tnlParallelReductionDiffAbsMin< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMin< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffAbsMin< long double, long int> :: ResultType& result );
+#endif
+#endif
+
 /****
  * Diff abs max
  */
@@ -1709,13 +1815,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< doub
                                      const typename tnlParallelReductionDiffAbsMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, int > >
                                    ( const tnlParallelReductionDiffAbsMax< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMax< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffAbsMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< char, long int > >
                                    ( const tnlParallelReductionDiffAbsMax< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsMax< char, long int > :: IndexType size,
@@ -1744,13 +1853,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< doub
                                      const typename tnlParallelReductionDiffAbsMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, long int > >
                                    ( const tnlParallelReductionDiffAbsMax< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMax< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionDiffAbsMax< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 
 /****
@@ -1770,13 +1881,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< doub
                                      const typename tnlParallelReductionDiffLpNorm< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, int > >
                                    ( const tnlParallelReductionDiffLpNorm< long double, int>& operation,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffLpNorm< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffLpNorm< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< char, long int > >
                                    ( const tnlParallelReductionDiffLpNorm< char, long int >& operation,
                                      const typename tnlParallelReductionDiffLpNorm< char, long int > :: IndexType size,
@@ -1805,15 +1919,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< doub
                                      const typename tnlParallelReductionDiffLpNorm< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, long int > >
                                    ( const tnlParallelReductionDiffLpNorm< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffLpNorm< long double, long int> :: ResultType& result );*/
-
-
-
+                                     typename tnlParallelReductionDiffLpNorm< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 #endif /* TEMPLATE_EXPLICIT_INSTANTIATION */
 
diff --git a/src/core/cuda/reduction-operations.h b/src/core/cuda/reduction-operations.h
index f5dad03987b6a010f340ad548bf732b8be2de6c2..6fb2d871599169596530e1d40c1c060a294e4cfa 100644
--- a/src/core/cuda/reduction-operations.h
+++ b/src/core/cuda/reduction-operations.h
@@ -50,6 +50,31 @@ __device__ inline  double tnlCudaMin( const double& a,
    return fmin( a, b );
 }
 
+template< class T > __device__ T tnlCudaMin( volatile const T& a,
+                                             volatile const T& b )
+{
+   return a < b ? a : b;
+}
+
+__device__ inline int tnlCudaMin( volatile const int& a,
+                                  volatile const int& b )
+{
+   return min( a, b );
+}
+
+__device__ inline  float tnlCudaMin( volatile const float& a,
+                                     volatile const float& b )
+{
+   return fminf( a, b );
+}
+
+__device__ inline  double tnlCudaMin( volatile const double& a,
+                                      volatile const double& b )
+{
+   return fmin( a, b );
+}
+
+
 /***
  * This function returns maximum of two numbers stored on the device.
  */
@@ -77,6 +102,30 @@ __device__  inline double tnlCudaMax( const double& a,
    return fmax( a, b );
 }
 
+template< class T > __device__ T tnlCudaMax( volatile const T& a,
+                                             volatile const T& b )
+{
+   return a > b ? a : b;
+}
+
+__device__  inline int tnlCudaMax( volatile const int& a,
+                                   volatile const int& b )
+{
+   return max( a, b );
+}
+
+__device__  inline float tnlCudaMax( volatile const float& a,
+                                     volatile const float& b )
+{
+   return fmaxf( a, b );
+}
+
+__device__  inline double tnlCudaMax( volatile const double& a,
+                                      volatile const double& b )
+{
+   return fmax( a, b );
+}
+
 /***
  * This function returns absolute value of given number on the device.
  */
@@ -105,6 +154,32 @@ __device__  inline long double tnlCudaAbs( const long double& a )
    return fabs( ( double ) a );
 }
 
+__device__  inline int tnlCudaAbs( volatile const int& a )
+{
+   return abs( a );
+}
+
+__device__  inline long int tnlCudaAbs( volatile const long int& a )
+{
+   return abs( a );
+}
+
+__device__  inline float tnlCudaAbs( volatile const float& a )
+{
+   return fabs( a );
+}
+
+__device__  inline double tnlCudaAbs( volatile const double& a )
+{
+   return fabs( a );
+}
+
+__device__  inline long double tnlCudaAbs( volatile const long double& a )
+{
+   return fabs( ( double ) a );
+}
+
+
 template< typename Type1, typename Type2 >
 __device__ Type1 tnlCudaPow( const Type1& x, const Type2& power )
 {
@@ -173,7 +248,7 @@ class tnlParallelReductionSum
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
       return data[ idx1 ] + data[ idx2 ];
    };
@@ -261,7 +336,7 @@ class tnlParallelReductionMin
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
       return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
    };
@@ -330,7 +405,7 @@ class tnlParallelReductionMax
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
       return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
    };
@@ -399,7 +474,7 @@ class tnlParallelReductionAbsSum
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
       return data[ idx1 ] + data[ idx2 ];
    };
@@ -468,9 +543,10 @@ class tnlParallelReductionAbsMin
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
-      return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
+      volatile ResultType aux = tnlCudaAbs( data[ idx2 ] );
+      return tnlCudaMin( data[ idx1 ],  aux );
    };
 #endif
 };
@@ -537,9 +613,10 @@ class tnlParallelReductionAbsMax
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
-      return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
+      volatile ResultType aux = tnlCudaAbs( data[ idx2 ] );
+      return tnlCudaMax( data[ idx1 ], aux );
    };
 #endif
 };
@@ -606,7 +683,7 @@ class tnlParallelReductionLogicalAnd
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
       return data[ idx1 ] && data[ idx2 ];
    };
@@ -676,7 +753,7 @@ class tnlParallelReductionLogicalOr
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
       return data[ idx1 ] || data[ idx2 ];
    };
@@ -752,7 +829,7 @@ class tnlParallelReductionLpNorm
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
       return data[ idx1 ] + data[ idx2 ];
    };
@@ -827,7 +904,7 @@ class tnlParallelReductionEqualities
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
       return data[ idx1 ] && data[ idx2 ];
    };
@@ -898,7 +975,7 @@ class tnlParallelReductionInequalities
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
       return data[ idx1 ] && data[ idx2 ];
    };
@@ -954,7 +1031,7 @@ class tnlParallelReductionScalarProduct
                                                  const RealType* data3 ) const
    {
       return data1[ idx1 ] +
-             ( data2[ idx2 ] * data2[ idx2] ) +
+             ( data2[ idx2 ] * data3[ idx2] ) +
              ( data2[ idx3 ] * data3[ idx3] );
    };
 
@@ -969,7 +1046,7 @@ class tnlParallelReductionScalarProduct
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
       return data[ idx1 ] + data[ idx2 ];
    };
@@ -1039,7 +1116,7 @@ class tnlParallelReductionDiffSum
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
       return data[ idx1 ] + data[ idx2 ];
    };
@@ -1110,7 +1187,7 @@ class tnlParallelReductionDiffMin
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
       return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
    };
@@ -1182,7 +1259,7 @@ class tnlParallelReductionDiffMax
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
       return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
    };
@@ -1254,7 +1331,7 @@ class tnlParallelReductionDiffAbsSum
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
       return data[ idx1 ] + data[ idx2 ];
    };
@@ -1327,7 +1404,7 @@ class tnlParallelReductionDiffAbsMin
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
       //return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
       return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
@@ -1401,7 +1478,7 @@ class tnlParallelReductionDiffAbsMax
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
       //return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
       return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
@@ -1479,7 +1556,7 @@ class tnlParallelReductionDiffLpNorm
 
    __device__ ResultType commonReductionOnDevice( const IndexType idx1,
                                                   const IndexType idx2,
-                                                  const ResultType* data ) const
+                                                  volatile const ResultType* data ) const
    {
       return data[ idx1 ] + data[ idx2 ];
    };
diff --git a/src/core/cuda/tnlCublasWrapper.h b/src/core/cuda/tnlCublasWrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..54d3e861a251605886f37c40ae1ecb0b1571504d
--- /dev/null
+++ b/src/core/cuda/tnlCublasWrapper.h
@@ -0,0 +1,70 @@
+/***************************************************************************
+                          tnlCublasWrapper.h  -  description
+                             -------------------
+    begin                : Apr 7, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef TNLCUBLASWARPER_H
+#define	TNLCUBLASWARPER_H
+
+#if defined HAVE_CUBLAS && defined HAVE_CUDA
+#include <cublas_v2.h>
+#endif
+
+template< typename Real1, 
+          typename Real2,
+          typename Index >
+class tnlCublasWrapper
+{
+    public:
+        static bool dot( const Real1* v1, const Real2* v2, const Index size, Real1& result)
+        {
+            return false;
+        }        
+};
+
+#if defined HAVE_CUBLAS && defined HAVE_CUDA
+
+template< typename Index >
+class tnlCublasWrapper< float, float, Index >
+{
+    public:
+        static bool dot( const float* v1, const float* v2, const Index size, float& result)
+        {
+
+            cublasHandle_t handle;
+            cublasCreate( &handle );
+            cublasSdot( handle, size, v1, 1, v2, 1, &result );
+            cublasDestroy( handle );
+            return false;
+        }        
+};
+
+template< typename Index >
+class tnlCublasWrapper< double, double, Index >
+{
+    public:
+        static bool dot( const double* v1, const double* v2, const Index size, double& result)
+        {
+            cublasHandle_t handle;
+            cublasCreate( &handle );
+            cublasDdot( handle, size, v1, 1, v2, 1, &result );
+            cublasDestroy( handle );
+            return false;
+        }        
+};
+#endif            
+
+#endif	/* TNLCUBLASWARPER_H */
+
diff --git a/src/core/mfuncs.h b/src/core/mfuncs.h
index 4719ad2fa9a6844700080a011e4010bd158f3353..8f0a87208bdbe0d9d320b4c7664c4d77d6e8d475 100644
--- a/src/core/mfuncs.h
+++ b/src/core/mfuncs.h
@@ -20,29 +20,24 @@
 
 #include <math.h>
 #include <stdlib.h>
+#include <core/tnlCuda.h>
 
 template< typename Type1, typename Type2 >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Type1 Min( const Type1& a, const Type2& b )
 {
    return a < b ? a : b;
 };
 
 template< typename Type1, typename Type2 >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Type1 Max( const Type1& a, const Type2& b )
 {
    return a > b ? a : b;
 };
 
 template< typename Type >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 void Swap( Type& a, Type& b )
 {
    Type tmp( a );
@@ -51,9 +46,7 @@ void Swap( Type& a, Type& b )
 };
 
 template< class T >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 T Sign( const T& a )
 {
    if( a < ( T ) 0 ) return -1;
@@ -62,9 +55,7 @@ T Sign( const T& a )
 };
 
 template< class T >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 T tnlAbs( const T& n )
 {
    if( n < ( T ) 0 )
@@ -99,9 +90,7 @@ inline int roundUpDivision( const int num, const int div )
    return num / div + ( num % div != 0 );
 }
 
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 inline int roundToMultiple( int number, int multiple )
 {
    return multiple*( number/ multiple + ( number % multiple != 0 ) );
diff --git a/src/core/tnlAssert.h b/src/core/tnlAssert.h
index 1f59f11b3bc83f8d2e5ff3610310195ea997a414..3a37eb52f2789986c413f11c0c601fc38caa7183 100644
--- a/src/core/tnlAssert.h
+++ b/src/core/tnlAssert.h
@@ -38,7 +38,7 @@ using namespace std;
            __STRING( ___tnl__assert_condition ),                                                         \
            __FILE__,                                                                                     \
            __LINE__ );                                                                                   \
-    abort();                                                                   \
+                                                              \
    }
 
 #else
diff --git a/src/core/tnlCuda.cu b/src/core/tnlCuda.cu
index ee30490b89728546a9eb1b6d7e594a87f83b9615..9178a3261e48a300267b0f292f06cec68575ba4d 100644
--- a/src/core/tnlCuda.cu
+++ b/src/core/tnlCuda.cu
@@ -391,6 +391,6 @@ bool tnlCuda::checkDevice( const char* file_name, int line )
        break;
 
    }
-   throw EXIT_FAILURE;
+   //throw EXIT_FAILURE;
    return false;
 }
diff --git a/src/core/tnlCuda.h b/src/core/tnlCuda.h
index 79d145e4bf74b21d30277670d17631e3e0c4823e..13e71fe062669f77a98115478463b03e6559e3af 100644
--- a/src/core/tnlCuda.h
+++ b/src/core/tnlCuda.h
@@ -107,6 +107,11 @@ class tnlCuda
 #define tnlCudaSupportMissingMessage \
    std::cerr << "The CUDA support is missing in the source file " << __FILE__ << " at line " << __LINE__ << ". Please set WITH_CUDA=yes in the install script. " << std::endl;
 
+#ifdef HAVE_CUDA
+#define __cuda_callable__ __device__ __host__
+#else
+#define __cuda_callable__
+#endif
 
 // TODO: This would be nice in tnlCuda but C++ standard does not allow it.
 #ifdef HAVE_CUDA
diff --git a/src/core/tnlTimerCPU.cpp b/src/core/tnlTimerCPU.cpp
index e89abc687896a685c874f4f056b2d999635dd719..d618b455e372c9daa16bfdfaa129019adbc395be 100644
--- a/src/core/tnlTimerCPU.cpp
+++ b/src/core/tnlTimerCPU.cpp
@@ -21,10 +21,10 @@ tnlTimerCPU defaultCPUTimer;
 
 tnlTimerCPU :: tnlTimerCPU()
 {
-   Reset();
+   reset();
 }
-//--------------------------------------------------------------------------
-void tnlTimerCPU :: Reset()
+
+void tnlTimerCPU::reset()
 {
 #ifdef HAVE_SYS_RESOURCE_H
    rusage init_usage;
@@ -36,8 +36,8 @@ void tnlTimerCPU :: Reset()
    total_time = 0.0;
    stop_state = false;
 }
-//--------------------------------------------------------------------------
-void tnlTimerCPU :: Stop()
+
+void tnlTimerCPU::stop()
 {
 #ifdef HAVE_SYS_RESOURCE_H
    if( ! stop_state )
@@ -49,8 +49,8 @@ void tnlTimerCPU :: Stop()
    }
 #endif
 }
-//--------------------------------------------------------------------------
-void tnlTimerCPU :: Continue()
+
+void tnlTimerCPU::start()
 {
 #ifdef HAVE_SYS_RESOURCE_H
    rusage init_usage;
@@ -59,12 +59,12 @@ void tnlTimerCPU :: Continue()
 #endif
   stop_state = false;
 }
-//--------------------------------------------------------------------------
-double tnlTimerCPU :: GetTime( int root, MPI_Comm comm )
+
+double tnlTimerCPU::getTime( int root, MPI_Comm comm )
 {
 #ifdef HAVE_SYS_RESOURCE_H
-   Stop();
-   Continue();
+   stop();
+   start();
    double mpi_total_time;
    MPIReduce( total_time, mpi_total_time, 1, MPI_SUM, root, comm );
    return mpi_total_time;
diff --git a/src/core/tnlTimerCPU.h b/src/core/tnlTimerCPU.h
index 098d90ab1822a0254d488e4d2c0f0cdb2e8797f3..c982018b5b1bb0ee6e02085569d7c00c9bff732f 100644
--- a/src/core/tnlTimerCPU.h
+++ b/src/core/tnlTimerCPU.h
@@ -32,13 +32,13 @@ class tnlTimerCPU
 
    tnlTimerCPU();
 
-   void Reset();
+   void reset();
    
-   void Stop();
+   void stop();
 
-   void Continue();
+   void start();
 
-   double GetTime( int root = 0, MPI_Comm = MPI_COMM_WORLD );
+   double getTime( int root = 0, MPI_Comm = MPI_COMM_WORLD );
       
    protected:
 
diff --git a/src/core/tnlTimerRT.cpp b/src/core/tnlTimerRT.cpp
index 8d2fa297e056a4f216ad0b344e2e8a77ce6e4360..7351972dc5cb83697d7d5884170ab74fcd9f392a 100644
--- a/src/core/tnlTimerRT.cpp
+++ b/src/core/tnlTimerRT.cpp
@@ -27,12 +27,12 @@
 
 tnlTimerRT defaultRTTimer;
 
-tnlTimerRT :: tnlTimerRT()
+tnlTimerRT::tnlTimerRT()
 {
-   Reset();
+   reset();
 }
 
-void tnlTimerRT :: Reset()
+void tnlTimerRT::reset()
 {
 #ifdef HAVE_TIME
    struct timeval tp;
@@ -46,7 +46,7 @@ void tnlTimerRT :: Reset()
 
 }
 
-void tnlTimerRT :: Stop()
+void tnlTimerRT::stop()
 {
 #ifdef HAVE_TIME
    if( ! stop_state )
@@ -59,7 +59,7 @@ void tnlTimerRT :: Stop()
 #endif
 }
 
-void tnlTimerRT :: Continue()
+void tnlTimerRT::start()
 {
 #ifdef HAVE_TIME
    struct timeval tp;
@@ -69,11 +69,11 @@ void tnlTimerRT :: Continue()
 #endif
 }
 
-double tnlTimerRT :: GetTime()
+double tnlTimerRT::getTime()
 {
 #ifdef HAVE_TIME
-	Stop();
-	Continue();
+	stop();
+	start();
 	return total_time;
 #endif
  return -1;
diff --git a/src/core/tnlTimerRT.h b/src/core/tnlTimerRT.h
index 31ab6a4f8003c1dcb7e2d4bb26a0495fc591e8d8..7aa6305c6f8f48e8cdf6f25616b3438f0efb09f3 100644
--- a/src/core/tnlTimerRT.h
+++ b/src/core/tnlTimerRT.h
@@ -26,13 +26,13 @@ class tnlTimerRT
 
    tnlTimerRT();
 
-   void Reset();
+   void reset();
 
-   void Stop();
+   void stop();
 
-   void Continue();
+   void start();
 
-   double GetTime();
+   double getTime();
 
    protected:
 
diff --git a/src/core/vectors/tnlMultiVector1D_impl.h b/src/core/vectors/tnlMultiVector1D_impl.h
index 4bb82a17674f8144682eeb46b626d4e3ab39432d..6e1b6a6695f41c38454f24a8493d9ae68b4bbe90 100644
--- a/src/core/vectors/tnlMultiVector1D_impl.h
+++ b/src/core/vectors/tnlMultiVector1D_impl.h
@@ -250,4 +250,32 @@ bool tnlMultiVector< 1, Real, Device, Index > :: load( const tnlString& fileName
    return tnlObject :: load( fileName );
 }
 
+#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
+
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 1, float,  tnlHost, int >;
+#endif
+extern template class tnlMultiVector< 1, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 1, float,  tnlHost, long int >;
+#endif
+extern template class tnlMultiVector< 1, double, tnlHost, long int >;
+#endif
+
+#ifdef HAVE_CUDA
+/*#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 1, float,  tnlCuda, int >;
+#endif
+extern template class tnlMultiVector< 1, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 1, float,  tnlCuda, long int >;
+#endif
+extern template class tnlMultiVector< 1, double, tnlCuda, long int >;
+#endif*/
+#endif
+
+#endif
+
 #endif /* TNLMULTIVECTOR1D_IMPL_H_ */
diff --git a/src/core/vectors/tnlMultiVector2D_impl.h b/src/core/vectors/tnlMultiVector2D_impl.h
index 479d64a31c3216c4f19baf5c48f18ec805dd2306..a964b062e641695bfcc29f30861c4c43b50f35f4 100644
--- a/src/core/vectors/tnlMultiVector2D_impl.h
+++ b/src/core/vectors/tnlMultiVector2D_impl.h
@@ -262,4 +262,33 @@ ostream& operator << ( ostream& str, const tnlMultiVector< 2, Real, Device, Inde
    return str;
 }
 
+#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
+
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 2, float,  tnlHost, int >;
+#endif
+extern template class tnlMultiVector< 2, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 2, float,  tnlHost, long int >;
+#endif
+extern template class tnlMultiVector< 2, double, tnlHost, long int >;
+#endif
+
+#ifdef HAVE_CUDA
+/*#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 2, float,  tnlCuda, int >;
+#endif
+extern template class tnlMultiVector< 2, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 2, float,  tnlCuda, long int >;
+#endif
+extern template class tnlMultiVector< 2, double, tnlCuda, long int >;
+#endif*/
+#endif
+
+#endif
+
+
 #endif /* TNLMULTIVECTOR2D_IMPL_H_ */
diff --git a/src/core/vectors/tnlMultiVector3D_impl.h b/src/core/vectors/tnlMultiVector3D_impl.h
index 50082d3f9e6d4f758947deb78ca09560240f3a38..3a19302c373be1f7b56db3fee36b688aa6aa6a04 100644
--- a/src/core/vectors/tnlMultiVector3D_impl.h
+++ b/src/core/vectors/tnlMultiVector3D_impl.h
@@ -285,4 +285,33 @@ bool tnlMultiVector< 3, Real, Device, Index > :: load( const tnlString& fileName
    return tnlObject :: load( fileName );
 }
 
+#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
+
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 3, float,  tnlHost, int >;
+#endif
+extern template class tnlMultiVector< 3, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 3, float,  tnlHost, long int >;
+#endif
+extern template class tnlMultiVector< 3, double, tnlHost, long int >;
+#endif
+
+#ifdef HAVE_CUDA
+/*#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 3, float,  tnlCuda, int >;
+#endif
+extern template class tnlMultiVector< 3, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 3, float,  tnlCuda, long int >;
+#endif
+extern template class tnlMultiVector< 3, double, tnlCuda, long int >;
+#endif*/
+#endif
+
+#endif
+
+
 #endif /* TNLMULTIVECTOR3D_IMPL_H_ */
diff --git a/src/core/vectors/tnlMultiVector4D_impl.h b/src/core/vectors/tnlMultiVector4D_impl.h
index 416acb45eaf492ee6b22bd267fc893c164ea6073..b843e4838df4691c6f101bdc793e3dad71c7aebe 100644
--- a/src/core/vectors/tnlMultiVector4D_impl.h
+++ b/src/core/vectors/tnlMultiVector4D_impl.h
@@ -306,4 +306,33 @@ bool tnlMultiVector< 4, Real, Device, Index > :: load( const tnlString& fileName
    return tnlObject :: load( fileName );
 }
 
+#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
+
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 4, float,  tnlHost, int >;
+#endif
+extern template class tnlMultiVector< 4, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 4, float,  tnlHost, long int >;
+#endif
+extern template class tnlMultiVector< 4, double, tnlHost, long int >;
+#endif
+
+#ifdef HAVE_CUDA
+/*#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 4, float,  tnlCuda, int >;
+#endif
+extern template class tnlMultiVector< 4, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 4, float,  tnlCuda, long int >;
+#endif
+extern template class tnlMultiVector< 4, double, tnlCuda, long int >;
+#endif*/
+#endif
+
+#endif
+
+
 #endif /* TNLMULTIVECTOR4D_IMPL_H_ */
diff --git a/src/core/vectors/tnlMultiVector_impl.cpp b/src/core/vectors/tnlMultiVector_impl.cpp
index 5dff1b99997c483d257bbced57a5a39cfe9f8903..5920c03ff640ad3500e4e88877a62344c24cb7aa 100644
--- a/src/core/vectors/tnlMultiVector_impl.cpp
+++ b/src/core/vectors/tnlMultiVector_impl.cpp
@@ -19,41 +19,92 @@
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 1, float,  tnlHost, int >;
+#endif
 template class tnlMultiVector< 1, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 1, float,  tnlHost, long int >;
+#endif
 template class tnlMultiVector< 1, double, tnlHost, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 2, float,  tnlHost, int >;
+#endif
 template class tnlMultiVector< 2, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 2, float,  tnlHost, long int >;
+#endif
 template class tnlMultiVector< 2, double, tnlHost, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 3, float,  tnlHost, int >;
+#endif
 template class tnlMultiVector< 3, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 3, float,  tnlHost, long int >;
+#endif
 template class tnlMultiVector< 3, double, tnlHost, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 4, float,  tnlHost, int >;
+#endif
 template class tnlMultiVector< 4, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 4, float,  tnlHost, long int >;
+#endif
 template class tnlMultiVector< 4, double, tnlHost, long int >;
+#endif
 
 #ifdef HAVE_CUDA
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 1, float,  tnlCuda, int >;
+#endif
 template class tnlMultiVector< 1, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 1, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiVector< 1, double, tnlCuda, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 2, float,  tnlCuda, int >;
+#endif
 template class tnlMultiVector< 2, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 2, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiVector< 2, double, tnlCuda, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 3, float,  tnlCuda, int >;
+#endif
 template class tnlMultiVector< 3, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 3, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiVector< 3, double, tnlCuda, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 4, float,  tnlCuda, int >;
+#endif
 template class tnlMultiVector< 4, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 4, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiVector< 4, double, tnlCuda, long int >;
+#endif
 
 #endif
 
diff --git a/src/core/vectors/tnlSharedVector.h b/src/core/vectors/tnlSharedVector.h
index a3d6621a85da8b17fadab11d2065fbfa57723db3..0a1e93ef3492ec0c64da322a841edb331587f0fa 100644
--- a/src/core/vectors/tnlSharedVector.h
+++ b/src/core/vectors/tnlSharedVector.h
@@ -20,7 +20,7 @@
 
 #include <core/arrays/tnlSharedArray.h>
 #include <core/vectors/tnlVector.h>
-#include <functions/tnlFunctionType.h>
+#include <functors/tnlFunctionType.h>
 
 class tnlHost;
 
@@ -125,25 +125,13 @@ class tnlSharedVector : public tnlSharedArray< Real, Device, Index >
                    const Real& alpha = 1.0,
                    const Real& thisMultiplicator = 1.0 );
 
-   //! Computes Y = alpha * X + beta * Y.
+   //! Computes this = thisMultiplicator * this + multiplicator1 * v1 + multiplicator2 * v2.
    template< typename Vector >
-   void alphaXPlusBetaY( const Real& alpha,
-                         const Vector& x,
-                         const Real& beta );
-
-   //! Computes Y = alpha * X + beta * Z
-   template< typename Vector >
-   void alphaXPlusBetaZ( const Real& alpha,
-                         const Vector& x,
-                         const Real& beta,
-                         const Vector& z );
-
-   //! Computes Y = Scalar Alpha X Plus Scalar Beta Z Plus Y
-   template< typename Vector >
-   void alphaXPlusBetaZPlusY( const Real& alpha,
-                              const Vector& x,
-                              const Real& beta,
-                              const Vector& z );
+   void addVectors( const Vector& v1,
+                    const Real& multiplicator1,
+                    const Vector& v2,
+                    const Real& multiplicator2,
+                    const Real& thisMultiplicator = 1.0 );
 
    void computePrefixSum();
 
diff --git a/src/core/vectors/tnlSharedVector_impl.cpp b/src/core/vectors/tnlSharedVector_impl.cpp
index 77a908e3a9939aa7ff9eaee38fa7ebca92fac84a..1993b3d301a7c8934514611be5ec032a7d35d1db 100644
--- a/src/core/vectors/tnlSharedVector_impl.cpp
+++ b/src/core/vectors/tnlSharedVector_impl.cpp
@@ -19,16 +19,42 @@
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedVector< float, tnlHost, int >;
+#endif
 template class tnlSharedVector< double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlSharedVector< long double, tnlHost, int >;
+#endif
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedVector< float, tnlHost, long int >;
+#endif
 template class tnlSharedVector< double, tnlHost, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlSharedVector< long double, tnlHost, long int >;
+#endif
+#endif
 
 #ifdef HAVE_CUDA
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedVector< float, tnlCuda, int >;
+#endif
 template class tnlSharedVector< double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlSharedVector< long double, tnlCuda, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedVector< float, tnlCuda, long int >;
+#endif
 template class tnlSharedVector< double, tnlCuda, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlSharedVector< long double, tnlCuda, long int >;
+#endif
+#endif
+
 #endif
 
 #endif
diff --git a/src/core/vectors/tnlSharedVector_impl.h b/src/core/vectors/tnlSharedVector_impl.h
index 3e98e54783639601b37c388ed10ee352196d25e3..e8bf600db8bb7bd58aa01c6508158739e814ad84 100644
--- a/src/core/vectors/tnlSharedVector_impl.h
+++ b/src/core/vectors/tnlSharedVector_impl.h
@@ -307,36 +307,17 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vector >
-void tnlSharedVector< Real, Device, Index > :: alphaXPlusBetaY( const Real& alpha,
-                                                                const Vector& x,
-                                                                const Real& beta )
+void
+tnlSharedVector< Real, Device, Index >::
+addVectors( const Vector& v1,
+            const Real& multiplicator1,
+            const Vector& v2,
+            const Real& multiplicator2,
+            const Real& thisMultiplicator )
 {
-   tnlVectorOperations< Device > :: alphaXPlusBetaY( *this, x, alpha, beta );
+   tnlVectorOperations< Device >::addVectors( *this, v1, multiplicator1, v2, multiplicator2, thisMultiplicator );
 }
 
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Vector >
-void tnlSharedVector< Real, Device, Index > :: alphaXPlusBetaZ( const Real& alpha,
-                                                                const Vector& x,
-                                                                const Real& beta,
-                                                                const Vector& z )
-{
-   tnlVectorOperations< Device > :: alphaXPlusBetaZ( *this, x, alpha, z, beta );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Vector >
-void tnlSharedVector< Real, Device, Index > :: alphaXPlusBetaZPlusY( const Real& alpha,
-                                                                     const Vector& x,
-                                                                     const Real& beta,
-                                                                     const Vector& z )
-{
-   tnlVectorOperations< Device > :: alphaXPlusBetaZPlusY( *this, x, alpha, z, beta );
-}
 template< typename Real,
           typename Device,
           typename Index >
@@ -374,17 +355,43 @@ void tnlSharedVector< Real, Device, Index > :: computeExclusivePrefixSum( const
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlSharedVector< float, tnlHost, int >;
+#endif
 extern template class tnlSharedVector< double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlSharedVector< long double, tnlHost, int >;
+#endif
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlSharedVector< float, tnlHost, long int >;
+#endif
 extern template class tnlSharedVector< double, tnlHost, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlSharedVector< long double, tnlHost, long int >;
+#endif
+#endif
 
 #ifdef HAVE_CUDA
 // TODO: fix this - it does not work with CUDA 5.5
-/*extern template class tnlSharedVector< float, tnlCuda, int >;
+/*
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlSharedVector< float, tnlCuda, int >;
+#endif
 extern template class tnlSharedVector< double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlSharedVector< long double, tnlCuda, int >;
+#endif
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlSharedVector< float, tnlCuda, long int >;
-extern template class tnlSharedVector< double, tnlCuda, long int >;*/
+#endif
+extern template class tnlSharedVector< double, tnlCuda, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlSharedVector< long double, tnlCuda, long int >;
+#endif
+ #endif 
+ */
 #endif
 
 #endif
diff --git a/src/core/vectors/tnlStaticVector1D_impl.h b/src/core/vectors/tnlStaticVector1D_impl.h
index 5fd6030414486a4d3a16de8f2cff013fbf4d91df..048c9b9619f8c2f5f17aadf3891f2a1915b8a87a 100644
--- a/src/core/vectors/tnlStaticVector1D_impl.h
+++ b/src/core/vectors/tnlStaticVector1D_impl.h
@@ -166,9 +166,13 @@ bool tnlStaticVector< 1, Real >::operator >= ( const tnlStaticVector& v ) const
 
 #ifndef HAVE_CUDA
 // TODO: does not work with CUDA
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlStaticVector< 1, float >;
+#endif
 extern template class tnlStaticVector< 1, double >;
-//extern template class tnlStaticVector< 1, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlStaticVector< 1, long double >;
+#endif
 #endif
 
 #endif
diff --git a/src/core/vectors/tnlStaticVector2D_impl.h b/src/core/vectors/tnlStaticVector2D_impl.h
index 81c152dce7423aa1a6421dabf74b12e36b79a5d3..5a780d57fac158ab2c6f60e7c3f28b88c9aa0ff7 100644
--- a/src/core/vectors/tnlStaticVector2D_impl.h
+++ b/src/core/vectors/tnlStaticVector2D_impl.h
@@ -195,9 +195,13 @@ bool tnlStaticVector< 2, Real >::operator >= ( const tnlStaticVector& v ) const
 
 #ifndef HAVE_CUDA
 // TODO: does not work with CUDA
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlStaticVector< 2, float >;
+#endif
 extern template class tnlStaticVector< 2, double >;
-//extern template class tnlStaticVector< 2, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlStaticVector< 2, long double >;
+#endif
 #endif
 
 #endif
diff --git a/src/core/vectors/tnlStaticVector3D_impl.h b/src/core/vectors/tnlStaticVector3D_impl.h
index 50f4078e63aaa5c3b13166b89ea24bfc62d81b27..8772a796f8ee52f34e03fe6dc7b4c59ab77ceac8 100644
--- a/src/core/vectors/tnlStaticVector3D_impl.h
+++ b/src/core/vectors/tnlStaticVector3D_impl.h
@@ -206,9 +206,13 @@ bool tnlStaticVector< 3, Real >::operator >= ( const tnlStaticVector& v ) const
 
 #ifndef HAVE_CUDA
 // TODO: does not work with CUDA
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlStaticVector< 3, float >;
+#endif
 extern template class tnlStaticVector< 3, double >;
-//extern template class tnlStaticVector< 3, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlStaticVector< 3, long double >;
+#endif
 #endif
 
 #endif
diff --git a/src/core/vectors/tnlStaticVector_impl.cpp b/src/core/vectors/tnlStaticVector_impl.cpp
index ab9fe2be93e7b3c7c311659cd965e054e6acc0f4..1b20e1a53aaf6fedf9b0de09d8d1d6691506741d 100644
--- a/src/core/vectors/tnlStaticVector_impl.cpp
+++ b/src/core/vectors/tnlStaticVector_impl.cpp
@@ -20,21 +20,37 @@
 #ifndef HAVE_CUDA
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticVector< 1, float >;
+#endif
 template class tnlStaticVector< 1, double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
 template class tnlStaticVector< 1, long double >;
+#endif
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticVector< 2, float >;
+#endif
 template class tnlStaticVector< 2, double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
 template class tnlStaticVector< 2, long double >;
+#endif
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticVector< 3, float >;
+#endif
 template class tnlStaticVector< 3, double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
 template class tnlStaticVector< 3, long double >;
+#endif
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticVector< 4, float >;
+#endif
 template class tnlStaticVector< 4, double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
 template class tnlStaticVector< 4, long double >;
+#endif
 
 #endif
 #endif
diff --git a/src/core/vectors/tnlStaticVector_impl.cu b/src/core/vectors/tnlStaticVector_impl.cu
index 938d0c47337fb389add036ab68dc7444474e4b16..c8ecd5fe583f9336a6e1e2283d2c4f187e8076fc 100644
--- a/src/core/vectors/tnlStaticVector_impl.cu
+++ b/src/core/vectors/tnlStaticVector_impl.cu
@@ -20,21 +20,37 @@
 #ifdef HAVE_CUDA
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticVector< 1, float >;
+#endif
 template class tnlStaticVector< 1, double >;
-//template class tnlStaticVector< 1, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlStaticVector< 1, long double >;
+#endif
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticVector< 2, float >;
+#endif
 template class tnlStaticVector< 2, double >;
-//template class tnlStaticVector< 2, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlStaticVector< 2, long double >;
+#endif
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticVector< 3, float >;
+#endif
 template class tnlStaticVector< 3, double >;
-//template class tnlStaticVector< 3, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlStaticVector< 3, long double >;
+#endif
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticVector< 4, float >;
+#endif
 template class tnlStaticVector< 4, double >;
-//template class tnlStaticVector< 4, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlStaticVector< 4, long double >;
+#endif
 
 #endif
 #endif
diff --git a/src/core/vectors/tnlStaticVector_impl.h b/src/core/vectors/tnlStaticVector_impl.h
index 208aca2bf923b66688382c502e08d17f759822c2..7adfd45fc5279341605dc6ccb53239b7be97405d 100644
--- a/src/core/vectors/tnlStaticVector_impl.h
+++ b/src/core/vectors/tnlStaticVector_impl.h
@@ -202,9 +202,13 @@ tnlStaticVector< Size, Real > operator * ( const Real& c, const tnlStaticVector<
 
 #ifndef HAVE_CUDA
 // TODO: does not work with CUDA
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlStaticVector< 4, float >;
+#endif
 extern template class tnlStaticVector< 4, double >;
-//extern template class tnlStaticVector< 4, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlStaticVector< 4, long double >;
+#endif
 #endif
 
 #endif
diff --git a/src/core/vectors/tnlVector.h b/src/core/vectors/tnlVector.h
index 3578812a583db6a9bde34c94b9576ca128fce638..544ab1e59836446646901b4249aaa8547d30da9e 100644
--- a/src/core/vectors/tnlVector.h
+++ b/src/core/vectors/tnlVector.h
@@ -19,7 +19,7 @@
 #define TNLVECTOR_H_
 
 #include <core/arrays/tnlArray.h>
-#include <functions/tnlFunctionType.h>
+#include <functors/tnlFunctionType.h>
 
 class tnlHost;
 
@@ -114,31 +114,20 @@ class tnlVector : public tnlArray< Real, Device, Index >
    template< typename Vector >
    Real scalarProduct( const Vector& v );
 
-   //! Computes Y = alpha * X + Y.
+   //! Computes this = thisMultiplicator * this + multiplicator * v.
    template< typename Vector >
    void addVector( const Vector& v,
                    const Real& multiplicator = 1.0,
                    const Real& thisMultiplicator = 1.0 );
 
-   //! Computes Y = alpha * X + beta * Y.
-   template< typename Vector >
-   void alphaXPlusBetaY( const Real& alpha,
-                         const Vector& x,
-                         const Real& beta );
-
-   //! Computes Y = alpha * X + beta * Z
-   template< typename Vector >
-   void alphaXPlusBetaZ( const Real& alpha,
-                         const Vector& x,
-                         const Real& beta,
-                         const Vector& z );
 
-   //! Computes Y = Scalar Alpha X Plus Scalar Beta Z Plus Y
+   //! Computes this = thisMultiplicator * this + multiplicator1 * v1 + multiplicator2 * v2.
    template< typename Vector >
-   void alphaXPlusBetaZPlusY( const Real& alpha,
-                              const Vector& x,
-                              const Real& beta,
-                              const Vector& z );
+   void addVectors( const Vector& v1,
+                    const Real& multiplicator1,
+                    const Vector& v2,
+                    const Real& multiplicator2,
+                    const Real& thisMultiplicator = 1.0 );
 
    void computePrefixSum();
 
diff --git a/src/core/vectors/tnlVectorOperations.h b/src/core/vectors/tnlVectorOperations.h
index b065a2bf97cb3a690433011dcc73cbbf5dc6d5b4..46cdb483ef664c8493b710c9a032f152c80800aa 100644
--- a/src/core/vectors/tnlVectorOperations.h
+++ b/src/core/vectors/tnlVectorOperations.h
@@ -18,7 +18,6 @@
 #ifndef TNLVECTOROPERATIONS_H_
 #define TNLVECTOROPERATIONS_H_
 
-#include <core/tnlCuda.h>
 #include <core/cuda/cuda-reduction.h>
 #include <core/cuda/reduction-operations.h>
 #include <core/tnlHost.h>
@@ -99,6 +98,15 @@ class tnlVectorOperations< tnlHost >
                           const Vector2& v,
                           const typename Vector2::RealType& multiplicator,
                           const typename Vector1::RealType& thisMultiplicator = 1.0 );
+   
+   template< typename Vector1, typename Vector2, typename Vector3 >
+   static void addVectors( Vector1& v,
+                           const Vector2& v1,
+                           const typename Vector2::RealType& multiplicator1,
+                           const Vector3& v2,
+                           const typename Vector3::RealType& multiplicator2,
+                           const typename Vector1::RealType& thisMultiplicator = 1.0 );
+
 
    template< typename Vector1, typename Vector2 >
    static void alphaXPlusBetaY( Vector1& y,
@@ -204,6 +212,15 @@ class tnlVectorOperations< tnlCuda >
                           const Vector2& x,
                           const typename Vector2::RealType& alpha,
                           const typename Vector1::RealType& thisMultiplicator = 1.0 );
+   
+   template< typename Vector1, typename Vector2, typename Vector3 >
+   static void addVectors( Vector1& v,
+                           const Vector2& v1,
+                           const typename Vector2::RealType& multiplicator1,
+                           const Vector3& v2,
+                           const typename Vector3::RealType& multiplicator2,
+                           const typename Vector1::RealType& thisMultiplicator = 1.0 );
+   
 
    template< typename Vector1, typename Vector2 >
    static void alphaXPlusBetaY( Vector1& y,
diff --git a/src/core/vectors/tnlVectorOperationsCuda_impl.h b/src/core/vectors/tnlVectorOperationsCuda_impl.h
index 0f5eb8371407acd4c8b5bac8494364085acc3d69..76c8b586545dd90f0734596d716995908deee6e6 100644
--- a/src/core/vectors/tnlVectorOperationsCuda_impl.h
+++ b/src/core/vectors/tnlVectorOperationsCuda_impl.h
@@ -18,7 +18,9 @@
 #ifndef TNLVECTOROPERATIONSCUDA_IMPL_H_
 #define TNLVECTOROPERATIONSCUDA_IMPL_H_
 
+#include <tnlConfig.h>
 #include <core/cuda/cuda-prefix-sum.h>
+#include <core/cuda/tnlCublasWrapper.h>
 
 template< typename Vector >
 void tnlVectorOperations< tnlCuda >::addElement( Vector& v,
@@ -52,7 +54,7 @@ typename Vector :: RealType tnlVectorOperations< tnlCuda > :: getVectorMax( cons
                           v. getSize(),
                           v. getData(),
                           ( Real* ) 0,
-                          result );
+                          result );   
    return result;
 }
 
@@ -245,9 +247,11 @@ typename Vector1 :: RealType tnlVectorOperations< tnlCuda > :: getVectorDifferen
 }
 
 template< typename Vector1, typename Vector2 >
-typename Vector1 :: RealType tnlVectorOperations< tnlCuda > :: getVectorDifferenceLpNorm( const Vector1& v1,
-                                                               const Vector2& v2,
-                                                               const typename Vector1 :: RealType& p )
+typename Vector1::RealType
+tnlVectorOperations< tnlCuda >::
+getVectorDifferenceLpNorm( const Vector1& v1,
+                           const Vector2& v2,
+                           const typename Vector1 :: RealType& p )
 {
    typedef typename Vector1 :: RealType Real;
    typedef typename Vector1 :: IndexType Index;
@@ -347,6 +351,12 @@ typename Vector1 :: RealType tnlVectorOperations< tnlCuda > :: getScalarProduct(
               cerr << "Vector names are " << v1. getName() << " and " << v2. getName() );
 
    Real result( 0 );
+#if defined HAVE_CUBLAS && defined HAVE_CUDA
+   if( tnlCublasWrapper< typename Vector1::RealType,
+                         typename Vector2::RealType,
+                         typename Vector1::IndexType >::dot( v1.getData(), v1.getData(), v1.getSize(), result ) )
+       return result;
+#endif
    tnlParallelReductionScalarProduct< Real, Index > operation;
    reductionOnCudaDevice( operation,
                           v1. getSize(),
@@ -419,6 +429,87 @@ void tnlVectorOperations< tnlCuda > :: addVector( Vector1& y,
    #endif
 }
 
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Index >
+__global__ void vectorAddVectorsCudaKernel( Real* v,
+                                            const Real* v1,
+                                            const Real* v2,
+                                            const Index size,
+                                            const Real multiplicator1,
+                                            const Real multiplicator2,
+                                            const Real thisMultiplicator )
+{
+   Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
+   const Index maxGridSize = blockDim. x * gridDim. x;
+   if( thisMultiplicator == 1.0 )
+      while( elementIdx < size )
+      {
+         v[ elementIdx ] += multiplicator1 * v1[ elementIdx ] +
+                            multiplicator2 * v2[ elementIdx ];
+         elementIdx += maxGridSize;
+      }
+   else
+      while( elementIdx < size )
+      {
+         v[ elementIdx ] = thisMultiplicator * v[ elementIdx ] +
+                           multiplicator1 * v1[ elementIdx ] +
+                           multiplicator2 * v2[ elementIdx ];
+         elementIdx += maxGridSize;
+      }
+}
+#endif
+
+
+template< typename Vector1,
+          typename Vector2,
+          typename Vector3 >
+void
+tnlVectorOperations< tnlCuda >::
+addVectors( Vector1& v,
+            const Vector2& v1,
+            const typename Vector2::RealType& multiplicator1,
+            const Vector3& v2,
+            const typename Vector3::RealType& multiplicator2,
+            const typename Vector1::RealType& thisMultiplicator )
+{
+   typedef typename Vector1 :: RealType Real;
+   typedef typename Vector1 :: IndexType Index;
+
+   tnlAssert( v.getSize() > 0,
+              cerr << "Vector name is " << v.getName() );
+   tnlAssert( v.getSize() == v1.getSize(),
+              cerr << "Vector names are " << v.getName() << " and " << v1.getName() );
+   tnlAssert( v.getSize() == v2.getSize(),
+              cerr << "Vector names are " << v.getName() << " and " << v2.getName() );
+   tnlAssert( v.getData() != 0, );
+   tnlAssert( v1.getData() != 0, );
+   tnlAssert( v2.getData() != 0, );
+
+   #ifdef HAVE_CUDA
+      dim3 blockSize( 0 ), gridSize( 0 );
+
+      const Index& size = v.getSize();
+      dim3 cudaBlockSize( 256 );
+      dim3 cudaBlocks;
+      cudaBlocks.x = Min( tnlCuda::getMaxGridSize(), tnlCuda::getNumberOfBlocks( size, cudaBlockSize.x ) );      
+
+      vectorAddVectorsCudaKernel<<< cudaBlocks, cudaBlockSize >>>( v.getData(),
+                                                                   v1.getData(),
+                                                                   v2.getData(),
+                                                                   size,
+                                                                   multiplicator1,
+                                                                   multiplicator2,
+                                                                   thisMultiplicator);
+      checkCudaDevice;
+   #else
+      tnlCudaSupportMissingMessage;;
+   #endif
+
+
+}
+
+
 #ifdef HAVE_CUDA
 template< typename Real,
           typename Index >
@@ -626,140 +717,250 @@ void tnlVectorOperations< tnlCuda >::computeExclusivePrefixSum( Vector& v,
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< int, tnlCuda, int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< float, tnlCuda, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< double, tnlCuda, int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< int, tnlCuda, long int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< double, tnlCuda, long int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Min
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< int, tnlCuda, int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< float, tnlCuda, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< double, tnlCuda, int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< int, tnlCuda, long int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< double, tnlCuda, long int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Abs max
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< int, tnlCuda, int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< float, tnlCuda, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< double, tnlCuda, int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< int, tnlCuda, long int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< double, tnlCuda, long int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Abs min
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< int, tnlCuda, int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< float, tnlCuda, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< double, tnlCuda, int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< int, tnlCuda, long int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< double, tnlCuda, long int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Lp norm
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< int, tnlCuda, int >& v, const int& p );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long int, tnlCuda, int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< float, tnlCuda, int >& v, const float& p );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< double, tnlCuda, int >& v, const double& p );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, int >& v, const long double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, int >& v, const long double& p );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< int, tnlCuda, long int >& v, const int& p );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long int, tnlCuda, long int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< float, tnlCuda, long int >& v, const float& p );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< double, tnlCuda, long int >& v, const double& p );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, long int >& v, const long double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, long int >& v, const long double& p );
+#endif
+#endif
 
 /****
  * Sum
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< int, tnlCuda, int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< float, tnlCuda, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< double, tnlCuda, int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< int, tnlCuda, long int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< double, tnlCuda, long int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Difference max
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference min
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference abs max
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference abs min
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
 
 #endif
 
diff --git a/src/core/vectors/tnlVectorOperationsHost_impl.h b/src/core/vectors/tnlVectorOperationsHost_impl.h
index cd3c2a17a17c1bf933d4bad46e2daa69404964f8..874397dc67def0089a347a577f816cbd787e8656 100644
--- a/src/core/vectors/tnlVectorOperationsHost_impl.h
+++ b/src/core/vectors/tnlVectorOperationsHost_impl.h
@@ -314,7 +314,7 @@ typename Vector1 :: RealType tnlVectorOperations< tnlHost > :: getScalarProduct(
    Real result = 0;
    const Index n = v1. getSize();
    for( Index i = 0; i < n; i ++ )
-      result += v1. getElement( i ) * v2. getElement( i );
+      result += v1[ i ] * v2[ i ];
    return result;
 }
 
@@ -339,26 +339,38 @@ void tnlVectorOperations< tnlHost > :: addVector( Vector1& y,
    else
       for( Index i = 0; i < n; i ++ )
          y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ];
-
 }
 
-template< typename Vector1, typename Vector2 >
-void tnlVectorOperations< tnlHost > :: alphaXPlusBetaY( Vector1& y,
-                                                        const Vector2& x,
-                                                        const typename Vector1::RealType& alpha,
-                                                        const typename Vector1::RealType& beta )
+template< typename Vector1,
+          typename Vector2,
+          typename Vector3 >
+void
+tnlVectorOperations< tnlHost >::
+addVectors( Vector1& v,
+            const Vector2& v1,
+            const typename Vector2::RealType& multiplicator1,
+            const Vector3& v2,
+            const typename Vector3::RealType& multiplicator2,
+            const typename Vector1::RealType& thisMultiplicator )
 {
    typedef typename Vector1 :: RealType Real;
    typedef typename Vector1 :: IndexType Index;
 
-   tnlAssert( x. getSize() > 0,
-              cerr << "Vector name is " << x. getName() );
-   tnlAssert( x. getSize() == y. getSize(),
-              cerr << "Vector names are " << x. getName() << " and " << y. getName() );
+   tnlAssert( v.getSize() > 0,
+              cerr << "Vector name is " << v.getName() );
+   tnlAssert( v.getSize() == v1.getSize(),
+              cerr << "Vector names are " << v.getName() << " and " << v1.getName() );
+   tnlAssert( v.getSize() == v2.getSize(),
+              cerr << "Vector names are " << v.getName() << " and " << v2.getName() );
 
-   const Index n = y. getSize();
-   for( Index i = 0; i < n; i ++ )
-      y[ i ] = alpha * x[ i ] + beta * y[ i ];
+   
+   const Index n = v.getSize();
+   if( thisMultiplicator == 1.0 )
+      for( Index i = 0; i < n; i ++ )
+         v[ i ] += multiplicator1 * v1[ i ] + multiplicator2 * v2[ i ];
+   else
+      for( Index i = 0; i < n; i ++ )
+         v[ i ] = thisMultiplicator * v[ i ] * multiplicator1 * v1[ i ] + multiplicator2 * v2[ i ];
 }
 
 
@@ -443,141 +455,251 @@ void tnlVectorOperations< tnlHost >::computeExclusivePrefixSum( Vector& v,
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< int, tnlHost, int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< float, tnlHost, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< int, tnlHost, long int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< float, tnlHost, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Min
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< int, tnlHost, int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< float, tnlHost, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< int, tnlHost, long int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< float, tnlHost, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Abs max
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< int, tnlHost, int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< float, tnlHost, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< int, tnlHost, long int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< float, tnlHost, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Abs min
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< int, tnlHost, int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< float, tnlHost, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< int, tnlHost, long int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< float, tnlHost, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Lp norm
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< int, tnlHost, int >& v, const int& p );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< long int, tnlHost, int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< float, tnlHost, int >& v, const float& p );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< double, tnlHost, int >& v, const double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< long double, tnlHost, int >& v, const long double& p );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< int, tnlHost, long int >& v, const int& p );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< long int, tnlHost, long int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< float, tnlHost, long int >& v, const float& p );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< double, tnlHost, long int >& v, const double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< long double, tnlHost, long int >& v, const long double& p );
+#endif
+#endif
 
 /****
  * Sum
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< int, tnlHost, int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< float, tnlHost, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< int, tnlHost, long int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< float, tnlHost, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Difference max
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< int, tnlHost, int >& v1, const tnlVector< int, tnlHost, int >& v2 );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< long int, tnlHost, int >& v1, const tnlVector< long int, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< float, tnlHost, int >& v1,  const tnlVector< float, tnlHost, int >& v2);
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< double, tnlHost, int >& v1, const tnlVector< double, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< long double, tnlHost, int >& v1, const tnlVector< long double, tnlHost, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< int, tnlHost, long int >& v1, const tnlVector< int, tnlHost, long int >& v2 );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< long int, tnlHost, long int >& v1, const tnlVector< long int, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< float, tnlHost, long int >& v1, const tnlVector< float, tnlHost, long int >& v2 );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< double, tnlHost, long int >& v1, const tnlVector< double, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< long double, tnlHost, long int >& v1, const tnlVector< long double, tnlHost, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference min
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< int, tnlHost, int >& v1, const tnlVector< int, tnlHost, int >& v2 );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< long int, tnlHost, int >& v1, const tnlVector< long int, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< float, tnlHost, int >& v1,  const tnlVector< float, tnlHost, int >& v2);
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< double, tnlHost, int >& v1, const tnlVector< double, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< long double, tnlHost, int >& v1, const tnlVector< long double, tnlHost, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< int, tnlHost, long int >& v1, const tnlVector< int, tnlHost, long int >& v2 );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< long int, tnlHost, long int >& v1, const tnlVector< long int, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< float, tnlHost, long int >& v1, const tnlVector< float, tnlHost, long int >& v2 );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< double, tnlHost, long int >& v1, const tnlVector< double, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< long double, tnlHost, long int >& v1, const tnlVector< long double, tnlHost, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference abs max
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< int, tnlHost, int >& v1, const tnlVector< int, tnlHost, int >& v2 );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlHost, int >& v1, const tnlVector< long int, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< float, tnlHost, int >& v1,  const tnlVector< float, tnlHost, int >& v2);
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< double, tnlHost, int >& v1, const tnlVector< double, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlHost, int >& v1, const tnlVector< long double, tnlHost, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< int, tnlHost, long int >& v1, const tnlVector< int, tnlHost, long int >& v2 );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlHost, long int >& v1, const tnlVector< long int, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< float, tnlHost, long int >& v1, const tnlVector< float, tnlHost, long int >& v2 );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< double, tnlHost, long int >& v1, const tnlVector< double, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlHost, long int >& v1, const tnlVector< long double, tnlHost, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference abs min
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< int, tnlHost, int >& v1, const tnlVector< int, tnlHost, int >& v2 );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlHost, int >& v1, const tnlVector< long int, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< float, tnlHost, int >& v1,  const tnlVector< float, tnlHost, int >& v2);
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< double, tnlHost, int >& v1, const tnlVector< double, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlHost, int >& v1, const tnlVector< long double, tnlHost, int >& v2 );
+#endif
+
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< int, tnlHost, long int >& v1, const tnlVector< int, tnlHost, long int >& v2 );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlHost, long int >& v1, const tnlVector< long int, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< float, tnlHost, long int >& v1, const tnlVector< float, tnlHost, long int >& v2 );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< double, tnlHost, long int >& v1, const tnlVector< double, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlHost, long int >& v1, const tnlVector< long double, tnlHost, long int >& v2 );
-
+#endif
+#endif
 
 #endif
 
diff --git a/src/core/vectors/tnlVectorOperations_impl.cpp b/src/core/vectors/tnlVectorOperations_impl.cpp
index 23ee7d01ebd17cbb964fa32d7ec94c2207dfe0cf..dbd2275c17f405ec2733f8fb19018dafd2ced47f 100644
--- a/src/core/vectors/tnlVectorOperations_impl.cpp
+++ b/src/core/vectors/tnlVectorOperations_impl.cpp
@@ -24,282 +24,503 @@
  */
 template int         tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< int, tnlHost, int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< float, tnlHost, int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< int, tnlHost, long int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< float, tnlHost, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Min
  */
 template int         tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< int, tnlHost, int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< float, tnlHost, int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< int, tnlHost, long int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< float, tnlHost, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Abs max
  */
 template int         tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< int, tnlHost, int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< float, tnlHost, int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< int, tnlHost, long int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< float, tnlHost, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Abs min
  */
 template int         tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< int, tnlHost, int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< float, tnlHost, int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< int, tnlHost, long int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< float, tnlHost, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Lp norm
  */
 template int         tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< int, tnlHost, int >& v, const int& p );
 template long int    tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< long int, tnlHost, int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< float, tnlHost, int >& v, const float& p );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< double, tnlHost, int >& v, const double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< long double, tnlHost, int >& v, const long double& p );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< int, tnlHost, long int >& v, const int& p );
 template long int    tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< long int, tnlHost, long int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< float, tnlHost, long int >& v, const float& p );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< double, tnlHost, long int >& v, const double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< long double, tnlHost, long int >& v, const long double& p );
+#endif
+#endif
 
 /****
  * Sum
  */
 template int         tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< int, tnlHost, int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< float, tnlHost, int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< int, tnlHost, long int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< float, tnlHost, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Difference max
  */
 template int         tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< int, tnlHost, int >& v1, const tnlVector< int, tnlHost, int >& v2 );
 template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< long int, tnlHost, int >& v1, const tnlVector< long int, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< float, tnlHost, int >& v1,  const tnlVector< float, tnlHost, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< double, tnlHost, int >& v1, const tnlVector< double, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< long double, tnlHost, int >& v1, const tnlVector< long double, tnlHost, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< int, tnlHost, long int >& v1, const tnlVector< int, tnlHost, long int >& v2 );
 template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< long int, tnlHost, long int >& v1, const tnlVector< long int, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< float, tnlHost, long int >& v1, const tnlVector< float, tnlHost, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< double, tnlHost, long int >& v1, const tnlVector< double, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< long double, tnlHost, long int >& v1, const tnlVector< long double, tnlHost, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference min
  */
 template int         tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< int, tnlHost, int >& v1, const tnlVector< int, tnlHost, int >& v2 );
 template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< long int, tnlHost, int >& v1, const tnlVector< long int, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< float, tnlHost, int >& v1,  const tnlVector< float, tnlHost, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< double, tnlHost, int >& v1, const tnlVector< double, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< long double, tnlHost, int >& v1, const tnlVector< long double, tnlHost, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< int, tnlHost, long int >& v1, const tnlVector< int, tnlHost, long int >& v2 );
 template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< long int, tnlHost, long int >& v1, const tnlVector< long int, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< float, tnlHost, long int >& v1, const tnlVector< float, tnlHost, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< double, tnlHost, long int >& v1, const tnlVector< double, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< long double, tnlHost, long int >& v1, const tnlVector< long double, tnlHost, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference abs max
  */
 template int         tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< int, tnlHost, int >& v1, const tnlVector< int, tnlHost, int >& v2 );
 template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlHost, int >& v1, const tnlVector< long int, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< float, tnlHost, int >& v1,  const tnlVector< float, tnlHost, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< double, tnlHost, int >& v1, const tnlVector< double, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlHost, int >& v1, const tnlVector< long double, tnlHost, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< int, tnlHost, long int >& v1, const tnlVector< int, tnlHost, long int >& v2 );
 template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlHost, long int >& v1, const tnlVector< long int, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< float, tnlHost, long int >& v1, const tnlVector< float, tnlHost, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< double, tnlHost, long int >& v1, const tnlVector< double, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlHost, long int >& v1, const tnlVector< long double, tnlHost, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference abs min
  */
 template int         tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< int, tnlHost, int >& v1, const tnlVector< int, tnlHost, int >& v2 );
 template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlHost, int >& v1, const tnlVector< long int, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< float, tnlHost, int >& v1,  const tnlVector< float, tnlHost, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< double, tnlHost, int >& v1, const tnlVector< double, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlHost, int >& v1, const tnlVector< long double, tnlHost, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< int, tnlHost, long int >& v1, const tnlVector< int, tnlHost, long int >& v2 );
 template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlHost, long int >& v1, const tnlVector< long int, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< float, tnlHost, long int >& v1, const tnlVector< float, tnlHost, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< double, tnlHost, long int >& v1, const tnlVector< double, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlHost, long int >& v1, const tnlVector< long double, tnlHost, long int >& v2 );
-
+#endif
+#endif
 
 /****
  * Max
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
+
 
 /****
  * Min
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Abs max
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Abs min
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Lp norm
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< int, tnlCuda, int >& v, const int& p );
 template long int    tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long int, tnlCuda, int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< float, tnlCuda, int >& v, const float& p );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< double, tnlCuda, int >& v, const double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, int >& v, const long double& p );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< int, tnlCuda, long int >& v, const int& p );
 template long int    tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long int, tnlCuda, long int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< float, tnlCuda, long int >& v, const float& p );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< double, tnlCuda, long int >& v, const double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, long int >& v, const long double& p );
+#endif
+#endif
 
 /****
  * Sum
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
+
 
 /****
  * Difference max
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
+
 
 /****
  * Difference min
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference abs max
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference abs min
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
-
+#endif
+#endif
 
 #endif
 
diff --git a/src/core/vectors/tnlVectorOperations_impl.cu b/src/core/vectors/tnlVectorOperations_impl.cu
index 6ad481eef135afc207929c3cf70f22f7ee99b460..b8af43c24dba48c7b2775a4c27eba77a1a77b6fa 100644
--- a/src/core/vectors/tnlVectorOperations_impl.cu
+++ b/src/core/vectors/tnlVectorOperations_impl.cu
@@ -24,140 +24,253 @@
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< double, tnlCuda, int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< double, tnlCuda, long int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Min
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< double, tnlCuda, int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< double, tnlCuda, long int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Abs max
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< double, tnlCuda, int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< double, tnlCuda, long int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
+
 
 /****
  * Abs min
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< double, tnlCuda, int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< double, tnlCuda, long int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
+
 
 /****
  * Lp norm
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< int, tnlCuda, int >& v, const int& p );
 template long int    tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long int, tnlCuda, int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< float, tnlCuda, int >& v, const float& p );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< double, tnlCuda, int >& v, const double& p );
-//template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, int >& v, const long double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, int >& v, const long double& p );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< int, tnlCuda, long int >& v, const int& p );
 template long int    tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long int, tnlCuda, long int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< float, tnlCuda, long int >& v, const float& p );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< double, tnlCuda, long int >& v, const double& p );
-//template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, long int >& v, const long double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, long int >& v, const long double& p );
+#endif
+#endif
 
 /****
  * Sum
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< double, tnlCuda, int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< double, tnlCuda, long int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Difference max
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-//template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-//template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference min
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-//template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-//template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference abs max
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-//template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-//template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
+
 
 /****
  * Difference abs min
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-//template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-//template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
         
 #endif
- 
\ No newline at end of file
+ 
diff --git a/src/core/vectors/tnlVector_impl.cpp b/src/core/vectors/tnlVector_impl.cpp
index e4515ae77448d5f5ca3fe858ef048208b8a0ae54..8d8df49b4937ae9da2c12a8b97b4b568e616124b 100644
--- a/src/core/vectors/tnlVector_impl.cpp
+++ b/src/core/vectors/tnlVector_impl.cpp
@@ -19,12 +19,25 @@
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlVector< float, tnlHost, int >;
 template tnlVector< float, tnlHost, int >& tnlVector< float, tnlHost, int >:: operator = ( const tnlVector< double, tnlHost, int >& vector );
+#endif
+
 
 template class tnlVector< double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlVector< long double, tnlHost, int >;
+#endif
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlVector< float, tnlHost, long int >;
+#endif
 template class tnlVector< double, tnlHost, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlVector< long double, tnlHost, long int >;
+#endif
+#endif
 
 #endif
 
diff --git a/src/core/vectors/tnlVector_impl.cu b/src/core/vectors/tnlVector_impl.cu
index 9daa8dcc9ce9b0b5b5b1a766209e09a520449d65..f5406187ec7ae03f628f98782131a6fe9eef02ca 100644
--- a/src/core/vectors/tnlVector_impl.cu
+++ b/src/core/vectors/tnlVector_impl.cu
@@ -20,10 +20,23 @@
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
 #ifdef HAVE_CUDA
+#ifdef INSTANTIATE_FLOAT
 template class tnlVector< float, tnlCuda, int >;
+#endif
 template class tnlVector< double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlVector< long double, tnlCuda, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlVector< float, tnlCuda, long int >;
+#endif
 template class tnlVector< double, tnlCuda, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlVector< long double, tnlCuda, long int >;
+#endif
+#endif
 #endif
 
 #endif
diff --git a/src/core/vectors/tnlVector_impl.h b/src/core/vectors/tnlVector_impl.h
index 636a80320344fdde2dc3e0d3beb4af3d00147a80..2845413c4cd7360c4dea5377c301dc96fb0fc98d 100644
--- a/src/core/vectors/tnlVector_impl.h
+++ b/src/core/vectors/tnlVector_impl.h
@@ -145,7 +145,7 @@ template< typename Real,
    template< typename Vector >
 tnlVector< Real, Device, Index >& tnlVector< Real, Device, Index > :: operator -= ( const Vector& vector )
 {
-   alphaXPlusBetaY( -1.0, vector, 1.0 );
+   this->addVector( vector, -1.0 );
    return *this;
 }
 
@@ -155,7 +155,7 @@ template< typename Real,
    template< typename Vector >
 tnlVector< Real, Device, Index >& tnlVector< Real, Device, Index > :: operator += ( const Vector& vector )
 {
-   alphaXPlusBetaY( 1.0, vector, 1.0 );
+   this->addVector( vector );
    return *this;
 }
 
@@ -301,35 +301,15 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vector >
-void tnlVector< Real, Device, Index > :: alphaXPlusBetaY( const Real& alpha,
-                                                          const Vector& x,
-                                                          const Real& beta )
-{
-   tnlVectorOperations< Device > :: alphaXPlusBetaY( *this, x, alpha, beta );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Vector >
-void tnlVector< Real, Device, Index > :: alphaXPlusBetaZ( const Real& alpha,
-                                                          const Vector& x,
-                                                          const Real& beta,
-                                                          const Vector& z )
-{
-   tnlVectorOperations< Device > :: alphaXPlusBetaZ( *this, x, alpha, z, beta );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Vector >
-void tnlVector< Real, Device, Index > :: alphaXPlusBetaZPlusY( const Real& alpha,
-                                                    const Vector& x,
-                                                    const Real& beta,
-                                                    const Vector& z )
-{
-   tnlVectorOperations< Device > :: alphaXPlusBetaZPlusY( *this, x, alpha, z, beta );
+void
+tnlVector< Real, Device, Index >::
+addVectors( const Vector& v1,
+            const Real& multiplicator1,
+            const Vector& v2,
+            const Real& multiplicator2,
+            const Real& thisMultiplicator )
+{
+   tnlVectorOperations< Device >::addVectors( *this, v1, multiplicator1, v2, multiplicator2, thisMultiplicator );
 }
 
 template< typename Real,
@@ -369,19 +349,44 @@ void tnlVector< Real, Device, Index > :: computeExclusivePrefixSum( const IndexT
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlVector< float, tnlHost, int >;
 extern template tnlVector< float, tnlHost, int >& tnlVector< float, tnlHost, int >:: operator = ( const tnlVector< double, tnlHost, int >& vector );
+#endif
 
 extern template class tnlVector< double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlVector< long double, tnlHost, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlVector< float, tnlHost, long int >;
+#endif
 extern template class tnlVector< double, tnlHost, long int >;
-
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlVector< long double, tnlHost, long int >;
+#endif
+#endif
 
 #ifdef HAVE_CUDA
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlVector< float, tnlCuda, int >;
+#endif
 extern template class tnlVector< double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlVector< long double, tnlCuda, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlVector< float, tnlCuda, long int >;
+#endif
 extern template class tnlVector< double, tnlCuda, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlVector< long double, tnlCuda, long int >;
+#endif
+#endif
 #endif
 
 #endif
diff --git a/src/functions/CMakeLists.txt b/src/functors/CMakeLists.txt
similarity index 77%
rename from src/functions/CMakeLists.txt
rename to src/functors/CMakeLists.txt
index e259ff2cb349e54812b9e0ec2e3204615c0fc9ff..2b53b1a43d10974ccd9cb3cdaee2c7fc7f37dc74 100755
--- a/src/functions/CMakeLists.txt
+++ b/src/functors/CMakeLists.txt
@@ -2,7 +2,9 @@ ADD_SUBDIRECTORY( initial_conditions )
 
 SET( headers tnlFunctionDiscretizer.h
              tnlFunctionDiscretizer_impl.h
-             tnlFunctionAdapter.h
+             tnlFunctionEnumerator.h
+             tnlFunctionEnumerator_impl.h
+             tnlFunctorAdapter.h
              tnlConstantFunction.h
              tnlConstantFunction_impl.h
              tnlExpBumpFunction.h
@@ -15,19 +17,19 @@ SET( headers tnlFunctionDiscretizer.h
              tnlFunctionType.h
              tnlTestFunction_impl.h )
 
-SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/functions )
+SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/functors )
 set( common_SOURCES
      ${CURRENT_DIR}/tnlTestFunction_impl.cpp )       
 
 IF( BUILD_CUDA )
-   set( tnl_functions_CUDA__SOURCES
+   set( tnl_functors_CUDA__SOURCES
         ${common_SOURCES} 
         ${CURRENT_DIR}/tnlTestFunction_impl.cu
         PARENT_SCOPE )
 ENDIF()    
 
-set( tnl_functions_SOURCES     
+set( tnl_functors_SOURCES     
      ${common_SOURCES}
      PARENT_SCOPE )
         
-INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/functions )
+INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/functors )
diff --git a/src/functions/tnlConstantFunction.h b/src/functors/tnlConstantFunction.h
similarity index 94%
rename from src/functions/tnlConstantFunction.h
rename to src/functors/tnlConstantFunction.h
index bd64f268a4292edd54578ab45c8425cf6ec7365d..df85597ef742b7d5eb9cc9877b99ecb3d6429ba0 100644
--- a/src/functions/tnlConstantFunction.h
+++ b/src/functors/tnlConstantFunction.h
@@ -20,7 +20,7 @@
 
 #include <iostream>
 #include <core/vectors/tnlStaticVector.h>
-#include <functions/tnlFunctionType.h>
+#include <functors/tnlFunctionType.h>
 
 template< int FunctionDimensions,
           typename Real = double >
@@ -55,16 +55,12 @@ class tnlConstantFunction
              int ZDiffOrder = 0,
              typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
    RealType getValue( const Vertex& v,
                       const Real& time = 0.0 ) const;
 
    template< typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
    RealType getValue( const Vertex& v,
                       const Real& time = 0.0 ) const
    {
@@ -93,6 +89,6 @@ class tnlFunctionType< tnlConstantFunction< FunctionDimensions, Real > >
       enum { Type = tnlAnalyticFunction };
 };
 
-#include <functions/tnlConstantFunction_impl.h>
+#include <functors/tnlConstantFunction_impl.h>
 
 #endif /* TNLCONSTANTFUNCTION_H_ */
diff --git a/src/functions/tnlConstantFunction_impl.h b/src/functors/tnlConstantFunction_impl.h
similarity index 100%
rename from src/functions/tnlConstantFunction_impl.h
rename to src/functors/tnlConstantFunction_impl.h
diff --git a/src/functions/tnlExpBumpFunction.h b/src/functors/tnlExpBumpFunction.h
similarity index 88%
rename from src/functions/tnlExpBumpFunction.h
rename to src/functors/tnlExpBumpFunction.h
index 14ed47cedce92cef5a5b85e19795af4517ee4526..1159f1849f3bd4bb911a321f8843c70312fe3efa 100644
--- a/src/functions/tnlExpBumpFunction.h
+++ b/src/functors/tnlExpBumpFunction.h
@@ -20,7 +20,7 @@
 
 #include <config/tnlParameterContainer.h>
 #include <core/vectors/tnlStaticVector.h>
-#include <functions/tnlFunctionType.h>
+#include <functors/tnlFunctionType.h>
 
 template< typename Real >
 class tnlExpBumpFunctionBase
@@ -75,11 +75,8 @@ class tnlExpBumpFunction< 1, Real > : public tnlExpBumpFunctionBase< Real >
                 int ZDiffOrder = 0,
                 typename Vertex = VertexType >
 #endif   
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
-      RealType getValue( const Vertex& v,
-                         const Real& time = 0.0 ) const;
+   __cuda_callable__ RealType getValue( const Vertex& v,
+                                        const Real& time = 0.0 ) const;
 };
 
 template< typename Real >
@@ -106,11 +103,8 @@ class tnlExpBumpFunction< 2, Real > : public tnlExpBumpFunctionBase< Real >
                 int ZDiffOrder = 0,
                 typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
-      RealType getValue( const Vertex& v,
-                         const Real& time = 0.0 ) const;
+   __cuda_callable__ RealType getValue( const Vertex& v,
+                                        const Real& time = 0.0 ) const;
 };
 
 template< typename Real >
@@ -137,11 +131,8 @@ class tnlExpBumpFunction< 3, Real > : public tnlExpBumpFunctionBase< Real >
                 int ZDiffOrder = 0,
                 typename Vertex = VertexType >
 #endif   
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
-      RealType getValue( const Vertex& v,
-                         const Real& time = 0.0 ) const;
+   __cuda_callable__    RealType getValue( const Vertex& v,
+                                           const Real& time = 0.0 ) const;
 };
 
 template< int Dimensions,
@@ -162,7 +153,7 @@ class tnlFunctionType< tnlExpBumpFunction< FunctionDimensions, Real > >
 };
 
 
-#include <functions/tnlExpBumpFunction_impl.h>
+#include <functors/tnlExpBumpFunction_impl.h>
 
 
 #endif /* TNLEXPBUMPFUNCTION_H_ */
diff --git a/src/functions/tnlExpBumpFunction_impl.h b/src/functors/tnlExpBumpFunction_impl.h
similarity index 97%
rename from src/functions/tnlExpBumpFunction_impl.h
rename to src/functors/tnlExpBumpFunction_impl.h
index d00ebae93f1959ee54de7147fc0e455027c279a2..dde9a973aad27eb274131d8ca406e651abfca960 100644
--- a/src/functions/tnlExpBumpFunction_impl.h
+++ b/src/functors/tnlExpBumpFunction_impl.h
@@ -18,7 +18,7 @@
 #ifndef TNLEXPBUMPFUNCTION_IMPL_H_
 #define TNLEXPBUMPFUNCTION_IMPL_H_
 
-#include <functions/tnlExpBumpFunction.h>
+#include <functors/tnlExpBumpFunction.h>
 
 template< typename Real >
 bool
@@ -76,9 +76,7 @@ template< typename Real >
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlExpBumpFunction< 1, Real >::getValue( const Vertex& v,
                                          const Real& time ) const
@@ -116,9 +114,7 @@ template< typename Real >
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlExpBumpFunction< 2, Real >::
 getValue( const Vertex& v,
@@ -164,9 +160,7 @@ template< typename Real >
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlExpBumpFunction< 3, Real >::
 getValue( const Vertex& v,
diff --git a/src/functions/tnlFunctionDiscretizer.h b/src/functors/tnlFunctionDiscretizer.h
similarity index 97%
rename from src/functions/tnlFunctionDiscretizer.h
rename to src/functors/tnlFunctionDiscretizer.h
index dfdef2112fcbcb7ccd91f0dfdf3f299a5e1a5ded..a060328b93c742e695e50a0558b1e31379be156d 100644
--- a/src/functions/tnlFunctionDiscretizer.h
+++ b/src/functors/tnlFunctionDiscretizer.h
@@ -48,6 +48,6 @@ class tnlFunctionDiscretizer
    
 };
 
-#include <functions/tnlFunctionDiscretizer_impl.h>
+#include <functors/tnlFunctionDiscretizer_impl.h>
 
 #endif /* TNLFUNCTIONDISCRETIZER_H_ */
diff --git a/src/functions/tnlFunctionDiscretizer_impl.h b/src/functors/tnlFunctionDiscretizer_impl.h
similarity index 100%
rename from src/functions/tnlFunctionDiscretizer_impl.h
rename to src/functors/tnlFunctionDiscretizer_impl.h
diff --git a/src/functors/tnlFunctionEnumerator.h b/src/functors/tnlFunctionEnumerator.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e25baf478eee26469c56ad03e9e68f6883e6518
--- /dev/null
+++ b/src/functors/tnlFunctionEnumerator.h
@@ -0,0 +1,203 @@
+/***************************************************************************
+                          tnlFunctionEnumerator.h  -  description
+                             -------------------
+    begin                : Mar 5, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+#ifndef SRC_FUNCTIONS_TNLFUNCTIONENUMERATOR_H_
+#define SRC_FUNCTIONS_TNLFUNCTIONENUMERATOR_H_
+
+#include <functors/tnlFunctorAdapter.h>
+
+template< typename Function,
+          typename DofVector >
+class tnlFunctionEnumeratorTraverserUserData
+{
+   public:
+
+      typedef typename DofVector::RealType RealType;
+
+      const RealType *time;
+
+      const Function* function;
+
+      DofVector *u;
+
+      const RealType* functionCoefficient;
+
+      const RealType* dofVectorCoefficient;
+
+      tnlFunctionEnumeratorTraverserUserData( const RealType& time,
+                                              const Function& function,
+                                              DofVector& u,
+                                              const RealType& functionCoefficient,
+                                              const RealType& dofVectorCoefficient )
+      : time( &time ),
+        function( &function ),
+        u( &u ),
+        functionCoefficient( &functionCoefficient ),
+        dofVectorCoefficient( &dofVectorCoefficient )
+      {};
+};
+
+
+template< typename Mesh,
+          typename Function,
+          typename DofVector >
+class tnlFunctionEnumerator
+{
+   public:
+      typedef Mesh MeshType;
+      typedef typename DofVector::RealType RealType;
+      typedef typename DofVector::DeviceType DeviceType;
+      typedef typename DofVector::IndexType IndexType;
+      typedef tnlFunctionEnumeratorTraverserUserData< Function,
+                                                      DofVector > TraverserUserData;
+
+      template< int EntityDimensions >
+      void enumerate( const MeshType& mesh,
+                      const Function& function,
+                      DofVector& u,
+                      const RealType& functionCoefficient = 1.0,
+                      const RealType& dofVectorCoefficient = 0.0,
+                      const RealType& time = 0.0 ) const;
+
+
+      class TraverserEntitiesProcessor
+      {
+         public:
+
+            template< int EntityDimensions >
+#ifdef HAVE_CUDA
+            __host__ __device__
+#endif
+            static void processEntity( const MeshType& mesh,
+                                       TraverserUserData& userData,
+                                       const IndexType index )
+            {
+               typedef tnlFunctorAdapter< MeshType, Function > FunctionAdapter;
+               if( ! *userData.dofVectorCoefficient  )
+                  ( *userData.u )[ index ] =
+                     ( *userData.functionCoefficient ) * FunctionAdapter::getValue( mesh,
+                                                                                    *userData.function,
+                                                                                    index,
+                                                                                    *userData.time );
+               else                                                                                            
+                 ( *userData.u )[ index ] =
+                             ( *userData.dofVectorCoefficient ) * ( *userData.u )[ index ] +
+                             ( *userData.functionCoefficient ) * FunctionAdapter::getValue( mesh,
+                                                                                            *userData.function,
+                                                                                            index,
+                                                                                            *userData.time );
+            }
+
+      };
+
+};
+
+template< int Dimensions,
+          typename Real,
+          typename Device,
+          typename Index,
+          typename Function,
+          typename DofVector >
+class tnlFunctionEnumerator< tnlGrid< Dimensions, Real, Device, Index >,
+                             Function,
+                             DofVector >
+{
+   public:
+
+      typedef tnlGrid< Dimensions, Real, Device, Index > MeshType;
+      typedef typename MeshType::RealType RealType;
+      typedef typename MeshType::DeviceType DeviceType;
+      typedef typename MeshType::IndexType IndexType;
+      typedef typename MeshType::CoordinatesType CoordinatesType;
+      typedef tnlFunctionEnumeratorTraverserUserData< Function,
+                                                      DofVector > TraverserUserData;
+
+      template< int EntityDimensions >
+      void enumerate( const MeshType& mesh,
+                      const Function& function,
+                      DofVector& u,
+                      const RealType& functionCoefficient = 1.0,
+                      const RealType& dofVectorCoefficient = 0.0,
+                      const RealType& time = 0.0 ) const;
+
+      class TraverserEntitiesProcessor
+      {
+         public:
+
+         typedef typename MeshType::VertexType VertexType;
+
+#ifdef HAVE_CUDA
+            __host__ __device__
+#endif
+            static void processCell( const MeshType& mesh,
+                                     TraverserUserData& userData,
+                                     const IndexType index,
+                                     const CoordinatesType& coordinates )
+            {
+               //printf( "Enumerator::processCell mesh =%p \n", &mesh );
+               typedef tnlFunctorAdapter< MeshType, Function > FunctionAdapter;
+               if( ! ( *userData.dofVectorCoefficient ) )
+                  ( *userData.u )[ index ] =
+                     ( *userData.functionCoefficient ) * FunctionAdapter::getValue( mesh,
+                                                                                    *userData.function,
+                                                                                    index,
+                                                                                    coordinates,
+                                                                                    *userData.time );
+               else
+                  ( *userData.u )[ index ] =
+                           ( *userData.dofVectorCoefficient ) * ( *userData.u )[ index ] +
+                           ( *userData.functionCoefficient ) * FunctionAdapter::getValue( mesh,
+                                                                                          *userData.function,
+                                                                                          index,
+                                                                                          coordinates,
+                                                                                          *userData.time );
+
+            }
+
+#ifdef HAVE_CUDA
+            __host__ __device__
+#endif
+            static void processFace( const MeshType& mesh,
+                                     TraverserUserData& userData,
+                                     const IndexType index,
+                                     const CoordinatesType& coordinates )
+            {
+               typedef tnlFunctorAdapter< MeshType, Function > FunctionAdapter;
+               if( ! ( *userData.dofVectorCoefficient ) )
+                  ( *userData.u )[ index ] =
+                     ( *userData.functionCoefficient ) * FunctionAdapter::getValue( mesh,
+                                                                                    *userData.function,
+                                                                                    index,
+                                                                                    coordinates,
+                                                                                    *userData.time );
+               else
+                  ( *userData.u )[ index ] =
+                           ( *userData.dofVectorCoefficient ) * ( *userData.u )[ index ] +
+                           ( *userData.functionCoefficient ) * FunctionAdapter::getValue( mesh,
+                                                                                          *userData.function,
+                                                                                          index,
+                                                                                          coordinates,
+                                                                                          *userData.time );
+            }
+      };
+
+};
+
+#include <functors/tnlFunctionEnumerator_impl.h>
+
+
+
+#endif /* SRC_FUNCTIONS_TNLFUNCTIONENUMERATOR_H_ */
diff --git a/src/functors/tnlFunctionEnumerator_impl.h b/src/functors/tnlFunctionEnumerator_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e3d824b9d8aa5c50e2377a3702d9d4081f460b0
--- /dev/null
+++ b/src/functors/tnlFunctionEnumerator_impl.h
@@ -0,0 +1,143 @@
+/***************************************************************************
+                          tnlFunctionEnumerator_impl.h  -  description
+                             -------------------
+    begin                : Mar 5, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+#ifndef SRC_FUNCTIONS_TNLFUNCTIONENUMERATOR_IMPL_H_
+#define SRC_FUNCTIONS_TNLFUNCTIONENUMERATOR_IMPL_H_
+
+#include <functors/tnlFunctionEnumerator.h>
+#include <mesh/tnlTraverser_Grid1D.h>
+#include <mesh/tnlTraverser_Grid2D.h>
+#include <mesh/tnlTraverser_Grid3D.h>
+
+template< typename Mesh,
+          typename Function,
+          typename DofVector >
+   template< int EntityDimensions >
+void
+tnlFunctionEnumerator< Mesh, Function, DofVector >::
+enumerate( const MeshType& mesh,
+           const Function& function,
+           DofVector& u,
+           const RealType& functionCoefficient,
+           const RealType& dofVectorCoefficient,
+           const RealType& time ) const
+
+{
+   if( DeviceType::DeviceType == tnlHostDevice )
+   {
+      TraverserUserData userData( time, function, u, functionCoefficient, dofVectorCoefficient );
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+
+   }
+   if( DeviceType::DeviceType == tnlCudaDevice )
+   {
+      RealType* kernelTime = tnlCuda::passToDevice( time );
+      Function* kernelFunction = tnlCuda::passToDevice( function );
+      DofVector* kernelU = tnlCuda::passToDevice( u );
+      RealType* kernelFunctionCoefficient = tnlCuda::passToDevice( functionCoefficient );
+      RealType* kernelDofVectorCoefficient = tnlCuda::passToDevice( dofVectorCoefficient );
+      TraverserUserData userData( *kernelTime, *kernelFunction, *kernelU, *kernelFunctionCoefficient, *kernelDofVectorCoefficient );
+      checkCudaDevice;
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+
+      checkCudaDevice;
+      tnlCuda::freeFromDevice( kernelTime );
+      tnlCuda::freeFromDevice( kernelFunction );
+      tnlCuda::freeFromDevice( kernelU );
+      tnlCuda::freeFromDevice( kernelFunctionCoefficient );
+      tnlCuda::freeFromDevice( kernelDofVectorCoefficient );
+      checkCudaDevice;
+   }
+}
+
+template< int Dimensions,
+          typename Real,
+          typename Device,
+          typename Index,
+          typename Function,
+          typename DofVector >
+   template< int EntityDimensions >
+void
+tnlFunctionEnumerator< tnlGrid< Dimensions, Real, Device, Index >, Function, DofVector  >::
+enumerate( const tnlGrid< Dimensions, Real, Device, Index >& mesh,
+           const Function& function,
+           DofVector& u,
+           const RealType& functionCoefficient,
+           const RealType& dofVectorCoefficient,
+           const RealType& time ) const
+{
+   if( ( tnlDeviceEnum ) DeviceType::DeviceType == tnlHostDevice )
+   {
+      TraverserUserData userData( time, function, u, functionCoefficient, dofVectorCoefficient );
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+
+   }
+   if( ( tnlDeviceEnum ) DeviceType::DeviceType == tnlCudaDevice )
+   {
+      RealType* kernelTime = tnlCuda::passToDevice( time );
+      Function* kernelFunction = tnlCuda::passToDevice( function );
+      DofVector* kernelU = tnlCuda::passToDevice( u );
+      RealType* kernelFunctionCoefficient = tnlCuda::passToDevice( functionCoefficient );
+      RealType* kernelDofVectorCoefficient = tnlCuda::passToDevice( dofVectorCoefficient );
+      TraverserUserData userData( *kernelTime, *kernelFunction, *kernelU, *kernelFunctionCoefficient, *kernelDofVectorCoefficient );
+      checkCudaDevice;
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+
+      checkCudaDevice;
+      tnlCuda::freeFromDevice( kernelTime );
+      tnlCuda::freeFromDevice( kernelFunction );
+      tnlCuda::freeFromDevice( kernelU );
+      tnlCuda::freeFromDevice( kernelFunctionCoefficient );
+      tnlCuda::freeFromDevice( kernelDofVectorCoefficient );
+      checkCudaDevice;
+   }
+}
+
+
+
+#endif /* SRC_FUNCTIONS_TNLFUNCTIONENUMERATOR_IMPL_H_ */
diff --git a/src/functions/tnlFunctionType.h b/src/functors/tnlFunctionType.h
similarity index 100%
rename from src/functions/tnlFunctionType.h
rename to src/functors/tnlFunctionType.h
diff --git a/src/functions/tnlFunctionAdapter.h b/src/functors/tnlFunctorAdapter.h
similarity index 84%
rename from src/functions/tnlFunctionAdapter.h
rename to src/functors/tnlFunctorAdapter.h
index f687b384b5fd8c56e0f1b7259f3a112012b5e398..3d6a96df58f61a84357cb6927487881816b5ef3f 100644
--- a/src/functions/tnlFunctionAdapter.h
+++ b/src/functors/tnlFunctorAdapter.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          tnlFunctionAdapter.h  -  description
+                          tnlFunctorAdapter.h  -  description
                              -------------------
     begin                : Nov 28, 2014
     copyright            : (C) 2014 by Tomas Oberhuber
@@ -15,16 +15,16 @@
  *                                                                         *
  ***************************************************************************/
 
-#ifndef TNLFUNCTIONADAPTER_H_
-#define TNLFUNCTIONADAPTER_H_
+#ifndef tnlFunctorAdapter_H_
+#define tnlFunctorAdapter_H_
 
-#include <functions/tnlConstantFunction.h>
-#include <functions/tnlFunctionType.h>
+#include <functors/tnlConstantFunction.h>
+#include <functors/tnlFunctionType.h>
 
 template< typename Mesh,
           typename Function,
           int FunctionType = tnlFunctionType< Function >::Type >
-class tnlFunctionAdapter
+class tnlFunctorAdapter
 {
 };
 
@@ -34,7 +34,7 @@ class tnlFunctionAdapter
  */
 template< typename Mesh,
           typename Function >
-class tnlFunctionAdapter< Mesh, Function, tnlGeneralFunction >
+class tnlFunctorAdapter< Mesh, Function, tnlGeneralFunction >
 {
    public:
 
@@ -44,15 +44,13 @@ class tnlFunctionAdapter< Mesh, Function, tnlGeneralFunction >
       typedef typename MeshType::IndexType IndexType;
 
       template< int MeshEntityDimension >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      __cuda_callable__ 
       static RealType getValue( const MeshType& mesh,
                                 const FunctionType& function,
                                 const IndexType index,
                                 const RealType& time = 0.0 )
       {
-         return function.getValue( mesh.template getEntityCenter< MeshEntityDimension >,
+         return function.getValue( mesh, //.template getEntityCenter< MeshEntityDimension >,
                                    index,
                                    time );
       }
@@ -67,7 +65,7 @@ template< int Dimensions,
           typename Device,
           typename Index,
           typename Function >
-class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function, tnlGeneralFunction >
+class tnlFunctorAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function, tnlGeneralFunction >
 {
          public:
 
@@ -78,16 +76,14 @@ class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function,
       typedef typename MeshType::VertexType VertexType;
       typedef typename MeshType::CoordinatesType CoordinatesType;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      __cuda_callable__
       static RealType getValue( const MeshType& mesh,
                                 const FunctionType& function,
                                 const IndexType index,
                                 const CoordinatesType& coordinates,
                                 const RealType& time = 0.0 )
       {
-         return function.getValue( mesh.template getCellCenter< VertexType >( coordinates ),
+         return function.getValue( mesh, //.template getCellCenter< VertexType >( coordinates ),
                                    index,
                                    time );
       }
@@ -99,7 +95,7 @@ class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function,
  */
 template< typename Mesh,
           typename Function >
-class tnlFunctionAdapter< Mesh, Function, tnlDiscreteFunction >
+class tnlFunctorAdapter< Mesh, Function, tnlDiscreteFunction >
 {
    public:
 
@@ -109,9 +105,7 @@ class tnlFunctionAdapter< Mesh, Function, tnlDiscreteFunction >
       typedef typename MeshType::IndexType IndexType;
 
       template< int MeshEntityDimension >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      __cuda_callable__
       static RealType getValue( const MeshType& mesh,
                                 const FunctionType& function,
                                 const IndexType index,
@@ -131,7 +125,7 @@ template< int Dimensions,
           typename Device,
           typename Index,
           typename Function >
-class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function, tnlDiscreteFunction >
+class tnlFunctorAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function, tnlDiscreteFunction >
 {
    public:
 
@@ -143,9 +137,7 @@ class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function,
       typedef typename MeshType::CoordinatesType CoordinatesType;
 
       //template< int MeshEntityDimension >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      __cuda_callable__
       static RealType getValue( const MeshType& mesh,
                                 const FunctionType& function,
                                 const IndexType index,
@@ -164,7 +156,7 @@ class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function,
  */
 template< typename Mesh,
           typename Function >
-class tnlFunctionAdapter< Mesh, Function, tnlAnalyticFunction >
+class tnlFunctorAdapter< Mesh, Function, tnlAnalyticFunction >
 {
    public:
 
@@ -174,9 +166,7 @@ class tnlFunctionAdapter< Mesh, Function, tnlAnalyticFunction >
       typedef typename MeshType::IndexType IndexType;
 
       template< int MeshEntityDimension >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      __cuda_callable__
       static RealType getValue( const MeshType& mesh,
                                 const FunctionType& function,
                                 const IndexType index,
@@ -196,7 +186,7 @@ template< int Dimensions,
           typename Device,
           typename Index,
           typename Function >
-class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function, tnlAnalyticFunction >
+class tnlFunctorAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function, tnlAnalyticFunction >
 {
          public:
 
@@ -207,9 +197,7 @@ class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function,
       typedef typename MeshType::VertexType VertexType;
       typedef typename MeshType::CoordinatesType CoordinatesType;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      __cuda_callable__
       static RealType getValue( const MeshType& mesh,
                                 const FunctionType& function,
                                 const IndexType index,
@@ -221,15 +209,16 @@ class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function,
       }
 };
 
+// TODO: Fix the specializations for the constant function.
+#ifdef UNDEF
 /****
  * Specialization for constant function
  *  - it does not ask the mesh for the mesh entity center
  */
-
 template< typename Mesh,
           int FunctionDimensions,
           typename Real >
-class tnlFunctionAdapter< Mesh, tnlConstantFunction< FunctionDimensions, Real >, tnlAnalyticFunction >
+class tnlFunctorAdapter< Mesh, tnlConstantFunction< FunctionDimensions, Real >, tnlAnalyticFunction >
 {
    public:
 
@@ -240,9 +229,7 @@ class tnlFunctionAdapter< Mesh, tnlConstantFunction< FunctionDimensions, Real >,
       typedef typename MeshType::VertexType VertexType;
 
       template< int MeshEntityDimension >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      __cuda_callable__
       static RealType getValue( const MeshType& mesh,
                                 const FunctionType& function,
                                 const IndexType index,
@@ -261,7 +248,7 @@ template< int Dimensions,
           typename Real,
           typename Device,
           typename Index >
-class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >,
+class tnlFunctorAdapter< tnlGrid< Dimensions, Real, Device, Index >,
                           tnlConstantFunction< Dimensions, Real >,
                           tnlAnalyticFunction >
 {
@@ -274,9 +261,7 @@ class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >,
       typedef typename MeshType::VertexType VertexType;
       typedef typename MeshType::CoordinatesType CoordinatesType;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      __cuda_callable__
       static RealType getValue( const MeshType& mesh,
                                 const FunctionType& function,
                                 const IndexType index,
@@ -288,6 +273,6 @@ class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >,
       }
 };
 
+#endif /* UNDEF */
 
-
-#endif /* TNLFUNCTIONADAPTER_H_ */
+#endif /* tnlFunctorAdapter_H_ */
diff --git a/src/functions/tnlSinBumpsFunction.h b/src/functors/tnlSinBumpsFunction.h
similarity index 95%
rename from src/functions/tnlSinBumpsFunction.h
rename to src/functors/tnlSinBumpsFunction.h
index d3ae9dcea77c452903806e55554426b99d045248..2b3183f8c252605c7501c9e88312d6a207a860d5 100644
--- a/src/functions/tnlSinBumpsFunction.h
+++ b/src/functors/tnlSinBumpsFunction.h
@@ -20,7 +20,7 @@
 
 #include <config/tnlParameterContainer.h>
 #include <core/vectors/tnlStaticVector.h>
-#include <functions/tnlFunctionType.h>
+#include <functors/tnlFunctionType.h>
 
 template< typename Vertex >
 class tnlSinBumpsFunctionBase
@@ -81,9 +81,7 @@ class tnlSinBumpsFunction< 1, Real  > : public tnlSinBumpsFunctionBase< tnlStati
                 int ZDiffOrder = 0,
                 typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       RealType getValue( const Vertex& v,
                          const Real& time = 0.0 ) const;
 };
@@ -114,9 +112,7 @@ class tnlSinBumpsFunction< 2, Real > : public tnlSinBumpsFunctionBase< tnlStatic
                 int ZDiffOrder = 0,
                 typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       RealType getValue( const Vertex& v,
                          const Real& time = 0.0 ) const;
 };
@@ -147,9 +143,7 @@ class tnlSinBumpsFunction< 3, Real > : public tnlSinBumpsFunctionBase< tnlStatic
                 int ZDiffOrder = 0,
                 typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       RealType getValue( const Vertex& v,
                          const Real& time = 0.0 ) const;
 };
@@ -173,7 +167,7 @@ class tnlFunctionType< tnlSinBumpsFunction< FunctionDimensions, Real > >
       enum { Type = tnlAnalyticFunction };
 };
 
-#include <functions/tnlSinBumpsFunction_impl.h>
+#include <functors/tnlSinBumpsFunction_impl.h>
 
 
 #endif /* TNLSINBUMPSFUNCTION_H_ */
diff --git a/src/functions/tnlSinBumpsFunction_impl.h b/src/functors/tnlSinBumpsFunction_impl.h
similarity index 98%
rename from src/functions/tnlSinBumpsFunction_impl.h
rename to src/functors/tnlSinBumpsFunction_impl.h
index 0ccf9f8a095d0fc8249d6b1ed7b19af5562ad8c9..facb1c20187adc7c8952fdaede073b96fa644b73 100644
--- a/src/functions/tnlSinBumpsFunction_impl.h
+++ b/src/functors/tnlSinBumpsFunction_impl.h
@@ -18,7 +18,7 @@
 #ifndef TNLSINBUMPSFUNCTION_IMPL_H_
 #define TNLSINBUMPSFUNCTION_IMPL_H_
 
-#include <functions/tnlSinBumpsFunction.h>
+#include <functors/tnlSinBumpsFunction.h>
 
 template< typename Vertex >
 void tnlSinBumpsFunctionBase< Vertex >::setWaveLength( const Vertex& waveLength )
@@ -81,9 +81,7 @@ template< typename Real >
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlSinBumpsFunction< 1, Real >::
 getValue( const Vertex& v,
@@ -128,9 +126,7 @@ template< typename Real >
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlSinBumpsFunction< 2, Real>::
 getValue( const Vertex& v,
@@ -186,9 +182,7 @@ template< typename Real >
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlSinBumpsFunction< 3, Real >::
 getValue( const Vertex& v,
diff --git a/src/functions/tnlSinWaveFunction.h b/src/functors/tnlSinWaveFunction.h
similarity index 94%
rename from src/functions/tnlSinWaveFunction.h
rename to src/functors/tnlSinWaveFunction.h
index 114ab844faa8d33549a83b309dbf06e470164660..e6d066d80d73a3a7f67fc4495b5f01fe545e6ff6 100644
--- a/src/functions/tnlSinWaveFunction.h
+++ b/src/functors/tnlSinWaveFunction.h
@@ -20,7 +20,7 @@
 
 #include <config/tnlParameterContainer.h>
 #include <core/vectors/tnlStaticVector.h>
-#include <functions/tnlFunctionType.h>
+#include <functors/tnlFunctionType.h>
 
 template< typename Real = double >
 class tnlSinWaveFunctionBase
@@ -74,9 +74,7 @@ class tnlSinWaveFunction< 1, Real > : public tnlSinWaveFunctionBase< Real >
                 int ZDiffOrder = 0,
                 typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       RealType getValue( const Vertex& v,
                          const Real& time = 0.0 ) const;
 
@@ -102,9 +100,7 @@ class tnlSinWaveFunction< 2, Real > : public tnlSinWaveFunctionBase< Real >
                 int ZDiffOrder = 0,
                 typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       RealType getValue( const Vertex& v,
                          const Real& time = 0.0 ) const;
 };
@@ -130,9 +126,7 @@ class tnlSinWaveFunction< 3, Real > : public tnlSinWaveFunctionBase< Real >
                 int ZDiffOrder = 0,
                 typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       RealType getValue( const Vertex& v,
                          const Real& time = 0.0 ) const;
 };
@@ -156,6 +150,6 @@ class tnlFunctionType< tnlSinWaveFunction< FunctionDimensions, Real > >
       enum { Type = tnlAnalyticFunction };
 };
 
-#include <functions/tnlSinWaveFunction_impl.h>
+#include <functors/tnlSinWaveFunction_impl.h>
 
 #endif /* TNLSINWAVEFUNCTION_H_ */
diff --git a/src/functions/tnlSinWaveFunction_impl.h b/src/functors/tnlSinWaveFunction_impl.h
similarity index 98%
rename from src/functions/tnlSinWaveFunction_impl.h
rename to src/functors/tnlSinWaveFunction_impl.h
index 50a36837010c2c7c3e263a2a654800a98d6649f6..60587d514aec5d009c221088d79bca76cb0bd8ad 100644
--- a/src/functions/tnlSinWaveFunction_impl.h
+++ b/src/functors/tnlSinWaveFunction_impl.h
@@ -18,7 +18,7 @@
 #ifndef TNLSINWAVEFUNCTION_IMPL_H_
 #define TNLSINWAVEFUNCTION_IMPL_H_
 
-#include <functions/tnlSinWaveFunction.h>
+#include <functors/tnlSinWaveFunction.h>
 
 template< typename Real >
 tnlSinWaveFunctionBase< Real >::tnlSinWaveFunctionBase()
@@ -81,9 +81,7 @@ template< typename Real >
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlSinWaveFunction< 1, Real >::
 getValue( const Vertex& v,
@@ -116,9 +114,7 @@ template< typename Real >
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlSinWaveFunction< 2, Real >::
 getValue( const Vertex& v,
@@ -150,9 +146,7 @@ template< typename Real >
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlSinWaveFunction< 3, Real >::
 getValue( const Vertex& v,
diff --git a/src/functions/tnlTestFunction.h b/src/functors/tnlTestFunction.h
similarity index 96%
rename from src/functions/tnlTestFunction.h
rename to src/functors/tnlTestFunction.h
index 95f5c189dfff33a71e6881583c4bba50e323f8f0..f6be4aef43342b46e0b40c4fb390e94cff8c8c26 100644
--- a/src/functions/tnlTestFunction.h
+++ b/src/functors/tnlTestFunction.h
@@ -72,9 +72,7 @@ class tnlTestFunction
              int ZDiffOrder = 0,
              typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Real getValue( const Vertex& vertex,
                   const Real& time = 0 ) const;
 
@@ -98,9 +96,7 @@ class tnlTestFunction
              int ZDiffOrder = 0,
              typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Real getTimeDerivative( const Vertex& vertex,
                            const Real& time = 0 ) const;
 
@@ -153,6 +149,6 @@ ostream& operator << ( ostream& str, const tnlTestFunction< FunctionDimensions,
    return f.print( str );
 }
 
-#include <functions/tnlTestFunction_impl.h>
+#include <functors/tnlTestFunction_impl.h>
 
 #endif /* TNLTESTFUNCTION_H_ */
diff --git a/src/functions/tnlTestFunction_impl.cpp b/src/functors/tnlTestFunction_impl.cpp
similarity index 97%
rename from src/functions/tnlTestFunction_impl.cpp
rename to src/functors/tnlTestFunction_impl.cpp
index c8baf09f22da5f5871f785e763fdf4c254110c62..948a11afa1b1593ffa7a8462b489096b82406e21 100644
--- a/src/functions/tnlTestFunction_impl.cpp
+++ b/src/functors/tnlTestFunction_impl.cpp
@@ -18,7 +18,7 @@
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
-#include <functions/tnlTestFunction.h>
+#include <functors/tnlTestFunction.h>
 
 template class tnlTestFunction< 1, float, tnlHost >;
 template class tnlTestFunction< 2, float, tnlHost >;
diff --git a/src/functions/tnlTestFunction_impl.cu b/src/functors/tnlTestFunction_impl.cu
similarity index 97%
rename from src/functions/tnlTestFunction_impl.cu
rename to src/functors/tnlTestFunction_impl.cu
index 69354815a11103d8134d35d27e97e323bc3f314b..b6a154832e2cdc277e06361eb5b3dab1431f4303 100644
--- a/src/functions/tnlTestFunction_impl.cu
+++ b/src/functors/tnlTestFunction_impl.cu
@@ -18,7 +18,7 @@
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 #ifdef HAVE_CUDA
 
-#include <functions/tnlTestFunction.h>
+#include <functors/tnlTestFunction.h>
 
 template class tnlTestFunction< 1, float, tnlCuda >;
 template class tnlTestFunction< 2, float, tnlCuda >;
diff --git a/src/functions/tnlTestFunction_impl.h b/src/functors/tnlTestFunction_impl.h
similarity index 97%
rename from src/functions/tnlTestFunction_impl.h
rename to src/functors/tnlTestFunction_impl.h
index 6991eab5f3bb03ab360c32f3cbca8723b27613e6..23a05de8f82964f242a6b4d1e814a0027b4769f4 100644
--- a/src/functions/tnlTestFunction_impl.h
+++ b/src/functors/tnlTestFunction_impl.h
@@ -19,10 +19,10 @@
 #define TNLTESTFUNCTION_IMPL_H_
 
 #include <core/tnlCuda.h>
-#include <functions/tnlConstantFunction.h>
-#include <functions/tnlExpBumpFunction.h>
-#include <functions/tnlSinBumpsFunction.h>
-#include <functions/tnlSinWaveFunction.h>
+#include <functors/tnlConstantFunction.h>
+#include <functors/tnlExpBumpFunction.h>
+#include <functors/tnlSinBumpsFunction.h>
+#include <functors/tnlSinWaveFunction.h>
 #include <functions/initial_conditions/tnlCylinderFunction.h>
 #include <functions/initial_conditions/tnlFlowerpotFunction.h>
 #include <functions/initial_conditions/tnlTwinsFunction.h>
@@ -259,9 +259,7 @@ template< int FunctionDimensions,
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlTestFunction< FunctionDimensions, Real, Device >::
 getValue( const Vertex& vertex,
@@ -325,9 +323,7 @@ template< int FunctionDimensions,
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlTestFunction< FunctionDimensions, Real, Device >::
 getTimeDerivative( const Vertex& vertex,
@@ -353,15 +349,12 @@ getTimeDerivative( const Vertex& vertex,
       case constant:
          return scale * ( ( tnlConstantFunction< Dimensions, Real >* ) function )->
                   getValue< XDiffOrder, YDiffOrder, ZDiffOrder, Vertex >( vertex, time );
-         break;
       case expBump:
          return scale * ( ( tnlExpBumpFunction< Dimensions, Real >* ) function )->
                   getValue< XDiffOrder, YDiffOrder, ZDiffOrder, Vertex >( vertex, time );
-         break;
       case sinBumps:
          return scale * ( ( tnlSinBumpsFunction< Dimensions, Real >* ) function )->
                   getValue< XDiffOrder, YDiffOrder, ZDiffOrder, Vertex >( vertex, time );
-         break;
       case sinWave:
          return scale * ( ( tnlSinWaveFunction< Dimensions, Real >* ) function )->
                   getValue< XDiffOrder, YDiffOrder, ZDiffOrder, Vertex >( vertex, time );
@@ -388,7 +381,6 @@ getTimeDerivative( const Vertex& vertex,
          break;
       default:
          return 0.0;
-         break;
    }
 }
 
@@ -459,7 +451,6 @@ void
 tnlTestFunction< FunctionDimensions, Real, Device >::
 copyFunction( const void* function )
 {
-   cout << "Copy function ********************************* " << endl;
    if( Device::DeviceType == ( int ) tnlHostDevice ) 
    {
       FunctionType* f = new FunctionType;
@@ -481,17 +472,17 @@ tnlTestFunction< FunctionDimensions, Real, Device >::
 printFunction( ostream& str ) const
 {
    FunctionType* f = ( FunctionType* ) this->function;
-   if( Device::DeviceType == ( int ) tnlHostDevice )
+   switch( Device::DeviceType )
    {
-      str << *f;
-      return str;
-   }
-   if( Device::DeviceType == ( int ) tnlCudaDevice )
-   {
-      tnlCuda::print( f, str );
-      return str;
+      case tnlHostDevice:
+         str << *f;
+         return str;
+      case tnlCudaDevice:
+         tnlCuda::print( f, str );
+         return str;
+      default:
+         return str;
    }
-   return str;
 }
 
 template< int FunctionDimensions,
diff --git a/src/legacy/benchmarks/tnlSpmvBenchmarkBase_impl.h b/src/legacy/benchmarks/tnlSpmvBenchmarkBase_impl.h
index 2b1216bb1b82569fd34d7aa255d75026eeab2bab..9066c57d7e5b8b76806346e5651a5232c384210f 100644
--- a/src/legacy/benchmarks/tnlSpmvBenchmarkBase_impl.h
+++ b/src/legacy/benchmarks/tnlSpmvBenchmarkBase_impl.h
@@ -115,7 +115,7 @@ void tnlSpmvBenchmarkBase< Matrix >::runBenchmark( const tnlVector< RealType, De
       iterations ++;
    }
 
-   this -> time = rt_timer. GetTime();
+   this -> time = rt_timer. getTime();
 
    firstErrorOccurence = 0;
    tnlVector< RealType, tnlHost, IndexType > resB( "tnlSpmvBenchmark< Real, Device, Index, Matrix > :: runBenchmark : b" );
diff --git a/src/legacy/benchmarks/tnlSpmvBenchmarkHybridMatrix.h b/src/legacy/benchmarks/tnlSpmvBenchmarkHybridMatrix.h
index ca4b3b5f13f47f8089c813bc015a5228c3692147..8dffd254eedc72d39c48991e462824127a670047 100644
--- a/src/legacy/benchmarks/tnlSpmvBenchmarkHybridMatrix.h
+++ b/src/legacy/benchmarks/tnlSpmvBenchmarkHybridMatrix.h
@@ -109,7 +109,7 @@ void tnlSpmvBenchmarkHybridMatrix< Real, Index > :: runBenchmark( const tnlVecto
       rt_timer. Reset();
 
       this -> iterations = 0;
-      //while( rt_timer. GetTime() < time )
+      //while( rt_timer. getTime() < time )
       {
          for( int i = 0; i < this -> maxIterations; i ++ )
          {
@@ -118,7 +118,7 @@ void tnlSpmvBenchmarkHybridMatrix< Real, Index > :: runBenchmark( const tnlVecto
             this -> iterations ++;
          }
       }
-      this -> time = rt_timer. GetTime();
+      this -> time = rt_timer. getTime();
 
       cusp::array1d< Real, cusp::host_memory > host_b( b );
       host_b = b;
diff --git a/src/legacy/solvers/tnlMatrixSolver.h b/src/legacy/solvers/tnlMatrixSolver.h
index bfdab721b6873af83090482698b4285d4b9aa931..f5351963d3cfdec6be6ef77aa18fe467f324b178 100644
--- a/src/legacy/solvers/tnlMatrixSolver.h
+++ b/src/legacy/solvers/tnlMatrixSolver.h
@@ -122,7 +122,7 @@ void tnlMatrixSolver< Real, Device, Index > :: printOut()
    if( this -> verbosity > 0 )
    {
       int cpu_time = 0;
-      if( this -> cpu_timer ) cpu_time = this -> cpu_timer -> GetTime( 0, this -> solver_comm );
+      if( this -> cpu_timer ) cpu_time = this -> cpu_timer -> getTime( 0, this -> solver_comm );
       if( MPIGetRank() != 0 ) return;
       // TODO: add EST
       //cout << " EST: " << estimated;
@@ -131,7 +131,7 @@ void tnlMatrixSolver< Real, Device, Index > :: printOut()
       if( this -> cpu_timer )
          cout << " CPU: " << setw( 8 ) << cpu_time;
       if( this -> rt_timer )
-         cout << " ELA: " << setw( 8 ) << this -> rt_timer -> GetTime();
+         cout << " ELA: " << setw( 8 ) << this -> rt_timer -> getTime();
       cout << "   \r" << flush;
    }
 };
diff --git a/src/matrices/tnlCSRMatrix.h b/src/matrices/tnlCSRMatrix.h
index 2216ce6b62c434802f37b23b0c9f127d08224865..3212a680bf1424b06c3f01730e889080fe6be334 100644
--- a/src/matrices/tnlCSRMatrix.h
+++ b/src/matrices/tnlCSRMatrix.h
@@ -63,9 +63,7 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
 
    void reset();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__ 
    bool setElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value );
@@ -73,9 +71,7 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
    bool setElement( const IndexType row,
                     const IndexType column,
                     const RealType& value );
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value,
@@ -87,9 +83,7 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
                     const RealType& thisElementMultiplicator = 1.0 );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setRowFast( const IndexType row,
                     const IndexType* columnIndexes,
                     const RealType* values,
@@ -101,9 +95,7 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
                 const IndexType elements );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -117,36 +109,26 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
                 const RealType& thisElementMultiplicator = 1.0 );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getElementFast( const IndexType row,
                             const IndexType column ) const;
 
    RealType getElement( const IndexType row,
                         const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getRowFast( const IndexType row,
                     IndexType* columns,
                     RealType* values ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    MatrixRow getRow( const IndexType rowIndex );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const MatrixRow getRow( const IndexType rowIndex ) const;
 
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
@@ -183,9 +165,7 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
 
    void setCudaKernelType( const SPMVCudaKernel kernel );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    SPMVCudaKernel getCudaKernelType() const;
 
    void setCudaWarpSize( const int warpSize );
@@ -194,9 +174,7 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
 
    void setHybridModeSplit( const IndexType hybridModeSplit );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getHybridModeSplit() const;
 
 #ifdef HAVE_CUDA
diff --git a/src/matrices/tnlCSRMatrix_impl.h b/src/matrices/tnlCSRMatrix_impl.h
index f6a0e8036ca585931ee4164f951eca1a6ce3ac0d..3becfa0a51d938b444dcfe7eccac2b19392d15e5 100644
--- a/src/matrices/tnlCSRMatrix_impl.h
+++ b/src/matrices/tnlCSRMatrix_impl.h
@@ -130,9 +130,7 @@ void tnlCSRMatrix< Real, Device, Index >::reset()
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlCSRMatrix< Real, Device, Index >::setElementFast( const IndexType row,
                                                           const IndexType column,
                                                           const Real& value )
@@ -154,9 +152,7 @@ bool tnlCSRMatrix< Real, Device, Index >::setElement( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlCSRMatrix< Real, Device, Index >::addElementFast( const IndexType row,
                                                           const IndexType column,
                                                           const RealType& value,
@@ -258,9 +254,7 @@ bool tnlCSRMatrix< Real, Device, Index >::addElement( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlCSRMatrix< Real, Device, Index > :: setRowFast( const IndexType row,
                                                         const IndexType* columnIndexes,
                                                         const RealType* values,
@@ -310,9 +304,7 @@ bool tnlCSRMatrix< Real, Device, Index > :: setRow( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlCSRMatrix< Real, Device, Index > :: addRowFast( const IndexType row,
                                                         const IndexType* columns,
                                                         const RealType* values,
@@ -338,9 +330,7 @@ bool tnlCSRMatrix< Real, Device, Index > :: addRow( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlCSRMatrix< Real, Device, Index >::getElementFast( const IndexType row,
                                                           const IndexType column ) const
 {
@@ -377,9 +367,7 @@ Real tnlCSRMatrix< Real, Device, Index >::getElement( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlCSRMatrix< Real, Device, Index >::getRowFast( const IndexType row,
                                                       IndexType* columns,
                                                       RealType* values ) const
@@ -397,9 +385,7 @@ void tnlCSRMatrix< Real, Device, Index >::getRowFast( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlCSRMatrix< Real, Device, Index >::MatrixRow
 tnlCSRMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex )
@@ -415,9 +401,7 @@ getRow( const IndexType rowIndex )
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlCSRMatrix< Real, Device, Index >::MatrixRow
 tnlCSRMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex ) const
@@ -434,9 +418,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename Vector::RealType tnlCSRMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
                                                                                  const Vector& vector ) const
 {
@@ -591,9 +573,7 @@ void tnlCSRMatrix< Real, Device, Index >::setCudaKernelType( const SPMVCudaKerne
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlCSRMatrix< Real, Device, Index >::SPMVCudaKernel tnlCSRMatrix< Real, Device, Index >::getCudaKernelType() const
 {
    return this->spmvCudaKernel;
@@ -626,9 +606,7 @@ void tnlCSRMatrix< Real, Device, Index >::setHybridModeSplit( const IndexType hy
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlCSRMatrix< Real, Device, Index >::getHybridModeSplit() const
 {
    return this->hybridModeSplit;
@@ -648,7 +626,7 @@ void tnlCSRMatrix< Real, Device, Index >::spmvCudaVectorized( const InVector& in
                                                               const IndexType warpEnd,
                                                               const IndexType inWarpIdx ) const
 {
-   Real* aux = getSharedMemory< Real >();
+   volatile Real* aux = getSharedMemory< Real >();
    for( IndexType row = warpStart; row < warpEnd; row++ )
    {
       aux[ threadIdx.x ] = 0.0;
@@ -672,8 +650,6 @@ void tnlCSRMatrix< Real, Device, Index >::spmvCudaVectorized( const InVector& in
          if( inWarpIdx < 2 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 2 ];
       if( warpSize >= 2 )
          if( inWarpIdx < 1 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 1 ];
-      __syncthreads(); // TODO: I am not sure why - aux must be volatile
-
       if( inWarpIdx == 0 )
          outVector[ row ] = aux[ threadIdx.x ];
    }
diff --git a/src/matrices/tnlChunkedEllpackMatrix.h b/src/matrices/tnlChunkedEllpackMatrix.h
index 3c4a4b7f649ea662d2afca0d0a6d8dad45e57d91..df91f84c19af182fdb9b3bc631df1e4a67908a4d 100644
--- a/src/matrices/tnlChunkedEllpackMatrix.h
+++ b/src/matrices/tnlChunkedEllpackMatrix.h
@@ -94,23 +94,17 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
 
    void setNumberOfChunksInSlice( const IndexType chunksInSlice );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getNumberOfChunksInSlice() const;
 
    void setDesiredChunkSize( const IndexType desiredChunkSize );
 
    IndexType getDesiredChunkSize() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getNumberOfSlices() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value );
@@ -119,9 +113,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                     const IndexType column,
                     const RealType& value );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value,
@@ -133,9 +125,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                     const RealType& thisElementMultiplicator = 1.0 );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setRowFast( const IndexType row,
                     const IndexType* columnIndexes,
                     const RealType* values,
@@ -147,9 +137,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                 const IndexType elements );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -163,18 +151,14 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                 const RealType& thisElementMultiplicator = 1.0 );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getElementFast( const IndexType row,
                             const IndexType column ) const;
 
    RealType getElement( const IndexType row,
                         const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getRowFast( const IndexType row,
                     IndexType* columns,
                     RealType* values ) const;
@@ -183,20 +167,14 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                 IndexType* columns,
                 RealType* values ) const;*/
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    MatrixRow getRow( const IndexType rowIndex );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const MatrixRow getRow( const IndexType rowIndex ) const;
 
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
@@ -257,9 +235,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                            RealType& value,
                            RealType& thisElementMultiplicator );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementToChunkFast( const IndexType sliceOffset,
                                const IndexType chunkIndex,
                                const IndexType chunkSize,
@@ -267,9 +243,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                                RealType& value,
                                RealType& thisElementMultiplicator );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void setChunkFast( const IndexType sliceOffset,
                       const IndexType chunkIndex,
                       const IndexType chunkSize,
@@ -290,9 +264,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                            const IndexType column,
                            RealType& value ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool getElementInChunkFast( const IndexType sliceOffset,
                                const IndexType chunkIndex,
                                const IndexType chunkSize,
@@ -305,9 +277,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                   IndexType* columns,
                   RealType* values ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getChunkFast( const IndexType sliceOffset,
                       const IndexType chunkIndex,
                       const IndexType chunkSize,
@@ -315,9 +285,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                       RealType* values ) const;
 
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    typename Vector::RealType chunkVectorProduct( const IndexType sliceOffset,
                                                  const IndexType chunkIndex,
                                                  const IndexType chunkSize,
diff --git a/src/matrices/tnlChunkedEllpackMatrix_impl.h b/src/matrices/tnlChunkedEllpackMatrix_impl.h
index 12a7e6866c56b03e7446d5966fee369ce8e40794..0cd53e578514fd182773ecfc1b3a0a321b00e4d7 100644
--- a/src/matrices/tnlChunkedEllpackMatrix_impl.h
+++ b/src/matrices/tnlChunkedEllpackMatrix_impl.h
@@ -309,9 +309,7 @@ void tnlChunkedEllpackMatrix< Real, Device, Index >::setNumberOfChunksInSlice( c
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlChunkedEllpackMatrix< Real, Device, Index >::getNumberOfChunksInSlice() const
 {
    return this->chunksInSlice;
@@ -336,9 +334,7 @@ Index tnlChunkedEllpackMatrix< Real, Device, Index >::getDesiredChunkSize() cons
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlChunkedEllpackMatrix< Real, Device, Index >::getNumberOfSlices() const
 {
    return this->numberOfSlices;
@@ -378,9 +374,7 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::operator != ( const tnlChun
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlChunkedEllpackMatrix< Real, Device, Index >::setElementFast( const IndexType row,
                                                                      const IndexType column,
                                                                      const Real& value )
@@ -401,9 +395,7 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setElement( const IndexType
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlChunkedEllpackMatrix< Real, Device, Index >::addElementFast( const IndexType row,
                                                                      const IndexType _column,
                                                                      const RealType& _value,
@@ -438,9 +430,7 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::addElementFast( const Index
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlChunkedEllpackMatrix< Real, Device, Index >::addElementToChunkFast( const IndexType sliceOffset,
                                                                             const IndexType chunkIndex,
                                                                             const IndexType chunkSize,
@@ -601,9 +591,7 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::addElementToChunk( const In
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlChunkedEllpackMatrix< Real, Device, Index >::setRowFast( const IndexType row,
                                                                  const IndexType* columnIndexes,
                                                                  const RealType* values,
@@ -645,9 +633,7 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setRowFast( const IndexType
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlChunkedEllpackMatrix< Real, Device, Index >::setChunkFast( const IndexType sliceOffset,
                                                                    const IndexType chunkIndex,
                                                                    const IndexType chunkSize,
@@ -757,9 +743,7 @@ void tnlChunkedEllpackMatrix< Real, Device, Index >::setChunk( const IndexType s
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlChunkedEllpackMatrix< Real, Device, Index > :: addRowFast( const IndexType row,
                                                                    const IndexType* columns,
                                                                    const RealType* values,
@@ -785,9 +769,7 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index > :: addRow( const IndexType r
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlChunkedEllpackMatrix< Real, Device, Index >::getElementFast( const IndexType row,
                                                                      const IndexType column ) const
 {
@@ -809,9 +791,7 @@ Real tnlChunkedEllpackMatrix< Real, Device, Index >::getElementFast( const Index
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlChunkedEllpackMatrix< Real, Device, Index >::getElementInChunkFast( const IndexType sliceOffset,
                                                                             const IndexType chunkIndex,
                                                                             const IndexType chunkSize,
@@ -889,9 +869,7 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::getElementInChunk( const In
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlChunkedEllpackMatrix< Real, Device, Index >::getRowFast( const IndexType row,
                                                                  IndexType* columns,
                                                                  RealType* values ) const
@@ -921,9 +899,7 @@ void tnlChunkedEllpackMatrix< Real, Device, Index >::getRowFast( const IndexType
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlChunkedEllpackMatrix< Real, Device, Index >::getChunkFast( const IndexType sliceOffset,
                                                                    const IndexType chunkIndex,
                                                                    const IndexType chunkSize,
@@ -945,9 +921,7 @@ void tnlChunkedEllpackMatrix< Real, Device, Index >::getChunkFast( const IndexTy
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlChunkedEllpackMatrix< Real, Device, Index >::MatrixRow
 tnlChunkedEllpackMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex )
@@ -963,9 +937,7 @@ getRow( const IndexType rowIndex )
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlChunkedEllpackMatrix< Real, Device, Index >::MatrixRow
 tnlChunkedEllpackMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex ) const
@@ -1039,9 +1011,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename Vector::RealType tnlChunkedEllpackMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
                                                                                             const Vector& vector ) const
 {
@@ -1072,9 +1042,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename Vector::RealType tnlChunkedEllpackMatrix< Real, Device, Index >::chunkVectorProduct( const IndexType sliceOffset,
                                                                                               const IndexType chunkIndex,
                                                                                               const IndexType chunkSize,
@@ -1326,6 +1294,7 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlHost >
       }
 
       template< typename Index >
+      __cuda_callable__
       static void initChunkTraverse( const Index sliceOffset,
                                      const Index chunkIndex,
                                      const Index chunkSize,
@@ -1385,9 +1354,7 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlCuda >
       }
       
       template< typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      __cuda_callable__
       static void initChunkTraverse( const Index sliceOffset,
                                      const Index chunkIndex,
                                      const Index chunkSize,
diff --git a/src/matrices/tnlDenseMatrix.h b/src/matrices/tnlDenseMatrix.h
index bf86805192a074c4e8808d6cdc318da649ae63c8..563069dd1cbc92f5241e2d4578c90b6da9dbe319 100644
--- a/src/matrices/tnlDenseMatrix.h
+++ b/src/matrices/tnlDenseMatrix.h
@@ -77,9 +77,7 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
 
    void setValue( const RealType& v );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value );
@@ -88,9 +86,7 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
                     const IndexType column,
                     const RealType& value );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value,
@@ -101,9 +97,7 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
                     const RealType& value,
                     const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -114,9 +108,7 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
                 const RealType* values,
                 const IndexType elements );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -129,18 +121,14 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
                 const IndexType elements,
                 const RealType& thisRowMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Real getElementFast( const IndexType row,
                         const IndexType column ) const;
 
    Real getElement( const IndexType row,
                     const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getRowFast( const IndexType row,
                     IndexType* columns,
                     RealType* values ) const;
@@ -149,20 +137,14 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
                 IndexType* columns,
                 RealType* values ) const;*/
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    MatrixRow getRow( const IndexType rowIndex );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const MatrixRow getRow( const IndexType rowIndex ) const;
 
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
@@ -217,9 +199,7 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
 
    protected:
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getElementIndex( const IndexType row,
                               const IndexType column ) const;
 
diff --git a/src/matrices/tnlDenseMatrixRow.h b/src/matrices/tnlDenseMatrixRow.h
index 29686a58e547f0eb9738e370deae29771fb9755d..0882c960b0e59e099ed0d57ed73f5ca17e871c1a 100644
--- a/src/matrices/tnlDenseMatrixRow.h
+++ b/src/matrices/tnlDenseMatrixRow.h
@@ -23,28 +23,20 @@ class tnlDenseMatrixRow
 {
    public:
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       tnlDenseMatrixRow();
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       tnlDenseMatrixRow( Real* values,
                          const Index columns,
                          const Index step );
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       void bind( Real* values,
                  const Index columns,
                  const Index step );
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       void setElement( const Index& elementIndex,
                        const Index& column,
                        const Real& value );
diff --git a/src/matrices/tnlDenseMatrixRow_impl.h b/src/matrices/tnlDenseMatrixRow_impl.h
index f752e26157fac6bdf82ba9f9f8a09df8904cd41b..3ad6af7f2b83533c02016467bbf9f5307b0cf2ca 100644
--- a/src/matrices/tnlDenseMatrixRow_impl.h
+++ b/src/matrices/tnlDenseMatrixRow_impl.h
@@ -19,9 +19,7 @@
 #define TNLDENSEMATRIXROW_IMPL_H_
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 tnlDenseMatrixRow< Real, Index >::
 tnlDenseMatrixRow()
 : values( 0 ),
@@ -31,9 +29,7 @@ tnlDenseMatrixRow()
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 tnlDenseMatrixRow< Real, Index >::
 tnlDenseMatrixRow( Real* values,
                    const Index columns,
@@ -45,9 +41,7 @@ tnlDenseMatrixRow( Real* values,
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlDenseMatrixRow< Real, Index >::
 bind( Real* values,
@@ -60,9 +54,7 @@ bind( Real* values,
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlDenseMatrixRow< Real, Index >::
 setElement( const Index& elementIndex,
diff --git a/src/matrices/tnlDenseMatrix_impl.h b/src/matrices/tnlDenseMatrix_impl.h
index fea62c6a9f3315472e06f380d1dcb19509827001..cfae92dce21368f8ac2784c612de7968f17592b1 100644
--- a/src/matrices/tnlDenseMatrix_impl.h
+++ b/src/matrices/tnlDenseMatrix_impl.h
@@ -141,9 +141,7 @@ void tnlDenseMatrix< Real, Device, Index >::setValue( const Real& value )
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlDenseMatrix< Real, Device, Index >::setElementFast( const IndexType row,
                                                             const IndexType column,
                                                             const RealType& value )
@@ -171,9 +169,7 @@ bool tnlDenseMatrix< Real, Device, Index >::setElement( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlDenseMatrix< Real, Device, Index >::addElementFast( const IndexType row,
                                                             const IndexType column,
                                                             const RealType& value,
@@ -213,9 +209,7 @@ bool tnlDenseMatrix< Real, Device, Index >::addElement( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlDenseMatrix< Real, Device, Index >::setRowFast( const IndexType row,
                                                         const IndexType* columns,
                                                         const RealType* values,
@@ -250,9 +244,7 @@ bool tnlDenseMatrix< Real, Device, Index >::setRow( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlDenseMatrix< Real, Device, Index >::addRowFast( const IndexType row,
                                                         const IndexType* columns,
                                                         const RealType* values,
@@ -292,9 +284,7 @@ bool tnlDenseMatrix< Real, Device, Index >::addRow( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlDenseMatrix< Real, Device, Index >::getElementFast( const IndexType row,
                                                             const IndexType column ) const
 {
@@ -316,9 +306,7 @@ Real tnlDenseMatrix< Real, Device, Index >::getElement( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlDenseMatrix< Real, Device, Index >::getRowFast( const IndexType row,
                                                         IndexType* columns,
                                                         RealType* values ) const
@@ -333,9 +321,7 @@ void tnlDenseMatrix< Real, Device, Index >::getRowFast( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlDenseMatrix< Real, Device, Index >::MatrixRow
 tnlDenseMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex )
@@ -353,9 +339,7 @@ getRow( const IndexType rowIndex )
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlDenseMatrix< Real, Device, Index >::MatrixRow
 tnlDenseMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex ) const
@@ -374,9 +358,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename Vector::RealType tnlDenseMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
                                                                                    const Vector& vector ) const
 {
diff --git a/src/matrices/tnlEllpackMatrix.h b/src/matrices/tnlEllpackMatrix.h
index 4667c4111d0b619509d8af54c1041cf8d524fe6e..d816c9bdb15ad75ee8044d66345c4f8e02b55134 100644
--- a/src/matrices/tnlEllpackMatrix.h
+++ b/src/matrices/tnlEllpackMatrix.h
@@ -71,9 +71,7 @@ class tnlEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
    bool copyFrom( const Matrix& matrix,
                   const RowLengthsVector& rowLengths );*/
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value );
@@ -82,9 +80,7 @@ class tnlEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                     const IndexType column,
                     const RealType& value );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value,
@@ -96,9 +92,7 @@ class tnlEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                     const RealType& thisElementMultiplicator = 1.0 );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setRowFast( const IndexType row,
                     const IndexType* columnIndexes,
                     const RealType* values,
@@ -110,9 +104,7 @@ class tnlEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                 const IndexType elements );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -125,36 +117,26 @@ class tnlEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                 const IndexType numberOfElements,
                 const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getElementFast( const IndexType row,
                             const IndexType column ) const;
 
    RealType getElement( const IndexType row,
                         const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getRowFast( const IndexType row,
                     IndexType* columns,
                     RealType* values ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    MatrixRow getRow( const IndexType rowIndex );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const MatrixRow getRow( const IndexType rowIndex ) const;
 
-template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   template< typename Vector >
+   __cuda_callable__
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
diff --git a/src/matrices/tnlEllpackMatrix_impl.h b/src/matrices/tnlEllpackMatrix_impl.h
index f0f3459f8f450938b239d7b74aed2d8354fd960c..00322b09956823620d3f20b489b51c7527ab403c 100644
--- a/src/matrices/tnlEllpackMatrix_impl.h
+++ b/src/matrices/tnlEllpackMatrix_impl.h
@@ -169,9 +169,7 @@ bool tnlEllpackMatrix< Real, Device, Index >::copyFrom( const Matrix& matrix,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlEllpackMatrix< Real, Device, Index > :: setElementFast( const IndexType row,
                                                                 const IndexType column,
                                                                 const Real& value )
@@ -193,9 +191,7 @@ bool tnlEllpackMatrix< Real, Device, Index > :: setElement( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlEllpackMatrix< Real, Device, Index > :: addElementFast( const IndexType row,
                                                                 const IndexType column,
                                                                 const RealType& value,
@@ -291,9 +287,7 @@ bool tnlEllpackMatrix< Real, Device, Index > :: addElement( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlEllpackMatrix< Real, Device, Index > :: setRowFast( const IndexType row,
                                                             const IndexType* columnIndexes,
                                                             const RealType* values,
@@ -359,9 +353,7 @@ bool tnlEllpackMatrix< Real, Device, Index > :: setRow( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlEllpackMatrix< Real, Device, Index > :: addRowFast( const IndexType row,
                                                             const IndexType* columns,
                                                             const RealType* values,
@@ -388,9 +380,7 @@ bool tnlEllpackMatrix< Real, Device, Index > :: addRow( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlEllpackMatrix< Real, Device, Index >::getElementFast( const IndexType row,
                                                               const IndexType column ) const
 {
@@ -430,9 +420,7 @@ Real tnlEllpackMatrix< Real, Device, Index >::getElement( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlEllpackMatrix< Real, Device, Index >::getRowFast( const IndexType row,
                                                           IndexType* columns,
                                                           RealType* values ) const
@@ -453,13 +441,12 @@ void tnlEllpackMatrix< Real, Device, Index >::getRowFast( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlEllpackMatrix< Real, Device, Index >::MatrixRow
 tnlEllpackMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex )
 {
+   //printf( "this->rowLengths = %d this = %p \n", this->rowLengths, this );
    IndexType rowBegin = DeviceDependentCode::getRowBegin( *this, rowIndex );
    return MatrixRow( &this->columnIndexes[ rowBegin ],
                      &this->values[ rowBegin ],
@@ -470,13 +457,12 @@ getRow( const IndexType rowIndex )
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlEllpackMatrix< Real, Device, Index >::MatrixRow
 tnlEllpackMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex ) const
 {
+   //printf( "this->rowLengths = %d this = %p \n", this->rowLengths, this );
    IndexType rowBegin = DeviceDependentCode::getRowBegin( *this, rowIndex );
    return MatrixRow( &this->columnIndexes[ rowBegin ],
                      &this->values[ rowBegin ],
@@ -488,9 +474,7 @@ template< typename Real,
           typename Device,
           typename Index >
   template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename Vector::RealType tnlEllpackMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
                                                                                      const Vector& vector ) const
 {
@@ -636,15 +620,16 @@ void tnlEllpackMatrix< Real, Device, Index >::print( ostream& str ) const
    for( IndexType row = 0; row < this->getRows(); row++ )
    {
       str <<"Row: " << row << " -> ";
-      IndexType i( row * this->rowLengths );
-      const IndexType rowEnd( i + this->rowLengths );
+      IndexType i = DeviceDependentCode::getRowBegin( *this, row );
+      const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row );
+      const IndexType step = DeviceDependentCode::getElementStep( *this );
       while( i < rowEnd &&
              this->columnIndexes.getElement( i ) < this->columns &&
              this->columnIndexes.getElement( i ) != this->getPaddingIndex() )
       {
          const Index column = this->columnIndexes.getElement( i );
          str << " Col:" << column << "->" << this->values.getElement( i ) << "\t";
-         i++;
+         i += step;
       }
       str << endl;
    }
@@ -669,6 +654,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost >
 
       template< typename Real,
                 typename Index >
+      __cuda_callable__
       static Index getRowBegin( const tnlEllpackMatrix< Real, Device, Index >& matrix,
                                 const Index row )
       {
@@ -677,6 +663,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost >
 
       template< typename Real,
                 typename Index >
+      __cuda_callable__
       static Index getRowEnd( const tnlEllpackMatrix< Real, Device, Index >& matrix,
                                 const Index row )
       {
@@ -685,6 +672,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost >
 
       template< typename Real,
                 typename Index >
+      __cuda_callable__
       static Index getElementStep( const tnlEllpackMatrix< Real, Device, Index >& matrix )
       {
          return 1;
@@ -712,9 +700,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
 
       template< typename Real,
                 typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       static Index getRowBegin( const tnlEllpackMatrix< Real, Device, Index >& matrix,
                                 const Index row )
       {
@@ -723,9 +709,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
 
       template< typename Real,
                 typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       static Index getRowEnd( const tnlEllpackMatrix< Real, Device, Index >& matrix,
                                 const Index row )
       {
@@ -734,9 +718,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
 
       template< typename Real,
                 typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       static Index getElementStep( const tnlEllpackMatrix< Real, Device, Index >& matrix )
       {
          return matrix.alignedRows;
@@ -754,7 +736,4 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
       }
 };
 
-
-
-
 #endif /* TNLELLPACKMATRIX_IMPL_H_ */
diff --git a/src/matrices/tnlMatrix.h b/src/matrices/tnlMatrix.h
index ad3e9960da5243883653201588e43f1cda415364..2522c5e198b629325980cc386187c2ea0ae7c5b4 100644
--- a/src/matrices/tnlMatrix.h
+++ b/src/matrices/tnlMatrix.h
@@ -55,14 +55,10 @@ class tnlMatrix : public virtual tnlObject
 
    void reset();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getRows() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getColumns() const;
 
    /****
diff --git a/src/matrices/tnlMatrixReader_impl.h b/src/matrices/tnlMatrixReader_impl.h
index 6603692e31132c0de0fef57805c2000f2fcfd11f..699cc23fb3090aca1cb310bf0b72b08a1307bfc1 100644
--- a/src/matrices/tnlMatrixReader_impl.h
+++ b/src/matrices/tnlMatrixReader_impl.h
@@ -126,8 +126,8 @@ bool tnlMatrixReader< Matrix >::verifyMtxFile( std::istream& file,
    long int fileSize = file.tellg();
    if( verbose )
       cout << " Verifying the matrix elements ... " << processedElements << " / " << matrix.getNumberOfMatrixElements()
-           << " -> " << timer.GetTime()
-           << " sec. i.e. " << fileSize / ( timer.GetTime() * ( 1 << 20 ))  << "MB/s." << endl;
+           << " -> " << timer.getTime()
+           << " sec. i.e. " << fileSize / ( timer.getTime() * ( 1 << 20 ))  << "MB/s." << endl;
    return true;
 }
 
@@ -144,8 +144,6 @@ bool tnlMatrixReader< Matrix >::findLineByElement( std::istream& file,
    bool dimensionsLine( false );
    lineNumber = 0;
    tnlTimerRT timer;
-   IndexType currentRow, currentColumn;
-   RealType value;
    while( line.getLine( file ) )
    {
       lineNumber++;
@@ -314,8 +312,8 @@ bool tnlMatrixReader< Matrix >::computeRowLengthsFromMtxFile( std::istream& file
    long int fileSize = file.tellg();
    if( verbose )
       cout << " Counting the matrix elements ... " << numberOfElements / 1000
-           << " thousands  -> " << timer.GetTime()
-           << " sec. i.e. " << fileSize / ( timer.GetTime() * ( 1 << 20 ))  << "MB/s." << endl;
+           << " thousands  -> " << timer.getTime()
+           << " sec. i.e. " << fileSize / ( timer.getTime() * ( 1 << 20 ))  << "MB/s." << endl;
    return true;
 }
 
@@ -357,8 +355,8 @@ bool tnlMatrixReader< Matrix >::readMatrixElementsFromMtxFile( std::istream& fil
    long int fileSize = file.tellg();
    if( verbose )
       cout << " Reading the matrix elements ... " << processedElements << " / " << matrix.getNumberOfMatrixElements()
-              << " -> " << timer.GetTime()
-              << " sec. i.e. " << fileSize / ( timer.GetTime() * ( 1 << 20 ))  << "MB/s." << endl;
+              << " -> " << timer.getTime()
+              << " sec. i.e. " << fileSize / ( timer.getTime() * ( 1 << 20 ))  << "MB/s." << endl;
    return true;
 }
 
diff --git a/src/matrices/tnlMatrixSetter.h b/src/matrices/tnlMatrixSetter.h
index 98c6400f1e9a1c460511cf1cd849a87cddf5deb1..ad6ecb4838f71658b03886731bc1ad9c434a77cf 100644
--- a/src/matrices/tnlMatrixSetter.h
+++ b/src/matrices/tnlMatrixSetter.h
@@ -67,9 +67,7 @@ class tnlMatrixSetter
       public:
 
          template< int EntityDimension >
-#ifdef HAVE_CUDA
-         __device__ __host__
-#endif
+         __cuda_callable__
          static void processEntity( const MeshType& mesh,
                                     TraversalUserData& userData,
                                     const IndexType index )
@@ -85,9 +83,7 @@ class tnlMatrixSetter
       public:
 
          template< int EntityDimensions >
-#ifdef HAVE_CUDA
-         __device__ __host__
-#endif
+         __cuda_callable__
          static void processEntity( const MeshType& mesh,
                                     TraversalUserData& userData,
                                     const IndexType index )
@@ -131,9 +127,7 @@ class tnlMatrixSetter< tnlGrid< Dimensions, Real, Device, Index >,
    {
       public:
 
-#ifdef HAVE_CUDA
-         __device__ __host__
-#endif
+         __cuda_callable__
          static void processCell( const MeshType& mesh,
                                   TraversalUserData& userData,
                                   const IndexType index,
@@ -143,15 +137,25 @@ class tnlMatrixSetter< tnlGrid< Dimensions, Real, Device, Index >,
                      userData.boundaryConditions->getLinearSystemRowLength( mesh, index, coordinates );
          }
 
+         __cuda_callable__
+         static void processFace( const MeshType& mesh,
+                                  TraversalUserData& userData,
+                                  const IndexType index,
+                                  const CoordinatesType& coordinates )
+         {
+             //printf("Matrix setter: Index = %d \n", index );
+            ( *userData.rowLengths )[ index ] =
+                     userData.boundaryConditions->getLinearSystemRowLength( mesh, index, coordinates );
+         }
+         
+
    };
 
    class TraversalInteriorEntitiesProcessor
    {
       public:
 
-#ifdef HAVE_CUDA
-         __device__ __host__
-#endif
+         __cuda_callable__
          static void processCell( const MeshType& mesh,
                                   TraversalUserData& userData,
                                   const IndexType index,
@@ -160,6 +164,18 @@ class tnlMatrixSetter< tnlGrid< Dimensions, Real, Device, Index >,
             ( *userData.rowLengths )[ index ] =
                      userData.differentialOperator->getLinearSystemRowLength( mesh, index, coordinates );
          }
+         
+         __cuda_callable__
+         static void processFace( const MeshType& mesh,
+                                  TraversalUserData& userData,
+                                  const IndexType index,
+                                  const CoordinatesType& coordinates )
+         {
+            // printf("Matrix setter: Index = %d \n", index );
+            ( *userData.rowLengths )[ index ] =
+                     userData.differentialOperator->getLinearSystemRowLength( mesh, index, coordinates );
+         }
+         
 
    };
 
diff --git a/src/matrices/tnlMatrix_impl.h b/src/matrices/tnlMatrix_impl.h
index 72674a61c503abb5afec6a2f232fda848993a08c..f4cb183a2631d91374b97cfac73f1563d67a631d 100644
--- a/src/matrices/tnlMatrix_impl.h
+++ b/src/matrices/tnlMatrix_impl.h
@@ -67,9 +67,7 @@ bool tnlMatrix< Real, Device, Index >::setLike( const tnlMatrix< Real2, Device2,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlMatrix< Real, Device, Index >::getRows() const
 {
    return this->rows;
@@ -78,9 +76,7 @@ Index tnlMatrix< Real, Device, Index >::getRows() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlMatrix< Real, Device, Index >::getColumns() const
 {
    return this->columns;
@@ -249,7 +245,7 @@ void tnlMatrixVectorProductCuda( const Matrix& matrix,
                                  const InVector& inVector,
                                  OutVector& outVector )
 {
-#ifdef HAVE_CUDA
+#ifdef HAVE_CUDA    
    typedef typename Matrix::IndexType IndexType;
    Matrix* kernel_this = tnlCuda::passToDevice( matrix );
    InVector* kernel_inVector = tnlCuda::passToDevice( inVector );
@@ -266,6 +262,7 @@ void tnlMatrixVectorProductCuda( const Matrix& matrix,
                                        kernel_inVector,
                                        kernel_outVector,
                                        gridIdx );
+      checkCudaDevice;
    }
    tnlCuda::freeFromDevice( kernel_this );
    tnlCuda::freeFromDevice( kernel_inVector );
diff --git a/src/matrices/tnlMultidiagonalMatrix.h b/src/matrices/tnlMultidiagonalMatrix.h
index 8c82a66d0bf2af8fd7ac8cf565298caa3384020b..c846e87f0f1d43ecb8a8d914d59cc0d4b1959c7c 100644
--- a/src/matrices/tnlMultidiagonalMatrix.h
+++ b/src/matrices/tnlMultidiagonalMatrix.h
@@ -80,9 +80,7 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
 
    void setValue( const RealType& v );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value );
@@ -91,9 +89,7 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
                     const IndexType column,
                     const RealType& value );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value,
@@ -105,9 +101,7 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
                     const RealType& thisElementMultiplicator = 1.0 );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -119,9 +113,7 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
                 const IndexType numberOfElements );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -134,18 +126,14 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
                 const IndexType numberOfElements,
                 const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getElementFast( const IndexType row,
                             const IndexType column ) const;
 
    RealType getElement( const IndexType row,
                         const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getRowFast( const IndexType row,
                     IndexType* columns,
                     RealType* values ) const;
@@ -154,20 +142,14 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
                 IndexType* columns,
                 RealType* values ) const;*/
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    MatrixRow getRow( const IndexType rowIndex );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const MatrixRow getRow( const IndexType rowIndex ) const;
 
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
@@ -207,9 +189,7 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
                          const IndexType column,
                          IndexType& index ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool getElementIndexFast( const IndexType row,
                              const IndexType column,
                              IndexType& index ) const;
diff --git a/src/matrices/tnlMultidiagonalMatrixRow.h b/src/matrices/tnlMultidiagonalMatrixRow.h
index cf5f92e97b59d99d10fff526974bad932d5ec0b9..0ec3b8b19ef7d5370e7dffcd807628ea2ed95501 100644
--- a/src/matrices/tnlMultidiagonalMatrixRow.h
+++ b/src/matrices/tnlMultidiagonalMatrixRow.h
@@ -23,14 +23,10 @@ class tnlMultidiagonalMatrixRow
 {
    public:
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       tnlMultidiagonalMatrixRow();
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       tnlMultidiagonalMatrixRow( Real* values,
                                  Index* diagonals,
                                  const Index maxRowLength,
@@ -38,9 +34,7 @@ class tnlMultidiagonalMatrixRow
                                  const Index columns,
                                  const Index step );
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       void bind( Real* values,
                  Index* diagonals,
                  const Index maxRowLength,
@@ -48,9 +42,7 @@ class tnlMultidiagonalMatrixRow
                  const Index columns,
                  const Index step );
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       void setElement( const Index& elementIndex,
                        const Index& column,
                        const Real& value );
diff --git a/src/matrices/tnlMultidiagonalMatrixRow_impl.h b/src/matrices/tnlMultidiagonalMatrixRow_impl.h
index c9f8135e55faf6f93cf6ded58d0ba7cf13e840aa..3c2864b4e34fa7fcbe0be40d12d618a71c573531 100644
--- a/src/matrices/tnlMultidiagonalMatrixRow_impl.h
+++ b/src/matrices/tnlMultidiagonalMatrixRow_impl.h
@@ -19,9 +19,7 @@
 #define TNLMULTIDIAGONALMATRIXROW_IMPL_H_
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 tnlMultidiagonalMatrixRow< Real, Index >::
 tnlMultidiagonalMatrixRow()
 : values( 0 ),
@@ -34,9 +32,7 @@ tnlMultidiagonalMatrixRow()
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 tnlMultidiagonalMatrixRow< Real, Index >::
 tnlMultidiagonalMatrixRow( Real* values,
                            Index* diagonals,
@@ -54,9 +50,7 @@ tnlMultidiagonalMatrixRow( Real* values,
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlMultidiagonalMatrixRow< Real, Index >::
 bind( Real* values,
@@ -75,9 +69,7 @@ bind( Real* values,
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlMultidiagonalMatrixRow< Real, Index >::
 setElement( const Index& elementIndex,
diff --git a/src/matrices/tnlMultidiagonalMatrix_impl.h b/src/matrices/tnlMultidiagonalMatrix_impl.h
index 9a5b1a135de96714a268a7d181185a771d49a768..39d42f54569c2a450da5e3accc74821e49eaef28 100644
--- a/src/matrices/tnlMultidiagonalMatrix_impl.h
+++ b/src/matrices/tnlMultidiagonalMatrix_impl.h
@@ -222,9 +222,7 @@ void tnlMultidiagonalMatrix< Real, Device, Index >::setValue( const RealType& v
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlMultidiagonalMatrix< Real, Device, Index > :: setElementFast( const IndexType row,
                                                                       const IndexType column,
                                                                       const Real& value )
@@ -254,9 +252,7 @@ bool tnlMultidiagonalMatrix< Real, Device, Index > :: setElement( const IndexTyp
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlMultidiagonalMatrix< Real, Device, Index > :: addElementFast( const IndexType row,
                                                                       const IndexType column,
                                                                       const RealType& value,
@@ -288,9 +284,7 @@ bool tnlMultidiagonalMatrix< Real, Device, Index > :: addElement( const IndexTyp
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlMultidiagonalMatrix< Real, Device, Index > :: setRowFast( const IndexType row,
                                                                   const IndexType* columns,
                                                                   const RealType* values,
@@ -314,9 +308,7 @@ bool tnlMultidiagonalMatrix< Real, Device, Index > :: setRow( const IndexType ro
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlMultidiagonalMatrix< Real, Device, Index > :: addRowFast( const IndexType row,
                                                                   const IndexType* columns,
                                                                   const RealType* values,
@@ -380,9 +372,7 @@ bool tnlMultidiagonalMatrix< Real, Device, Index > :: addRow( const IndexType ro
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlMultidiagonalMatrix< Real, Device, Index >::getElementFast( const IndexType row,
                                                                     const IndexType column ) const
 {
@@ -408,9 +398,7 @@ Real tnlMultidiagonalMatrix< Real, Device, Index >::getElement( const IndexType
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlMultidiagonalMatrix< Real, Device, Index >::getRowFast( const IndexType row,
                                                                 IndexType* columns,
                                                                 RealType* values ) const
@@ -431,9 +419,7 @@ void tnlMultidiagonalMatrix< Real, Device, Index >::getRowFast( const IndexType
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlMultidiagonalMatrix< Real, Device, Index >::MatrixRow
 tnlMultidiagonalMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex )
@@ -464,9 +450,7 @@ getRow( const IndexType rowIndex )
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlMultidiagonalMatrix< Real, Device, Index >::MatrixRow
 tnlMultidiagonalMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex ) const
@@ -497,9 +481,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename Vector::RealType tnlMultidiagonalMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
                                                                                            const Vector& vector ) const
 {
@@ -710,9 +692,7 @@ bool tnlMultidiagonalMatrix< Real, Device, Index >::getElementIndex( const Index
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlMultidiagonalMatrix< Real, Device, Index >::getElementIndexFast( const IndexType row,
                                                                          const IndexType column,
                                                                          Index& index ) const
@@ -748,6 +728,7 @@ class tnlMultidiagonalMatrixDeviceDependentCode< tnlHost >
       typedef tnlHost Device;
 
       template< typename Index >
+      __cuda_callable__
       static Index getElementIndex( const Index rows,
                                     const Index diagonals,
                                     const Index row,
@@ -777,9 +758,7 @@ class tnlMultidiagonalMatrixDeviceDependentCode< tnlCuda >
       typedef tnlCuda Device;
 
       template< typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       static Index getElementIndex( const Index rows,
                                     const Index diagonals,
                                     const Index row,
diff --git a/src/matrices/tnlSlicedEllpackMatrix.h b/src/matrices/tnlSlicedEllpackMatrix.h
index 6c5bc2c11fa94df593d38572ad22b4a6723d8ba7..b820daecd9baef544d2b381cf99e16cbde23a964 100644
--- a/src/matrices/tnlSlicedEllpackMatrix.h
+++ b/src/matrices/tnlSlicedEllpackMatrix.h
@@ -84,9 +84,7 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
    template< typename Real2, typename Device2, typename Index2 >
    bool operator != ( const tnlSlicedEllpackMatrix< Real2, Device2, Index2 >& matrix ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value );
@@ -95,9 +93,7 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                     const IndexType column,
                     const RealType& value );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value,
@@ -108,9 +104,7 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                     const RealType& value,
                     const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setRowFast( const IndexType row,
                     const IndexType* columnIndexes,
                     const RealType* values,
@@ -121,9 +115,7 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                 const RealType* values,
                 const IndexType elements );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -136,36 +128,26 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                 const IndexType numberOfElements,
                 const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getElementFast( const IndexType row,
                             const IndexType column ) const;
 
    RealType getElement( const IndexType row,
                         const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getRowFast( const IndexType row,
                     IndexType* columns,
                     RealType* values ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    MatrixRow getRow( const IndexType rowIndex );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const MatrixRow getRow( const IndexType rowIndex ) const;
 
    template< typename Vector >
-   #ifdef HAVE_CUDA
-      __device__ __host__
-   #endif
+   __cuda_callable__
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
diff --git a/src/matrices/tnlSlicedEllpackMatrix_impl.h b/src/matrices/tnlSlicedEllpackMatrix_impl.h
index 3ac2cbedd7e7d15a35f2ddda574d1f3fb17adb5f..39c4d86bf6593c8da8906724409636b9178be425 100644
--- a/src/matrices/tnlSlicedEllpackMatrix_impl.h
+++ b/src/matrices/tnlSlicedEllpackMatrix_impl.h
@@ -160,9 +160,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::setElementFast( const IndexType row,
                                                                                const IndexType column,
                                                                                const Real& value )
@@ -186,9 +184,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::addElementFast( const IndexType row,
                                                                                const IndexType column,
                                                                                const RealType& value,
@@ -285,9 +281,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > :: setRowFast( const IndexType row,
                                                                              const IndexType* columnIndexes,
                                                                              const RealType* values,
@@ -356,9 +350,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > :: addRowFast( const IndexType row,
                                                                              const IndexType* columns,
                                                                              const RealType* values,
@@ -387,9 +379,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::getElementFast( const IndexType row,
                                                                                const IndexType column ) const
 {
@@ -431,9 +421,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::getRowFast( const IndexType row,
                                                                            IndexType* columns,
                                                                            RealType* values ) const
@@ -454,9 +442,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::MatrixRow
 tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::
 getRow( const IndexType rowIndex )
@@ -474,9 +460,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::MatrixRow
 tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::
 getRow( const IndexType rowIndex ) const
@@ -495,9 +479,7 @@ template< typename Real,
           typename Index,
           int SliceSize >
   template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename Vector::RealType tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::rowVectorProduct( const IndexType row,
                                                                                                       const Vector& vector ) const
 {
@@ -724,6 +706,7 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlHost >
       template< typename Real,
                 typename Index,
                 int SliceSize >
+      __cuda_callable__
       static void initRowTraverseFast( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
                                        const Index row,
                                        Index& rowBegin,
@@ -822,9 +805,7 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
       template< typename Real,
                 typename Index,
                 int SliceSize >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       static void initRowTraverseFast( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
                                        const Index row,
                                        Index& rowBegin,
@@ -886,6 +867,4 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
 
 };
 
-
-
 #endif /* TNLSLICEDELLPACKMATRIX_IMPL_H_ */
diff --git a/src/matrices/tnlSparseMatrix.h b/src/matrices/tnlSparseMatrix.h
index adad7932e4b4fdd11d3de2cfbdd111631be5c573..fa404d7664c3d3b917ad27e55a2cb6b62da214ee 100644
--- a/src/matrices/tnlSparseMatrix.h
+++ b/src/matrices/tnlSparseMatrix.h
@@ -50,9 +50,7 @@ class tnlSparseMatrix : public tnlMatrix< Real, Device, Index >
 
    IndexType getMaxRowLength() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getPaddingIndex() const;
 
    void reset();
diff --git a/src/matrices/tnlSparseMatrixRow.h b/src/matrices/tnlSparseMatrixRow.h
index 74a8506333ec10164856593308e98485e1e273be..7290ffb41ee56299094a6acac25f1651339b8c3a 100644
--- a/src/matrices/tnlSparseMatrixRow.h
+++ b/src/matrices/tnlSparseMatrixRow.h
@@ -24,30 +24,22 @@ class tnlSparseMatrixRow
 {
    public:
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       tnlSparseMatrixRow();
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       tnlSparseMatrixRow( Index* columns,
                           Real* values,
                           const Index length,
                           const Index step );
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       void bind( Index* columns,
                  Real* values,
                  const Index length,
                  const Index step );
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       void setElement( const Index& elementIndex,
                        const Index& column,
                        const Real& value );
diff --git a/src/matrices/tnlSparseMatrixRow_impl.h b/src/matrices/tnlSparseMatrixRow_impl.h
index 296c910cd10989d8b1b5a3e4e5c3c5f690c2a8b7..79f171f158dd1e2362ef42ad163bf77fe5805bb4 100644
--- a/src/matrices/tnlSparseMatrixRow_impl.h
+++ b/src/matrices/tnlSparseMatrixRow_impl.h
@@ -19,9 +19,7 @@
 #define TNLSPARSEMATRIXROW_IMPL_H_
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 tnlSparseMatrixRow< Real, Index >::
 tnlSparseMatrixRow()
 : values( 0 ),
@@ -32,9 +30,7 @@ tnlSparseMatrixRow()
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 tnlSparseMatrixRow< Real, Index >::
 tnlSparseMatrixRow( Index* columns,
                     Real* values,
@@ -48,9 +44,7 @@ tnlSparseMatrixRow( Index* columns,
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlSparseMatrixRow< Real, Index >::
 bind( Index* columns,
@@ -65,9 +59,7 @@ bind( Index* columns,
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlSparseMatrixRow< Real, Index >::
 setElement( const Index& elementIndex,
@@ -77,6 +69,7 @@ setElement( const Index& elementIndex,
    tnlAssert( this->columns, );
    tnlAssert( this->values, );
    tnlAssert( this->step > 0,);
+   //printf( "elementIndex = %d length = %d \n", elementIndex, this->length );
    tnlAssert( elementIndex >= 0 && elementIndex < this->length,
               cerr << "elementIndex = " << elementIndex << " this->length = " << this->length );
 
diff --git a/src/matrices/tnlSparseMatrix_impl.h b/src/matrices/tnlSparseMatrix_impl.h
index 6ec90d0e9414dd62b33c7f3eacf28a75836bdd87..830b54252abb0f7350cf52669578b430319a404d 100644
--- a/src/matrices/tnlSparseMatrix_impl.h
+++ b/src/matrices/tnlSparseMatrix_impl.h
@@ -74,9 +74,7 @@ getMaxRowLength() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlSparseMatrix< Real, Device, Index >::getPaddingIndex() const
 {
    return this->getColumns();
diff --git a/src/matrices/tnlTridiagonalMatrix.h b/src/matrices/tnlTridiagonalMatrix.h
index 25ddda40985582b73e8e9f51558dc2cac16e0268..9a049585b00c01e4da4de4da665570d23b53f129 100644
--- a/src/matrices/tnlTridiagonalMatrix.h
+++ b/src/matrices/tnlTridiagonalMatrix.h
@@ -76,9 +76,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
 
    void setValue( const RealType& v );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value );
@@ -87,9 +85,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                     const IndexType column,
                     const RealType& value );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value,
@@ -100,9 +96,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                     const RealType& value,
                     const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -113,9 +107,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                 const RealType* values,
                 const IndexType elements );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -128,36 +120,26 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                 const IndexType elements,
                 const RealType& thisRowMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getElementFast( const IndexType row,
                             const IndexType column ) const;
 
    RealType getElement( const IndexType row,
                         const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getRowFast( const IndexType row,
                     IndexType* columns,
                     RealType* values ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    MatrixRow getRow( const IndexType rowIndex );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const MatrixRow getRow( const IndexType rowIndex ) const;
 
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
@@ -182,9 +164,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
 #endif   
 
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void performSORIteration( const Vector& b,
                              const IndexType row,
                              Vector& x,
@@ -202,9 +182,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
 
    protected:
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getElementIndex( const IndexType row,
                               const IndexType column ) const;
 
diff --git a/src/matrices/tnlTridiagonalMatrixRow.h b/src/matrices/tnlTridiagonalMatrixRow.h
index 81bac3f54f39d963760347192f7e2029a7d77b8f..4872e67fccb81cb87c0cf668f25402ac8275d8df 100644
--- a/src/matrices/tnlTridiagonalMatrixRow.h
+++ b/src/matrices/tnlTridiagonalMatrixRow.h
@@ -23,30 +23,22 @@ class tnlTridiagonalMatrixRow
 {
    public:
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       tnlTridiagonalMatrixRow();
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       tnlTridiagonalMatrixRow( Real* values,
                                const Index row,
                                const Index columns,
                                const Index step );
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       void bind( Real* values,
                  const Index row,
                  const Index columns,
                  const Index step );
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       void setElement( const Index& elementIndex,
                        const Index& column,
                        const Real& value );
diff --git a/src/matrices/tnlTridiagonalMatrixRow_impl.h b/src/matrices/tnlTridiagonalMatrixRow_impl.h
index b7f364b6117c504b13761847842fe3111713b7bf..c9cd1682fe491c3a6f07e735d28f0541ea9a65f6 100644
--- a/src/matrices/tnlTridiagonalMatrixRow_impl.h
+++ b/src/matrices/tnlTridiagonalMatrixRow_impl.h
@@ -19,9 +19,7 @@
 #define TNLTRIDIAGONALMATRIXROW_IMPL_H_
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 tnlTridiagonalMatrixRow< Real, Index >::
 tnlTridiagonalMatrixRow()
 : values( 0 ),
@@ -32,9 +30,7 @@ tnlTridiagonalMatrixRow()
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 tnlTridiagonalMatrixRow< Real, Index >::
 tnlTridiagonalMatrixRow( Real* values,
                          const Index row,
@@ -48,9 +44,7 @@ tnlTridiagonalMatrixRow( Real* values,
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlTridiagonalMatrixRow< Real, Index >::
 bind( Real* values,
@@ -65,9 +59,7 @@ bind( Real* values,
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlTridiagonalMatrixRow< Real, Index >::
 setElement( const Index& elementIndex,
diff --git a/src/matrices/tnlTridiagonalMatrix_impl.h b/src/matrices/tnlTridiagonalMatrix_impl.h
index 2f379c920b079b998cc37b8de5c224676083adcc..4c0a6fcdbb0b4a55765a277229b2575bd8cdf5df 100644
--- a/src/matrices/tnlTridiagonalMatrix_impl.h
+++ b/src/matrices/tnlTridiagonalMatrix_impl.h
@@ -189,9 +189,7 @@ void tnlTridiagonalMatrix< Real, Device, Index >::setValue( const RealType& v )
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlTridiagonalMatrix< Real, Device, Index >::setElementFast( const IndexType row,
                                                                   const IndexType column,
                                                                   const RealType& value )
@@ -214,9 +212,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setElement( const IndexType ro
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlTridiagonalMatrix< Real, Device, Index >::addElementFast( const IndexType row,
                                                                   const IndexType column,
                                                                   const RealType& value,
@@ -243,9 +239,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addElement( const IndexType ro
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlTridiagonalMatrix< Real, Device, Index >::setRowFast( const IndexType row,
                                                               const IndexType* columns,
                                                               const RealType* values,
@@ -276,9 +270,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setRow( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlTridiagonalMatrix< Real, Device, Index >::addRowFast( const IndexType row,
                                                               const IndexType* columns,
                                                               const RealType* values,
@@ -329,9 +321,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addRow( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlTridiagonalMatrix< Real, Device, Index >::getElementFast( const IndexType row,
                                                                   const IndexType column ) const
 {
@@ -354,9 +344,7 @@ Real tnlTridiagonalMatrix< Real, Device, Index >::getElement( const IndexType ro
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlTridiagonalMatrix< Real, Device, Index >::getRowFast( const IndexType row,
                                                               IndexType* columns,
                                                               RealType* values ) const
@@ -377,9 +365,7 @@ void tnlTridiagonalMatrix< Real, Device, Index >::getRowFast( const IndexType ro
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlTridiagonalMatrix< Real, Device, Index >::MatrixRow
 tnlTridiagonalMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex )
@@ -399,9 +385,7 @@ getRow( const IndexType rowIndex )
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlTridiagonalMatrix< Real, Device, Index >::MatrixRow
 tnlTridiagonalMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex ) const
@@ -414,9 +398,7 @@ template< typename Real,
           typename Device,
           typename Index >
 template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename Vector::RealType tnlTridiagonalMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
                                                                                          const Vector& vector ) const
 {
@@ -547,9 +529,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlTridiagonalMatrix< Real, Device, Index >::performSORIteration( const Vector& b,
                                                                        const IndexType row,
                                                                        Vector& x,
@@ -625,9 +605,7 @@ void tnlTridiagonalMatrix< Real, Device, Index >::print( ostream& str ) const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlTridiagonalMatrix< Real, Device, Index >::getElementIndex( const IndexType row,
                                                                     const IndexType column ) const
 {
@@ -647,6 +625,7 @@ class tnlTridiagonalMatrixDeviceDependentCode< tnlHost >
       typedef tnlHost Device;
 
       template< typename Index >
+      __cuda_callable__
       static Index getElementIndex( const Index rows,
                                     const Index row,
                                     const Index column )
@@ -657,6 +636,7 @@ class tnlTridiagonalMatrixDeviceDependentCode< tnlHost >
       template< typename Vector,
                 typename Index,
                 typename ValuesType  >
+      __cuda_callable__
       static typename Vector::RealType rowVectorProduct( const Index rows,
                                                          const ValuesType& values,
                                                          const Index row,
@@ -695,9 +675,7 @@ class tnlTridiagonalMatrixDeviceDependentCode< tnlCuda >
       typedef tnlCuda Device;
 
       template< typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       static Index getElementIndex( const Index rows,
                                     const Index row,
                                     const Index column )
@@ -708,9 +686,7 @@ class tnlTridiagonalMatrixDeviceDependentCode< tnlCuda >
       template< typename Vector,
                 typename Index,
                 typename ValuesType >
-#ifdef HAVE_CUDA
-      __device__
-#endif
+      __cuda_callable__
       static typename Vector::RealType rowVectorProduct( const Index rows,
                                                          const ValuesType& values,
                                                          const Index row,
diff --git a/src/mesh/tnlGrid1D.h b/src/mesh/tnlGrid1D.h
index 96351cf68bfa14ef8cf9b2c2b014b5e9dbc6616f..30807dc3e813ae3fb24d5bbbe7106376c45548b6 100644
--- a/src/mesh/tnlGrid1D.h
+++ b/src/mesh/tnlGrid1D.h
@@ -49,78 +49,50 @@ class tnlGrid< 1, Real, Device, Index > : public tnlObject
 
    void setDimensions( const CoordinatesType& dimensions );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const CoordinatesType& getDimensions() const;
 
    void setDomain( const VertexType& origin,
                    const VertexType& proportions );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const VertexType& getOrigin() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const VertexType& getProportions() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const VertexType& getCellProportions() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getCellIndex( const CoordinatesType& cellCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    CoordinatesType getCellCoordinates( const Index cellIndex ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getVertexIndex( const CoordinatesType& vertexCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    CoordinatesType getVertexCoordinates( const Index vertexCoordinates ) const;
 
    template< int dx >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getCellNextToCell( const IndexType& cellIndex ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHx() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxSquare() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxSquareInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getSmallestSpaceStep() const;
 
    /****
@@ -131,9 +103,7 @@ class tnlGrid< 1, Real, Device, Index > : public tnlObject
 #else
    template< typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getCellCenter( const CoordinatesType& cellCoordinates ) const;
 
 #ifdef HAVE_NOT_CXX11
@@ -141,9 +111,7 @@ class tnlGrid< 1, Real, Device, Index > : public tnlObject
 #else
    template< typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getCellCenter( const IndexType& cellIndex ) const;
 
 #ifdef HAVE_NOT_CXX11
@@ -151,34 +119,22 @@ class tnlGrid< 1, Real, Device, Index > : public tnlObject
 #else
    template< typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getVertex( const CoordinatesType& vertexCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfCells() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfVertices() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryCell( const CoordinatesType& cellCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryCell( const IndexType& cellIndex ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryVertex( const CoordinatesType& vertexCoordinates ) const;
 
    template< typename GridFunction >
diff --git a/src/mesh/tnlGrid1D_impl.h b/src/mesh/tnlGrid1D_impl.h
index 46f0761184c4e1573fa47a1f5688eb911fb2c5a3..61006531c3ce3478c64301de225203d10596c16d 100644
--- a/src/mesh/tnlGrid1D_impl.h
+++ b/src/mesh/tnlGrid1D_impl.h
@@ -109,9 +109,7 @@ void tnlGrid< 1, Real, Device, Index > :: setDimensions( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index  >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 1, Real, Device, Index >::CoordinatesType&
    tnlGrid< 1, Real, Device, Index > :: getDimensions() const
 {
@@ -132,9 +130,7 @@ void tnlGrid< 1, Real, Device, Index > :: setDomain( const VertexType& origin,
 template< typename Real,
           typename Device,
           typename Index  >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 1, Real, Device, Index > :: VertexType& 
   tnlGrid< 1, Real, Device, Index > :: getOrigin() const
 {
@@ -144,9 +140,7 @@ const typename tnlGrid< 1, Real, Device, Index > :: VertexType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 1, Real, Device, Index > :: VertexType& 
    tnlGrid< 1, Real, Device, Index > :: getProportions() const
 {
@@ -156,9 +150,7 @@ const typename tnlGrid< 1, Real, Device, Index > :: VertexType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 1, Real, Device, Index > :: VertexType& 
    tnlGrid< 1, Real, Device, Index > :: getCellProportions() const
 {
@@ -168,9 +160,7 @@ const typename tnlGrid< 1, Real, Device, Index > :: VertexType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 1, Real, Device, Index > :: getCellIndex( const CoordinatesType& cellCoordinates ) const
 {
    tnlAssert( cellCoordinates.x() >= 0 && cellCoordinates.x() < this->getDimensions().x(),
@@ -183,9 +173,7 @@ Index tnlGrid< 1, Real, Device, Index > :: getCellIndex( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlGrid< 1, Real, Device, Index > :: CoordinatesType
 tnlGrid< 1, Real, Device, Index > :: getCellCoordinates( const Index cellIndex ) const
 {
@@ -199,9 +187,7 @@ tnlGrid< 1, Real, Device, Index > :: getCellCoordinates( const Index cellIndex )
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 1, Real, Device, Index > :: getVertexIndex( const CoordinatesType& vertexCoordinates ) const
 {
    tnlAssert( vertexCoordinates.x() >= 0 && vertexCoordinates.x() < this->getDimensions().x() + 1,
@@ -214,9 +200,7 @@ Index tnlGrid< 1, Real, Device, Index > :: getVertexIndex( const CoordinatesType
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlGrid< 1, Real, Device, Index > :: CoordinatesType
 tnlGrid< 1, Real, Device, Index > :: getVertexCoordinates( const Index vertexIndex ) const
 {
@@ -231,9 +215,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int dx >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 1, Real, Device, Index > :: getCellNextToCell( const IndexType& cellIndex ) const
 {
    tnlAssert( cellIndex + dx >= 0 &&
@@ -248,9 +230,7 @@ Index tnlGrid< 1, Real, Device, Index > :: getCellNextToCell( const IndexType& c
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 1, Real, Device, Index > :: getHx() const
 {
    return this->hx;
@@ -259,9 +239,7 @@ const Real& tnlGrid< 1, Real, Device, Index > :: getHx() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 1, Real, Device, Index > :: getHxSquare() const
 {
    return this->hxSquare;
@@ -270,9 +248,7 @@ const Real& tnlGrid< 1, Real, Device, Index > :: getHxSquare() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 1, Real, Device, Index > :: getHxInverse() const
 {
    return this->hxInverse;
@@ -281,9 +257,7 @@ const Real& tnlGrid< 1, Real, Device, Index > :: getHxInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 1, Real, Device, Index > :: getHxSquareInverse() const
 {
    return this->hxSquareInverse;
@@ -292,9 +266,7 @@ const Real& tnlGrid< 1, Real, Device, Index > :: getHxSquareInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlGrid< 1, Real, Device, Index > :: getSmallestSpaceStep() const
 {
    return this->hx;
@@ -304,9 +276,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 1, Real, Device, Index >::getCellCenter( const CoordinatesType& cellCoordinates ) const
 {
    tnlAssert( cellCoordinates.x() >= 0 && cellCoordinates.x() < this->getDimensions().x(),
@@ -320,9 +290,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 1, Real, Device, Index >::getCellCenter( const IndexType& cellIndex ) const
 {
    tnlAssert( cellIndex >= 0 && cellIndex < this->getNumberOfCells(),
@@ -336,9 +304,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vertex >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 1, Real, Device, Index >::getVertex( const CoordinatesType& vertexCoordinates ) const
 {
    tnlAssert( vertexCoordinates.x() >= 0 && vertexCoordinates.x() < this->getDimensions().x() + 1,
@@ -351,9 +317,7 @@ Vertex tnlGrid< 1, Real, Device, Index >::getVertex( const CoordinatesType& vert
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 1, Real, Device, Index > :: getNumberOfCells() const
 {
    return this->numberOfCells;
@@ -362,9 +326,7 @@ Index tnlGrid< 1, Real, Device, Index > :: getNumberOfCells() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 1, Real, Device, Index > :: getNumberOfVertices() const
 {
    return this->numberOfVertices;
@@ -373,9 +335,7 @@ Index tnlGrid< 1, Real, Device, Index > :: getNumberOfVertices() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlGrid< 1, Real, Device, Index > :: isBoundaryCell( const CoordinatesType& cellCoordinates ) const
 {
    tnlAssert( cellCoordinates.x() >= 0 && cellCoordinates.x() < this->getDimensions().x(),
@@ -390,9 +350,7 @@ bool tnlGrid< 1, Real, Device, Index > :: isBoundaryCell( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool
 tnlGrid< 1, Real, Device, Index >::
 isBoundaryCell( const IndexType& cellIndex ) const
@@ -407,9 +365,7 @@ isBoundaryCell( const IndexType& cellIndex ) const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlGrid< 1, Real, Device, Index > :: isBoundaryVertex( const CoordinatesType& vertexCoordinates ) const
 {
    tnlAssert( vertexCoordinates.x() >= 0 && vertexCoordinates.x() < this->getDimensions().x() + 1,
diff --git a/src/mesh/tnlGrid2D.h b/src/mesh/tnlGrid2D.h
index 0d11cd0e959c3e680cdc8696f384610af7f6d258..286b7f08730884f31a5b50bb9d805a83140de80c 100644
--- a/src/mesh/tnlGrid2D.h
+++ b/src/mesh/tnlGrid2D.h
@@ -49,131 +49,83 @@ class tnlGrid< 2, Real, Device, Index > : public tnlObject
 
    void setDimensions( const CoordinatesType& dimensions );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const CoordinatesType& getDimensions() const;
 
    void setDomain( const VertexType& origin,
                    const VertexType& proportions );
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const VertexType& getOrigin() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const VertexType& getProportions() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const VertexType& getCellProportions() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getCellIndex( const CoordinatesType& cellCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    CoordinatesType getCellCoordinates( const IndexType cellIndex ) const;
 
    template< int nx, int ny >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getFaceIndex( const CoordinatesType& faceCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    CoordinatesType getFaceCoordinates( const Index faceIndex, int& nx, int& ny ) const;
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getVertexIndex( const CoordinatesType& vertexCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    CoordinatesType getVertexCoordinates( const Index vertexIndex ) const;
 
    template< int dx, int dy >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getCellNextToCell( const IndexType& cellIndex ) const;
 
    template< int nx, int ny >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getFaceNextToCell( const IndexType& cellIndex ) const;
 
    template< int nx, int ny >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getCellNextToFace( const IndexType& cellIndex ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHx() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxSquare() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxSquareInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHy() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHySquare() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHyInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHySquareInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxHy() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxHyInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getSmallestSpaceStep() const;
 
 
@@ -185,9 +137,7 @@ class tnlGrid< 2, Real, Device, Index > : public tnlObject
 #else
    template< typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getCellCenter( const CoordinatesType& cellCoordinates ) const;
 
 #ifdef HAVE_NOT_CXX11
@@ -195,9 +145,7 @@ class tnlGrid< 2, Real, Device, Index > : public tnlObject
 #else
    template< typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getCellCenter( const IndexType& cellIndex ) const;
 
 
@@ -206,9 +154,7 @@ class tnlGrid< 2, Real, Device, Index > : public tnlObject
 #else
    template< int nx, int ny, typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getFaceCenter( const CoordinatesType& faceCoordinates ) const;
 
 #ifdef HAVE_NOT_CXX11
@@ -216,14 +162,10 @@ class tnlGrid< 2, Real, Device, Index > : public tnlObject
 #else
    template< typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getVertex( const CoordinatesType& vertexCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfCells() const;
 
 #ifdef HAVE_NOT_CXX11
@@ -233,35 +175,23 @@ class tnlGrid< 2, Real, Device, Index > : public tnlObject
    template< int nx = 1,
              int ny = 1 >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfFaces() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfVertices() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryCell( const CoordinatesType& cellCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryCell( const IndexType& cellIndex ) const;
 
    template< int nx, int ny >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryFace( const CoordinatesType& faceCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryVertex( const CoordinatesType& vertexCoordinates ) const;
 
    template< typename GridFunction >
@@ -302,6 +232,7 @@ class tnlGrid< 2, Real, Device, Index > : public tnlObject
 
    protected:
 
+   __cuda_callable__
    void computeSpaceSteps();
 
    CoordinatesType dimensions;
diff --git a/src/mesh/tnlGrid2D_impl.h b/src/mesh/tnlGrid2D_impl.h
index 5ab7dff83755d30377a6f8faa5fb30b4701b1176..46da534eb5b7cfcada12c0078dfb0bb22a30627b 100644
--- a/src/mesh/tnlGrid2D_impl.h
+++ b/src/mesh/tnlGrid2D_impl.h
@@ -76,6 +76,7 @@ tnlString tnlGrid< 2, Real, Device, Index > :: getSerializationTypeVirtual() con
 template< typename Real,
           typename Device,
           typename Index >
+__cuda_callable__
 void tnlGrid< 2, Real, Device, Index > :: computeSpaceSteps()
 {
    if( this->getDimensions().x() > 0 && this->getDimensions().y() > 0 )
@@ -124,9 +125,7 @@ void tnlGrid< 2, Real, Device, Index > :: setDimensions( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 2, Real, Device, Index >::CoordinatesType&
 tnlGrid< 2, Real, Device, Index > :: getDimensions() const
 {
@@ -147,9 +146,7 @@ void tnlGrid< 2, Real, Device, Index > :: setDomain( const VertexType& origin,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 2, Real, Device, Index >::VertexType&
 tnlGrid< 2, Real, Device, Index >::getOrigin() const
 {
@@ -159,9 +156,7 @@ tnlGrid< 2, Real, Device, Index >::getOrigin() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 2, Real, Device, Index > :: VertexType&
    tnlGrid< 2, Real, Device, Index > :: getProportions() const
 {
@@ -171,9 +166,7 @@ const typename tnlGrid< 2, Real, Device, Index > :: VertexType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 2, Real, Device, Index > :: VertexType&
 tnlGrid< 2, Real, Device, Index > :: getCellProportions() const
 {
@@ -183,9 +176,7 @@ tnlGrid< 2, Real, Device, Index > :: getCellProportions() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 2, Real, Device, Index > :: getCellIndex( const CoordinatesType& cellCoordinates ) const
 {
    tnlAssert( cellCoordinates.x() >= 0 && cellCoordinates.x() < this->getDimensions().x(),
@@ -203,9 +194,7 @@ Index tnlGrid< 2, Real, Device, Index > :: getCellIndex( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlGrid< 2, Real, Device, Index >::CoordinatesType
 tnlGrid< 2, Real, Device, Index >::getCellCoordinates( const Index cellIndex ) const
 {
@@ -220,9 +209,7 @@ template< typename Real,
           typename Device,
           typename Index >
 template< int nx, int ny >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 2, Real, Device, Index >::getFaceIndex( const CoordinatesType& faceCoordinates ) const
 {
    tnlStaticAssert( nx >= 0 && ny >= 0 && nx + ny == 1, "Wrong template parameters nx or ny." );
@@ -252,15 +239,13 @@ Index tnlGrid< 2, Real, Device, Index >::getFaceIndex( const CoordinatesType& fa
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 typename tnlGrid< 2, Real, Device, Index >::CoordinatesType
 tnlGrid< 2, Real, Device, Index >::getFaceCoordinates( const Index faceIndex, int& nx, int& ny ) const
 {
-   tnlAssert( faceIndex >= 0 && faceIndex < this->getNumberOfFaces(),
+   tnlAssert( faceIndex >= 0 && faceIndex < ( this->template getNumberOfFaces< 1, 1 >() ),
               cerr << " faceIndex = " << faceIndex
-                   << " this->getNumberOfFaces() = " << this->getNumberOfFaces()
+                   << " this->getNumberOfFaces() = " << ( this->template getNumberOfFaces< 1, 1 >() )
                    << " this->getName() " << this->getName(); );
    if( faceIndex < this->numberOfNxFaces )
    {
@@ -279,9 +264,7 @@ tnlGrid< 2, Real, Device, Index >::getFaceCoordinates( const Index faceIndex, in
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 2, Real, Device, Index > :: getVertexIndex( const CoordinatesType& vertexCoordinates ) const
 {
    tnlAssert( vertexCoordinates.x() >= 0 && vertexCoordinates.x() < this->getDimensions().x() + 1,
@@ -298,9 +281,7 @@ Index tnlGrid< 2, Real, Device, Index > :: getVertexIndex( const CoordinatesType
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlGrid< 2, Real, Device, Index > :: CoordinatesType
 tnlGrid< 2, Real, Device, Index > :: getVertexCoordinates( const Index vertexIndex ) const
 {
@@ -316,9 +297,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int dx, int dy >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 2, Real, Device, Index >::getCellNextToCell( const IndexType& cellIndex ) const
 {
    const IndexType result = cellIndex + dx + dy * this->getDimensions().x();
@@ -336,9 +315,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int nx, int ny >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 2, Real, Device, Index >::getFaceNextToCell( const IndexType& cellIndex ) const
 {
    tnlAssert( nx * ny == 0 && nx + ny != 0,
@@ -350,11 +327,11 @@ Index tnlGrid< 2, Real, Device, Index >::getFaceNextToCell( const IndexType& cel
    if( ny )
       result = this->numberOfNxFaces + cellIndex + ( ny + ( ny < 0 ) ) * this->getDimensions().x();
    tnlAssert( result >= 0 &&
-              result < this->getNumberOfFaces(),
+              result < ( this->template getNumberOfFaces< 1, 1 >() ),
               cerr << " cellIndex = " << cellIndex
                    << " nx = " << nx
                    << " ny = " << ny
-                   << " this->getNumberOfCells() = " << this->getNumberOfCells()
+                   << " this->getNumberOfCells() = " << ( this->template getNumberOfCells< 1, 1 >() )
                    << " this->getName() " << this->getName(); );
    return result;
 }
@@ -363,9 +340,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int nx, int ny >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 2, Real, Device, Index >::getCellNextToFace( const IndexType& faceIndex ) const
 {
    tnlAssert( abs( nx ) + abs( ny ) == 1,
@@ -394,9 +369,7 @@ Index tnlGrid< 2, Real, Device, Index >::getCellNextToFace( const IndexType& fac
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHx() const
 {
    return this->hx;
@@ -405,9 +378,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHx() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHxSquare() const
 {
    return this->hxSquare;
@@ -416,9 +387,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHxSquare() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHxInverse() const
 {
    return this->hxInverse;
@@ -427,9 +396,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHxInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHxSquareInverse() const
 {
    return this->hxSquareInverse;
@@ -438,9 +405,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHxSquareInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHy() const
 {
    return this->hy;
@@ -449,9 +414,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHy() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHySquare() const
 {
    return this->hySquare;
@@ -460,9 +423,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHySquare() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHyInverse() const
 {
    return this->hyInverse;
@@ -471,9 +432,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHyInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHySquareInverse() const
 {
    return this->hySquareInverse;
@@ -482,9 +441,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHySquareInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHxHy() const
 {
    return this->hxhy;
@@ -493,9 +450,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHxHy() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHxHyInverse() const
 {
    return this->hxhyInverse;
@@ -504,9 +459,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHxHyInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlGrid< 2, Real, Device, Index > :: getSmallestSpaceStep() const
 {
    return Min( this->hx, this->hy );
@@ -516,9 +469,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 2, Real, Device, Index > :: getCellCenter( const CoordinatesType& cellCoordinates ) const
 {
    tnlAssert( cellCoordinates.x() >= 0 && cellCoordinates.x() < this->getDimensions().x(),
@@ -538,9 +489,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 2, Real, Device, Index >::getCellCenter( const IndexType& cellIndex ) const
 {
    tnlAssert( cellIndex >= 0 && cellIndex < this->getNumberOfCells(),
@@ -554,9 +503,7 @@ template< typename Real,
           typename Device,
           typename Index >
 template< int nx, int ny, typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 2, Real, Device, Index > :: getFaceCenter( const CoordinatesType& faceCoordinates ) const
 {
    tnlStaticAssert( nx >= 0 && ny >= 0 && nx + ny == 1, "Wrong template parameters nx or ny." );
@@ -593,9 +540,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vertex >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 2, Real, Device, Index >::getVertex( const CoordinatesType& vertexCoordinates ) const
 {
    tnlAssert( vertexCoordinates.x() >= 0 && vertexCoordinates.x() < this->getDimensions().x() + 1,
@@ -614,9 +559,7 @@ Vertex tnlGrid< 2, Real, Device, Index >::getVertex( const CoordinatesType& vert
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 2, Real, Device, Index > :: getNumberOfCells() const
 {
    return this->numberOfCells;
@@ -627,9 +570,7 @@ template< typename Real,
           typename Index >
    template< int nx,
              int ny >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 2, Real, Device, Index > :: getNumberOfFaces() const
 {
    return nx * this->numberOfNxFaces + ny * this->numberOfNyFaces;
@@ -638,9 +579,7 @@ Index tnlGrid< 2, Real, Device, Index > :: getNumberOfFaces() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 2, Real, Device, Index > :: getNumberOfVertices() const
 {
    return this->numberOfVertices;
@@ -649,9 +588,7 @@ Index tnlGrid< 2, Real, Device, Index > :: getNumberOfVertices() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlGrid< 2, Real, Device, Index > :: isBoundaryCell( const CoordinatesType& cellCoordinates ) const
 {
    tnlAssert( cellCoordinates.x() >= 0 && cellCoordinates.x() < this->getDimensions().x(),
@@ -673,9 +610,7 @@ bool tnlGrid< 2, Real, Device, Index > :: isBoundaryCell( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool
 tnlGrid< 2, Real, Device, Index >::
 isBoundaryCell( const IndexType& cellIndex ) const
@@ -692,9 +627,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int nx, int ny >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 bool tnlGrid< 2, Real, Device, Index > :: isBoundaryFace( const CoordinatesType& faceCoordinates ) const
 {
    tnlStaticAssert( nx >= 0 && ny >= 0 && nx + ny == 1, "Wrong template parameters nx or ny." );
@@ -729,9 +662,7 @@ bool tnlGrid< 2, Real, Device, Index > :: isBoundaryFace( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlGrid< 2, Real, Device, Index > :: isBoundaryVertex( const CoordinatesType& vertexCoordinates ) const
 {
    tnlAssert( vertexCoordinates.x() >= 0 && vertexCoordinates.x() < this->getDimensions().x() + 1,
diff --git a/src/mesh/tnlGrid3D.h b/src/mesh/tnlGrid3D.h
index fb2d0ac080b16f8bfcd652a6250fc4dfd3e9d48c..9db36a288cd37b1bae31beaaa814fdbb45c78d5b 100644
--- a/src/mesh/tnlGrid3D.h
+++ b/src/mesh/tnlGrid3D.h
@@ -49,181 +49,113 @@ class tnlGrid< 3, Real, Device, Index > : public tnlObject
 
    void setDimensions( const CoordinatesType& );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const CoordinatesType& getDimensions() const;
 
    void setDomain( const VertexType& origin,
                    const VertexType& proportions );
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const VertexType& getOrigin() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const VertexType& getProportions() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const VertexType& getCellProportions() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getCellIndex( const CoordinatesType& cellCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    CoordinatesType getCellCoordinates( const IndexType cellIndex ) const;
 
    template< int nx, int ny, int nz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getFaceIndex( const CoordinatesType& faceCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    CoordinatesType getFaceCoordinates( const Index faceIndex, int& nx, int& ny, int& nz ) const;
 
    template< int dx, int dy, int dz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getEdgeIndex( const CoordinatesType& edgeCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    CoordinatesType getEdgeCoordinates( const Index edgeIndex, int& dx, int& dy, int& dz ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getVertexIndex( const CoordinatesType& vertexCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    CoordinatesType getVertexCoordinates( const Index vertexIndex ) const;
 
    template< int dx, int dy, int dz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getCellNextToCell( const IndexType& cellIndex ) const;
 
    template< int nx, int ny, int nz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getFaceNextToCell( const IndexType& cellIndex ) const;
 
    template< int nx, int ny, int nz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getCellNextToFace( const IndexType& cellIndex ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHx() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxSquare() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxSquareInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHy() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHySquare() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHyInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHySquareInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHz() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHzSquare() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHzInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHzSquareInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxHy() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxHz() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHyHz() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxHyInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxHzInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHyHzInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getSmallestSpaceStep() const;
 
    /****
@@ -234,9 +166,7 @@ class tnlGrid< 3, Real, Device, Index > : public tnlObject
 #else
    template< typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getCellCenter( const CoordinatesType& cellCoordinates ) const;
 
 #ifdef HAVE_NOT_CXX11
@@ -244,9 +174,7 @@ class tnlGrid< 3, Real, Device, Index > : public tnlObject
 #else
    template< typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getCellCenter( const IndexType& cellIndex ) const;
 
 #ifdef HAVE_NOT_CXX11
@@ -254,9 +182,7 @@ class tnlGrid< 3, Real, Device, Index > : public tnlObject
 #else
    template< int nx, int ny, int nz, typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getFaceCenter( const CoordinatesType& faceCoordinates ) const;
 
 #ifdef HAVE_NOT_CXX11
@@ -264,9 +190,7 @@ class tnlGrid< 3, Real, Device, Index > : public tnlObject
 #else
    template< int dx, int dy, int dz, typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getEdgeCenter( const CoordinatesType& edgeCoordinates ) const;
 
 #ifdef HAVE_NOT_CXX11
@@ -274,14 +198,10 @@ class tnlGrid< 3, Real, Device, Index > : public tnlObject
 #else
    template< typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getVertex( const CoordinatesType& vertexCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfCells() const;
 
 #ifdef HAVE_NOT_CXX11
@@ -293,9 +213,7 @@ class tnlGrid< 3, Real, Device, Index > : public tnlObject
              int ny = 1,
              int nz = 1 >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfFaces() const;
 
 #ifdef HAVE_NOT_CXX11
@@ -307,41 +225,27 @@ class tnlGrid< 3, Real, Device, Index > : public tnlObject
              int dy = 1,
              int dz = 1 >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfEdges() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfVertices() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryCell( const CoordinatesType& cellCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryCell( const IndexType& cellIndex ) const;
 
    template< int nx, int ny, int nz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryFace( const CoordinatesType& faceCoordinates ) const;
 
    template< int dx, int dy, int dz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryEdge( const CoordinatesType& edgeCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryVertex( const CoordinatesType& vertexCoordinates ) const;
 
    template< typename GridFunction >
diff --git a/src/mesh/tnlGrid3D_impl.h b/src/mesh/tnlGrid3D_impl.h
index 16daf5783152f2b0a6f4f8a5533cd637a2f7a9d3..147f038e595789a7d6ce4d0d4d4b74aa0fb18a4c 100644
--- a/src/mesh/tnlGrid3D_impl.h
+++ b/src/mesh/tnlGrid3D_impl.h
@@ -154,9 +154,7 @@ void tnlGrid< 3, Real, Device, Index > :: setDimensions( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 3, Real, Device, Index > :: CoordinatesType&
    tnlGrid< 3, Real, Device, Index > :: getDimensions() const
 {
@@ -177,9 +175,7 @@ void tnlGrid< 3, Real, Device, Index > :: setDomain( const VertexType& origin,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 3, Real, Device, Index >::VertexType&
 tnlGrid< 3, Real, Device, Index >::getOrigin() const
 {
@@ -189,9 +185,7 @@ tnlGrid< 3, Real, Device, Index >::getOrigin() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 3, Real, Device, Index > :: VertexType&
    tnlGrid< 3, Real, Device, Index > :: getProportions() const
 {
@@ -201,9 +195,7 @@ const typename tnlGrid< 3, Real, Device, Index > :: VertexType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 3, Real, Device, Index > :: VertexType&
    tnlGrid< 3, Real, Device, Index > :: getCellProportions() const
 {
@@ -251,9 +243,7 @@ template< typename Real,
           typename Device,
           typename Index >
 template< int nx, int ny, int nz >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index >::getFaceIndex( const CoordinatesType& faceCoordinates ) const
 {
    tnlStaticAssert( nx >= 0 && ny >= 0 && nz >= 0 && nx + ny + nz == 1, "Wrong template parameters nx or ny or nz." );
@@ -312,9 +302,7 @@ Index tnlGrid< 3, Real, Device, Index >::getFaceIndex( const CoordinatesType& fa
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 typename tnlGrid< 3, Real, Device, Index >::CoordinatesType
 tnlGrid< 3, Real, Device, Index >::getFaceCoordinates( const Index faceIndex, int& nx, int& ny, int& nz ) const
 {
@@ -356,9 +344,7 @@ template< typename Real,
           typename Device,
           typename Index >
 template< int dx, int dy, int dz >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index > :: getEdgeIndex( const CoordinatesType& edgeCoordinates ) const
 {
    tnlStaticAssert( dx >= 0 && dy >= 0 && dz >= 0 && dx + dy + dz == 1, "Wrong template parameters dx or dy or dz.");
@@ -417,9 +403,7 @@ Index tnlGrid< 3, Real, Device, Index > :: getEdgeIndex( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 typename tnlGrid< 3, Real, Device, Index > :: CoordinatesType
 tnlGrid< 3, Real, Device, Index > :: getEdgeCoordinates( const Index edgeIndex, int& dx, int& dy, int& dz ) const
 {
@@ -463,9 +447,7 @@ tnlGrid< 3, Real, Device, Index > :: getEdgeCoordinates( const Index edgeIndex,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index > :: getVertexIndex( const CoordinatesType& vertexCoordinates ) const
 {
    tnlAssert( vertexCoordinates.x() >= 0 && vertexCoordinates.x() < this->getDimensions().x() + 1,
@@ -487,9 +469,7 @@ Index tnlGrid< 3, Real, Device, Index > :: getVertexIndex( const CoordinatesType
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlGrid< 3, Real, Device, Index > :: CoordinatesType
 tnlGrid< 3, Real, Device, Index > :: getVertexCoordinates( const Index vertexIndex ) const
 {
@@ -509,9 +489,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int dx, int dy, int dz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index > :: getCellNextToCell( const IndexType& cellIndex ) const
 {
    tnlAssert( cellIndex + dx >= 0 &&
@@ -535,9 +513,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int nx, int ny, int nz >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index >::getFaceNextToCell( const IndexType& cellIndex ) const
 {
    tnlAssert( nx * ny * nz == 0 && nx + ny + nz != 0,
@@ -566,9 +542,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int nx, int ny, int nz >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index >::getCellNextToFace( const IndexType& faceIndex ) const
 {
    tnlAssert( abs( nx ) + abs( ny ) + abs( nz ) == 1,
@@ -616,9 +590,7 @@ Index tnlGrid< 3, Real, Device, Index >::getCellNextToFace( const IndexType& fac
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHx() const
 {
    return this->hx;
@@ -627,9 +599,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHx() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHxSquare() const
 {
    return this->hxSquare;
@@ -638,9 +608,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHxSquare() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHxInverse() const
 {
    return this->hxInverse;
@@ -649,9 +617,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHxInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHxSquareInverse() const
 {
    return this->hxSquareInverse;
@@ -660,9 +626,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHxSquareInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHy() const
 {
    return this->hy;
@@ -671,9 +635,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHy() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHySquare() const
 {
    return this->hySquare;
@@ -682,9 +644,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHySquare() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHyInverse() const
 {
    return this->hyInverse;
@@ -693,9 +653,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHyInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHySquareInverse() const
 {
    return this->hySquareInverse;
@@ -704,9 +662,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHySquareInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHz() const
 {
    return this->hz;
@@ -715,9 +671,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHz() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHzSquare() const
 {
    return this->hzSquare;
@@ -726,9 +680,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHzSquare() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHzInverse() const
 {
    return this->hzInverse;
@@ -737,9 +689,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHzInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHzSquareInverse() const
 {
    return this->hzSquareInverse;
@@ -748,9 +698,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHzSquareInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHxHy() const
 {
    return this->hxhy;
@@ -759,9 +707,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHxHy() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHxHz() const
 {
    return this->hxhz;
@@ -770,9 +716,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHxHz() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHyHz() const
 {
    return this->hyhz;
@@ -781,9 +725,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHyHz() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHxHyInverse() const
 {
    return this->hxhyInverse;
@@ -792,9 +734,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHxHyInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHxHzInverse() const
 {
    return this->hxhzInverse;
@@ -803,9 +743,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHxHzInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHyHzInverse() const
 {
    return this->hyhzInverse;
@@ -814,9 +752,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHyHzInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlGrid< 3, Real, Device, Index > :: getSmallestSpaceStep() const
 {
    return Min( this->hx, Min( this->hy, this->hz ) );
@@ -827,9 +763,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 3, Real, Device, Index > :: getCellCenter( const CoordinatesType& cellCoordinates ) const
 {
    tnlAssert( cellCoordinates.x() >= 0 && cellCoordinates.x() < this->getDimensions().x(),
@@ -855,9 +789,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 3, Real, Device, Index >::getCellCenter( const IndexType& cellIndex ) const
 {
    tnlAssert( cellIndex >= 0 && cellIndex < this->getNumberOfCells(),
@@ -871,9 +803,7 @@ template< typename Real,
           typename Device,
           typename Index >
 template< int nx, int ny, int nz, typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 3, Real, Device, Index > :: getFaceCenter( const CoordinatesType& faceCoordinates ) const
 {
    tnlStaticAssert( nx >= 0 && ny >= 0 && nz >= 0 && nx + ny + nz == 1, "Wrong template parameters nx or ny or nz." );
@@ -938,9 +868,7 @@ template< typename Real,
           typename Device,
           typename Index >
 template< int dx, int dy, int dz, typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 3, Real, Device, Index > :: getEdgeCenter( const CoordinatesType& edgeCoordinates ) const
 {
    tnlStaticAssert( dx >= 0 && dy >= 0 && dz >= 0 && dx + dy + dz == 1, "Wrong template parameters nx or ny or nz." );
@@ -1004,9 +932,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vertex >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 3, Real, Device, Index >::getVertex( const CoordinatesType& vertexCoordinates ) const
 {
    tnlAssert( vertexCoordinates.x() >= 0 && vertexCoordinates.x() < this->getDimensions().x() + 1,
@@ -1030,9 +956,7 @@ Vertex tnlGrid< 3, Real, Device, Index >::getVertex( const CoordinatesType& vert
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index > :: getNumberOfCells() const
 {
    return this->numberOfCells;
@@ -1042,9 +966,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int nx, int ny, int nz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index > :: getNumberOfFaces() const
 {
    return nx * this->numberOfNxFaces +
@@ -1056,9 +978,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int dx, int dy, int dz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index > :: getNumberOfEdges() const
 {
    return dx * this->numberOfDxEdges +
@@ -1069,9 +989,7 @@ Index tnlGrid< 3, Real, Device, Index > :: getNumberOfEdges() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index > :: getNumberOfVertices() const
 {
    return numberOfVertices;
@@ -1080,9 +998,7 @@ Index tnlGrid< 3, Real, Device, Index > :: getNumberOfVertices() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlGrid< 3, Real, Device, Index > :: isBoundaryCell( const CoordinatesType& cellCoordinates ) const
 {
    tnlAssert( cellCoordinates.x() >= 0 && cellCoordinates.x() < this->getDimensions().x(),
@@ -1109,9 +1025,7 @@ bool tnlGrid< 3, Real, Device, Index > :: isBoundaryCell( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool
 tnlGrid< 3, Real, Device, Index >::
 isBoundaryCell( const IndexType& cellIndex ) const
@@ -1127,9 +1041,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int nx, int ny, int nz >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 bool tnlGrid< 3, Real, Device, Index > :: isBoundaryFace( const CoordinatesType& faceCoordinates ) const
 {
    tnlStaticAssert( nx >= 0 && ny >= 0 && nz >=0 && nx + ny + nz == 1, "Wrong template parameters nx or ny or nz." );
@@ -1185,9 +1097,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int dx, int dy, int dz >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 bool tnlGrid< 3, Real, Device, Index >::isBoundaryEdge( const CoordinatesType& edgeCoordinates ) const
 {
    tnlStaticAssert( dx >= 0 && dy >= 0 && dz >= 0 && dx + dy + dz == 1, "Wrong template parameters nx or ny or nz." );
@@ -1254,9 +1164,7 @@ bool tnlGrid< 3, Real, Device, Index >::isBoundaryEdge( const CoordinatesType& e
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlGrid< 3, Real, Device, Index > :: isBoundaryVertex( const CoordinatesType& vertexCoordinates ) const
 {
    tnlAssert( vertexCoordinates.x() >= 0 && vertexCoordinates.x() < this->getDimensions().x() + 1,
diff --git a/src/mesh/tnlGrid_impl.cpp b/src/mesh/tnlGrid_impl.cpp
index 28a56bf44788c1a05e5e221f2ee1b1ddacd5aa47..fbf9ba34508088c78cc555c7d4fb4201758c54c4 100644
--- a/src/mesh/tnlGrid_impl.cpp
+++ b/src/mesh/tnlGrid_impl.cpp
@@ -21,32 +21,48 @@
 
 template class tnlGrid< 1, float,  tnlHost, int >;
 template class tnlGrid< 1, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlGrid< 1, float,  tnlHost, long int >;
 template class tnlGrid< 1, double, tnlHost, long int >;
+#endif
+
 template class tnlGrid< 2, float,  tnlHost, int >;
 template class tnlGrid< 2, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlGrid< 2, float,  tnlHost, long int >;
 template class tnlGrid< 2, double, tnlHost, long int >;
+#endif
+
 template class tnlGrid< 3, float,  tnlHost, int >;
 template class tnlGrid< 3, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlGrid< 3, float,  tnlHost, long int >;
 template class tnlGrid< 3, double, tnlHost, long int >;
+#endif
 
 #ifdef HAVE_CUDA
 #endif
 
 template class tnlGrid< 1, float,  tnlCuda, int >;
 template class tnlGrid< 1, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlGrid< 1, float,  tnlCuda, long int >;
 template class tnlGrid< 1, double, tnlCuda, long int >;
+#endif
+
 template class tnlGrid< 2, float,  tnlCuda, int >;
 template class tnlGrid< 2, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlGrid< 2, float,  tnlCuda, long int >;
 template class tnlGrid< 2, double, tnlCuda, long int >;
+#endif
+
 template class tnlGrid< 3, float,  tnlCuda, int >;
 template class tnlGrid< 3, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlGrid< 3, float,  tnlCuda, long int >;
 template class tnlGrid< 3, double, tnlCuda, long int >;
+#endif
 
 #endif
 
diff --git a/src/mesh/tnlTraverser_Grid2D_impl.h b/src/mesh/tnlTraverser_Grid2D_impl.h
index 5e778ec253fe4d71f99291c23bde7cff06dbbaf3..4d8efa2df82898134c560b3c9458f6d71b313dfe 100644
--- a/src/mesh/tnlTraverser_Grid2D_impl.h
+++ b/src/mesh/tnlTraverser_Grid2D_impl.h
@@ -167,6 +167,25 @@ processBoundaryEntities( const GridType& grid,
    /****
     * Traversing boundary vertices
     */
+   CoordinatesType coordinates;
+   const IndexType& xSize = grid.getDimensions().x();
+   const IndexType& ySize = grid.getDimensions().y();
+
+   for( coordinates.x() = 0; coordinates.x() <= xSize; coordinates.x() ++ )
+   {
+      coordinates.y() = 0;
+      EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+      coordinates.y() = ySize;
+      EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+   }
+   for( coordinates.y() = 1; coordinates.y() <= ySize; coordinates.y() ++ )
+   {
+      coordinates.x() = 0;
+      EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+      coordinates.x() = xSize;
+      EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+   }
+   
 }
 
 template< typename Real,
@@ -181,8 +200,20 @@ processInteriorEntities( const GridType& grid,
    /****
     * Traversing interior vertices
     */
-}
+   CoordinatesType coordinates;
+   const IndexType& xSize = grid.getDimensions().x();
+   const IndexType& ySize = grid.getDimensions().y();
 
+#ifdef HAVE_OPENMP
+//#pragma omp parallel for
+#endif
+   for( coordinates.y() = 1; coordinates.y() < ySize; coordinates.y() ++ )
+      for( coordinates.x() = 1; coordinates.x() < xSize; coordinates.x() ++ )
+      {
+         const IndexType index = grid.getVertexIndex( coordinates );
+         EntitiesProcessor::processVertex( grid, userData, index, coordinates );
+      }  
+}
 
 /***
  *
@@ -284,7 +315,7 @@ __global__ void tnlTraverserGrid2DBoundaryFaces( const tnlGrid< 2, Real, tnlCuda
    if( faceCoordinates.x() < grid->getDimensions().x() + nx &&
        faceCoordinates.y() < grid->getDimensions().y() + ny )
    {
-      if( grid->isBoundaryFace( faceCoordinates ) )
+      if( grid->template isBoundaryFace< nx, ny >( faceCoordinates ) )
       {
          //printf( "Processing boundary conditions at %d %d \n", cellCoordinates.x(), cellCoordinates.y() );
          EntitiesProcessor::processFace( *grid,
@@ -320,7 +351,7 @@ __global__ void tnlTraverserGrid2DInteriorFaces( const tnlGrid< 2, Real, tnlCuda
    if( faceCoordinates.x() < grid->getDimensions().x() + nx &&
        faceCoordinates.y() < grid->getDimensions().y() + ny )
    {
-      if( ! grid->isBoundaryFace( faceCoordinates ) )
+      if( ! grid->template isBoundaryFace< nx, ny >( faceCoordinates ) )
       {
          //printf( "Processing interior conditions at %d %d \n", cellCoordinates.x(), cellCoordinates.y() );
          EntitiesProcessor::processFace( *grid,
@@ -331,6 +362,71 @@ __global__ void tnlTraverserGrid2DInteriorFaces( const tnlGrid< 2, Real, tnlCuda
    }
 }
 
+template< typename Real,
+          typename Index,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void tnlTraverserGrid2DBoundaryVertices( const tnlGrid< 2, Real, tnlCuda, Index >* grid,
+                                                    UserData* userData,
+                                                    const Index gridXIdx,
+                                                    const Index gridYIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef tnlGrid< 2, Real, tnlCuda, Index > GridType;
+   typedef typename GridType::CoordinatesType CoordinatesType;
+
+   const IndexType& xSize = grid->getDimensions().x();
+   const IndexType& ySize = grid->getDimensions().y();
+
+   CoordinatesType vertexCoordinates( ( gridXIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x,
+                                      ( gridYIdx * tnlCuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y );
+
+   if( vertexCoordinates.x() <= grid->getDimensions().x() &&
+       vertexCoordinates.y() <= grid->getDimensions().y() )
+   {
+      if( grid->isBoundaryVertex( vertexCoordinates ) )
+      {
+         EntitiesProcessor::processVertex( *grid,
+                                           *userData,
+                                           grid->getVertexIndex( vertexCoordinates ),
+                                           vertexCoordinates );
+      }
+   }
+}
+
+template< typename Real,
+          typename Index,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void tnlTraverserGrid2DInteriorVertices( const tnlGrid< 2, Real, tnlCuda, Index >* grid,
+                                                    UserData* userData,
+                                                    const Index gridXIdx,
+                                                    const Index gridYIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef tnlGrid< 2, Real, tnlCuda, Index > GridType;
+   typedef typename GridType::CoordinatesType CoordinatesType;
+
+   const IndexType& xSize = grid->getDimensions().x();
+   const IndexType& ySize = grid->getDimensions().y();
+
+   CoordinatesType vertexCoordinates( ( gridXIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x,
+                                      ( gridYIdx * tnlCuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y );
+
+   if( vertexCoordinates.x() <= grid->getDimensions().x() &&
+       vertexCoordinates.y() <= grid->getDimensions().y() )
+   {
+      if( ! grid->isBoundaryVertex( vertexCoordinates ) )
+      {
+         EntitiesProcessor::processVertex( *grid,
+                                           *userData,
+                                           grid->getVertexIndex( vertexCoordinates ),
+                                           vertexCoordinates );
+      }
+   }
+}
 
 
 #endif
@@ -368,9 +464,9 @@ processBoundaryEntities( const GridType& grid,
                                          kernelUserData,
                                          gridXIdx,
                                          gridYIdx );
+         checkCudaDevice;
       }
-   cudaThreadSynchronize();
-   checkCudaDevice;
+   cudaThreadSynchronize();   
 #endif
 }
 
@@ -406,8 +502,8 @@ processInteriorEntities( const GridType& grid,
                                          kernelUserData,
                                          gridXIdx,
                                          gridYIdx );
+         checkCudaDevice;
       }
-   checkCudaDevice;
    tnlCuda::freeFromDevice( kernelGrid );
    tnlCuda::freeFromDevice( kernelUserData );
 #endif
@@ -450,9 +546,10 @@ processBoundaryEntities( const GridType& grid,
                                          kernelUserData,
                                          gridXIdx,
                                          gridYIdx );
+         checkCudaDevice;
       }
    cudaThreadSynchronize();
-   checkCudaDevice;
+   
 
    /****
     * < 0, 1 > faces
@@ -470,9 +567,10 @@ processBoundaryEntities( const GridType& grid,
                                          kernelUserData,
                                          gridXIdx,
                                          gridYIdx );
+         checkCudaDevice;
       }
    cudaThreadSynchronize();
-   checkCudaDevice;
+   
 #endif
 
 }
@@ -517,9 +615,10 @@ processInteriorEntities( const GridType& grid,
                                          kernelUserData,
                                          gridXIdx,
                                          gridYIdx );
+         checkCudaDevice;
       }
    cudaThreadSynchronize();
-   checkCudaDevice;
+   
 
    /****
     * < 0, 1 > faces
@@ -537,9 +636,9 @@ processInteriorEntities( const GridType& grid,
                                          kernelUserData,
                                          gridXIdx,
                                          gridYIdx );
+         checkCudaDevice;
       }
    cudaThreadSynchronize();
-   checkCudaDevice;
 #endif
 }
 
@@ -552,9 +651,33 @@ tnlTraverser< tnlGrid< 2, Real, tnlCuda, Index >, 0 >::
 processBoundaryEntities( const GridType& grid,
                          UserData& userData ) const
 {
+#ifdef HAVE_CUDA
    /****
-    * Boundary interior vertices
+    * Traversing boundary vertices    
     */
+   GridType* kernelGrid = tnlCuda::passToDevice( grid );
+   UserData* kernelUserData = tnlCuda::passToDevice( userData );
+
+   dim3 cudaBlockSize( 16, 16 );
+   dim3 cudaBlocks;
+   cudaBlocks.x = tnlCuda::getNumberOfBlocks( grid.getDimensions().x() + 1, cudaBlockSize.x );
+   cudaBlocks.y = tnlCuda::getNumberOfBlocks( grid.getDimensions().y() + 1, cudaBlockSize.y );
+   const IndexType cudaXGrids = tnlCuda::getNumberOfGrids( cudaBlocks.x );
+   const IndexType cudaYGrids = tnlCuda::getNumberOfGrids( cudaBlocks.y );
+
+   for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+      for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
+      {
+         tnlTraverserGrid2DBoundaryVertices< Real, Index, UserData, EntitiesProcessor >
+                                           <<< cudaBlocks, cudaBlockSize >>>
+                                          ( kernelGrid,
+                                            kernelUserData,
+                                            gridXIdx,
+                                            gridYIdx );
+         checkCudaDevice;
+      }
+   cudaThreadSynchronize();   
+#endif
 }
 
 
@@ -567,9 +690,33 @@ tnlTraverser< tnlGrid< 2, Real, tnlCuda, Index >, 0 >::
 processInteriorEntities( const GridType& grid,
                          UserData& userData ) const
 {
+#ifdef HAVE_CUDA
    /****
-    * Traversing interior vertices
+    * Traversing interior vertices    
     */
+   GridType* kernelGrid = tnlCuda::passToDevice( grid );
+   UserData* kernelUserData = tnlCuda::passToDevice( userData );
+
+   dim3 cudaBlockSize( 16, 16 );
+   dim3 cudaBlocks;
+   cudaBlocks.x = tnlCuda::getNumberOfBlocks( grid.getDimensions().x() + 1, cudaBlockSize.x );
+   cudaBlocks.y = tnlCuda::getNumberOfBlocks( grid.getDimensions().y() + 1, cudaBlockSize.y );
+   const IndexType cudaXGrids = tnlCuda::getNumberOfGrids( cudaBlocks.x );
+   const IndexType cudaYGrids = tnlCuda::getNumberOfGrids( cudaBlocks.y );
+
+   for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+      for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
+      {
+         tnlTraverserGrid2DInteriorVertices< Real, Index, UserData, EntitiesProcessor >
+                                           <<< cudaBlocks, cudaBlockSize >>>
+                                          ( kernelGrid,
+                                            kernelUserData,
+                                            gridXIdx,
+                                            gridYIdx );
+         checkCudaDevice;
+      }
+   cudaThreadSynchronize();   
+#endif
 }
 
 
diff --git a/src/mesh/tnlTraverser_Grid3D_impl.h b/src/mesh/tnlTraverser_Grid3D_impl.h
index 9cd074ca5fbad055e2e0f5c6420df935b767835b..739e12cd5e58f3e459911f118af61fb421e76cca 100644
--- a/src/mesh/tnlTraverser_Grid3D_impl.h
+++ b/src/mesh/tnlTraverser_Grid3D_impl.h
@@ -161,6 +161,37 @@ processBoundaryEntities( const GridType& grid,
    /****
     * Traversing boundary vertices
     */
+   CoordinatesType coordinates;
+   const IndexType& xSize = grid.getDimensions().x();
+   const IndexType& ySize = grid.getDimensions().y();
+   const IndexType& zSize = grid.getDimensions().z();
+
+   for( coordinates.y() = 0; coordinates.y() <= ySize; coordinates.y() ++ )
+      for( coordinates.x() = 0; coordinates.x() <= xSize; coordinates.x() ++ )
+      {
+         coordinates.z() = 0;
+         EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+         coordinates.z() = zSize;
+         EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+      }
+
+   for( coordinates.z() = 0; coordinates.z() <= zSize; coordinates.z() ++ )
+      for( coordinates.x() = 0; coordinates.x() <= xSize; coordinates.x() ++ )
+      {
+         coordinates.y() = 0;
+         EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+         coordinates.y() = ySize;
+         EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+      }
+
+   for( coordinates.z() = 0; coordinates.z() <= zSize; coordinates.z() ++ )
+      for( coordinates.y() = 0; coordinates.y() <= ySize; coordinates.y() ++ )
+      {
+         coordinates.x() = 0;
+         EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+         coordinates.x() = xSize;
+         EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+      }
 }
 
 template< typename Real,
@@ -175,6 +206,21 @@ processInteriorEntities( const GridType& grid,
    /****
     * Traversing interior vertices
     */
+   CoordinatesType coordinates;
+   const IndexType& xSize = grid.getDimensions().x();
+   const IndexType& ySize = grid.getDimensions().y();
+   const IndexType& zSize = grid.getDimensions().z();
+
+#ifdef HAVE_OPENMP
+//#pragma omp parallel for
+#endif
+   for( coordinates.z() = 1; coordinates.z() < zSize; coordinates.z() ++ )
+      for( coordinates.y() = 1; coordinates.y() < ySize; coordinates.y() ++ )
+         for( coordinates.x() = 1; coordinates.x() < xSize; coordinates.x() ++ )
+         {
+            const IndexType index = grid.getVertexIndex( coordinates );
+            EntitiesProcessor::processVertex( grid, userData, index, coordinates );
+         }
 }
 
 
@@ -260,6 +306,81 @@ __global__ void tnlTraverserGrid3DInteriorCells( const tnlGrid< 3, Real, tnlCuda
    }
 }
 
+template< typename Real,
+          typename Index,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void tnlTraverserGrid3DBoundaryVertices( const tnlGrid< 3, Real, tnlCuda, Index >* grid,
+                                                    UserData* userData,
+                                                    const Index gridXIdx,
+                                                    const Index gridYIdx,
+                                                    const Index gridZIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef tnlGrid< 3, Real, tnlCuda, Index > GridType;
+   typedef typename GridType::CoordinatesType CoordinatesType;
+
+   const IndexType& xSize = grid->getDimensions().x();
+   const IndexType& ySize = grid->getDimensions().y();
+   const IndexType& zSize = grid->getDimensions().z();
+
+   CoordinatesType vertexCoordinates( ( gridXIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x,
+                                      ( gridYIdx * tnlCuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y,
+                                      ( gridZIdx * tnlCuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z );
+
+   if( vertexCoordinates.x() < grid->getDimensions().x() &&
+       vertexCoordinates.y() < grid->getDimensions().y() &&
+       vertexCoordinates.z() < grid->getDimensions().z() )
+   {
+      if( grid->isBoundaryVertex( vertexCoordinates ) )
+      {
+         EntitiesProcessor::processVertex( *grid,
+                                           *userData,
+                                           grid->getVertexIndex( vertexCoordinates ),
+                                           vertexCoordinates );
+      }
+   }
+}
+
+template< typename Real,
+          typename Index,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void tnlTraverserGrid3DInteriorVertices( const tnlGrid< 3, Real, tnlCuda, Index >* grid,
+                                                    UserData* userData,
+                                                    const Index gridXIdx,
+                                                    const Index gridYIdx,
+                                                    const Index gridZIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef tnlGrid< 3, Real, tnlCuda, Index > GridType;
+   typedef typename GridType::CoordinatesType CoordinatesType;
+
+   const IndexType& xSize = grid->getDimensions().x();
+   const IndexType& ySize = grid->getDimensions().y();
+   const IndexType& zSize = grid->getDimensions().z();
+
+   CoordinatesType vertexCoordinates( ( gridXIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x,
+                                      ( gridYIdx * tnlCuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y,
+                                      ( gridZIdx * tnlCuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z );
+
+   if( vertexCoordinates.x() < grid->getDimensions().x() &&
+       vertexCoordinates.y() < grid->getDimensions().y() &&
+       vertexCoordinates.z() < grid->getDimensions().z() )
+   {
+      if( ! grid->isBoundaryVertex( vertexCoordinates ) )
+      {
+         EntitiesProcessor::processVertex( *grid,
+                                           *userData,
+                                           grid->getVertexIndex( vertexCoordinates ),
+                                           vertexCoordinates );
+      }
+   }
+}
+
+
 #endif
 
 template< typename Real,
@@ -419,6 +540,35 @@ processBoundaryEntities( const GridType& grid,
    /****
     * Traversing boundary vertices
     */
+#ifdef HAVE_CUDA
+   GridType* kernelGrid = tnlCuda::passToDevice( grid );
+   UserData* kernelUserData = tnlCuda::passToDevice( userData );
+
+   dim3 cudaBlockSize( 8, 8, 4 );
+   dim3 cudaBlocks;
+   cudaBlocks.x = tnlCuda::getNumberOfBlocks( grid.getDimensions().x() + 1, cudaBlockSize.x );
+   cudaBlocks.y = tnlCuda::getNumberOfBlocks( grid.getDimensions().y() + 1, cudaBlockSize.y );
+   cudaBlocks.z = tnlCuda::getNumberOfBlocks( grid.getDimensions().z() + 1, cudaBlockSize.z );
+   const IndexType cudaXGrids = tnlCuda::getNumberOfGrids( cudaBlocks.x );
+   const IndexType cudaYGrids = tnlCuda::getNumberOfGrids( cudaBlocks.y );
+   const IndexType cudaZGrids = tnlCuda::getNumberOfGrids( cudaBlocks.z );
+
+   for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+      for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
+         for( IndexType gridZIdx = 0; gridZIdx < cudaZGrids; gridZIdx ++ )
+         {
+            tnlTraverserGrid3DBoundaryVertices< Real, Index, UserData, EntitiesProcessor >
+                                              <<< cudaBlocks, cudaBlockSize >>>
+                                             ( kernelGrid,
+                                               kernelUserData,
+                                               gridXIdx,
+                                               gridYIdx,
+                                               gridZIdx );
+         }
+   cudaThreadSynchronize();
+   checkCudaDevice;
+#endif
+   
 }
 
 template< typename Real,
@@ -433,6 +583,35 @@ processInteriorEntities( const GridType& grid,
    /****
     * Traversing interior vertices
     */
+#ifdef HAVE_CUDA
+   GridType* kernelGrid = tnlCuda::passToDevice( grid );
+   UserData* kernelUserData = tnlCuda::passToDevice( userData );
+
+   dim3 cudaBlockSize( 8, 8, 4 );
+   dim3 cudaBlocks;
+   cudaBlocks.x = tnlCuda::getNumberOfBlocks( grid.getDimensions().x() + 1, cudaBlockSize.x );
+   cudaBlocks.y = tnlCuda::getNumberOfBlocks( grid.getDimensions().y() + 1, cudaBlockSize.y );
+   cudaBlocks.z = tnlCuda::getNumberOfBlocks( grid.getDimensions().z() + 1, cudaBlockSize.z );
+   const IndexType cudaXGrids = tnlCuda::getNumberOfGrids( cudaBlocks.x );
+   const IndexType cudaYGrids = tnlCuda::getNumberOfGrids( cudaBlocks.y );
+   const IndexType cudaZGrids = tnlCuda::getNumberOfGrids( cudaBlocks.z );
+
+   for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+      for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
+         for( IndexType gridZIdx = 0; gridZIdx < cudaZGrids; gridZIdx ++ )
+         {
+            tnlTraverserGrid3DInteriorVertices< Real, Index, UserData, EntitiesProcessor >
+                                              <<< cudaBlocks, cudaBlockSize >>>
+                                             ( kernelGrid,
+                                               kernelUserData,
+                                               gridXIdx,
+                                               gridYIdx,
+                                               gridZIdx );
+         }
+   cudaThreadSynchronize();
+   checkCudaDevice;
+#endif
+   
 }
 
 
diff --git a/src/operators/CMakeLists.txt b/src/operators/CMakeLists.txt
index 00448fe209f6d29bd4dbdb22fb4ed9267b1c2fa7..440ee4020d07fb4c2dd9d2b5777c2556a26c8525 100755
--- a/src/operators/CMakeLists.txt
+++ b/src/operators/CMakeLists.txt
@@ -14,7 +14,9 @@ SET( headers tnlFiniteDifferences.h
              tnlNeumannBoundaryConditions_impl.h
              tnlAnalyticNeumannBoundaryConditions.h
              tnlAnalyticNeumannBoundaryConditions_impl.h
-             tnlExactOperatorEvaluator.h )
+             tnlExactOperatorEvaluator.h
+             tnlOperatorEnumerator.h
+             tnlOperatorEnumerator_impl.h )
              
 SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/operators )
 
diff --git a/src/operators/diffusion/tnlExactLinearDiffusion.h b/src/operators/diffusion/tnlExactLinearDiffusion.h
index 17250df6356a0fa03932fade2a9b694f3077f276..28018f0684c68958eccfb10ca897ad277ac20536 100644
--- a/src/operators/diffusion/tnlExactLinearDiffusion.h
+++ b/src/operators/diffusion/tnlExactLinearDiffusion.h
@@ -18,7 +18,7 @@
 #ifndef TNLEXACTLINEARDIFFUSION_H_
 #define TNLEXACTLINEARDIFFUSION_H_
 
-#include <functions/tnlFunctionType.h>
+#include <functors/tnlFunctionType.h>
 
 template< int Dimensions >
 class tnlExactLinearDiffusion
@@ -38,9 +38,7 @@ class tnlExactLinearDiffusion< 1 >
 #else   
       template< typename Function, typename Vertex, typename Real = typename Vertex::RealType >
 #endif      
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       static Real getValue( const Function& function,
                             const Vertex& v,
                             const Real& time = 0.0 );
@@ -60,9 +58,7 @@ class tnlExactLinearDiffusion< 2 >
 #else   
       template< typename Function, typename Vertex, typename Real = typename Vertex::RealType >
 #endif
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif      
+      __cuda_callable__
       static Real getValue( const Function& function,
                             const Vertex& v,
                             const Real& time = 0.0 );
@@ -82,9 +78,7 @@ class tnlExactLinearDiffusion< 3 >
 #else   
       template< typename Function, typename Vertex, typename Real = typename Vertex::RealType >
 #endif
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       static Real getValue( const Function& function,
                             const Vertex& v,
                             const Real& time = 0.0 );
diff --git a/src/operators/diffusion/tnlExactLinearDiffusion_impl.h b/src/operators/diffusion/tnlExactLinearDiffusion_impl.h
index 19d3dc7663aeb31efe1ee80656dc2ab096526a72..33bd68192b922c91c7b42f7be98028c644b78b9e 100644
--- a/src/operators/diffusion/tnlExactLinearDiffusion_impl.h
+++ b/src/operators/diffusion/tnlExactLinearDiffusion_impl.h
@@ -26,9 +26,7 @@ getType()
 }
 
 template< typename Function, typename Vertex, typename Real >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlExactLinearDiffusion< 1 >::
 getValue( const Function& function,
@@ -46,9 +44,7 @@ getType()
 }
 
 template< typename Function, typename Vertex, typename Real >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlExactLinearDiffusion< 2 >::
 getValue( const Function& function,
@@ -67,9 +63,7 @@ getType()
 }
 
 template< typename Function, typename Vertex, typename Real >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlExactLinearDiffusion< 3 >::
 getValue( const Function& function,
diff --git a/src/operators/diffusion/tnlLinearDiffusion.h b/src/operators/diffusion/tnlLinearDiffusion.h
index 35eebca181a439adf938f69974ca3ad0f77f9d63..5702538a80e59b4101429e2ad2e4d1c8fbe9d47e 100644
--- a/src/operators/diffusion/tnlLinearDiffusion.h
+++ b/src/operators/diffusion/tnlLinearDiffusion.h
@@ -1,3 +1,20 @@
+/***************************************************************************
+                          tnlLinearDiffusion.h  -  description
+                             -------------------
+    begin                : Aug 8, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
 #ifndef TNLLINEARDIFFUSION_H
 #define	TNLLINEARDIFFUSION_H
 
@@ -20,37 +37,32 @@ template< typename MeshReal,
           typename Index >
 class tnlLinearDiffusion< tnlGrid< 1,MeshReal, Device, MeshIndex >, Real, Index >
 {
-   public: 
-   
-   typedef tnlGrid< 1, MeshReal, Device, MeshIndex > MeshType;
-   typedef typename MeshType::CoordinatesType CoordinatesType;
-   typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
-
-   static tnlString getType();
+   public:    
    
-   template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Real getValue( const MeshType& mesh,
-                  const IndexType cellIndex,
-                  const CoordinatesType& coordinates,
-                  const Vector& u,
-                  const RealType& time ) const;
-
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getLinearSystemRowLength( const MeshType& mesh,
-                                   const IndexType& index,
-                                   const CoordinatesType& coordinates ) const;
-
-   template< typename Vector, typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      typedef tnlGrid< 1, MeshReal, Device, MeshIndex > MeshType;
+      typedef typename MeshType::CoordinatesType CoordinatesType;
+      typedef Real RealType;
+      typedef Device DeviceType;
+      typedef Index IndexType;
+      enum { Dimensions = MeshType::Dimensions };
+
+      static tnlString getType();
+
+      template< typename Vector >
+      __cuda_callable__
+      Real getValue( const MeshType& mesh,
+                     const IndexType cellIndex,
+                     const CoordinatesType& coordinates,
+                     const Vector& u,
+                     const RealType& time ) const;
+
+      __cuda_callable__
+      Index getLinearSystemRowLength( const MeshType& mesh,
+                                      const IndexType& index,
+                                      const CoordinatesType& coordinates ) const;
+
+      template< typename Vector, typename MatrixRow >
+      __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const RealType& tau,
                                const MeshType& mesh,
@@ -72,35 +84,30 @@ class tnlLinearDiffusion< tnlGrid< 2, MeshReal, Device, MeshIndex >, Real, Index
 {
    public: 
    
-   typedef tnlGrid< 2, MeshReal, Device, MeshIndex > MeshType;
-   typedef typename MeshType::CoordinatesType CoordinatesType;
-   typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
-
-   static tnlString getType();
-   
-   template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Real getValue( const MeshType& mesh,
-                  const IndexType cellIndex,
-                  const CoordinatesType& coordinates,
-                  const Vector& u,
-                  const Real& time ) const;
-
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getLinearSystemRowLength( const MeshType& mesh,
-                                   const IndexType& index,
-                                   const CoordinatesType& coordinates ) const;
-
-   template< typename Vector, typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      typedef tnlGrid< 2, MeshReal, Device, MeshIndex > MeshType;
+      typedef typename MeshType::CoordinatesType CoordinatesType;
+      typedef Real RealType;
+      typedef Device DeviceType;
+      typedef Index IndexType;
+      enum { Dimensions = MeshType::Dimensions };
+
+      static tnlString getType();
+
+      template< typename Vector >
+      __cuda_callable__
+      Real getValue( const MeshType& mesh,
+                     const IndexType cellIndex,
+                     const CoordinatesType& coordinates,
+                     const Vector& u,
+                     const Real& time ) const;
+
+      __cuda_callable__
+      Index getLinearSystemRowLength( const MeshType& mesh,
+                                      const IndexType& index,
+                                      const CoordinatesType& coordinates ) const;
+
+      template< typename Vector, typename MatrixRow >
+      __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const RealType& tau,
                                const MeshType& mesh,
@@ -121,35 +128,30 @@ class tnlLinearDiffusion< tnlGrid< 3, MeshReal, Device, MeshIndex >, Real, Index
 {
    public: 
    
-   typedef tnlGrid< 3, MeshReal, Device, MeshIndex > MeshType;
-   typedef typename MeshType::CoordinatesType CoordinatesType;
-   typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
-
-   static tnlString getType();
-   
-   template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Real getValue( const MeshType& mesh,
-                  const IndexType cellIndex,
-                  const CoordinatesType& coordinates,
-                  const Vector& u,
-                  const Real& time ) const;
-
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getLinearSystemRowLength( const MeshType& mesh,
-                                   const IndexType& index,
-                                   const CoordinatesType& coordinates ) const;
-
-   template< typename Vector, typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      typedef tnlGrid< 3, MeshReal, Device, MeshIndex > MeshType;
+      typedef typename MeshType::CoordinatesType CoordinatesType;
+      typedef Real RealType;
+      typedef Device DeviceType;
+      typedef Index IndexType;
+      enum { Dimensions = MeshType::Dimensions };
+
+      static tnlString getType();
+
+      template< typename Vector >
+      __cuda_callable__
+      Real getValue( const MeshType& mesh,
+                     const IndexType cellIndex,
+                     const CoordinatesType& coordinates,
+                     const Vector& u,
+                     const Real& time ) const;
+
+      __cuda_callable__
+      Index getLinearSystemRowLength( const MeshType& mesh,
+                                      const IndexType& index,
+                                      const CoordinatesType& coordinates ) const;
+
+      template< typename Vector, typename MatrixRow >
+      __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const RealType& tau,
                                const MeshType& mesh,
diff --git a/src/operators/diffusion/tnlLinearDiffusion_impl.h b/src/operators/diffusion/tnlLinearDiffusion_impl.h
index 87b02fe2f743a344370433caea26b8dca4baab7f..07abc05d24a648cc1fece9151c606cfc24b24052 100644
--- a/src/operators/diffusion/tnlLinearDiffusion_impl.h
+++ b/src/operators/diffusion/tnlLinearDiffusion_impl.h
@@ -1,3 +1,19 @@
+/***************************************************************************
+                          tnlLinearDiffusion_impl.h  -  description
+                             -------------------
+    begin                : Aug 8, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
 
 #ifndef TNLLINEARDIFFUSION_IMP_H
 #define	TNLLINEARDIFFUSION_IMP_H
@@ -26,9 +42,7 @@ template< typename MeshReal,
           typename Real,
           typename Index >
 template< typename Vector >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlLinearDiffusion< tnlGrid< 1, MeshReal, Device, MeshIndex >, Real, Index >::
 getValue( const MeshType& mesh,
@@ -37,7 +51,7 @@ getValue( const MeshType& mesh,
           const Vector& u,
           const Real& time ) const
 {
-   return ( u[ mesh.template getCellNextToCell< - 1 >( cellIndex ) ]
+   return ( u[ mesh.template getCellNextToCell< -1 >( cellIndex ) ]
             - 2.0 * u[ cellIndex ]
             + u[ mesh.template getCellNextToCell< 1 >( cellIndex ) ] ) * mesh.getHxSquareInverse();
 }
@@ -47,9 +61,7 @@ template< typename MeshReal,
           typename MeshIndex,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlLinearDiffusion< tnlGrid< 1, MeshReal, Device, MeshIndex >, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -65,9 +77,7 @@ template< typename MeshReal,
           typename Real,
           typename Index >
    template< typename Vector, typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 void
 tnlLinearDiffusion< tnlGrid< 1, MeshReal, Device, MeshIndex >, Real, Index >::
 updateLinearSystem( const RealType& time,
@@ -107,9 +117,7 @@ template< typename MeshReal,
           typename MeshIndex,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlLinearDiffusion< tnlGrid< 2, MeshReal, Device, MeshIndex >, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -126,9 +134,7 @@ template< typename MeshReal,
           typename Real,
           typename Index >
 template< typename Vector >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlLinearDiffusion< tnlGrid< 2, MeshReal, Device, MeshIndex >, Real, Index >::
 getValue( const MeshType& mesh,
@@ -151,9 +157,7 @@ template< typename MeshReal,
           typename Real,
           typename Index >
    template< typename Vector, typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 void
 tnlLinearDiffusion< tnlGrid< 2, MeshReal, Device, MeshIndex >, Real, Index >::
 updateLinearSystem( const RealType& time,
@@ -196,9 +200,7 @@ template< typename MeshReal,
           typename Real,
           typename Index >
 template< typename Vector >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlLinearDiffusion< tnlGrid< 3, MeshReal, Device, MeshIndex >, Real, Index >::
 getValue( const MeshType& mesh,
@@ -223,9 +225,7 @@ template< typename MeshReal,
           typename MeshIndex,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlLinearDiffusion< tnlGrid< 3, MeshReal, Device, MeshIndex >, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -241,9 +241,7 @@ template< typename MeshReal,
           typename Real,
           typename Index >
    template< typename Vector, typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 void
 tnlLinearDiffusion< tnlGrid< 3, MeshReal, Device, MeshIndex >, Real, Index >::
 updateLinearSystem( const RealType& time,
@@ -267,6 +265,4 @@ updateLinearSystem( const RealType& time,
    matrixRow.setElement( 6, mesh.template getCellNextToCell< 0, 0, 1 >( index ),   -lambdaZ );
 }
 
-
-
 #endif	/* TNLLINEARDIFFUSION_IMP_H */
diff --git a/src/operators/tnlAnalyticDirichletBoundaryConditions.h b/src/operators/tnlAnalyticDirichletBoundaryConditions.h
index 3c0855c48bec35ab2e9ea35310f99005a6e42341..25aa0d383ea569739dd64d8315e1046f58b9bcc2 100644
--- a/src/operators/tnlAnalyticDirichletBoundaryConditions.h
+++ b/src/operators/tnlAnalyticDirichletBoundaryConditions.h
@@ -21,7 +21,7 @@
 
 #include <core/vectors/tnlStaticVector.h>
 #include <config/tnlParameterContainer.h>
-#include <functions/tnlConstantFunction.h>
+#include <functors/tnlConstantFunction.h>
 #include <core/vectors/tnlSharedVector.h>
 
 template< typename Mesh,
@@ -45,50 +45,44 @@ class tnlAnalyticDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Dev
 {
    public:
    
-   typedef tnlGrid< Dimensions, MeshReal, Device, MeshIndex > MeshType;
-   typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
-   typedef tnlAnalyticDirichletBoundaryConditions< MeshType, Function, Real, Index > ThisType;
+      typedef tnlGrid< Dimensions, MeshReal, Device, MeshIndex > MeshType;
+      typedef Real RealType;
+      typedef Device DeviceType;
+      typedef Index IndexType;
+      typedef tnlAnalyticDirichletBoundaryConditions< MeshType, Function, Real, Index > ThisType;
 
-   typedef tnlSharedVector< RealType, DeviceType, IndexType > SharedVector;
-   typedef tnlVector< RealType, DeviceType, IndexType> DofVectorType;
-   typedef tnlStaticVector< Dimensions, RealType > VertexType;
-   typedef typename MeshType::CoordinatesType CoordinatesType;
+      typedef tnlSharedVector< RealType, DeviceType, IndexType > SharedVector;
+      typedef tnlVector< RealType, DeviceType, IndexType> DofVectorType;
+      typedef tnlStaticVector< Dimensions, RealType > VertexType;
+      typedef typename MeshType::CoordinatesType CoordinatesType;
 
-   static void configSetup( tnlConfigDescription& config,
-                            const tnlString& prefix = "" );
-            
-   bool setup( const tnlParameterContainer& parameters,
-               const tnlString& prefix = "" );
+      static void configSetup( tnlConfigDescription& config,
+                               const tnlString& prefix = "" );
 
-   void setFunction( const Function& function );
+      bool setup( const tnlParameterContainer& parameters,
+                  const tnlString& prefix = "" );
 
-   Function& getFunction();
+      void setFunction( const Function& function );
 
-   const Function& getFunction() const;
+      Function& getFunction();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   void setBoundaryConditions( const RealType& time,
-                               const MeshType& mesh,
-                               const IndexType index,
-                               const CoordinatesType& coordinates,
-                               DofVectorType& u,
-                               DofVectorType& fu ) const;
-
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getLinearSystemRowLength( const MeshType& mesh,
-                                   const IndexType& index,
-                                   const CoordinatesType& coordinates ) const;
-
-   template< typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      const Function& getFunction() const;
+
+      __cuda_callable__
+      void setBoundaryConditions( const RealType& time,
+                                  const MeshType& mesh,
+                                  const IndexType index,
+                                  const CoordinatesType& coordinates,
+                                  DofVectorType& u,
+                                  DofVectorType& fu ) const;
+
+      __cuda_callable__
+      Index getLinearSystemRowLength( const MeshType& mesh,
+                                      const IndexType& index,
+                                      const CoordinatesType& coordinates ) const;
+
+      template< typename MatrixRow >
+      __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const MeshType& mesh,
                                const IndexType& index,
@@ -99,7 +93,7 @@ class tnlAnalyticDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Dev
 
    protected:
 
-   Function function;
+      Function function;
 };
 
 template< typename Mesh,
diff --git a/src/operators/tnlAnalyticDirichletBoundaryConditions_impl.h b/src/operators/tnlAnalyticDirichletBoundaryConditions_impl.h
index 2a67471b80ad60574b7d09243ea7a8834f2937dc..f744de07d6578db830d3574cb75a98867c7ddd40 100644
--- a/src/operators/tnlAnalyticDirichletBoundaryConditions_impl.h
+++ b/src/operators/tnlAnalyticDirichletBoundaryConditions_impl.h
@@ -97,9 +97,7 @@ template< int Dimensions,
           typename Function,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlAnalyticDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 setBoundaryConditions( const RealType& time,
@@ -120,9 +118,7 @@ template< int Dimensions,
           typename Function,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlAnalyticDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -140,9 +136,7 @@ template< int Dimensions,
           typename Real,
           typename Index >
    template< typename MatrixRow >          
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 void
 tnlAnalyticDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 updateLinearSystem( const RealType& time,
diff --git a/src/operators/tnlAnalyticNeumannBoundaryConditions.h b/src/operators/tnlAnalyticNeumannBoundaryConditions.h
index 7a90a3763d658cbbeb51facb873aa24dc2e9bb28..cdee63045408aa4b91aa9f360203ea31ae18e7db 100644
--- a/src/operators/tnlAnalyticNeumannBoundaryConditions.h
+++ b/src/operators/tnlAnalyticNeumannBoundaryConditions.h
@@ -79,9 +79,7 @@ class tnlAnalyticNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIn
    typedef typename MeshType::CoordinatesType CoordinatesType;
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void setBoundaryConditions( const RealType& time,
                                const MeshType& mesh,
                                const IndexType index,
@@ -89,17 +87,13 @@ class tnlAnalyticNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIn
                                DofVectorType& u,
                                DofVectorType& fu ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getLinearSystemRowLength( const MeshType& mesh,
                                    const IndexType& index,
                                    const CoordinatesType& coordinates ) const;
 
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const MeshType& mesh,
                                const IndexType& index,
@@ -135,9 +129,7 @@ class tnlAnalyticNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIn
    typedef tnlStaticVector< 2, RealType > VertexType;
    typedef typename MeshType::CoordinatesType CoordinatesType;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__  
    void setBoundaryConditions( const RealType& time,
                                const MeshType& mesh,
                                const IndexType index,
@@ -145,17 +137,13 @@ class tnlAnalyticNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIn
                                DofVectorType& u,
                                DofVectorType& fu ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getLinearSystemRowLength( const MeshType& mesh,
                                    const IndexType& index,
                                    const CoordinatesType& coordinates ) const;
 
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const MeshType& mesh,
                                const IndexType& index,
@@ -191,9 +179,7 @@ class tnlAnalyticNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIn
    typedef typename MeshType::CoordinatesType CoordinatesType;
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void setBoundaryConditions( const RealType& time,
                                const MeshType& mesh,
                                const IndexType index,
@@ -201,17 +187,13 @@ class tnlAnalyticNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIn
                                DofVectorType& u,
                                DofVectorType& fu ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getLinearSystemRowLength( const MeshType& mesh,
                                    const IndexType& index,
                                    const CoordinatesType& coordinates ) const;
 
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const MeshType& mesh,
                                const IndexType& index,
diff --git a/src/operators/tnlAnalyticNeumannBoundaryConditions_impl.h b/src/operators/tnlAnalyticNeumannBoundaryConditions_impl.h
index a2e47cbe14b8f11a51719250b3528867962d4bfd..42f15cf38f84a37e1fc8e5dcda505c8b45cfa725 100644
--- a/src/operators/tnlAnalyticNeumannBoundaryConditions_impl.h
+++ b/src/operators/tnlAnalyticNeumannBoundaryConditions_impl.h
@@ -72,9 +72,7 @@ template< typename MeshReal,
           typename Function,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlAnalyticNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 setBoundaryConditions( const RealType& time,
@@ -98,9 +96,7 @@ template< typename MeshReal,
           typename Function,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlAnalyticNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -117,9 +113,7 @@ template< typename MeshReal,
           typename Real,
           typename Index >
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 void
 tnlAnalyticNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 updateLinearSystem( const RealType& time,
@@ -155,9 +149,7 @@ template< typename MeshReal,
           typename Function,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlAnalyticNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 setBoundaryConditions( const RealType& time,
@@ -197,9 +189,7 @@ template< typename MeshReal,
           typename Function,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlAnalyticNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -216,9 +206,7 @@ template< typename MeshReal,
           typename Real,
           typename Index >
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 void
 tnlAnalyticNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 updateLinearSystem( const RealType& time,
@@ -265,9 +253,7 @@ template< typename MeshReal,
           typename Function,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlAnalyticNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 setBoundaryConditions( const RealType& time,
@@ -318,9 +304,7 @@ template< typename MeshReal,
           typename Function,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlAnalyticNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -337,9 +321,7 @@ template< typename MeshReal,
           typename Real,
           typename Index >
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 void
 tnlAnalyticNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 updateLinearSystem( const RealType& time,
diff --git a/src/operators/tnlDirichletBoundaryConditions.h b/src/operators/tnlDirichletBoundaryConditions.h
index ae300ee8762d2c598ed7c4393dcec57cfd893e42..8ece2bddfa0701d74dedf24a9aa4a65a44a632e3 100644
--- a/src/operators/tnlDirichletBoundaryConditions.h
+++ b/src/operators/tnlDirichletBoundaryConditions.h
@@ -59,9 +59,7 @@ class tnlDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Device, Mes
 
    const Vector& getVector() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void setBoundaryConditions( const RealType& time,
                                const MeshType& mesh,
                                const IndexType index,
@@ -69,17 +67,13 @@ class tnlDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Device, Mes
                                DofVectorType& u,
                                DofVectorType& fu ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getLinearSystemRowLength( const MeshType& mesh,
                                    const IndexType& index,
                                    const CoordinatesType& coordinates ) const;
 
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const MeshType& mesh,
                                const IndexType& index,
diff --git a/src/operators/tnlDirichletBoundaryConditions_impl.h b/src/operators/tnlDirichletBoundaryConditions_impl.h
index fd910a0ba58841654a61e7a582d6b02c025329f5..1fc9b245a8bcb9dd3957eed488ec1fb12b8160db 100644
--- a/src/operators/tnlDirichletBoundaryConditions_impl.h
+++ b/src/operators/tnlDirichletBoundaryConditions_impl.h
@@ -90,9 +90,7 @@ template< int Dimensions,
           typename Vector,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 setBoundaryConditions( const RealType& time,
@@ -113,9 +111,7 @@ template< int Dimensions,
           typename Vector,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -133,9 +129,7 @@ template< int Dimensions,
           typename Real,
           typename Index >
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 void
 tnlDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 updateLinearSystem( const RealType& time,
diff --git a/src/operators/tnlNeumannBoundaryConditions.h b/src/operators/tnlNeumannBoundaryConditions.h
index b2b0218a076a7a4b57bd1a193a7f0107d0c759bc..19806026231cded2aac0279ff131eb04b175159f 100644
--- a/src/operators/tnlNeumannBoundaryConditions.h
+++ b/src/operators/tnlNeumannBoundaryConditions.h
@@ -59,9 +59,7 @@ class tnlNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIndex >, V
    typedef tnlStaticVector< 1, RealType > VertexType;
    typedef typename MeshType::CoordinatesType CoordinatesType;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void setBoundaryConditions( const RealType& time,
                                const MeshType& mesh,
                                const IndexType index,
@@ -69,17 +67,13 @@ class tnlNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIndex >, V
                                DofVectorType& u,
                                DofVectorType& fu ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getLinearSystemRowLength( const MeshType& mesh,
                                    const IndexType& index,
                                    const CoordinatesType& coordinates ) const;
 
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const MeshType& mesh,
                                const IndexType& index,
@@ -114,9 +108,7 @@ class tnlNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIndex >, V
    typedef tnlStaticVector< 2, RealType > VertexType;
    typedef typename MeshType::CoordinatesType CoordinatesType;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void setBoundaryConditions( const RealType& time,
                                const MeshType& mesh,
                                const IndexType index,
@@ -124,17 +116,13 @@ class tnlNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIndex >, V
                                DofVectorType& u,
                                DofVectorType& fu ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getLinearSystemRowLength( const MeshType& mesh,
                                    const IndexType& index,
                                    const CoordinatesType& coordinates ) const;
 
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const MeshType& mesh,
                                const IndexType& index,
@@ -169,9 +157,7 @@ class tnlNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIndex >, V
    typedef tnlStaticVector< 3, RealType > VertexType;
    typedef typename MeshType::CoordinatesType CoordinatesType;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void setBoundaryConditions( const RealType& time,
                                const MeshType& mesh,
                                const IndexType index,
@@ -179,17 +165,13 @@ class tnlNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIndex >, V
                                DofVectorType& u,
                                DofVectorType& fu ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getLinearSystemRowLength( const MeshType& mesh,
                                    const IndexType& index,
                                    const CoordinatesType& coordinates ) const;
 
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const MeshType& mesh,
                                const IndexType& index,
diff --git a/src/operators/tnlNeumannBoundaryConditions_impl.h b/src/operators/tnlNeumannBoundaryConditions_impl.h
index c80cad00c99853d5d08febb04154cc3cca198907..333bf09da00094ed9022e913f023054b98858d55 100644
--- a/src/operators/tnlNeumannBoundaryConditions_impl.h
+++ b/src/operators/tnlNeumannBoundaryConditions_impl.h
@@ -50,9 +50,7 @@ template< typename MeshReal,
           typename Vector,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 setBoundaryConditions( const RealType& time,
@@ -75,9 +73,7 @@ template< typename MeshReal,
           typename Vector,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -94,9 +90,7 @@ template< typename MeshReal,
           typename Real,
           typename Index >
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 void
 tnlNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 updateLinearSystem( const RealType& time,
@@ -130,9 +124,7 @@ template< typename MeshReal,
           typename Vector,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 setBoundaryConditions( const RealType& time,
@@ -171,9 +163,7 @@ template< typename MeshReal,
           typename Vector,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -190,9 +180,7 @@ template< typename MeshReal,
           typename Real,
           typename Index >
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 void
 tnlNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 updateLinearSystem( const RealType& time,
@@ -238,9 +226,7 @@ template< typename MeshReal,
           typename Vector,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 setBoundaryConditions( const RealType& time,
@@ -289,9 +275,7 @@ template< typename MeshReal,
           typename Vector,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -308,9 +292,7 @@ template< typename MeshReal,
           typename Real,
           typename Index >
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 void
 tnlNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 updateLinearSystem( const RealType& time,
diff --git a/src/operators/tnlOperatorEnumerator.h b/src/operators/tnlOperatorEnumerator.h
new file mode 100644
index 0000000000000000000000000000000000000000..050a23b72d489d1e977df6fe0b1f3783b5f3e6ef
--- /dev/null
+++ b/src/operators/tnlOperatorEnumerator.h
@@ -0,0 +1,175 @@
+/***************************************************************************
+                          tnlOperatorEnumerator.h  -  description
+                             -------------------
+    begin                : Mar 8, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+#ifndef SRC_OPERATORS_TNLOPERATORENUMERATOR_H_
+#define SRC_OPERATORS_TNLOPERATORENUMERATOR_H_
+
+//#include <_operators/tnlOperatorAdapter.h>
+
+template< typename Operator,
+          typename DofVector >
+class tnlOperatorEnumeratorTraverserUserData
+{
+   public:
+
+      typedef typename DofVector::RealType RealType;
+
+      const RealType *time;
+
+      const Operator* _operator;
+
+      DofVector *u;
+
+      const RealType* _operatorCoefficient;
+
+      const RealType* dofVectorCoefficient;
+
+      tnlOperatorEnumeratorTraverserUserData( const RealType& time,
+                                              const Operator& _operator,
+                                              DofVector& u,
+                                              const RealType& _operatorCoefficient,
+                                              const RealType& dofVectorCoefficient )
+      : time( &time ),
+        _operator( &_operator ),
+        u( &u ),
+        _operatorCoefficient( &_operatorCoefficient ),
+        dofVectorCoefficient( &dofVectorCoefficient )
+      {};
+};
+
+
+template< typename Mesh,
+          typename Operator,
+          typename DofVector >
+class tnlOperatorEnumerator
+{
+   public:
+      typedef Mesh MeshType;
+      typedef typename DofVector::RealType RealType;
+      typedef typename DofVector::DeviceType DeviceType;
+      typedef typename DofVector::IndexType IndexType;
+      typedef tnlOperatorEnumeratorTraverserUserData< Operator,
+                                                      DofVector > TraverserUserData;
+
+      template< int EntityDimensions >
+      void enumerate( const MeshType& mesh,
+                      const Operator& _operator,
+                      DofVector& u,
+                      const RealType& _operatorCoefficient = 1.0,
+                      const RealType& dofVectorCoefficient = 0.0,
+                      const RealType& time = 0.0 ) const;
+
+
+      class TraverserEntitiesProcessor
+      {
+         public:
+
+            template< int EntityDimensions >
+#ifdef HAVE_CUDA
+            __host__ __device__
+#endif
+            static void processEntity( const MeshType& mesh,
+                                       TraverserUserData& userData,
+                                       const IndexType index )
+            {
+               //typedef tnlOperatorAdapter< MeshType, Operator > OperatorAdapter;
+               ( *userData.u )[ index ] =
+                        ( *userData.dofVectorCoefficient ) * ( *userData.u )[ index ] +
+                        ( *userData._operatorCoefficient ) * userData._operator ->getValue( mesh,
+                                                                                            index,
+                                                                                            *userData.time );
+            }
+
+      };
+
+};
+
+template< int Dimensions,
+          typename Real,
+          typename Device,
+          typename Index,
+          typename Operator,
+          typename DofVector >
+class tnlOperatorEnumerator< tnlGrid< Dimensions, Real, Device, Index >,
+                             Operator,
+                             DofVector >
+{
+   public:
+
+      typedef tnlGrid< Dimensions, Real, Device, Index > MeshType;
+      typedef typename MeshType::RealType RealType;
+      typedef typename MeshType::DeviceType DeviceType;
+      typedef typename MeshType::IndexType IndexType;
+      typedef typename MeshType::CoordinatesType CoordinatesType;
+      typedef tnlOperatorEnumeratorTraverserUserData< Operator,
+                                                      DofVector > TraverserUserData;
+
+      template< int EntityDimensions >
+      void enumerate( const MeshType& mesh,
+                      const Operator& _operator,
+                      DofVector& u,
+                      const RealType& _operatorCoefficient = 1.0,
+                      const RealType& dofVectorCoefficient = 0.0,
+                      const RealType& time = 0.0 ) const;
+
+      class TraverserEntitiesProcessor
+      {
+         public:
+
+         typedef typename MeshType::VertexType VertexType;
+
+#ifdef HAVE_CUDA
+            __host__ __device__
+#endif
+            static void processCell( const MeshType& mesh,
+                                     TraverserUserData& userData,
+                                     const IndexType index,
+                                     const CoordinatesType& coordinates )
+            {
+               //printf( "Enumerator::processCell mesh =%p \n", &mesh );
+               //typedef tnlOperatorAdapter< MeshType, Operator > OperatorAdapter;
+               ( *userData.u )[ index ] =
+                        ( *userData.dofVectorCoefficient ) * ( *userData.u )[ index ] +
+                        ( *userData._operatorCoefficient ) * userData._operator->getValue( mesh,
+                                                                                           index,
+                                                                                           coordinates,
+                                                                                           *userData.time );
+
+            }
+
+#ifdef HAVE_CUDA
+            __host__ __device__
+#endif
+            static void processFace( const MeshType& mesh,
+                                     TraverserUserData& userData,
+                                     const IndexType index,
+                                     const CoordinatesType& coordinates )
+            {
+               //typedef tnlOperatorAdapter< MeshType, Operator > OperatorAdapter;
+               ( *userData.u )[ index ] =
+                        ( *userData.dofVectorCoefficient ) * ( *userData.u )[ index ] +
+                        ( *userData._operatorCoefficient ) * userData._operator->getValue( mesh,
+                                                                                           index,
+                                                                                           coordinates,
+                                                                                           *userData.time );
+            }
+      };
+
+};
+
+#include <operators/tnlOperatorEnumerator_impl.h>
+
+#endif /* SRC_OPERATORS_TNLOPERATORENUMERATOR_H_ */
diff --git a/src/operators/tnlOperatorEnumerator_impl.h b/src/operators/tnlOperatorEnumerator_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ffe3f21a7a7235cd0c63c545400dd2bf86afa901
--- /dev/null
+++ b/src/operators/tnlOperatorEnumerator_impl.h
@@ -0,0 +1,141 @@
+/***************************************************************************
+                          tnlOperatorEnumerator_impl.h  -  description
+                             -------------------
+    begin                : Mar 8, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+#ifndef SRC_OPERATORS_TNLOPERATORENUMERATOR_IMPL_H_
+#define SRC_OPERATORS_TNLOPERATORENUMERATOR_IMPL_H_
+
+#include <operators/tnlOperatorEnumerator.h>
+#include <mesh/tnlTraverser_Grid1D.h>
+#include <mesh/tnlTraverser_Grid2D.h>
+#include <mesh/tnlTraverser_Grid3D.h>
+
+template< typename Mesh,
+          typename Operator,
+          typename DofVector >
+   template< int EntityDimensions >
+void
+tnlOperatorEnumerator< Mesh, Operator, DofVector >::
+enumerate( const MeshType& mesh,
+           const Operator& _operator,
+           DofVector& u,
+           const RealType& _operatorCoefficient,
+           const RealType& dofVectorCoefficient,
+           const RealType& time ) const
+
+{
+   if( DeviceType::DeviceType == tnlHostDevice )
+   {
+      TraverserUserData userData( time, _operator, u, _operatorCoefficient, dofVectorCoefficient );
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+
+   }
+   if( DeviceType::DeviceType == tnlCudaDevice )
+   {
+      RealType* kernelTime = tnlCuda::passToDevice( time );
+      Operator* kernelOperator = tnlCuda::passToDevice( _operator );
+      DofVector* kernelU = tnlCuda::passToDevice( u );
+      RealType* kernelOperatorCoefficient = tnlCuda::passToDevice( _operatorCoefficient );
+      RealType* kernelDofVectorCoefficient = tnlCuda::passToDevice( dofVectorCoefficient );
+      TraverserUserData userData( *kernelTime, *kernelOperator, *kernelU, *kernelOperatorCoefficient, *kernelDofVectorCoefficient );
+      checkCudaDevice;
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+
+      checkCudaDevice;
+      tnlCuda::freeFromDevice( kernelTime );
+      tnlCuda::freeFromDevice( kernelOperator );
+      tnlCuda::freeFromDevice( kernelU );
+      tnlCuda::freeFromDevice( kernelOperatorCoefficient );
+      tnlCuda::freeFromDevice( kernelDofVectorCoefficient );
+      checkCudaDevice;
+   }
+}
+
+template< int Dimensions,
+          typename Real,
+          typename Device,
+          typename Index,
+          typename Operator,
+          typename DofVector >
+   template< int EntityDimensions >
+void
+tnlOperatorEnumerator< tnlGrid< Dimensions, Real, Device, Index >, Operator, DofVector  >::
+enumerate( const tnlGrid< Dimensions, Real, Device, Index >& mesh,
+           const Operator& _operator,
+           DofVector& u,
+           const RealType& _operatorCoefficient,
+           const RealType& dofVectorCoefficient,
+           const RealType& time ) const
+{
+   if( DeviceType::DeviceType == tnlHostDevice )
+   {
+      TraverserUserData userData( time, _operator, u, _operatorCoefficient, dofVectorCoefficient );
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+
+   }
+   if( DeviceType::DeviceType == tnlCudaDevice )
+   {
+      RealType* kernelTime = tnlCuda::passToDevice( time );
+      Operator* kernelOperator = tnlCuda::passToDevice( _operator );
+      DofVector* kernelU = tnlCuda::passToDevice( u );
+      RealType* kernelOperatorCoefficient = tnlCuda::passToDevice( _operatorCoefficient );
+      RealType* kernelDofVectorCoefficient = tnlCuda::passToDevice( dofVectorCoefficient );
+      TraverserUserData userData( *kernelTime, *kernelOperator, *kernelU, *kernelOperatorCoefficient, *kernelDofVectorCoefficient );
+      checkCudaDevice;
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+
+      checkCudaDevice;
+      tnlCuda::freeFromDevice( kernelTime );
+      tnlCuda::freeFromDevice( kernelOperator );
+      tnlCuda::freeFromDevice( kernelU );
+      tnlCuda::freeFromDevice( kernelOperatorCoefficient );
+      tnlCuda::freeFromDevice( kernelDofVectorCoefficient );
+      checkCudaDevice;
+   }
+}
+
+#endif /* SRC_OPERATORS_TNLOPERATORENUMERATOR_IMPL_H_ */
diff --git a/src/problems/tnlHeatEquationEocRhs.h b/src/problems/tnlHeatEquationEocRhs.h
index 4133915989f7f129eca6911e2787a3f35f349c0a..c7cd20d881dd902bb45dbd820fe880bd9b49a025 100644
--- a/src/problems/tnlHeatEquationEocRhs.h
+++ b/src/problems/tnlHeatEquationEocRhs.h
@@ -18,7 +18,7 @@
 #ifndef TNLHEATEQUATIONEOCRHS_H_
 #define TNLHEATEQUATIONEOCRHS_H_
 
-#include <functions/tnlFunctionType.h>
+#include <functors/tnlFunctionType.h>
 
 template< typename ExactOperator,
           typename TestFunction >
@@ -39,9 +39,7 @@ class tnlHeatEquationEocRhs
 
       template< typename Vertex,
                 typename Real >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       Real getValue( const Vertex& vertex,
                      const Real& time ) const
       {
diff --git a/src/problems/tnlHeatEquationProblem.h b/src/problems/tnlHeatEquationProblem.h
index d6f5375e8db0207dd92007834531b6be7a2644d0..f1356515c047961790afb9f8d44dc6e5749656f9 100644
--- a/src/problems/tnlHeatEquationProblem.h
+++ b/src/problems/tnlHeatEquationProblem.h
@@ -20,6 +20,7 @@
 
 #include <problems/tnlPDEProblem.h>
 #include <operators/diffusion/tnlLinearDiffusion.h>
+#include <matrices/tnlEllpackMatrix.h>
 
 template< typename Mesh,
           typename BoundaryCondition,
@@ -37,9 +38,11 @@ class tnlHeatEquationProblem : public tnlPDEProblem< Mesh,
       typedef typename Mesh::DeviceType DeviceType;
       typedef typename DifferentialOperator::IndexType IndexType;
       typedef tnlPDEProblem< Mesh, RealType, DeviceType, IndexType > BaseType;
+      typedef tnlEllpackMatrix< RealType, DeviceType, IndexType > MatrixType;
 
       using typename BaseType::MeshType;
       using typename BaseType::DofVectorType;
+      using typename BaseType::MeshDependentDataType;
 
       static tnlString getTypeStatic();
 
@@ -53,17 +56,17 @@ class tnlHeatEquationProblem : public tnlPDEProblem< Mesh,
       bool setInitialCondition( const tnlParameterContainer& parameters,
                                 const MeshType& mesh,
                                 DofVectorType& dofs,
-                                DofVectorType& auxDofs );
+                                MeshDependentDataType& meshDependentData );
 
-      template< typename MatrixType >
+      template< typename Matrix >
       bool setupLinearSystem( const MeshType& mesh,
-                              MatrixType& matrix );
+                              Matrix& matrix );
 
       bool makeSnapshot( const RealType& time,
                          const IndexType& step,
                          const MeshType& mesh,
                          DofVectorType& dofs,
-                         DofVectorType& auxDofs );
+                         MeshDependentDataType& meshDependentData );
 
       IndexType getDofs( const MeshType& mesh ) const;
 
@@ -74,15 +77,16 @@ class tnlHeatEquationProblem : public tnlPDEProblem< Mesh,
                            const RealType& tau,
                            const MeshType& mesh,
                            DofVectorType& _u,
+                           MeshDependentDataType& meshDependentData,
                            DofVectorType& _fu );
 
-      template< typename MatrixType >
+      template< typename Matrix >
       void assemblyLinearSystem( const RealType& time,
                                  const RealType& tau,
                                  const MeshType& mesh,
                                  DofVectorType& dofs,
-                                 DofVectorType& auxDofs,
-                                 MatrixType& matrix,
+                                 MeshDependentDataType& meshDependentData,
+                                 Matrix& matrix,
                                  DofVectorType& rightHandSide );
 
 
diff --git a/src/problems/tnlHeatEquationProblem_impl.h b/src/problems/tnlHeatEquationProblem_impl.h
index 0a79298ba0fc3df6436de9370d2eb2a0be4d0b18..021c4cff6e03041434688274d5a7bd710cd1977f 100644
--- a/src/problems/tnlHeatEquationProblem_impl.h
+++ b/src/problems/tnlHeatEquationProblem_impl.h
@@ -24,6 +24,7 @@
 #include <core/tnlLogger.h>
 #include <solvers/pde/tnlExplicitUpdater.h>
 #include <solvers/pde/tnlLinearSystemAssembler.h>
+#include <solvers/pde/tnlBackwardTimeDiscretisation.h>
 
 
 template< typename Mesh,
@@ -108,7 +109,7 @@ tnlHeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOper
 setInitialCondition( const tnlParameterContainer& parameters,
                      const MeshType& mesh,
                      DofVectorType& dofs,
-                     DofVectorType& auxiliaryDofs )
+                     MeshDependentDataType& meshDependentData )
 {
    this->bindDofs( mesh, dofs );
    const tnlString& initialConditionFile = parameters.getParameter< tnlString >( "initial-condition" );
@@ -124,14 +125,14 @@ template< typename Mesh,
           typename BoundaryCondition,
           typename RightHandSide,
           typename DifferentialOperator >
-   template< typename MatrixType >          
+   template< typename Matrix >          
 bool
 tnlHeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::
 setupLinearSystem( const MeshType& mesh,
-                   MatrixType& matrix )
+                   Matrix& matrix )
 {
    const IndexType dofs = this->getDofs( mesh );
-   typedef typename MatrixType::RowLengthsVector RowLengthsVectorType;
+   typedef typename Matrix::RowLengthsVector RowLengthsVectorType;
    RowLengthsVectorType rowLengths;
    if( ! rowLengths.setSize( dofs ) )
       return false;
@@ -157,7 +158,7 @@ makeSnapshot( const RealType& time,
               const IndexType& step,
               const MeshType& mesh,
               DofVectorType& dofs,
-              DofVectorType& auxiliaryDofs )
+              MeshDependentDataType& meshDependentData )
 {
    cout << endl << "Writing output at time " << time << " step " << step << "." << endl;
 
@@ -180,6 +181,7 @@ getExplicitRHS( const RealType& time,
                 const RealType& tau,
                 const MeshType& mesh,
                 DofVectorType& u,
+                MeshDependentDataType& meshDependentData,
                 DofVectorType& fu )
 {
    /****
@@ -212,18 +214,24 @@ template< typename Mesh,
           typename BoundaryCondition,
           typename RightHandSide,
           typename DifferentialOperator >
-    template< typename MatrixType >          
+    template< typename Matrix >          
 void
 tnlHeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::
 assemblyLinearSystem( const RealType& time,
                       const RealType& tau,
                       const MeshType& mesh,
                       DofVectorType& u,
-                      DofVectorType& auxDofs,
-                      MatrixType& matrix,
+                      MeshDependentDataType& meshDependentData,
+                      Matrix& matrix,
                       DofVectorType& b )
 {
-   tnlLinearSystemAssembler< Mesh, DofVectorType, DifferentialOperator, BoundaryCondition, RightHandSide, MatrixType > systemAssembler;
+   tnlLinearSystemAssembler< Mesh,
+                             DofVectorType,
+                             DifferentialOperator,
+                             BoundaryCondition,
+                             RightHandSide,
+                             tnlBackwardTimeDiscretisation,
+                             Matrix > systemAssembler;
    systemAssembler.template assembly< Mesh::Dimensions >( time,
                                                           tau,
                                                           mesh,
@@ -237,6 +245,26 @@ assemblyLinearSystem( const RealType& time,
    cout << endl << b << endl;
    cout << endl << u << endl;
    abort();*/
+   /*cout << "Matrix multiplication test ..." << endl;
+   tnlVector< RealType, DeviceType, IndexType > y;
+   y.setLike( u );
+   tnlTimerRT timer;
+   timer.reset();
+   timer.start();
+   for( int i = 0; i < 100; i++ )
+      matrix.vectorProduct( u, y );
+   timer.stop();
+   cout << "The time is " << timer.getTime();
+   cout << "Scalar product test ..." << endl;
+   timer.reset();
+   RealType a;
+   timer.start();
+   for( int i = 0; i < 100; i++ )
+      a = y.scalarProduct( u );
+   timer.stop();
+   cout << "The time is " << timer.getTime();
+   cout << endl;
+   abort();*/
 }
 
 #endif /* TNLHEATEQUATIONPROBLEM_IMPL_H_ */
diff --git a/src/problems/tnlPDEProblem.h b/src/problems/tnlPDEProblem.h
index eb56959ee0ead58c1cdb48167c43645fe5d48acc..8c5931cdb74549034ad0d5d915eb22ee86ae5b88 100644
--- a/src/problems/tnlPDEProblem.h
+++ b/src/problems/tnlPDEProblem.h
@@ -37,6 +37,7 @@ class tnlPDEProblem : public tnlProblem< Real, Device, Index >
       typedef Mesh MeshType;
       typedef tnlVector< RealType, DeviceType, IndexType> DofVectorType;
       typedef tnlCSRMatrix< RealType, DeviceType, IndexType > MatrixType;
+      typedef tnlVector< RealType, DeviceType, IndexType > MeshDependentDataType;
 
       /****
        * This means that the time stepper will be set from the command line arguments.
@@ -50,23 +51,23 @@ class tnlPDEProblem : public tnlProblem< Real, Device, Index >
       void writeProlog( tnlLogger& logger,
                         const tnlParameterContainer& parameters ) const;
 
-      typename tnlPDEProblem< Mesh, Real, Device, Index >::IndexType getAuxiliaryDofs( const MeshType& mesh ) const;
+      bool setMeshDependentData( const MeshType& mesh,
+                                 MeshDependentDataType& meshDependentData );
 
-      void bindAuxiliaryDofs( const MeshType& mesh,
-                              DofVectorType& auxiliaryDofs );
+      void bindMeshDependentData( const MeshType& mesh,
+                                  MeshDependentDataType& meshDependentData );
 
       bool preIterate( const RealType& time,
                        const RealType& tau,
                        const MeshType& mesh,
                        DofVectorType& dofs,
-                       DofVectorType& auxDofs );
+                       MeshDependentDataType& meshDependentData );
 
       bool postIterate( const RealType& time,
                         const RealType& tau,
                         const MeshType& mesh,
                         DofVectorType& dofs,
-                        DofVectorType& auxDofs );
-
+                        MeshDependentDataType& meshDependentData );
 
       tnlSolverMonitor< typename tnlPDEProblem< Mesh, Real, Device, Index >::RealType, typename tnlPDEProblem< Mesh, Real, Device, Index >::IndexType >* getSolverMonitor();
 
diff --git a/src/problems/tnlPDEProblem_impl.h b/src/problems/tnlPDEProblem_impl.h
index 33e05b13dd535aac8b2e3f091f5fe4772f94e9e9..9954e2b1f4294c348fd4f197f7e3dfb6e08f273d 100644
--- a/src/problems/tnlPDEProblem_impl.h
+++ b/src/problems/tnlPDEProblem_impl.h
@@ -58,14 +58,15 @@ template< typename Mesh,
           typename Real,
           typename Device,
           typename Index >
-typename tnlPDEProblem< Mesh, Real, Device, Index >::IndexType
+bool
 tnlPDEProblem< Mesh, Real, Device, Index >::
-getAuxiliaryDofs( const MeshType& mesh ) const
+setMeshDependentData( const MeshType& mesh,
+                      MeshDependentDataType& meshDependentData )
 {
    /****
-    * Set-up DOFs and supporting grid functions which will not appear in the discrete solver
+    * Set-up auxiliary data depending on the numerical mesh
     */
-   return 0;
+   return true;
 }
 
 template< typename Mesh,
@@ -74,8 +75,8 @@ template< typename Mesh,
           typename Index >
 void
 tnlPDEProblem< Mesh, Real, Device, Index >::
-bindAuxiliaryDofs( const MeshType& mesh,
-                   DofVectorType& auxiliaryDofs )
+bindMeshDependentData( const MeshType& mesh,
+                       MeshDependentDataType& meshDependentData )
 {
 }
 
diff --git a/src/solvers/CMakeLists.txt b/src/solvers/CMakeLists.txt
index cd08601fdbdeaa3478c1da158037a21d095d8bc9..befbe5500a3fb236123bae2e3c4130577217de98 100755
--- a/src/solvers/CMakeLists.txt
+++ b/src/solvers/CMakeLists.txt
@@ -6,8 +6,8 @@ ADD_SUBDIRECTORY( preconditioners )
 
 SET( headers tnlIterativeSolver.h
              tnlIterativeSolver_impl.h
-             tnlConfigTags.h
-             tnlFastBuildConfig.h
+             tnlBuildConfigTags.h
+             tnlFastBuildConfigTag.h
              tnlMeshTypeResolver.h
              tnlMeshTypeResolver_impl.h
              tnlSolver.h
diff --git a/src/solvers/linear/krylov/tnlBICGStabSolver_impl.h b/src/solvers/linear/krylov/tnlBICGStabSolver_impl.h
index d6c521c7b64405e8d8a2285d7c03a4ae61686878..4359537441f2dbfafbf77936f11fc3ac0a92ae3c 100644
--- a/src/solvers/linear/krylov/tnlBICGStabSolver_impl.h
+++ b/src/solvers/linear/krylov/tnlBICGStabSolver_impl.h
@@ -50,7 +50,7 @@ tnlBICGStabSolver< Matrix, Preconditioner >::
 configSetup( tnlConfigDescription& config,
              const tnlString& prefix )
 {
-   tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
+   //tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
 }
 
 template< typename Matrix,
@@ -142,7 +142,7 @@ bool tnlBICGStabSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vect
       /****
        * s_j = r_j - alpha_j * A p_j
        */
-      s. alphaXPlusBetaZ( 1.0, r, -alpha, Ap );
+      s.addVectors( r, 1.0, Ap, -alpha );
 
       /****
        * omega_j = ( A s_j, s_j ) / ( A s_j, A s_j )
@@ -164,12 +164,12 @@ bool tnlBICGStabSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vect
       /****
        * x_{j+1} = x_j + alpha_j * p_j + omega_j * s_j
        */
-      x. alphaXPlusBetaZPlusY( alpha, p, omega, s );
+      x.addVectors( p, alpha, s, omega );
       
       /****
        * r_{j+1} = s_j - omega_j * A * s_j
        */
-      r. alphaXPlusBetaZ( 1.0, s, -omega, As );
+      r.addVectors( s, 1.0, As, -omega );
 
       /****
        * beta = alpha_j / omega_j * ( r_{j+1}, r^ast_0 ) / ( r_j, r^ast_0 )
diff --git a/src/solvers/linear/krylov/tnlCGSolver_impl.h b/src/solvers/linear/krylov/tnlCGSolver_impl.h
index 70271b6561ac8586759b267f73c250ff58bbf528..ae599af1bbb4e6e51307bee5b9cf6d7df1a92efd 100644
--- a/src/solvers/linear/krylov/tnlCGSolver_impl.h
+++ b/src/solvers/linear/krylov/tnlCGSolver_impl.h
@@ -40,7 +40,7 @@ tnlCGSolver< Matrix, Preconditioner >::
 configSetup( tnlConfigDescription& config,
              const tnlString& prefix )
 {
-   tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
+   //tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
 }
 
 template< typename Matrix,
diff --git a/src/solvers/linear/krylov/tnlGMRESSolver_impl.h b/src/solvers/linear/krylov/tnlGMRESSolver_impl.h
index 011e7e466178036ebb2299659f4a21dafed80a95..cbf970a2297203cbd94c2f2f4f8d44ff7c3e2e26 100644
--- a/src/solvers/linear/krylov/tnlGMRESSolver_impl.h
+++ b/src/solvers/linear/krylov/tnlGMRESSolver_impl.h
@@ -55,8 +55,8 @@ tnlGMRESSolver< Matrix, Preconditioner >::
 configSetup( tnlConfigDescription& config,
              const tnlString& prefix )
 {
-   tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
-   config.addEntry< int >( prefix + "gmres-restarting", "Number of iterations after which the GMRES restarts.", 10 );
+   //tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
+   config.addEntry< int >( prefix + "gmres-restarting", "Number of iterations after which the GMRES restarts.", 10 );   
 }
 
 template< typename Matrix,
@@ -68,6 +68,7 @@ setup( const tnlParameterContainer& parameters,
 {
    tnlIterativeSolver< RealType, IndexType >::setup( parameters, prefix );
    this->setRestarting( parameters.getParameter< int >( "gmres-restarting" ) );
+   return true;
 }
 
 template< typename Matrix,
@@ -105,10 +106,11 @@ bool tnlGMRESSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
            << ". Please set some positive value using the SetRestarting method." << endl;
       return false;
    }
-   if( ! setSize( matrix -> getRows(), restarting ) ) return false;
-
-
-   IndexType i, j = 1, k, l;
+   if( ! setSize( matrix -> getRows(), restarting ) )
+   {
+       cerr << "I am not able to allocate enough memory for the GMRES solver. You may try to decrease the restarting parameter." << endl;
+       return false;
+   }
 
    IndexType _size = size;
 
@@ -132,7 +134,7 @@ bool tnlGMRESSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
       normb = _M_tmp. lpNorm( ( RealType ) 2.0 );
 
       matrix -> vectorProduct( x, _M_tmp );
-      _M_tmp. alphaXPlusBetaY( ( RealType ) 1.0, b, -1.0 );
+      _M_tmp.addVector( b, ( RealType ) 1.0, -1.0 );
       /*for( i = 0; i < size; i ++ )
          M_tmp[ i ] = b[ i ] - M_tmp[ i ];*/
 
@@ -141,11 +143,16 @@ bool tnlGMRESSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
    }
    else
    {
-      matrix -> vectorProduct( x, _r );
+      matrix -> vectorProduct( x, _r );      
       normb = b. lpNorm( ( RealType ) 2.0 );
-      _r. alphaXPlusBetaY( ( RealType ) 1.0, b, -1.0 );
+      _r. addVector( b, ( RealType ) 1.0, -1.0 );
       beta = _r. lpNorm( ( RealType ) 2.0 );
+      //cout << "x = " << x << endl;
    }
+   
+    //cout << "norm b = " << normb << endl;
+    //cout << " beta = " << beta << endl;
+
 
    if( normb == 0.0 ) normb = 1.0;
 
@@ -153,13 +160,13 @@ bool tnlGMRESSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
    this->setResidue( beta / normb );
 
    tnlSharedVector< RealType, DeviceType, IndexType > vi;
-   vi. setName( "tnlGMRESSolver::vi" );
+   //vi. setName( "tnlGMRESSolver::vi" );
    tnlSharedVector< RealType, DeviceType, IndexType > vk;
-   vk. setName( "tnlGMRESSolver::vk" );
+   //vk. setName( "tnlGMRESSolver::vk" );
    while( this->nextIteration() )
    {
       const IndexType m = restarting;
-      for( i = 0; i < m + 1; i ++ )
+      for( IndexType i = 0; i < m + 1; i ++ )
          H[ i ] = s[ i ] = cs[ i ] = sn[ i ] = 0.0;
 
       /****
@@ -181,7 +188,7 @@ bool tnlGMRESSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
       /****
        * Starting m-loop
        */
-      for( i = 0; i < m && this->getIterations() <= this->getMaxIterations(); i++ )
+      for( IndexType i = 0; i < m && this->nextIteration(); i++ )
       {
          vi. bind( &( _v. getData()[ i * size ] ), size );
          /****
@@ -194,37 +201,49 @@ bool tnlGMRESSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
          }
          else
              matrix -> vectorProduct( vi, _w );
-
-         for( k = 0; k <= i; k++ )
-         {
-            vk. bind( &( _v. getData()[ k * _size ] ), _size );
-            /***
-             * H_{k,i} = ( w, v_k )
-             */
-            RealType H_k_i = _w. scalarProduct( vk );
-            H[ k + i * ( m + 1 ) ] = H_k_i;
-
-            /****
-             * w = w - H_{k,i} v_k
-             */
-            _w. addVector( vk, -H_k_i );
-         }
+         
+         //cout << " i = " << i << " vi = " << vi << endl;
+
+         for( IndexType k = 0; k <= i; k++ )
+            H[ k + i * ( m + 1 ) ] = 0.0;
+         for( IndexType l = 0; l < 2; l++ )
+            for( IndexType k = 0; k <= i; k++ )
+            {
+               vk. bind( &( _v. getData()[ k * _size ] ), _size );
+               /***
+                * H_{k,i} = ( w, v_k )
+                */
+               RealType H_k_i = _w. scalarProduct( vk );
+               H[ k + i * ( m + 1 ) ] += H_k_i;           
+
+               /****
+                * w = w - H_{k,i} v_k
+                */
+               _w. addVector( vk, -H_k_i );
+
+               //cout << "H_ki = " << H_k_i << endl;
+               //cout << "w = " << _w << endl;
+            }
          /***
           * H_{i+1,i} = |w|
           */
          RealType normw = _w. lpNorm( ( RealType ) 2.0 );
          H[ i + 1 + i * ( m + 1 ) ] = normw;
 
+         //cout << "normw = " << normw << endl;
+         
          /***
           * v_{i+1} = w / |w|
           */
          vi. bind( &( _v. getData()[ ( i + 1 ) * size ] ), size );
          vi. addVector( _w, ( RealType ) 1.0 / normw );
+         
+         //cout << "vi = " << vi << endl;
 
          /****
           * Applying the Givens rotations
           */
-         for( k = 0; k < i; k++ )
+         for( IndexType k = 0; k < i; k++ )
             applyPlaneRotation( H[ k + i * ( m + 1 )],
                                 H[ k + 1 + i * ( m + 1 ) ],
                                 cs[ k ],
@@ -246,13 +265,13 @@ bool tnlGMRESSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
          this->setResidue( fabs( s[ i + 1 ] ) / normb );
          this->refreshSolverMonitor();
 
-         if( this->getResidue() < this->getConvergenceResidue() )
+         /*if( this->getResidue() < this->getConvergenceResidue() )
          {
             update( i, m, _H, _s, _v, x );
             return true;
          }
          if( ! this->nextIteration() )
-            return false;
+            return false;*/
       }
       update( m - 1, m, _H, _s, _v, x );
 
@@ -263,19 +282,24 @@ bool tnlGMRESSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
       if( preconditioner )
       {
          matrix -> vectorProduct( x, _M_tmp );
-         for( i = 0; i < _size; i ++ )
+         for( IndexType i = 0; i < _size; i ++ )
             M_tmp[ i ] = b[ i ] - M_tmp[ i ];
          //preconditioner -> solve( M_tmp, r );
-         for( i = 0; i < _size; i ++ )
+         for( IndexType i = 0; i < _size; i ++ )
             beta += r[ i ] * r[ i ];
       }
       else
       {
          matrix -> vectorProduct( x, _r );
-         _r. alphaXPlusBetaY( ( RealType ) 1.0, b, -1.0 );
+         _r.addVector( b, ( RealType ) 1.0, -1.0 );
          beta = _r. lpNorm( ( RealType ) 2.0 );
       }
       this->setResidue( beta / normb );
+
+      //cout << " x = " << x << endl;
+      //cout << " beta = " << beta << endl;
+      //cout << "residue = " << beta / normb << endl;
+
    }
    this->refreshSolverMonitor();
    return this->checkConvergence();
diff --git a/src/solvers/linear/krylov/tnlTFQMRSolver_impl.h b/src/solvers/linear/krylov/tnlTFQMRSolver_impl.h
index cc3c51bf2bd9a085702ded5398168741948c6de2..b9f5fefea64720287e51b7b1d2ed7cbaf60c8144 100644
--- a/src/solvers/linear/krylov/tnlTFQMRSolver_impl.h
+++ b/src/solvers/linear/krylov/tnlTFQMRSolver_impl.h
@@ -49,7 +49,7 @@ tnlTFQMRSolver< Matrix, Preconditioner >::
 configSetup( tnlConfigDescription& config,
              const tnlString& prefix )
 {
-   tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
+   //tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
 }
 
 template< typename Matrix,
@@ -59,7 +59,7 @@ tnlTFQMRSolver< Matrix, Preconditioner >::
 setup( const tnlParameterContainer& parameters,
        const tnlString& prefix )
 {
-   tnlIterativeSolver< RealType, IndexType >::setup( parameters, prefix );
+   return tnlIterativeSolver< RealType, IndexType >::setup( parameters, prefix );
 }
 
 template< typename Matrix,
@@ -99,7 +99,7 @@ bool tnlTFQMRSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
    }
    else*/
    {
-      r. alphaXPlusBetaY( -1.0, b, -1.0 );
+      r. addVector( b, -1.0, -1.0 );
       w = u = r;
       matrix -> vectorProduct( u, v );
       d. setValue( 0.0 );
@@ -127,7 +127,7 @@ bool tnlTFQMRSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
       w.addVector( Au, -alpha );
       //cerr << "alpha = " << alpha << endl;
       //cerr << "theta * theta / alpha * eta = " << theta * theta / alpha * eta << endl;
-      d. alphaXPlusBetaY( 1.0, u, theta * theta / alpha * eta );
+      d. addVector( u, 1.0, theta * theta / alpha * eta );
       theta = w. lpNorm( 2.0 ) / tau;
       const RealType c = sqrt( 1.0 + theta * theta );
       tau = tau * theta * c;
@@ -143,7 +143,7 @@ bool tnlTFQMRSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
          Au.addVector( v, beta );
          u.addVector( w, 1.0, beta );
          matrix -> vectorProduct( u, Au_new );
-         v.alphaXPlusBetaZ( 1.0, Au_new, beta, Au );
+         v.addVectors( Au_new, 1.0, Au, beta );
       }
       
       //this -> setResidue( residue );
diff --git a/src/solvers/linear/stationary/tnlSORSolver_impl.h b/src/solvers/linear/stationary/tnlSORSolver_impl.h
index 4de51ef56fabd9c8508b486a5bf9f4eff2443bb6..419e190d55462ec8da4eb0d40b18f1f4c032e285 100644
--- a/src/solvers/linear/stationary/tnlSORSolver_impl.h
+++ b/src/solvers/linear/stationary/tnlSORSolver_impl.h
@@ -39,7 +39,7 @@ tnlSORSolver< Matrix, Preconditioner >::
 configSetup( tnlConfigDescription& config,
              const tnlString& prefix )
 {
-   tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
+   //tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
    config.addEntry< double >( prefix + "sor-omega", "Relaxation parameter of the SOR method.", 1.0 );
 }
 
diff --git a/src/solvers/ode/tnlEulerSolver_impl.h b/src/solvers/ode/tnlEulerSolver_impl.h
index 6e7881cb940b3aa34169d46627d704461c7a6488..2522596e58c280c4f66307d238b7e487963ff9f7 100644
--- a/src/solvers/ode/tnlEulerSolver_impl.h
+++ b/src/solvers/ode/tnlEulerSolver_impl.h
@@ -48,7 +48,7 @@ template< typename Problem >
 void tnlEulerSolver< Problem > :: configSetup( tnlConfigDescription& config,
                                                const tnlString& prefix )
 {
-   tnlExplicitSolver< Problem >::configSetup( config, prefix );
+   //tnlExplicitSolver< Problem >::configSetup( config, prefix );
    config.addEntry< double >( prefix + "euler-cfl", "Coefficient C in the Courant–Friedrichs–Lewy condition.", 0.0 );
 };
 
diff --git a/src/solvers/ode/tnlExplicitSolver_impl.h b/src/solvers/ode/tnlExplicitSolver_impl.h
index 11a49ae9f2d231dd2ea110cb68b1da3896014615..ad35537d4a4bca62ab60fc60157e55b83797a2c1 100644
--- a/src/solvers/ode/tnlExplicitSolver_impl.h
+++ b/src/solvers/ode/tnlExplicitSolver_impl.h
@@ -41,7 +41,7 @@ tnlExplicitSolver< Problem >::
 configSetup( tnlConfigDescription& config,
              const tnlString& prefix )
 {
-   tnlIterativeSolver< typename Problem::RealType, typename Problem::IndexType >::configSetup( config, prefix );
+   //tnlIterativeSolver< typename Problem::RealType, typename Problem::IndexType >::configSetup( config, prefix );
 }
 
 template< typename Problem >
diff --git a/src/solvers/ode/tnlMersonSolver_impl.h b/src/solvers/ode/tnlMersonSolver_impl.h
index 88d69280c3f6e6bfb6532203fe9f424627970d4a..95241e39acf21bc1868a09115d3a50fdf7b1bac5 100644
--- a/src/solvers/ode/tnlMersonSolver_impl.h
+++ b/src/solvers/ode/tnlMersonSolver_impl.h
@@ -111,7 +111,7 @@ template< typename Problem >
 void tnlMersonSolver< Problem > :: configSetup( tnlConfigDescription& config,
                                                 const tnlString& prefix )
 {
-   tnlExplicitSolver< Problem >::configSetup( config, prefix );
+   //tnlExplicitSolver< Problem >::configSetup( config, prefix );
    config.addEntry< double >( prefix + "merson-adaptivity", "Time step adaptivity controlling coefficient (the smaller the more precise the computation is, zero means no adaptivity).", 1.0e-4 );
 };
 
diff --git a/src/solvers/ode/tnlODESolverMonitor_impl.h b/src/solvers/ode/tnlODESolverMonitor_impl.h
index 9a104aa018c51916e0035cd6daadcce6b2fcfab3..04a8f669b0d27156e963e049f3aee9f4b12a1a6d 100644
--- a/src/solvers/ode/tnlODESolverMonitor_impl.h
+++ b/src/solvers/ode/tnlODESolverMonitor_impl.h
@@ -41,7 +41,7 @@ void tnlODESolverMonitor< RealType, IndexType> :: refresh()
        /*double flops = ( double ) tnl_flops_counter. getFlops();
        if( flops )
        {
-         cout << " GFLOPS:  " << setw( 8 ) << 1.0e-9 * flops / rt_timer -> GetTime();
+         cout << " GFLOPS:  " << setw( 8 ) << 1.0e-9 * flops / rt_timer -> getTime();
        }*/
        cout << "   \r" << flush;
     }
diff --git a/src/solvers/pde/CMakeLists.txt b/src/solvers/pde/CMakeLists.txt
index 80a4500775ce37e32e6c62b2292b06e80bde0c55..6521acc8b569c435f86b6f1597275dceee6c9db1 100755
--- a/src/solvers/pde/CMakeLists.txt
+++ b/src/solvers/pde/CMakeLists.txt
@@ -7,6 +7,8 @@ SET( headers tnlPDESolver.h
              tnlSemiImplicitTimeStepper.h
              tnlSemiImplicitTimeStepper_impl.h
              tnlLinearSystemAssembler.h
-             tnlLinearSystemAssembler_impl.h  )
+             tnlLinearSystemAssembler_impl.h
+             tnlBackwardTimeDiscretisation.h
+             tnlNoTimeDiscretisation.h  )
              
 INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/solvers/pde )
\ No newline at end of file
diff --git a/src/solvers/pde/tnlBackwardTimeDiscretisation.h b/src/solvers/pde/tnlBackwardTimeDiscretisation.h
new file mode 100644
index 0000000000000000000000000000000000000000..17a247a9f0a84f60f7c8d74918c6f9dcc5612e56
--- /dev/null
+++ b/src/solvers/pde/tnlBackwardTimeDiscretisation.h
@@ -0,0 +1,44 @@
+/***************************************************************************
+                          tnlBackwardTimeDiscretisation.h  -  description
+                             -------------------
+    begin                : Apr 4, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+
+#ifndef TNLBACKWARDTIMEDISCRETISATION_H
+#define	TNLBACKWARDTIMEDISCRETISATION_H
+
+#include <core/tnlCuda.h>
+
+class tnlBackwardTimeDiscretisation
+{
+    public:        
+        
+        template< typename RealType,
+                  typename IndexType,
+                  typename MatrixType >
+        __cuda_callable__ static void applyTimeDiscretisation( MatrixType& matrix,
+                                                               RealType& b,
+                                                               const IndexType index,
+                                                               const RealType& u,
+                                                               const RealType& tau,
+                                                               const RealType& rhs )
+        {
+            b += u + tau * rhs;
+            matrix.addElementFast( index, index, 1.0, 1.0 );
+        };
+};
+
+#endif	/* TNLBACKWARDTIMEDISCRETISATION_H */
+
diff --git a/src/solvers/pde/tnlExplicitTimeStepper.h b/src/solvers/pde/tnlExplicitTimeStepper.h
index 1f4bf0af0fede03b042dfaf61d395c0301a18f26..9d3dc11a2754673c08fc008b6ebeabb0caa354fd 100644
--- a/src/solvers/pde/tnlExplicitTimeStepper.h
+++ b/src/solvers/pde/tnlExplicitTimeStepper.h
@@ -21,6 +21,8 @@
 #include <solvers/ode/tnlODESolverMonitor.h>
 #include <config/tnlConfigDescription.h>
 #include <config/tnlParameterContainer.h>
+#include <core/tnlTimerRT.h>
+#include <core/tnlLogger.h>
 
 
 template< typename Problem,
@@ -36,6 +38,7 @@ class tnlExplicitTimeStepper
    typedef typename Problem::IndexType IndexType;
    typedef typename Problem::MeshType MeshType;
    typedef typename ProblemType::DofVectorType DofVectorType;
+   typedef typename ProblemType::MeshDependentDataType MeshDependentDataType;
 
    tnlExplicitTimeStepper();
 
@@ -61,12 +64,14 @@ class tnlExplicitTimeStepper
                const RealType& stopTime,
                const MeshType& mesh,
                DofVectorType& dofVector,
-               DofVectorType& auxiliaryDofVector );
+               MeshDependentDataType& meshDependentData );
 
    void getExplicitRHS( const RealType& time,
                         const RealType& tau,
                         DofVectorType& _u,
                         DofVectorType& _fu );
+   
+   bool writeEpilog( tnlLogger& logger );
 
    protected:
 
@@ -78,7 +83,9 @@ class tnlExplicitTimeStepper
 
    RealType timeStep;
 
-   DofVectorType* auxiliaryDofs;
+   MeshDependentDataType* meshDependentData;
+   
+   tnlTimerRT explicitUpdaterTimer;
 };
 
 #include <solvers/pde/tnlExplicitTimeStepper_impl.h>
diff --git a/src/solvers/pde/tnlExplicitTimeStepper_impl.h b/src/solvers/pde/tnlExplicitTimeStepper_impl.h
index 7f8359d0a3fcd14cfe9a7f76ff42f56d3324917d..e47e0c562eeaa1f0040abc80e4763897330dc04c 100644
--- a/src/solvers/pde/tnlExplicitTimeStepper_impl.h
+++ b/src/solvers/pde/tnlExplicitTimeStepper_impl.h
@@ -18,6 +18,9 @@
 #ifndef TNLEXPLICITTIMESTEPPER_IMPL_H_
 #define TNLEXPLICITTIMESTEPPER_IMPL_H_
 
+#include "tnlExplicitTimeStepper.h"
+
+
 template< typename Problem,
           template < typename OdeProblem > class OdeSolver >
 tnlExplicitTimeStepper< Problem, OdeSolver >::
@@ -53,6 +56,7 @@ bool
 tnlExplicitTimeStepper< Problem, OdeSolver >::
 init( const MeshType& mesh )
 {
+   this->explicitUpdaterTimer.reset();
    return true;
 }
 
@@ -106,7 +110,7 @@ solve( const RealType& time,
        const RealType& stopTime,
        const MeshType& mesh,
        DofVectorType& dofVector,
-       DofVectorType& auxiliaryDofVector )
+       MeshDependentDataType& meshDependentData )
 {
    tnlAssert( this->odeSolver, );
    this->odeSolver->setTau( this -> timeStep );
@@ -116,33 +120,37 @@ solve( const RealType& time,
    if( this->odeSolver->getMinIterations() )
       this->odeSolver->setMaxTau( ( stopTime - time ) / ( typename OdeSolver< Problem >::RealType ) this->odeSolver->getMinIterations() );
    this->mesh = &mesh;
-   this->auxiliaryDofs = &auxiliaryDofVector;
+   this->meshDependentData = &meshDependentData;
    return this->odeSolver->solve( dofVector );
 }
 
 template< typename Problem,
           template < typename OdeProblem > class OdeSolver >
-void tnlExplicitTimeStepper< Problem, OdeSolver >::getExplicitRHS( const RealType& time,
-                                                                   const RealType& tau,
-                                                                   DofVectorType& u,
-                                                                   DofVectorType& fu )
+void
+tnlExplicitTimeStepper< Problem, OdeSolver >::
+getExplicitRHS( const RealType& time,
+                const RealType& tau,
+                DofVectorType& u,
+                DofVectorType& fu )
 {
    if( ! this->problem->preIterate( time,
                                     tau,
                                     *( this->mesh),
                                     u,
-                                    *( this->auxiliaryDofs ) ) )
+                                    *( this->meshDependentData ) ) )
    {
       cerr << endl << "Preiteration failed." << endl;
       return;
       //return false; // TODO: throw exception
    }
-   this->problem->getExplicitRHS( time, tau, *( this->mesh ), u, fu );
+   this->explicitUpdaterTimer.start();   
+   this->problem->getExplicitRHS( time, tau, *( this->mesh ), u, fu, *( this->meshDependentData ) );
+   this->explicitUpdaterTimer.stop();
    if( ! this->problem->postIterate( time,
                                      tau,
                                      *( this->mesh ),
                                      u,
-                                     *( this->auxiliaryDofs ) ) )
+                                     *( this->meshDependentData ) ) )
    {
       cerr << endl << "Postiteration failed." << endl;
       return;
@@ -150,4 +158,14 @@ void tnlExplicitTimeStepper< Problem, OdeSolver >::getExplicitRHS( const RealTyp
    }
 }
 
+template< typename Problem,
+          template < typename OdeProblem > class OdeSolver >
+bool
+tnlExplicitTimeStepper< Problem, OdeSolver >::
+writeEpilog( tnlLogger& logger )
+{
+   logger.writeParameter< double >( "Explicit update computation time:", this->explicitUpdaterTimer.getTime() );
+   return true;
+}
+
 #endif /* TNLEXPLICITTIMESTEPPER_IMPL_H_ */
diff --git a/src/solvers/pde/tnlExplicitUpdater.h b/src/solvers/pde/tnlExplicitUpdater.h
index e77536157aa3b2b58fc53cc9b2fde430c61222d1..de64bd1563afba9109e1178bbb84dc1a0533da04 100644
--- a/src/solvers/pde/tnlExplicitUpdater.h
+++ b/src/solvers/pde/tnlExplicitUpdater.h
@@ -18,14 +18,14 @@
 #ifndef TNLEXPLICITUPDATER_H_
 #define TNLEXPLICITUPDATER_H_
 
-#include <functions/tnlFunctionAdapter.h>
+#include <functors/tnlFunctorAdapter.h>
 
 template< typename Real,
           typename DofVector,
           typename DifferentialOperator,
           typename BoundaryConditions,
           typename RightHandSide >
-class tnlExplicitUpdaterTraversalUserData
+class tnlExplicitUpdaterTraverserUserData
 {
    public:
 
@@ -39,7 +39,7 @@ class tnlExplicitUpdaterTraversalUserData
 
       DofVector *u, *fu;
 
-      tnlExplicitUpdaterTraversalUserData( const Real& time,
+      tnlExplicitUpdaterTraverserUserData( const Real& time,
                                            const DifferentialOperator& differentialOperator,
                                            const BoundaryConditions& boundaryConditions,
                                            const RightHandSide& rightHandSide,
@@ -67,11 +67,11 @@ class tnlExplicitUpdater
       typedef typename DofVector::RealType RealType;
       typedef typename DofVector::DeviceType DeviceType;
       typedef typename DofVector::IndexType IndexType;
-      typedef tnlExplicitUpdaterTraversalUserData< RealType,
+      typedef tnlExplicitUpdaterTraverserUserData< RealType,
                                                    DofVector,
                                                    DifferentialOperator,
                                                    BoundaryConditions,
-                                                   RightHandSide > TraversalUserData;
+                                                   RightHandSide > TraverserUserData;
 
       template< int EntityDimensions >
       void update( const RealType& time,
@@ -82,7 +82,7 @@ class tnlExplicitUpdater
                    DofVector& u,
                    DofVector& fu ) const;
 
-      class TraversalBoundaryEntitiesProcessor
+      class TraverserBoundaryEntitiesProcessor
       {
          public:
 
@@ -91,7 +91,7 @@ class tnlExplicitUpdater
             __host__ __device__
 #endif
             static void processEntity( const MeshType& mesh,
-                                       TraversalUserData& userData,
+                                       TraverserUserData& userData,
                                        const IndexType index )
             {
                userData.boundaryConditions->setBoundaryConditions( *userData.time,
@@ -103,7 +103,7 @@ class tnlExplicitUpdater
 
       };
 
-      class TraversalInteriorEntitiesProcessor
+      class TraverserInteriorEntitiesProcessor
       {
          public:
 
@@ -112,14 +112,14 @@ class tnlExplicitUpdater
             __host__ __device__
 #endif
             static void processEntity( const MeshType& mesh,
-                                       TraversalUserData& userData,
+                                       TraverserUserData& userData,
                                        const IndexType index )
             {
                (* userData.fu )[ index ] = userData.differentialOperator->getValue( mesh,
                                                                                     index,
                                                                                     *userData.u,
                                                                                     *userData.time );
-               typedef tnlFunctionAdapter< MeshType, RightHandSide > FunctionAdapter;
+               typedef tnlFunctorAdapter< MeshType, RightHandSide > FunctionAdapter;
                ( *userData.fu )[ index ] += FunctionAdapter::getValue( mesh,
                                                                        *userData.rightHandSide,
                                                                        index,
@@ -151,11 +151,11 @@ class tnlExplicitUpdater< tnlGrid< Dimensions, Real, Device, Index >,
       typedef typename MeshType::DeviceType DeviceType;
       typedef typename MeshType::IndexType IndexType;
       typedef typename MeshType::CoordinatesType CoordinatesType;
-      typedef tnlExplicitUpdaterTraversalUserData< RealType,
+      typedef tnlExplicitUpdaterTraverserUserData< RealType,
                                                    DofVector,
                                                    DifferentialOperator,
                                                    BoundaryConditions,
-                                                   RightHandSide > TraversalUserData;
+                                                   RightHandSide > TraverserUserData;
       
       template< int EntityDimensions >
       void update( const RealType& time,
@@ -166,7 +166,7 @@ class tnlExplicitUpdater< tnlGrid< Dimensions, Real, Device, Index >,
                    DofVector& u,
                    DofVector& fu ) const;
 
-      class TraversalBoundaryEntitiesProcessor
+      class TraverserBoundaryEntitiesProcessor
       {
          public:
 
@@ -178,7 +178,7 @@ class tnlExplicitUpdater< tnlGrid< Dimensions, Real, Device, Index >,
             __host__ __device__
 #endif
             static void processCell( const MeshType& mesh,
-                                     TraversalUserData& userData,
+                                     TraverserUserData& userData,
                                      const IndexType index,
                                      const CoordinatesType& coordinates )
             {
@@ -194,7 +194,7 @@ class tnlExplicitUpdater< tnlGrid< Dimensions, Real, Device, Index >,
             __host__ __device__
 #endif
             static void processFace( const MeshType& mesh,
-                                     TraversalUserData& userData,
+                                     TraverserUserData& userData,
                                      const IndexType index,
                                      const CoordinatesType& coordinates )
             {
@@ -209,7 +209,7 @@ class tnlExplicitUpdater< tnlGrid< Dimensions, Real, Device, Index >,
 
       };
 
-      class TraversalInteriorEntitiesProcessor
+      class TraverserInteriorEntitiesProcessor
       {
          public:
 
@@ -219,7 +219,7 @@ class tnlExplicitUpdater< tnlGrid< Dimensions, Real, Device, Index >,
             __host__ __device__
 #endif
             static void processCell( const MeshType& mesh,
-                                     TraversalUserData& userData,
+                                     TraverserUserData& userData,
                                      const IndexType index,
                                      const CoordinatesType& coordinates )
             {
@@ -229,7 +229,7 @@ class tnlExplicitUpdater< tnlGrid< Dimensions, Real, Device, Index >,
                                                                                    *userData.u,
                                                                                    *userData.time );
 
-               typedef tnlFunctionAdapter< MeshType, RightHandSide > FunctionAdapter;
+               typedef tnlFunctorAdapter< MeshType, RightHandSide > FunctionAdapter;
                ( * userData.fu )[ index ] += FunctionAdapter::getValue( mesh,
                                                                         *userData.rightHandSide,
                                                                         index,
@@ -241,7 +241,7 @@ class tnlExplicitUpdater< tnlGrid< Dimensions, Real, Device, Index >,
             __host__ __device__
 #endif
             static void processFace( const MeshType& mesh,
-                                     TraversalUserData& userData,
+                                     TraverserUserData& userData,
                                      const IndexType index,
                                      const CoordinatesType& coordinates )
             {
@@ -251,7 +251,7 @@ class tnlExplicitUpdater< tnlGrid< Dimensions, Real, Device, Index >,
                                                                                    *userData.u,
                                                                                    *userData.time );
 
-               typedef tnlFunctionAdapter< MeshType, RightHandSide > FunctionAdapter;
+               typedef tnlFunctorAdapter< MeshType, RightHandSide > FunctionAdapter;
                ( * userData.fu )[ index ] += FunctionAdapter::getValue( mesh,
                                                                         *userData.rightHandSide,
                                                                         index,
diff --git a/src/solvers/pde/tnlExplicitUpdater_impl.h b/src/solvers/pde/tnlExplicitUpdater_impl.h
index 172ea8700831ec1c7833d44331a87c6cdc27864b..35f80644ca7f3bec21172bd3ab12819410e8f432 100644
--- a/src/solvers/pde/tnlExplicitUpdater_impl.h
+++ b/src/solvers/pde/tnlExplicitUpdater_impl.h
@@ -40,14 +40,14 @@ update( const RealType& time,
 {
    if( DeviceType::DeviceType == tnlHostDevice )
    {
-      TraversalUserData userData( time, differentialOperator, boundaryConditions, rightHandSide, u, fu );
-      tnlTraverser< MeshType, EntityDimensions > meshTraversal;
-      meshTraversal.template processBoundaryEntities< TraversalUserData,
-                                                      TraversalBoundaryEntitiesProcessor >
+      TraverserUserData userData( time, differentialOperator, boundaryConditions, rightHandSide, u, fu );
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserBoundaryEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
-      meshTraversal.template processInteriorEntities< TraversalUserData,
-                                                      TraversalInteriorEntitiesProcessor >
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserInteriorEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
 
@@ -60,15 +60,15 @@ update( const RealType& time,
       RightHandSide* kernelRightHandSide = tnlCuda::passToDevice( rightHandSide );
       DofVector* kernelU = tnlCuda::passToDevice( u );
       DofVector* kernelFu = tnlCuda::passToDevice( fu );
-      TraversalUserData userData( *kernelTime, *kernelDifferentialOperator, *kernelBoundaryConditions, *kernelRightHandSide, *kernelU, *kernelFu );
+      TraverserUserData userData( *kernelTime, *kernelDifferentialOperator, *kernelBoundaryConditions, *kernelRightHandSide, *kernelU, *kernelFu );
       checkCudaDevice;
-      tnlTraverser< MeshType, EntityDimensions > meshTraversal;
-      meshTraversal.template processBoundaryEntities< TraversalUserData,
-                                                      TraversalBoundaryEntitiesProcessor >
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserBoundaryEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
-      meshTraversal.template processInteriorEntities< TraversalUserData,
-                                                      TraversalInteriorEntitiesProcessor >
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserInteriorEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
 
@@ -105,14 +105,14 @@ update( const RealType& time,
 
    if( ( tnlDeviceEnum ) DeviceType::DeviceType == tnlHostDevice )
    {
-      TraversalUserData userData( time, differentialOperator, boundaryConditions, rightHandSide, u, fu );
-      tnlTraverser< MeshType, EntityDimensions > meshTraversal;
-      meshTraversal.template processBoundaryEntities< TraversalUserData,
-                                                      TraversalBoundaryEntitiesProcessor >
+      TraverserUserData userData( time, differentialOperator, boundaryConditions, rightHandSide, u, fu );
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserBoundaryEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
-      meshTraversal.template processInteriorEntities< TraversalUserData,
-                                                      TraversalInteriorEntitiesProcessor >
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserInteriorEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
 
@@ -127,14 +127,14 @@ update( const RealType& time,
       DofVector* kernelU = tnlCuda::passToDevice( u );
       DofVector* kernelFu = tnlCuda::passToDevice( fu );
       checkCudaDevice;
-      TraversalUserData userData( *kernelTime, *kernelDifferentialOperator, *kernelBoundaryConditions, *kernelRightHandSide, *kernelU, *kernelFu );
-      tnlTraverser< MeshType, EntityDimensions > meshTraversal;
-      meshTraversal.template processBoundaryEntities< TraversalUserData,
-                                                      TraversalBoundaryEntitiesProcessor >
+      TraverserUserData userData( *kernelTime, *kernelDifferentialOperator, *kernelBoundaryConditions, *kernelRightHandSide, *kernelU, *kernelFu );
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserBoundaryEntitiesProcessor >
                                                    ( mesh,
                                                      userData );
-      meshTraversal.template processInteriorEntities< TraversalUserData,
-                                                      TraversalInteriorEntitiesProcessor >
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserInteriorEntitiesProcessor >
                                                    ( mesh,
                                                      userData );
 
diff --git a/src/solvers/pde/tnlLinearSystemAssembler.h b/src/solvers/pde/tnlLinearSystemAssembler.h
index f26d10aec384ad84b6d66e97636b568ea4908014..4ac6c99d3efa5d0abd9f07b44931360e983f37cd 100644
--- a/src/solvers/pde/tnlLinearSystemAssembler.h
+++ b/src/solvers/pde/tnlLinearSystemAssembler.h
@@ -18,7 +18,7 @@
 #ifndef TNLLINEARSYSTEMASSEMBLER_H_
 #define TNLLINEARSYSTEMASSEMBLER_H_
 
-#include <functions/tnlFunctionAdapter.h>
+#include <functors/tnlFunctorAdapter.h>
 
 template< typename Real,
           typename DofVector,
@@ -26,7 +26,7 @@ template< typename Real,
           typename BoundaryConditions,
           typename RightHandSide,
           typename Matrix >
-class tnlLinearSystemAssemblerTraversalUserData
+class tnlLinearSystemAssemblerTraverserUserData
 {
    public:
       typedef Matrix MatrixType;
@@ -48,7 +48,7 @@ class tnlLinearSystemAssemblerTraversalUserData
 
       const Real* timeDiscretisationCoefficient;
 
-      tnlLinearSystemAssemblerTraversalUserData( const Real& time,
+      tnlLinearSystemAssemblerTraverserUserData( const Real& time,
                                                  const Real& tau,
                                                  const Real& timeDiscretisationCoefficient,
                                                  const DifferentialOperator& differentialOperator,
@@ -78,6 +78,7 @@ template< typename Mesh,
           typename DifferentialOperator,
           typename BoundaryConditions,
           typename RightHandSide,
+          typename TimeDiscretisation,
           typename Matrix >
 class tnlLinearSystemAssembler
 {
@@ -87,12 +88,12 @@ class tnlLinearSystemAssembler
    typedef typename DofVector::DeviceType DeviceType;
    typedef typename DofVector::IndexType IndexType;
    typedef Matrix MatrixType;
-   typedef tnlLinearSystemAssemblerTraversalUserData< RealType,
+   typedef tnlLinearSystemAssemblerTraverserUserData< RealType,
                                                       DofVector,
                                                       DifferentialOperator,
                                                       BoundaryConditions,
                                                       RightHandSide,
-                                                      MatrixType > TraversalUserData;
+                                                      MatrixType > TraverserUserData;
 
    template< int EntityDimensions >
    void assembly( const RealType& time,
@@ -105,7 +106,7 @@ class tnlLinearSystemAssembler
                   MatrixType& matrix,
                   DofVector& b ) const;
 
-   class TraversalBoundaryEntitiesProcessor
+   class TraverserBoundaryEntitiesProcessor
    {
       public:
 
@@ -114,21 +115,21 @@ class tnlLinearSystemAssembler
          __host__ __device__
 #endif
          static void processEntity( const MeshType& mesh,
-                                    TraversalUserData& userData,
+                                    TraverserUserData& userData,
                                     const IndexType index )
          {
             typename MatrixType::MatrixRow matrixRow = userData.matrix->getRow( index );
-            userData.boundaryConditions->updateLinearSystem( *userData.time,
-                                                            mesh,
-                                                            index,
-                                                            *userData.u,
-                                                            *userData.b,
-                                                            matrixRow );
+            userData.boundaryConditions->updateLinearSystem( *userData.time + *userData.tau,
+                                                             mesh,
+                                                             index,
+                                                             *userData.u,
+                                                             *userData.b,
+                                                             matrixRow );
          }
 
    };
 
-   class TraversalInteriorEntitiesProcessor
+   class TraverserInteriorEntitiesProcessor
    {
       public:
 
@@ -137,15 +138,15 @@ class tnlLinearSystemAssembler
          __host__ __device__
 #endif
          static void processEntity( const MeshType& mesh,
-                                    TraversalUserData& userData,
+                                    TraverserUserData& userData,
                                     const IndexType index )
          {
-            typedef tnlFunctionAdapter< MeshType, RightHandSide > FunctionAdapter;
-            ( *userData.b )[ index ] = ( *userData.u )[ index ] +
+            typedef tnlFunctorAdapter< MeshType, RightHandSide > FunctionAdapter;
+            ( *userData.b )[ index ] = 0.0;/*( *userData.u )[ index ] +
                      ( *userData.tau ) * FunctionAdapter::getValue( mesh,
                                                                     *userData.rightHandSide,
                                                                     index,
-                                                                    *userData.time );
+                                                                    *userData.time );*/
 
             typename MatrixType::MatrixRow matrixRow = userData.matrix->getRow( index );
             userData.differentialOperator->updateLinearSystem( *userData.time,
@@ -155,7 +156,18 @@ class tnlLinearSystemAssembler
                                                                *userData.u,
                                                                *userData.b,
                                                                matrixRow );
-            userData.matrix->addElement( index, index, 1.0, 1.0 );
+            //userData.matrix->addElement( index, index, 1.0, 1.0 );
+            const RealType& rhs = FunctionAdapter::getValue( mesh,
+                                                             *userData.rightHandSide,
+                                                             index,
+                                                             *userData.time );
+            TimeDiscretisation::applyTimeDiscretisation( *userData.matrix,
+                                                         ( *userData.b )[ index ],
+                                                         index,
+                                                         ( *userData.u )[ index ],
+                                                         ( *userData.tau ),
+                                                         rhs );
+            
          }
    };
 };
@@ -168,12 +180,14 @@ template< int Dimensions,
           typename DifferentialOperator,
           typename BoundaryConditions,
           typename RightHandSide,
+          typename TimeDiscretisation,
           typename Matrix >
 class tnlLinearSystemAssembler< tnlGrid< Dimensions, Real, Device, Index >,
                                 DofVector,
                                 DifferentialOperator,
                                 BoundaryConditions,
                                 RightHandSide,
+                                TimeDiscretisation,
                                 Matrix >
 {
    public:
@@ -183,12 +197,12 @@ class tnlLinearSystemAssembler< tnlGrid< Dimensions, Real, Device, Index >,
    typedef typename DofVector::IndexType IndexType;
    typedef Matrix MatrixType;
    typedef typename MeshType::CoordinatesType CoordinatesType;
-   typedef tnlLinearSystemAssemblerTraversalUserData< RealType,
+   typedef tnlLinearSystemAssemblerTraverserUserData< RealType,
                                                       DofVector,
                                                       DifferentialOperator,
                                                       BoundaryConditions,
                                                       RightHandSide,
-                                                      MatrixType > TraversalUserData;
+                                                      MatrixType > TraverserUserData;
 
    tnlLinearSystemAssembler()
    : timeDiscretisationCoefficient( 1.0 ){}
@@ -212,7 +226,7 @@ class tnlLinearSystemAssembler< tnlGrid< Dimensions, Real, Device, Index >,
       this->timeDiscretisationCoefficient = c;
    }
 
-   class TraversalBoundaryEntitiesProcessor
+   class TraverserBoundaryEntitiesProcessor
    {
       public:
 
@@ -220,13 +234,14 @@ class tnlLinearSystemAssembler< tnlGrid< Dimensions, Real, Device, Index >,
          __host__ __device__
 #endif
          static void processCell( const MeshType& mesh,
-                                  TraversalUserData& userData,
+                                  TraverserUserData& userData,
                                   const IndexType index,
                                   const CoordinatesType& coordinates )
          {
             //printf( "index = %d \n", index );
+             ( *userData.b )[ index ] = 0.0;
             typename MatrixType::MatrixRow matrixRow = userData.matrix->getRow( index );
-            userData.boundaryConditions->updateLinearSystem( *userData.time,
+            userData.boundaryConditions->updateLinearSystem( *userData.time + *userData.tau,
                                                              mesh,
                                                              index,
                                                              coordinates,
@@ -239,11 +254,13 @@ class tnlLinearSystemAssembler< tnlGrid< Dimensions, Real, Device, Index >,
          __host__ __device__
 #endif
          static void processFace( const MeshType& mesh,
-                                  TraversalUserData& userData,
+                                  TraverserUserData& userData,
                                   const IndexType index,
                                   const CoordinatesType& coordinates )
          {
             //printf( "index = %d \n", index );
+            // printf("Matrix assembler: Index = %d \n", index );
+            ( *userData.b )[ index ] = 0.0;
             typename MatrixType::MatrixRow matrixRow = userData.matrix->getRow( index );
             userData.boundaryConditions->updateLinearSystem( *userData.time,
                                                              mesh,
@@ -252,12 +269,13 @@ class tnlLinearSystemAssembler< tnlGrid< Dimensions, Real, Device, Index >,
                                                              *userData.u,
                                                              *userData.b,
                                                              matrixRow );
+            //printf( "BC: index = %d, b = %f \n", index, ( *userData.b )[ index ] );
          }
 
 
    };
 
-   class TraversalInteriorEntitiesProcessor
+   class TraverserInteriorEntitiesProcessor
    {
       public:
 
@@ -270,18 +288,18 @@ class tnlLinearSystemAssembler< tnlGrid< Dimensions, Real, Device, Index >,
          __host__ __device__
 #endif
          static void processCell( const MeshType& mesh,
-                                  TraversalUserData& userData,
+                                  TraverserUserData& userData,
                                   const IndexType index,
                                   const CoordinatesType& coordinates )
          {
             //printf( "index = %d \n", index );
-            typedef tnlFunctionAdapter< MeshType, RightHandSide > FunctionAdapter;
-            ( *userData.b )[ index ] = ( *userData.timeDiscretisationCoefficient) * ( *userData.u )[ index ] +
+            typedef tnlFunctorAdapter< MeshType, RightHandSide > FunctionAdapter;
+            ( *userData.b )[ index ] = 0.0; /*( *userData.timeDiscretisationCoefficient) * ( *userData.u )[ index ] +
                                   ( *userData.tau ) * FunctionAdapter::getValue( mesh,
                                                              *userData.rightHandSide,
                                                              index,
                                                              coordinates,
-                                                             *userData.time );
+                                                             *userData.time );*/
             
             typename MatrixType::MatrixRow matrixRow = userData.matrix->getRow( index );
             userData.differentialOperator->updateLinearSystem( *userData.time,
@@ -292,29 +310,43 @@ class tnlLinearSystemAssembler< tnlGrid< Dimensions, Real, Device, Index >,
                                                                *userData.u,
                                                                *userData.b,
                                                                matrixRow );
-            if( *userData.timeDiscretisationCoefficient != 0.0 )
+            /*if( *userData.timeDiscretisationCoefficient != 0.0 )
                userData.matrix->addElementFast( index,
                                                 index,
                                                 *userData.timeDiscretisationCoefficient,
-                                                1.0 );
+                                                1.0 );*/
+            
+            const RealType& rhs = FunctionAdapter::getValue( mesh,
+                                                             *userData.rightHandSide,
+                                                             index,
+                                                             coordinates,
+                                                             *userData.time );
+            TimeDiscretisation::applyTimeDiscretisation( *userData.matrix,
+                                                         ( *userData.b )[ index ],
+                                                         index,
+                                                         ( *userData.u )[ index ],
+                                                         ( *userData.tau ),
+                                                         rhs );
+            //printf( "IC: index = %d, b = %f \n", index, ( *userData.b )[ index ] );
          }
 
 #ifdef HAVE_CUDA
          __host__ __device__
 #endif
          static void processFace( const MeshType& mesh,
-                                  TraversalUserData& userData,
+                                  TraverserUserData& userData,
                                   const IndexType index,
                                   const CoordinatesType& coordinates )
          {
             //printf( "index = %d \n", index );
-            typedef tnlFunctionAdapter< MeshType, RightHandSide > FunctionAdapter;
-            ( *userData.b )[ index ] = ( *userData.timeDiscretisationCoefficient) * ( *userData.u )[ index ] +
+            // printf("Matrix assembler: Index = %d \n", index );
+            typedef tnlFunctorAdapter< MeshType, RightHandSide > FunctionAdapter;
+            ( *userData.b )[ index ] = 0.0; /*( *userData.timeDiscretisationCoefficient) * ( *userData.u )[ index ] +
                                   ( *userData.tau ) * FunctionAdapter::getValue( mesh,
                                                              *userData.rightHandSide,
                                                              index,
                                                              coordinates,
-                                                             *userData.time );
+                                                             *userData.time );*/
 
             typename MatrixType::MatrixRow matrixRow = userData.matrix->getRow( index );
             userData.differentialOperator->updateLinearSystem( *userData.time,
@@ -325,11 +357,23 @@ class tnlLinearSystemAssembler< tnlGrid< Dimensions, Real, Device, Index >,
                                                                *userData.u,
                                                                *userData.b,
                                                                matrixRow );
-            if( *userData.timeDiscretisationCoefficient != 0.0 )
+            /*if( *userData.timeDiscretisationCoefficient != 0.0 )
                userData.matrix->addElementFast( index,
                                                 index,
                                                 *userData.timeDiscretisationCoefficient,
-                                                1.0 );
+                                                1.0 );*/
+            
+            const RealType& rhs = FunctionAdapter::getValue( mesh,
+                                                             *userData.rightHandSide,
+                                                             index,
+                                                             coordinates,
+                                                             *userData.time );
+            TimeDiscretisation::applyTimeDiscretisation( *userData.matrix,
+                                                         ( *userData.b )[ index ],
+                                                         index,
+                                                         ( *userData.u )[ index ],
+                                                         ( *userData.tau ),
+                                                         rhs );
 
          }
    };
diff --git a/src/solvers/pde/tnlLinearSystemAssembler_impl.h b/src/solvers/pde/tnlLinearSystemAssembler_impl.h
index 26ca2dd13c3ebe5e3ad8f34f7c56eb2499a7b1aa..a884e636311136b1b0f56b0bdf1c3ffa492c2862 100644
--- a/src/solvers/pde/tnlLinearSystemAssembler_impl.h
+++ b/src/solvers/pde/tnlLinearSystemAssembler_impl.h
@@ -27,10 +27,11 @@ template< typename Mesh,
           typename DifferentialOperator,
           typename BoundaryConditions,
           typename RightHandSide,
+          typename TimeDiscretisation,
           typename Matrix >
    template< int EntityDimensions >
 void
-tnlLinearSystemAssembler< Mesh, DofVector, DifferentialOperator, BoundaryConditions, RightHandSide, Matrix >::
+tnlLinearSystemAssembler< Mesh, DofVector, DifferentialOperator, BoundaryConditions, RightHandSide, TimeDiscretisation, Matrix >::
 assembly( const RealType& time,
           const RealType& tau,
           const Mesh& mesh,
@@ -43,21 +44,21 @@ assembly( const RealType& time,
 {
    const IndexType maxRowLength = matrix.getMaxRowLength();
    tnlAssert( maxRowLength > 0, );
-   typedef typename TraversalUserData::RowValuesType RowValuesType;
-   typedef typename TraversalUserData::RowColumnsType RowColumnsType;
+   typedef typename TraverserUserData::RowValuesType RowValuesType;
+   typedef typename TraverserUserData::RowColumnsType RowColumnsType;
    RowValuesType values;
    RowColumnsType columns;
 
    if( DeviceType::DeviceType == tnlHostDevice )
    {
-      TraversalUserData userData( time, tau, differentialOperator, boundaryConditions, rightHandSide, u, matrix, b );
-      tnlTraverser< MeshType, EntityDimensions > meshTraversal;
-      meshTraversal.template processBoundaryEntities< TraversalUserData,
-                                                      TraversalBoundaryEntitiesProcessor >
+      TraverserUserData userData( time, tau, differentialOperator, boundaryConditions, rightHandSide, u, matrix, b );
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserBoundaryEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
-      meshTraversal.template processInteriorEntities< TraversalUserData,
-                                                      TraversalInteriorEntitiesProcessor >
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserInteriorEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
    }
@@ -71,15 +72,15 @@ assembly( const RealType& time,
       DofVector* kernelU = tnlCuda::passToDevice( u );
       DofVector* kernelB = tnlCuda::passToDevice( b );
       MatrixType* kernelMatrix = tnlCuda::passToDevice( matrix );
-      TraversalUserData userData( *kernelTime, *kernelTau, *kernelDifferentialOperator, *kernelBoundaryConditions, *kernelRightHandSide, *kernelU, *kernelMatrix, *kernelB );
+      TraverserUserData userData( *kernelTime, *kernelTau, *kernelDifferentialOperator, *kernelBoundaryConditions, *kernelRightHandSide, *kernelU, *kernelMatrix, *kernelB );
       checkCudaDevice;
-      tnlTraverser< MeshType, EntityDimensions > meshTraversal;
-      meshTraversal.template processBoundaryEntities< TraversalUserData,
-                                                      TraversalBoundaryEntitiesProcessor >
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserBoundaryEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
-      meshTraversal.template processInteriorEntities< TraversalUserData,
-                                                      TraversalInteriorEntitiesProcessor >
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserInteriorEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
 
@@ -104,10 +105,11 @@ template< int Dimensions,
           typename DifferentialOperator,
           typename BoundaryConditions,
           typename RightHandSide,
+          typename TimeDiscretisation,
           typename Matrix >
    template< int EntityDimensions >
 void
-tnlLinearSystemAssembler< tnlGrid< Dimensions, Real, Device, Index >, DofVector, DifferentialOperator, BoundaryConditions, RightHandSide, Matrix >::
+tnlLinearSystemAssembler< tnlGrid< Dimensions, Real, Device, Index >, DofVector, DifferentialOperator, BoundaryConditions, RightHandSide, TimeDiscretisation, Matrix >::
 assembly( const RealType& time,
           const RealType& tau,
           const tnlGrid< Dimensions, Real, Device, Index >& mesh,
@@ -123,7 +125,7 @@ assembly( const RealType& time,
 
    if( ( tnlDeviceEnum ) DeviceType::DeviceType == tnlHostDevice )
    {
-      TraversalUserData userData( time,
+      TraverserUserData userData( time,
                                   tau,
                                   this->timeDiscretisationCoefficient,
                                   differentialOperator,
@@ -132,13 +134,13 @@ assembly( const RealType& time,
                                   u,
                                   matrix,
                                   b );
-      tnlTraverser< MeshType, EntityDimensions > meshTraversal;
-      meshTraversal.template processBoundaryEntities< TraversalUserData,
-                                                      TraversalBoundaryEntitiesProcessor >
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserBoundaryEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
-      meshTraversal.template processInteriorEntities< TraversalUserData,
-                                                      TraversalInteriorEntitiesProcessor >
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserInteriorEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
    }
@@ -154,7 +156,7 @@ assembly( const RealType& time,
       DofVector* kernelU = tnlCuda::passToDevice( u );
       DofVector* kernelB = tnlCuda::passToDevice( b );
       MatrixType* kernelMatrix = tnlCuda::passToDevice( matrix );
-      TraversalUserData userData( *kernelTime,
+      TraverserUserData userData( *kernelTime,
                                   *kernelTau,
                                   *kernelTimeDiscretisationCoefficient,
                                   *kernelDifferentialOperator,
@@ -164,13 +166,13 @@ assembly( const RealType& time,
                                   *kernelMatrix,
                                   *kernelB );
       checkCudaDevice;
-      tnlTraverser< MeshType, EntityDimensions > meshTraversal;
-      meshTraversal.template processBoundaryEntities< TraversalUserData,
-                                                      TraversalBoundaryEntitiesProcessor >
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserBoundaryEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
-      meshTraversal.template processInteriorEntities< TraversalUserData,
-                                                      TraversalInteriorEntitiesProcessor >
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserInteriorEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
 
diff --git a/src/solvers/pde/tnlNoTimeDiscretisation.h b/src/solvers/pde/tnlNoTimeDiscretisation.h
new file mode 100644
index 0000000000000000000000000000000000000000..27965ea03706c74da4cd1512ebfc1f732032bca3
--- /dev/null
+++ b/src/solvers/pde/tnlNoTimeDiscretisation.h
@@ -0,0 +1,42 @@
+/***************************************************************************
+                          tnlNoTimeDiscretisation.h  -  description
+                             -------------------
+    begin                : Apr 4, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef TNLNOTIMEDISCRETISATION_H
+#define	TNLNOTIMEDISCRETISATION_H
+
+#include <core/tnlCuda.h>
+
+class tnlNoTimeDiscretisation
+{
+    public:
+        
+        template< typename RealType,
+                  typename IndexType,
+                  typename MatrixType >
+        __cuda_callable__ static void applyTimeDiscretisation( MatrixType& matrix,
+                                                               RealType& b,
+                                                               const IndexType index,
+                                                               const RealType& u,
+                                                               const RealType& tau,
+                                                               const RealType& rhs )
+        {
+            b += rhs;
+        };
+};
+
+#endif	/* TNLNOTIMEDISCRETISATION_H */
+
diff --git a/src/solvers/pde/tnlPDESolver.h b/src/solvers/pde/tnlPDESolver.h
index d72f56fb726446e66a549143caa13fc48b12962f..87c1f18e5b36a0814455703c9d689a54004f043b 100644
--- a/src/solvers/pde/tnlPDESolver.h
+++ b/src/solvers/pde/tnlPDESolver.h
@@ -22,6 +22,7 @@
 #include <config/tnlConfigDescription.h>
 #include <config/tnlParameterContainer.h>
 #include <solvers/tnlSolverMonitor.h>
+#include <core/tnlLogger.h>
 
 template< typename Problem,
           typename TimeStepper >
@@ -29,71 +30,78 @@ class tnlPDESolver : public tnlObject
 {
    public:
 
-   typedef typename TimeStepper::RealType RealType;
-   typedef typename TimeStepper::DeviceType DeviceType;
-   typedef typename TimeStepper::IndexType IndexType;
-   typedef Problem ProblemType;
-   typedef typename ProblemType::MeshType MeshType;
-   typedef typename ProblemType::DofVectorType DofVectorType;
-   
-   tnlPDESolver();
+      typedef typename TimeStepper::RealType RealType;
+      typedef typename TimeStepper::DeviceType DeviceType;
+      typedef typename TimeStepper::IndexType IndexType;
+      typedef Problem ProblemType;
+      typedef typename ProblemType::MeshType MeshType;
+      typedef typename ProblemType::DofVectorType DofVectorType;
+      typedef typename ProblemType::MeshDependentDataType MeshDependentDataType;
 
-   static void configSetup( tnlConfigDescription& config,
-                            const tnlString& prefix = "" );
+      tnlPDESolver();
 
-   bool setup( const tnlParameterContainer& parameters,
-              const tnlString& prefix = "" );
+      static void configSetup( tnlConfigDescription& config,
+                               const tnlString& prefix = "" );
 
-   bool writeProlog( tnlLogger& logger,
-                     const tnlParameterContainer& parameters );
+      bool setup( const tnlParameterContainer& parameters,
+                 const tnlString& prefix = "" );
 
-   void setTimeStepper( TimeStepper& timeStepper );
+      bool writeProlog( tnlLogger& logger,
+                        const tnlParameterContainer& parameters );
 
-   void setProblem( ProblemType& problem );
+      void setTimeStepper( TimeStepper& timeStepper );
 
-   bool setFinalTime( const RealType& finalT );
+      void setProblem( ProblemType& problem );
 
-   const RealType& getFinalTine() const;
+      void setInitialTime( const RealType& initialT );
 
-   bool setTimeStep( const RealType& timeStep );
+      const RealType& getInitialTime() const;
 
-   const RealType& getTimeStep() const;
+      bool setFinalTime( const RealType& finalT );
 
-   bool setTimeStepOrder( const RealType& timeStepOrder );
+      const RealType& getFinalTime() const;
 
-   const RealType& getTimeStepOrder() const;
+      bool setTimeStep( const RealType& timeStep );
 
-   bool setSnapshotPeriod( const RealType& period );
-   
-   const RealType& getSnapshotPeriod() const;
+      const RealType& getTimeStep() const;
 
-   void setIoRtTimer( tnlTimerRT& ioRtTimer);
+      bool setTimeStepOrder( const RealType& timeStepOrder );
 
-   void setComputeRtTimer( tnlTimerRT& computeRtTimer );
+      const RealType& getTimeStepOrder() const;
 
-   void setIoCpuTimer( tnlTimerCPU& ioCpuTimer );
+      bool setSnapshotPeriod( const RealType& period );
 
-   void setComputeCpuTimer( tnlTimerCPU& computeCpuTimer );
+      const RealType& getSnapshotPeriod() const;
 
-   bool solve();
+      void setIoRtTimer( tnlTimerRT& ioRtTimer);
+
+      void setComputeRtTimer( tnlTimerRT& computeRtTimer );
+
+      void setIoCpuTimer( tnlTimerCPU& ioCpuTimer );
+
+      void setComputeCpuTimer( tnlTimerCPU& computeCpuTimer );
+
+      bool solve();
+
+      bool writeEpilog( tnlLogger& logger ) const;
 
    protected:
 
-   MeshType mesh;
+      MeshType mesh;
 
-   DofVectorType dofs;
+      DofVectorType dofs;
 
-   DofVectorType auxiliaryDofs;
+      MeshDependentDataType meshDependentData;
 
-   TimeStepper* timeStepper;
+      TimeStepper* timeStepper;
 
-   RealType finalTime, snapshotPeriod, timeStep, timeStepOrder;
+      RealType initialTime, finalTime, snapshotPeriod, timeStep, timeStepOrder;
 
-   ProblemType* problem;
+      ProblemType* problem;
 
-   tnlTimerRT *ioRtTimer, *computeRtTimer;
+      tnlTimerRT *ioRtTimer, *computeRtTimer;
 
-   tnlTimerCPU *ioCpuTimer, *computeCpuTimer;
+      tnlTimerCPU *ioCpuTimer, *computeCpuTimer;
 
 };
 
diff --git a/src/solvers/pde/tnlPDESolver_impl.h b/src/solvers/pde/tnlPDESolver_impl.h
index d8d1f3c1f9c408dfa9ac8609647abe78c4fe6754..3b5282e7410d5813f540180bf1965e35babf6ab0 100644
--- a/src/solvers/pde/tnlPDESolver_impl.h
+++ b/src/solvers/pde/tnlPDESolver_impl.h
@@ -23,6 +23,7 @@ template< typename Problem,
 tnlPDESolver< Problem, TimeStepper >::
 tnlPDESolver()
 : timeStepper( 0 ),
+  initialTime( 0.0 ),
   finalTime( 0.0 ),
   snapshotPeriod( 0.0 ),
   timeStep( 1.0 ),
@@ -34,7 +35,7 @@ tnlPDESolver()
   computeCpuTimer( 0 )
 {
    this->dofs.setName( "dofs" );
-   this->auxiliaryDofs.setName( "auxiliaryDofs" );
+   this->meshDependentData.setName( "meshDependentData" );
 }
 
 template< typename Problem,
@@ -46,6 +47,7 @@ configSetup( tnlConfigDescription& config,
 {
    config.addEntry< tnlString >( prefix + "initial-condition", "File name with the initial condition.", "init.tnl" );
    config.addRequiredEntry< double >( prefix + "final-time", "Stop time of the time dependent problem." );
+   config.addEntry< double >( prefix + "initial-time", "Initial time of the time dependent problem.", 0 );
    config.addRequiredEntry< double >( prefix + "snapshot-period", "Time period for writing the problem status.");
    config.addEntry< double >( "time-step", "The time step for the time discretisation.", 1.0 );
    config.addEntry< double >( "time-step-order", "The time step is set to time-step*pow( space-step, time-step-order).", 0.0 );
@@ -77,8 +79,7 @@ setup( const tnlParameterContainer& parameters,
     */
    tnlAssert( problem->getDofs( this->mesh ) != 0, );
    cout << "Allocating dofs ... ";
-   if( ! this->dofs.setSize( problem->getDofs( this->mesh ) ) ||
-       ! this->auxiliaryDofs.setSize( problem->getAuxiliaryDofs( this->mesh ) ) )
+   if( ! this->dofs.setSize( problem->getDofs( this->mesh ) ) )
    {
       cerr << endl;
       cerr << "I am not able to allocate DOFs (degrees of freedom)." << endl;
@@ -86,17 +87,20 @@ setup( const tnlParameterContainer& parameters,
    }
    cout << " [ OK ]" << endl;
    this->dofs.setValue( 0.0 );
-   if( this->auxiliaryDofs.getSize() != 0 )
-      this->auxiliaryDofs.setValue( 0.0 );
-   this->problem->bindDofs( mesh, this->dofs );
-   this->problem->bindAuxiliaryDofs( mesh, this->auxiliaryDofs );
+   this->problem->bindDofs( this->mesh, this->dofs );
+   
+   /****
+    * Set mesh dependent data
+    */
+   this->problem->setMeshDependentData( this->mesh, this->meshDependentData );
+   this->problem->bindMeshDependentData( this->mesh, this->meshDependentData );
    
    /***
     * Set-up the initial condition
     */
    cout << "Setting up the initial condition ... ";
    typedef typename Problem :: DofVectorType DofVectorType;
-   if( ! this->problem->setInitialCondition( parameters, mesh, this->dofs, this->auxiliaryDofs ) )
+   if( ! this->problem->setInitialCondition( parameters, this->mesh, this->dofs, this->meshDependentData ) )
       return false;
    cout << " [ OK ]" << endl;
 
@@ -104,6 +108,7 @@ setup( const tnlParameterContainer& parameters,
     * Initialize the time discretisation
     */
    this->setFinalTime( parameters.getParameter< double >( "final-time" ) );
+   this->setInitialTime( parameters.getParameter< double >( "initial-time" ) );
    this->setSnapshotPeriod( parameters.getParameter< double >( "snapshot-period" ) );
    this->setTimeStep( parameters.getParameter< double >( "time-step") );
    this->setTimeStepOrder( parameters.getParameter< double >( "time-step-order" ) );
@@ -124,6 +129,7 @@ writeProlog( tnlLogger& logger,
    logger.writeSeparator();
    logger.writeParameter< tnlString >( "Time discretisation:", "time-discretisation", parameters );
    logger.writeParameter< double >( "Initial time step:", "time-step", parameters );
+   logger.writeParameter< double >( "Initial time:", "initial-time", parameters );
    logger.writeParameter< double >( "Final time:", "final-time", parameters );
    logger.writeParameter< double >( "Snapshot period:", "snapshot-period", parameters );
    const tnlString& solverName = parameters. getParameter< tnlString >( "discrete-solver" );
@@ -163,15 +169,34 @@ setProblem( ProblemType& problem )
    this->problem = &problem;
 }
 
+template< typename Problem,
+          typename TimeStepper >
+void
+tnlPDESolver< Problem, TimeStepper >::
+setInitialTime( const RealType& initialTime )
+{
+   this->initialTime = initialTime;
+}
+
+template< typename Problem,
+          typename TimeStepper >
+const typename TimeStepper :: RealType&
+tnlPDESolver< Problem, TimeStepper >::
+getInitialTime() const
+{
+   return this->initialTime;
+}
+
+
 template< typename Problem,
           typename TimeStepper >
 bool
 tnlPDESolver< Problem, TimeStepper >::
 setFinalTime( const RealType& finalTime )
 {
-   if( finalTime <= 0 )
+   if( finalTime <= this->initialTime )
    {
-      cerr << "Final time for tnlPDESolver must be positive value." << endl;
+      cerr << "Final time for tnlPDESolver must larger than the initial time which is now " << this->initialTime << "." << endl;
       return false;
    }
    this->finalTime = finalTime;
@@ -182,7 +207,7 @@ template< typename Problem,
           typename TimeStepper >
 const typename TimeStepper :: RealType&
 tnlPDESolver< Problem, TimeStepper >::
-getFinalTine() const
+getFinalTime() const
 {
    return this->finalTime;
 }
@@ -287,7 +312,9 @@ void tnlPDESolver< Problem, TimeStepper > :: setComputeCpuTimer( tnlTimerCPU& co
 }
 
 template< typename Problem, typename TimeStepper >
-bool tnlPDESolver< Problem, TimeStepper > :: solve()
+bool
+tnlPDESolver< Problem, TimeStepper >::
+solve()
 {
    tnlAssert( timeStepper != 0,
               cerr << "No time stepper was set in tnlPDESolver with name " << this -> getName() );
@@ -299,47 +326,56 @@ bool tnlPDESolver< Problem, TimeStepper > :: solve()
       cerr << "No snapshot tau was set in tnlPDESolver " << this -> getName() << "." << endl;
       return false;
    }
-   RealType t( 0.0 );
+   RealType t( this->initialTime );
    IndexType step( 0 );
-   IndexType allSteps = ceil( this->finalTime / this->snapshotPeriod );
-   this->timeStepper->setProblem( * ( this->problem ) );
-   this->timeStepper->init( mesh );
-   this->problem->bindDofs( mesh, this->dofs );
-   this->problem->bindAuxiliaryDofs( mesh, this->auxiliaryDofs );
+   IndexType allSteps = ceil( ( this->finalTime - this->initialTime ) / this->snapshotPeriod );
 
-   if( ! this->problem->makeSnapshot( t, step, mesh, this->dofs, this->auxiliaryDofs ) )
+   if( ! this->problem->makeSnapshot( t, step, mesh, this->dofs, this->meshDependentData ) )
    {
       cerr << "Making the snapshot failed." << endl;
       return false;
    }
-   timeStepper->setTimeStep( this->timeStep * pow( mesh.getSmallestSpaceStep(), this->timeStepOrder ) );
+
+   /****
+    * Initialize the time stepper
+    */
+   this->timeStepper->setProblem( * ( this->problem ) );
+   this->timeStepper->init( this->mesh );
+   this->timeStepper->setTimeStep( this->timeStep * pow( mesh.getSmallestSpaceStep(), this->timeStepOrder ) );
    while( step < allSteps )
    {
       RealType tau = Min( this -> snapshotPeriod,
                           this -> finalTime - t );
-      if( ! this->timeStepper->solve( t, t + tau, mesh, this->dofs, this->auxiliaryDofs ) )
+      if( ! this->timeStepper->solve( t, t + tau, mesh, this->dofs, this->meshDependentData ) )
          return false;
       step ++;
       t += tau;
 
-      this->ioRtTimer->Continue();
-      this->ioCpuTimer->Continue();
-      this->computeRtTimer->Stop();
-      this->computeCpuTimer->Stop();
+      this->ioRtTimer->start();
+      this->ioCpuTimer->start();
+      this->computeRtTimer->stop();
+      this->computeCpuTimer->stop();
 
-      if( ! this->problem->makeSnapshot( t, step, mesh, this->dofs, this->auxiliaryDofs ) )
+      if( ! this->problem->makeSnapshot( t, step, mesh, this->dofs, this->meshDependentData ) )
       {
          cerr << "Making the snapshot failed." << endl;
          return false;
       }
 
-      this-> ioRtTimer->Stop();
-      this-> ioCpuTimer->Stop();
-      this-> computeRtTimer->Continue();
-      this-> computeCpuTimer->Continue();
-
+      this-> ioRtTimer->stop();
+      this-> ioCpuTimer->stop();
+      this-> computeRtTimer->start();
+      this-> computeCpuTimer->start();
    }
    return true;
 }
 
+template< typename Problem, typename TimeStepper >
+bool
+tnlPDESolver< Problem, TimeStepper >::
+writeEpilog( tnlLogger& logger ) const
+{
+   return this->timeStepper->writeEpilog( logger );
+}
+
 #endif /* TNLPDESOLVER_IMPL_H_ */
diff --git a/src/solvers/pde/tnlSemiImplicitTimeStepper.h b/src/solvers/pde/tnlSemiImplicitTimeStepper.h
index 1e2a71477f83b5d736bb8d3337e2f9ac7dac0316..ab23a123dd6639794142e0dad15bc25206a41bb8 100644
--- a/src/solvers/pde/tnlSemiImplicitTimeStepper.h
+++ b/src/solvers/pde/tnlSemiImplicitTimeStepper.h
@@ -18,6 +18,9 @@
 #ifndef TNLSEMIIMPLICITTIMESTEPPER_H_
 #define TNLSEMIIMPLICITTIMESTEPPER_H_
 
+#include <core/tnlTimerRT.h>
+#include <core/tnlLogger.h>
+
 template< typename Problem,
           typename LinearSystemSolver >
 class tnlSemiImplicitTimeStepper
@@ -30,6 +33,7 @@ class tnlSemiImplicitTimeStepper
    typedef typename Problem::IndexType IndexType;
    typedef typename Problem::MeshType MeshType;
    typedef typename ProblemType::DofVectorType DofVectorType;
+   typedef typename ProblemType::MeshDependentDataType MeshDependentDataType;
    typedef LinearSystemSolver LinearSystemSolverType;
    typedef typename ProblemType::MatrixType MatrixType;
 
@@ -59,7 +63,9 @@ class tnlSemiImplicitTimeStepper
                const RealType& stopTime,
                const MeshType& mesh,
                DofVectorType& dofVector,
-               DofVectorType& auxiliaryDofVector );
+               MeshDependentDataType& meshDependentData );
+   
+   bool writeEpilog( tnlLogger& logger );
 
    protected:
 
@@ -73,6 +79,8 @@ class tnlSemiImplicitTimeStepper
 
    RealType timeStep;
 
+   tnlTimerRT linearSystemAssemblerTimer, linearSystemSolverTimer;
+   
    bool verbose;
 };
 
diff --git a/src/solvers/pde/tnlSemiImplicitTimeStepper_impl.h b/src/solvers/pde/tnlSemiImplicitTimeStepper_impl.h
index 7aafb69904916a5d3011d72f2fb9999482b55636..26559d13d66015a2e0832de8070d8a821444b3dd 100644
--- a/src/solvers/pde/tnlSemiImplicitTimeStepper_impl.h
+++ b/src/solvers/pde/tnlSemiImplicitTimeStepper_impl.h
@@ -73,6 +73,8 @@ init( const MeshType& mesh )
    }
    if( ! this->rightHandSide.setSize( this->matrix.getRows() ) )
       return false;
+   this->linearSystemAssemblerTimer.reset();
+   this->linearSystemSolverTimer.reset();
    return true;
 }
 
@@ -134,7 +136,7 @@ solve( const RealType& time,
        const RealType& stopTime,
        const MeshType& mesh,
        DofVectorType& dofVector,
-       DofVectorType& auxiliaryDofVector )
+       MeshDependentDataType& meshDependentData )
 {
    tnlAssert( this->problem != 0, );
    RealType t = time;
@@ -147,34 +149,38 @@ solve( const RealType& time,
                                        currentTau,
                                        mesh,
                                        dofVector,
-                                       auxiliaryDofVector ) )
+                                       meshDependentData ) )
       {
          cerr << endl << "Preiteration failed." << endl;
          return false;
       }
       if( verbose )
          cout << "                                                                  Assembling the linear system ... \r" << flush;
+      this->linearSystemAssemblerTimer.start();
       this->problem->assemblyLinearSystem( t,
                                            currentTau,
                                            mesh,
                                            dofVector,
-                                           auxiliaryDofVector,
+                                           meshDependentData,
                                            this->matrix,
                                            this->rightHandSide );
+      this->linearSystemAssemblerTimer.stop();
       if( verbose )
          cout << "                                                                  Solving the linear system for time " << t << "             \r" << flush;
+      this->linearSystemSolverTimer.start();
       if( ! this->linearSystemSolver->template solve< DofVectorType, tnlLinearResidueGetter< MatrixType, DofVectorType > >( this->rightHandSide, dofVector ) )
       {
          cerr << endl << "The linear system solver did not converge." << endl;
          return false;
       }
+      this->linearSystemSolverTimer.stop();
       //if( verbose )
       //   cout << endl;
       if( ! this->problem->postIterate( t,
                                         currentTau,
                                         mesh,
                                         dofVector,
-                                        auxiliaryDofVector ) )
+                                        meshDependentData ) )
       {
          cerr << endl << "Postiteration failed." << endl;
          return false;
@@ -184,4 +190,15 @@ solve( const RealType& time,
    return true;
 }
 
+template< typename Problem,
+          typename LinearSystemSolver >
+bool
+tnlSemiImplicitTimeStepper< Problem, LinearSystemSolver >::
+writeEpilog( tnlLogger& logger )
+{
+   logger.writeParameter< double >( "Linear system assembler time:", this->linearSystemAssemblerTimer.getTime() );
+   logger.writeParameter< double >( "Linear system solver time:", this->linearSystemSolverTimer.getTime() );
+   return true;
+}
+
 #endif /* TNLSEMIIMPLICITTIMESTEPPER_IMPL_H_ */
diff --git a/src/solvers/tnlConfigTags.h b/src/solvers/tnlBuildConfigTags.h
similarity index 99%
rename from src/solvers/tnlConfigTags.h
rename to src/solvers/tnlBuildConfigTags.h
index 9f1794dcc0652a99c3340a931b1f592ed8687ecb..5b094846fbe5899d9f72cb184da920f2f3bd76bc 100644
--- a/src/solvers/tnlConfigTags.h
+++ b/src/solvers/tnlBuildConfigTags.h
@@ -20,7 +20,7 @@
 
 #include <mesh/tnlGrid.h>
 
-class tnlDefaultConfigTag{};
+class tnlDefaultBuildConfigTag{};
 
 /****
  * All devices are enabled by default. Those which are not available
diff --git a/src/solvers/tnlDummyProblem.h b/src/solvers/tnlDummyProblem.h
index 73ca61fa789b619018eb59e1737b9c9bbd5e5291..0d5f226d379a5d6224b15ad1939a8d4ffebf6702 100644
--- a/src/solvers/tnlDummyProblem.h
+++ b/src/solvers/tnlDummyProblem.h
@@ -34,6 +34,7 @@ class tnlDummyProblem
       typedef Index IndexType;
       typedef tnlVector< Real, Device, Index > DofVectorType;
       typedef tnlGrid< 1, Real, Device, Index > MeshType;
+      typedef DofVectorType MeshDependentDataType;
 };
 
 
diff --git a/src/solvers/tnlFastBuildConfig.h b/src/solvers/tnlFastBuildConfigTag.h
similarity index 98%
rename from src/solvers/tnlFastBuildConfig.h
rename to src/solvers/tnlFastBuildConfigTag.h
index b8bd739351d7637e7860f323598becede0f55850..23b603cf7a24e19262c6b9b0dba8dc02319b93b0 100644
--- a/src/solvers/tnlFastBuildConfig.h
+++ b/src/solvers/tnlFastBuildConfigTag.h
@@ -18,7 +18,7 @@
 #ifndef TNLFASTBUILDCONFIG_H_
 #define TNLFASTBUILDCONFIG_H_
 
-#include <solvers/tnlConfigTags.h>
+#include <solvers/tnlBuildConfigTags.h>
 
 class tnlFastBuildConfig
 {
diff --git a/src/solvers/tnlIterativeSolverMonitor_impl.h b/src/solvers/tnlIterativeSolverMonitor_impl.h
index 803fad0ed734cb50ae1ebc97bb07e8b2f0e31ad3..ecfc2225d2e36c29a935a850e8980408bdf27dc3 100644
--- a/src/solvers/tnlIterativeSolverMonitor_impl.h
+++ b/src/solvers/tnlIterativeSolverMonitor_impl.h
@@ -79,20 +79,20 @@ void tnlIterativeSolverMonitor< Real, Index > :: refresh()
 template< typename Real, typename Index>
 void tnlIterativeSolverMonitor< Real, Index > :: resetTimers()
 {
-   cpuTimer. Reset();
-   rtTimer. Reset();
+   cpuTimer.reset();
+   rtTimer.reset();
 }
 
 template< typename Real, typename Index>
 double tnlIterativeSolverMonitor< Real, Index > :: getCPUTime()
 {
-   return cpuTimer. GetTime();
+   return cpuTimer.getTime();
 }
 
 template< typename Real, typename Index>
 double tnlIterativeSolverMonitor< Real, Index > :: getRealTime()
 {
-   return rtTimer. GetTime();
+   return rtTimer.getTime();
 }
 
 
diff --git a/src/solvers/tnlIterativeSolver_impl.h b/src/solvers/tnlIterativeSolver_impl.h
index 0331a974ebd1ab31cc845f021ad79fc126d090c4..f6503520c3f22a66d55fd2e5c5942a51db2dca00 100644
--- a/src/solvers/tnlIterativeSolver_impl.h
+++ b/src/solvers/tnlIterativeSolver_impl.h
@@ -54,6 +54,7 @@ bool tnlIterativeSolver< Real, Index> :: setup( const tnlParameterContainer& par
    this->setConvergenceResidue( parameters.getParameter< double >( "convergence-residue" ) );
    this->setDivergenceResidue( parameters.getParameter< double >( "divergence-residue" ) );
    this->setRefreshRate( parameters.getParameter< int >( "refresh-rate" ) );
+   return true;
 }
 
 template< typename Real, typename Index >
@@ -100,27 +101,11 @@ bool tnlIterativeSolver< Real, Index> :: nextIteration()
          solverMonitor->refresh();
    }
 
-   if( std::isnan( this->getResidue() ) )
-   {
-      //cerr << endl << "RES is Nan" << endl;
-      return false;
-   }
-   if(( this->getResidue() > this->getDivergenceResidue() &&
-         this->getIterations() > this->minIterations ) )
-   {
-      ///cerr << endl << "RES is over the divergence residue." << endl;
+   if( std::isnan( this->getResidue() ) || 
+       this->getIterations() > this->getMaxIterations()  ||
+       ( this->getResidue() > this->getDivergenceResidue() && this->getIterations() > this->minIterations ) ||
+       ( this->getResidue() < this->getConvergenceResidue() && this->getIterations() > this->minIterations ) ) 
       return false;
-   }
-   if( this->getIterations() > this->getMaxIterations() )
-   {
-      //cerr << endl << "Max. iterations exceeded." << endl;
-      return false;
-   }
-   if( this->getResidue() < this->getConvergenceResidue() )
-   {
-      //cerr << endl << "The solver has. converged." <<  endl;
-      return false;
-   }
    return true;
 }
 
diff --git a/src/solvers/tnlSolver.h b/src/solvers/tnlSolver.h
index c99bd3af92ab7621534801539b0d3656f27069ab..9e4040d87a48621acbe7750bb4f2164063c73c11 100644
--- a/src/solvers/tnlSolver.h
+++ b/src/solvers/tnlSolver.h
@@ -18,15 +18,15 @@
 #ifndef TNLSOLVER_H_
 #define TNLSOLVER_H_
 
-#include <solvers/tnlConfigTags.h>
+#include <solvers/tnlBuildConfigTags.h>
 
 template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter > class ProblemSetter,
           template< typename ConfTag > class ProblemConfig,
-          typename ConfigTag = tnlDefaultConfigTag >
+          typename ConfigTag = tnlDefaultBuildConfigTag >
 class tnlSolver
 {
    public:
-   bool run( int argc, char* argv[] );
+   static bool run( int argc, char* argv[] );
 
    protected:
 };
diff --git a/src/solvers/tnlSolverConfig_impl.h b/src/solvers/tnlSolverConfig_impl.h
index 0e814f49064494f32f784de79257db4665d036dd..329b0335918660845018b304f8191b543e7b302f 100644
--- a/src/solvers/tnlSolverConfig_impl.h
+++ b/src/solvers/tnlSolverConfig_impl.h
@@ -19,7 +19,7 @@
 #define TNLSOLVERCONFIG_IMPL_H_
 
 #include <tnlConfig.h>
-#include <solvers/tnlConfigTags.h>
+#include <solvers/tnlBuildConfigTags.h>
 #include <solvers/tnlDummyProblem.h>
 #include <solvers/pde/tnlExplicitTimeStepper.h>
 #include <solvers/pde/tnlPDESolver.h>
@@ -100,7 +100,7 @@ bool tnlSolverConfig< ConfigTag, ProblemConfig >::configSetup( tnlConfigDescript
    }
    config.addRequiredEntry< tnlString >( "discrete-solver", "The solver of the discretised problem:" );
    if( tnlConfigTagTimeDiscretisation< ConfigTag, tnlExplicitTimeDiscretisationTag >::enabled )
-   {
+   {      
       if( tnlConfigTagExplicitSolver< ConfigTag, tnlExplicitEulerSolverTag >::enabled )
          config.addEntryEnum( "euler" );
       if( tnlConfigTagExplicitSolver< ConfigTag, tnlExplicitMersonSolverTag >::enabled )
@@ -117,9 +117,16 @@ bool tnlSolverConfig< ConfigTag, ProblemConfig >::configSetup( tnlConfigDescript
       if( tnlConfigTagSemiImplicitSolver< ConfigTag, tnlSemiImplicitSORSolverTag >::enabled )
          config.addEntryEnum( "sor" );
    }
+   if( tnlConfigTagTimeDiscretisation< ConfigTag, tnlExplicitTimeDiscretisationTag >::enabled ||
+       tnlConfigTagTimeDiscretisation< ConfigTag, tnlSemiImplicitTimeDiscretisationTag >::enabled )
+   {
+      config.addDelimiter( " === Iterative solvers parameters === " );
+      tnlIterativeSolver< double, int >::configSetup( config );
+   }
    if( tnlConfigTagTimeDiscretisation< ConfigTag, tnlExplicitTimeDiscretisationTag >::enabled )
    {
       config.addDelimiter( " === Explicit solvers parameters === " );
+      tnlExplicitSolver< tnlDummyProblem< double, tnlHost, int > >::configSetup( config );
       if( tnlConfigTagExplicitSolver< ConfigTag, tnlExplicitEulerSolverTag >::enabled )
          tnlEulerSolver< tnlDummyProblem< double, tnlHost, int > >::configSetup( config );
 
@@ -128,7 +135,7 @@ bool tnlSolverConfig< ConfigTag, ProblemConfig >::configSetup( tnlConfigDescript
    }
    if( tnlConfigTagTimeDiscretisation< ConfigTag, tnlSemiImplicitTimeDiscretisationTag >::enabled )
    {
-      config.addDelimiter( " === Semi-implicit solvers parameters === " );
+      config.addDelimiter( " === Semi-implicit solvers parameters === " );      
       typedef tnlCSRMatrix< double, tnlHost, int > MatrixType;
       if( tnlConfigTagSemiImplicitSolver< ConfigTag, tnlSemiImplicitCGSolverTag >::enabled )
          tnlCGSolver< MatrixType >::configSetup( config );
diff --git a/src/solvers/tnlSolverInitiator.h b/src/solvers/tnlSolverInitiator.h
index 3a27cf53574aa383ee79f9036775e732d8139181..525d304c9d2bfea36396c2b0e35cbfae65903527 100644
--- a/src/solvers/tnlSolverInitiator.h
+++ b/src/solvers/tnlSolverInitiator.h
@@ -20,7 +20,7 @@
 
 #include <core/tnlObject.h>
 #include <config/tnlParameterContainer.h>
-#include <solvers/tnlConfigTags.h>
+#include <solvers/tnlBuildConfigTags.h>
 
 template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter > class ProblemSetter,
           typename ConfigTag >
diff --git a/src/solvers/tnlSolverInitiator_impl.h b/src/solvers/tnlSolverInitiator_impl.h
index 05e281e1bc72a7d1fd2433d91195c405da5a0026..0876bd27527879a2a9cf4faaa071fc5f5bfa90df 100644
--- a/src/solvers/tnlSolverInitiator_impl.h
+++ b/src/solvers/tnlSolverInitiator_impl.h
@@ -17,7 +17,7 @@
 
 #include <config/tnlParameterContainer.h>
 #include <solvers/tnlMeshTypeResolver.h>
-#include <solvers/tnlConfigTags.h>
+#include <solvers/tnlBuildConfigTags.h>
 #include <solvers/linear/stationary/tnlSORSolver.h>
 #include <solvers/linear/krylov/tnlCGSolver.h>
 #include <solvers/linear/krylov/tnlBICGStabSolver.h>
diff --git a/src/solvers/tnlSolverStarter.h b/src/solvers/tnlSolverStarter.h
index 150fd3d83bd75bc5142a1c625ebeae04d43dd9a9..a2f604bedc3b21d29e34b58d967b5708ee354abb 100644
--- a/src/solvers/tnlSolverStarter.h
+++ b/src/solvers/tnlSolverStarter.h
@@ -33,7 +33,8 @@ class tnlSolverStarter
    template< typename Problem >
    static bool run( const tnlParameterContainer& parameters );
 
-   bool writeEpilog( ostream& str );
+   template< typename Solver >
+   bool writeEpilog( ostream& str, const Solver& solver );
 
    template< typename Problem, typename TimeStepper >
    bool runPDESolver( Problem& problem,
diff --git a/src/solvers/tnlSolverStarter_impl.h b/src/solvers/tnlSolverStarter_impl.h
index 98135409449424796cfa927adbd3f91924b1d8fb..09e61c33e25d96b0d63cbe30891d21b274f7a296 100644
--- a/src/solvers/tnlSolverStarter_impl.h
+++ b/src/solvers/tnlSolverStarter_impl.h
@@ -516,17 +516,17 @@ bool tnlSolverStarter< ConfigTag > :: runPDESolver( Problem& problem,
                                                     const tnlParameterContainer& parameters,
                                                     TimeStepper& timeStepper )
 {
-   this->totalCpuTimer. Reset();
-   this->totalRtTimer. Reset();
+   this->totalCpuTimer.reset();
+   this->totalRtTimer.reset();
 
    /****
     * Set-up the PDE solver
     */
    tnlPDESolver< Problem, TimeStepper > solver;
    solver.setProblem( problem );
+   solver.setTimeStepper( timeStepper );
    if( ! solver.setup( parameters ) )
       return false;
-   solver.setTimeStepper( timeStepper );
 
    /****
     * Write a prolog
@@ -560,16 +560,16 @@ bool tnlSolverStarter< ConfigTag > :: runPDESolver( Problem& problem,
    /****
     * Set-up timers
     */
-   this->computeRtTimer. Reset();
-   this->computeCpuTimer. Reset();
-   this->ioRtTimer. Reset();
-   this->ioRtTimer. Stop();
-   this->ioCpuTimer. Reset();
-   this->ioCpuTimer. Stop();
-   solver.setComputeRtTimer( this -> computeRtTimer );
-   solver.setComputeCpuTimer( this -> computeCpuTimer );
-   solver.setIoRtTimer( this -> ioRtTimer );
-   solver.setIoCpuTimer( this -> ioCpuTimer );
+   this->computeRtTimer.reset();
+   this->computeCpuTimer.reset();
+   this->ioRtTimer.reset();
+   this->ioRtTimer.stop();
+   this->ioCpuTimer.reset();
+   this->ioCpuTimer.stop();
+   solver.setComputeRtTimer( this->computeRtTimer );
+   solver.setComputeCpuTimer( this->computeCpuTimer );
+   solver.setIoRtTimer( this->ioRtTimer );
+   solver.setIoCpuTimer( this->ioCpuTimer );
 
    /****
     * Start the solver
@@ -597,16 +597,16 @@ bool tnlSolverStarter< ConfigTag > :: runPDESolver( Problem& problem,
    /****
     * Stop timers
     */
-   this->computeRtTimer.Stop();
-   this->computeCpuTimer.Stop();
-   this->totalCpuTimer.Stop();
-   this->totalRtTimer.Stop();
+   this->computeRtTimer.stop();
+   this->computeCpuTimer.stop();
+   this->totalCpuTimer.stop();
+   this->totalRtTimer.stop();
 
    /****
     * Write an epilog
     */
    if( verbose )
-      writeEpilog( cout );
+      writeEpilog( cout, solver );
    if( haveLogFile )
    {
       fstream logFile;
@@ -618,7 +618,7 @@ bool tnlSolverStarter< ConfigTag > :: runPDESolver( Problem& problem,
       }
       else
       {
-         writeEpilog( logFile );
+         writeEpilog( logFile, solver );
          logFile.close();
       }
    }
@@ -626,18 +626,21 @@ bool tnlSolverStarter< ConfigTag > :: runPDESolver( Problem& problem,
 }
 
 template< typename ConfigTag >
-bool tnlSolverStarter< ConfigTag > :: writeEpilog( ostream& str )
+   template< typename Solver >
+bool tnlSolverStarter< ConfigTag > :: writeEpilog( ostream& str, const Solver& solver  )
 {
    tnlLogger logger( logWidth, str );
    logger.writeCurrentTime( "Finished at:" );
-   logger.writeParameter< double >( "IO Real Time:", this -> ioRtTimer. GetTime() );
-   logger.writeParameter< double >( "IO CPU Time:", this -> ioCpuTimer. GetTime() );
-   logger.writeParameter< double >( "Compute Real Time:", this -> computeRtTimer. GetTime() );
-   logger.writeParameter< double >( "Compute CPU Time:", this -> computeCpuTimer. GetTime() );
-   logger.writeParameter< double >( "Total Real Time:", this -> totalRtTimer. GetTime() );
-   logger.writeParameter< double >( "Total CPU Time:", this -> totalCpuTimer. GetTime() );
+   if( ! solver.writeEpilog( logger ) )
+      return false;
+   logger.writeParameter< double >( "IO Real Time:", this -> ioRtTimer. getTime() );
+   logger.writeParameter< double >( "IO CPU Time:", this -> ioCpuTimer. getTime() );
+   logger.writeParameter< double >( "Compute Real Time:", this -> computeRtTimer. getTime() );
+   logger.writeParameter< double >( "Compute CPU Time:", this -> computeCpuTimer. getTime() );
+   logger.writeParameter< double >( "Total Real Time:", this -> totalRtTimer. getTime() );
+   logger.writeParameter< double >( "Total CPU Time:", this -> totalCpuTimer. getTime() );
    char buf[ 256 ];
-   sprintf( buf, "%f %%", 100 * ( ( double ) this -> totalCpuTimer. GetTime() ) / this -> totalRtTimer. GetTime() );
+   sprintf( buf, "%f %%", 100 * ( ( double ) this -> totalCpuTimer. getTime() ) / this -> totalRtTimer. getTime() );
    logger.writeParameter< char* >( "CPU usage:", buf );
    logger.writeSeparator();
    return true;
diff --git a/src/tnl-benchmarks.h b/src/tnl-benchmarks.h
index 3eaa5152fa08d52f5280e35a0c499536e00ddf79..9bb1727a4d9227153676c0035999d87a11c8c9ea 100644
--- a/src/tnl-benchmarks.h
+++ b/src/tnl-benchmarks.h
@@ -45,7 +45,7 @@ bool transferBenchmark( const int size,
    for( int i = 0; i < cycles; i ++ )
       if( ! host_vector2. copyFrom( host_vector ) )
          return false;
-   double time = timer. GetTime();
+   double time = timer. getTime();
    double giga_byte = ( double ) ( 1 << 30 );
    host_to_host_band_width = bytes / giga_byte / time;
 
@@ -55,7 +55,7 @@ bool transferBenchmark( const int size,
    for( int i = 0; i < cycles; i ++ )
       if( ! device_vector. copyFrom( host_vector ) )
          return false;
-   time = timer. GetTime();
+   time = timer. getTime();
    host_to_device_band_width = bytes / giga_byte / time;
 
    cout << "Transfering " << bytes / mega_byte << " MB from HOST to DEVICE took " << time << " seconds. Bandwidth is " << host_to_device_band_width << " GB/s." << endl;
@@ -64,7 +64,7 @@ bool transferBenchmark( const int size,
    for( int i = 0; i < cycles; i ++ )
       if( ! host_vector2. copyFrom( device_vector ) )
          return false;
-   time = timer. GetTime();
+   time = timer. getTime();
    device_to_host_band_width = bytes / giga_byte / time;
 
    cout << "Transfering " << bytes / mega_byte << " MB from DEVICE to HOST took " << time << " seconds. Bandwidth is " << device_to_host_band_width << " GB/s." << endl;
@@ -74,7 +74,7 @@ bool transferBenchmark( const int size,
       if( ! device_vector2. copyFrom( device_vector ) )
          return false;
 
-   time = timer. GetTime();
+   time = timer. getTime();
 
    // Since we read and write tha data back we process twice as many bytes.
    bytes *= 2;
@@ -230,7 +230,7 @@ void reductionBenchmark( const int size,
 
       }
    }
-   const double time = timer. GetTime();
+   const double time = timer. getTime();
    double giga_byte = ( double ) ( 1 << 30 );
    long int mega_byte = 1 << 20;
    long int bytes_reduced = size * sizeof( T ) * reducing_cycles * 3;
diff --git a/tests/benchmarks/share/tnl-matrix-solvers-benchmark.cfg.desc b/tests/benchmarks/share/tnl-matrix-solvers-benchmark.cfg.desc
deleted file mode 100644
index 8e8280574470d15a71513a8cd49958af2ffdcafa..0000000000000000000000000000000000000000
--- a/tests/benchmarks/share/tnl-matrix-solvers-benchmark.cfg.desc
+++ /dev/null
@@ -1,19 +0,0 @@
-group IO
-{
-   string input-file(!)         [Input binary file name.];
-   string input-mtx-file("")    [Input mtx file name.];
-   string log-file("")          [Log file name.];
-   string matrix-stats-file("") [File for matrix statistics like size number of non-zero elements.];
-   real stop-time(3.0)          [How many seconds shell we iterate SpMV.];
-   integer verbose(1)           [Verbose mode.];
-},[Arguments describing input and output data.];
-
-group solver
-{
-   string device("host")         [On what device the solver will run. Can be host or cuda.];
-   string solver-name(!)         [Set matrix solver for benchmarking. It can be sor, cg, bicgstab, tfqmr, gmres. ];
-   string solver-class("tnl")    [Choose other library of solvers. It can be tnl or petsc.];
-   real max-residue(1.0e-6)      [Set what residue we want to achieve.];
-   integer gmres-restarting(20)  [Set restarting for GMRES method.];   
-   real sor-omega(1.0)           [Omega parameter for the SOR method. Can be 0--2.];
-},[Arguments describing the solver.];
\ No newline at end of file
diff --git a/tests/benchmarks/share/tnl-sparse-matrix-benchmark.cfg.desc b/tests/benchmarks/share/tnl-sparse-matrix-benchmark.cfg.desc
deleted file mode 100644
index 9a3bde6ec250122dcb96fc681f8bd4f276bc5b9b..0000000000000000000000000000000000000000
--- a/tests/benchmarks/share/tnl-sparse-matrix-benchmark.cfg.desc
+++ /dev/null
@@ -1,12 +0,0 @@
-group IO
-{
-   string input-mtx-file(!)     [Input mtx file name.];
-   string input-file("")        [Input binary file name.];
-   string pdf-file("")          [PDF file with matrix pattern.];
-   string log-file("")          [Log file name.];
-   string precision("double")   [Precision of the arithmetics.];
-   real stop-time(3.0)          [How many seconds shell we iterate SpMV.];
-   integer max-iterations(100)  [Maximum number of SpMV repetitions.];
-   bool format-test( no )       [Turn on/off test of matrix formats.]; 
-   integer verbose(1)           [Verbose mode.];
-},[Arguments describing input and output data.];
\ No newline at end of file
diff --git a/tests/benchmarks/tnl-benchmark-spmv.h b/tests/benchmarks/tnl-benchmark-spmv.h
index eea8a954293f229d081f0a2e223802f4581df7e6..2f360605a64acfa4789ab6bfe1a05cd3722c9350 100644
--- a/tests/benchmarks/tnl-benchmark-spmv.h
+++ b/tests/benchmarks/tnl-benchmark-spmv.h
@@ -245,17 +245,17 @@ double benchmarkMatrix( const Matrix& matrix,
                         fstream& logFile )
 {
    tnlTimerRT timer;
-   timer.Reset();
+   timer.reset();
    double time( 0.0 );
    int iterations( 0 );
    while( time < stopTime )
    {
       matrix.vectorProduct( x, b );
 #ifdef HAVE_CUDA
-      if( Matrix::DeviceType::DeviceType == tnlCudaDevice )
+      if( ( tnlDeviceEnum ) Matrix::DeviceType::DeviceType == tnlCudaDevice )
          cudaThreadSynchronize();
 #endif
-      time = timer.GetTime();
+      time = timer.getTime();
       iterations++;
    }
    const double gflops = computeGflops( nonzeroElements, iterations, time );
diff --git a/tests/benchmarks/tnl-benchmarks.h b/tests/benchmarks/tnl-benchmarks.h
index bb40f95948086bcd9cd5ff99cb36eb6a786c7024..4379a74939dbb5e938fb7acabda028a5a66e05d8 100644
--- a/tests/benchmarks/tnl-benchmarks.h
+++ b/tests/benchmarks/tnl-benchmarks.h
@@ -48,7 +48,7 @@ bool transferBenchmark( const int size,
    for( int i = 0; i < cycles; i ++ )
       host_vector2 = host_vector;
 
-   double time = timer. GetTime();
+   double time = timer. getTime();
    double giga_byte = ( double ) ( 1 << 30 );
    host_to_host_band_width = bytes / giga_byte / time;
 
@@ -58,7 +58,7 @@ bool transferBenchmark( const int size,
    for( int i = 0; i < cycles; i ++ )
       device_vector = host_vector;
 
-   time = timer. GetTime();
+   time = timer. getTime();
    host_to_device_band_width = bytes / giga_byte / time;
 
    cout << "Transfering " << bytes / mega_byte << " MB from HOST to DEVICE took " << time << " seconds. Bandwidth is " << host_to_device_band_width << " GB/s." << endl;
@@ -67,7 +67,7 @@ bool transferBenchmark( const int size,
    for( int i = 0; i < cycles; i ++ )
       host_vector2 = device_vector;
 
-   time = timer. GetTime();
+   time = timer. getTime();
    device_to_host_band_width = bytes / giga_byte / time;
 
    cout << "Transfering " << bytes / mega_byte << " MB from DEVICE to HOST took " << time << " seconds. Bandwidth is " << device_to_host_band_width << " GB/s." << endl;
@@ -77,7 +77,7 @@ bool transferBenchmark( const int size,
       device_vector2 = device_vector;
 
 
-   time = timer. GetTime();
+   time = timer. getTime();
 
    // Since we read and write tha data back we process twice as many bytes.
    bytes *= 2;
@@ -241,7 +241,7 @@ void reductionBenchmark( const int size,
 
       }
    }
-   const double time = timer. GetTime();
+   const double time = timer. getTime();
    double giga_byte = ( double ) ( 1 << 30 );
    long int mega_byte = 1 << 20;
    long int bytes_reduced = size * sizeof( T ) * reducing_cycles * 3;
diff --git a/tests/unit-tests/operators/diffusion/tnlLinearDiffusionTest.cpp b/tests/unit-tests/operators/diffusion/tnlLinearDiffusionTest.cpp
index 687e485c2a8da7798dae95f3f253229d4a5a18d3..e20eba5283b58569a24c19308968151359cb5158 100644
--- a/tests/unit-tests/operators/diffusion/tnlLinearDiffusionTest.cpp
+++ b/tests/unit-tests/operators/diffusion/tnlLinearDiffusionTest.cpp
@@ -25,7 +25,7 @@
 #include <operators/diffusion/tnlLinearDiffusion.h>
 #include <operators/diffusion/tnlExactLinearDiffusion.h>
 #include "../tnlPDEOperatorEocTestResult.h"
-#include <functions/tnlExpBumpFunction.h>
+#include <functors/tnlExpBumpFunction.h>
 
 template< int Dimensions,
           typename Real,
diff --git a/tests/unit-tests/operators/diffusion/tnlLinearDiffusionTest.cu b/tests/unit-tests/operators/diffusion/tnlLinearDiffusionTest.cu
index 912bd9214cddb49e63db55f8a16900751435a1b5..081b142f4b65bf1ee08593d678023a41fc8a6e6d 100644
--- a/tests/unit-tests/operators/diffusion/tnlLinearDiffusionTest.cu
+++ b/tests/unit-tests/operators/diffusion/tnlLinearDiffusionTest.cu
@@ -25,7 +25,7 @@
 #include <operators/diffusion/tnlLinearDiffusion.h>
 #include <operators/diffusion/tnlExactLinearDiffusion.h>
 #include "../tnlPDEOperatorEocTestResult.h"
-#include <functions/tnlExpBumpFunction.h>
+#include <functors/tnlExpBumpFunction.h>
 
 template< int Dimensions,
           typename Real,
diff --git a/tests/unit-tests/operators/tnlPDEOperatorEocTestSetter.h b/tests/unit-tests/operators/tnlPDEOperatorEocTestSetter.h
index 16542c201dd6c1bcc4348d91e762425fdc0b630c..69d1c31843e98eb34c7dfb34734b5a6ff12ffd58 100644
--- a/tests/unit-tests/operators/tnlPDEOperatorEocTestSetter.h
+++ b/tests/unit-tests/operators/tnlPDEOperatorEocTestSetter.h
@@ -19,7 +19,7 @@
 #define TNLPDEOPERATOREOCTESTSETTER_H_
 
 #include <mesh/tnlGrid.h>
-#include <functions/tnlExpBumpFunction.h>
+#include <functors/tnlExpBumpFunction.h>
 
 template< typename ApproximateOperator,
           typename ExactOperator,
diff --git a/tests/unit-tests/tnlApproximationError.h b/tests/unit-tests/tnlApproximationError.h
index b39741257d69173c08e21c58183b3d458eee3bc3..74c3fb2f7004f64b9f1f0ab174c21c10c4c383a9 100644
--- a/tests/unit-tests/tnlApproximationError.h
+++ b/tests/unit-tests/tnlApproximationError.h
@@ -19,7 +19,7 @@
 #define TNLAPPROXIMATIONERROR_H_
 
 #include <mesh/tnlGrid.h>
-#include <functions/tnlConstantFunction.h>
+#include <functors/tnlConstantFunction.h>
 #include <operators/tnlAnalyticDirichletBoundaryConditions.h>
 #include <solvers/pde/tnlExplicitUpdater.h>
 
diff --git a/tests/unit-tests/tnlApproximationError_impl.h b/tests/unit-tests/tnlApproximationError_impl.h
index 28b1936677892656276f74c5b1b3937e82b047ec..ca9d6a5de9da802dc23b1b8f564778e635d9540a 100644
--- a/tests/unit-tests/tnlApproximationError_impl.h
+++ b/tests/unit-tests/tnlApproximationError_impl.h
@@ -20,7 +20,7 @@
 
 #include <mesh/tnlTraverser.h>
 #include <core/vectors/tnlVector.h>
-#include <functions/tnlFunctionDiscretizer.h>
+#include <functors/tnlFunctionDiscretizer.h>
 #include <matrices/tnlCSRMatrix.h>
 #include <matrices/tnlMatrixSetter.h>
 #include <solvers/pde/tnlLinearSystemAssembler.h>
diff --git a/tnlConfig.h.in b/tnlConfig.h.in
index aa39826c80236408ac0fb633acbed44b33fe0bfe..cb4ad9f48950fa688e2be72cb5e8c91c400bc1ba 100644
--- a/tnlConfig.h.in
+++ b/tnlConfig.h.in
@@ -1,4 +1,4 @@
-@HAVE_LIBBZ2@
+@HAVE_CUBLAS@
 
 @HAVE_CUSP@
 
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 8bc1a507b458a254953bad38bb050d76c03d1c45..6575ba2f79b96d113ace39a78737ee986fe6dae4 100755
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -1,2 +1,13 @@
 add_subdirectory (src)
-add_subdirectory (share)
\ No newline at end of file
+add_subdirectory (share)
+add_subdirectory (tnl-quickstart)
+
+CONFIGURE_FILE( "tnl-compile.in" "${PROJECT_TOOLS_PATH}/tnl-compile" @ONLY )
+CONFIGURE_FILE( "tnl-link.in" "${PROJECT_TOOLS_PATH}/tnl-link" @ONLY )
+CONFIGURE_FILE( "tnl-bindir.in" "${PROJECT_TOOLS_PATH}/tnl-bindir" @ONLY )
+
+INSTALL( FILES ${PROJECT_TOOLS_PATH}/tnl-compile 
+               ${PROJECT_TOOLS_PATH}/tnl-link
+               ${PROJECT_TOOLS_PATH}/tnl-bindir
+         DESTINATION bin
+         PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
\ No newline at end of file
diff --git a/tools/src/CMakeLists.txt b/tools/src/CMakeLists.txt
index 44921b44f264a9fd9b6adbe4ed56199fc49c89a1..a14f6bae5dfacc943469fc814aa345f75fa8ed58 100755
--- a/tools/src/CMakeLists.txt
+++ b/tools/src/CMakeLists.txt
@@ -43,6 +43,11 @@ target_link_libraries( tnl-functions-benchmark${debugExt} tnl${debugExt}-${tnlVe
 ADD_EXECUTABLE(tnl-curve2gnuplot${debugExt} ${tnlcurve2gnuplotsources})
 target_link_libraries (tnl-curve2gnuplot${debugExt} tnl${debugExt}-${tnlVersion} )
 
+IF( BUILD_CUDA )
+    CUDA_ADD_EXECUTABLE( tnl-cuda-arch${debugExt} tnl-cuda-arch.cu
+                         OPTIONS ${CUDA_ADD_EXECUTABLE_OPTIONS} )
+    SET_TARGET_PROPERTIES( tnl-cuda-arch${debugExt} PROPERTIES CUDA_COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" )
+ENDIF()
 #ADD_EXECUTABLE( tnl-matrix-convert${debugExt} ${tnlmatrixconvertsources} )
 #target_link_libraries( tnl-matrix-convert${debugExt} tnl${debugExt}-${tnlVersion} )
 
@@ -56,6 +61,12 @@ INSTALL( TARGETS tnl-init${debugExt}
          RUNTIME DESTINATION bin
          PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
          
+IF( BUILD_CUDA )
+   INSTALL( TARGETS tnl-cuda-arch${debugExt}
+            RUNTIME DESTINATION bin
+            PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
+ENDIF()
+
 INSTALL( FILES tnl-err2eoc
                tnl-time-series2png
                tnl-eoc-test-log
diff --git a/tools/src/functions-benchmark.h b/tools/src/functions-benchmark.h
index 45e8d3a0ebbcaea887bb5639e5db94af2c20dddf..a9f79a0d1026a979cd42dec69154b057f096d654 100644
--- a/tools/src/functions-benchmark.h
+++ b/tools/src/functions-benchmark.h
@@ -37,7 +37,7 @@ template< typename REAL > void benchmarkAddition( long int loops )
       a4 += REAL( 0.1 );
    }
 
-   double cpu_time = cpu_timer. GetTime();
+   double cpu_time = cpu_timer. getTime();
    cout << " ( " << a1 + a2 + a3 + a4 << " ) " <<  cpu_time << "secs. " << 4.0 * ( ( double ) loops ) / cpu_time * 1.0e-9 << " GFLOPS." << endl;
 }
 
@@ -61,7 +61,7 @@ template< typename REAL > void benchmarkMultiplication( const long int loops )
       }
    }
 
-   double cpu_time = cpu_timer. GetTime();
+   double cpu_time = cpu_timer. getTime();
    cout << " ( " << a1 * a2 * a3 * a4 << " ) " <<  cpu_time << "secs. " << 4.0 * ( ( double ) loops ) / cpu_time * 1.0e-9 << " GFLOPS." << endl;
 }
 
@@ -83,7 +83,7 @@ template< typename REAL > void benchmarkDivision( long int loops )
       if( a1 < REAL( 0.01 ) ) a1 = a2 = a3 = a4 = REAL( 1.0e9 );
    }
 
-   double cpu_time = cpu_timer. GetTime();
+   double cpu_time = cpu_timer. getTime();
    cout << " ( " << a1 / a2 / a3 / a4 << " ) " << cpu_time << "secs. " << 4.0 * ( ( double ) loops / 2 ) / cpu_time * 1.0e-9 << " GFLOPS." << endl;
 }
 
@@ -105,7 +105,7 @@ template< typename REAL > void benchmarkSqrt( long int loops )
       if( a1 < REAL( 100.0 ) ) a1 = a2 = a3 = a4 = REAL( 1.0e9 );
    }
 
-   double cpu_time = cpu_timer. GetTime();
+   double cpu_time = cpu_timer. getTime();
    cout << " ( " << a1 + a2 + a3 + a4 << " ) " << cpu_time << "secs. " << 4.0 * ( ( double ) loops / 2 ) / cpu_time * 1.0e-9 << " GFLOPS." << endl;
 }
 
@@ -126,7 +126,7 @@ template< typename REAL > void benchmarkSin( long int loops )
       a4 = sin( a4 );
    }
 
-   double cpu_time = cpu_timer. GetTime();
+   double cpu_time = cpu_timer. getTime();
    cout << " ( " << a1 + a2 + a3 + a4 << " ) " << cpu_time << "secs. " << 4.0 * ( ( double ) loops ) / cpu_time * 1.0e-9 << " GFLOPS." << endl;
 }
 
@@ -148,7 +148,7 @@ template< typename REAL > void benchmarkExp( long int loops )
       if( a1 > REAL( 1.0e9 ) ) a1 = a2 = a3 = a4 = REAL( 1.1 );
    }
 
-   double cpu_time = cpu_timer. GetTime();
+   double cpu_time = cpu_timer. getTime();
    cout << " ( " << a1 + a2 + a3 + a4 << " ) " << cpu_time << "secs. " << 4.0 * ( ( double ) loops) / cpu_time * 1.0e-9 << " GFLOPS." << endl;
 }
 
@@ -170,7 +170,7 @@ template< typename REAL > void benchmarkPow( long int loops )
       if( a1 < REAL( 1.0 ) ) a1 = a2 = a3 = a4 = REAL( 1.0e9 );
    }
 
-   double cpu_time = cpu_timer. GetTime();
+   double cpu_time = cpu_timer. getTime();
    cout << " ( " << a1 + a2 + a3 + a4 << " ) " << cpu_time << "secs. " << 4.0 * ( ( double ) loops) / cpu_time * 1.0e-9 << " GFLOPS." << endl;
 }
 
diff --git a/tools/src/tnl-cuda-arch.cu b/tools/src/tnl-cuda-arch.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b6bbe8d4dc843cce931c8f7fcdb9d2233cda6407
--- /dev/null
+++ b/tools/src/tnl-cuda-arch.cu
@@ -0,0 +1,21 @@
+#include <stdio.h> 
+
+int main() {
+    int num_devices;
+    cudaGetDeviceCount( &num_devices );
+    for( int i = 0; i < num_devices; i++ ) {
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties( &prop, i );
+
+        int compute_minor = prop.minor;
+        // sm_21 is the only 'real' architecture that does not have 'virtual' counterpart
+        if( prop.major == 2 )
+            compute_minor = 0;
+
+        if( i > 0 )
+            printf(" ");
+        printf( "-gencode arch=compute_%d%d,code=sm_%d%d",
+                prop.major, compute_minor, prop.major, prop.minor );
+    }
+    printf("\n");
+}
diff --git a/tools/src/tnl-err2eoc b/tools/src/tnl-err2eoc
index b262fae9e1df3c0e27a4193203ae28912d153f9f..4582ef2efbbd1aa60027406f69586e7529f1cfc3 100644
--- a/tools/src/tnl-err2eoc
+++ b/tools/src/tnl-err2eoc
@@ -12,7 +12,7 @@ refinement = 2
 i = 0
 while i < len( arguments ):
    if arguments[ i ] == "--refinement":
-      refinement = arguments[ i + 1 ]
+      refinement = float( arguments[ i + 1 ] )
       i = i + 2
       continue
    if arguments[ i ] == "--output-file":
diff --git a/tools/src/tnl-init.cpp b/tools/src/tnl-init.cpp
index 6cff727f12f7b04acc766720010dc44c8d828ea9..75147ebcad4e168beb8235962fc10f574a105c43 100644
--- a/tools/src/tnl-init.cpp
+++ b/tools/src/tnl-init.cpp
@@ -21,7 +21,7 @@
 #include <debug/tnlDebug.h>
 #include <config/tnlConfigDescription.h>
 #include <config/tnlParameterContainer.h>
-#include <functions/tnlTestFunction.h>
+#include <functors/tnlTestFunction.h>
 #include <mesh/tnlDummyMesh.h>
 #include <mesh/tnlGrid.h>
 
diff --git a/tools/src/tnl-init.h b/tools/src/tnl-init.h
index f9f4aa17d661156c199c6de917857b17f5683389..642271498e13584d6b0e01653217681b02e8ec76 100644
--- a/tools/src/tnl-init.h
+++ b/tools/src/tnl-init.h
@@ -21,8 +21,8 @@
 #include <config/tnlParameterContainer.h>
 #include <core/vectors/tnlVector.h>
 #include <mesh/tnlGrid.h>
-#include <functions/tnlFunctionDiscretizer.h>
-#include <functions/tnlTestFunction.h>
+#include <functors/tnlFunctionDiscretizer.h>
+#include <functors/tnlTestFunction.h>
 #include <operators/tnlFiniteDifferences.h>
 #include <core/mfilename.h>
 
diff --git a/tools/tnl-bindir.in b/tools/tnl-bindir.in
new file mode 100644
index 0000000000000000000000000000000000000000..2ce6738087bc081246b24e7790426e652bf4ec36
--- /dev/null
+++ b/tools/tnl-bindir.in
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+echo @CMAKE_INSTALL_PREFIX@/bin
\ No newline at end of file
diff --git a/tools/tnl-compile.in b/tools/tnl-compile.in
new file mode 100644
index 0000000000000000000000000000000000000000..f471dc6c38583790225c87c60ba1fd30140a0572
--- /dev/null
+++ b/tools/tnl-compile.in
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+DEBUG_FLAGS="-DNDEBUG" # -march=native"
+CUDA_FLAGS=""
+CXX_STD_FLAGS="-std=c++11"
+
+for option in "$@"
+do
+    case $option in
+        --cuda                  ) CUDA_FLAGS="-DHAVE_CUDA -DHAVE_NOT_CXX11 `tnl-cuda-arch`"
+                                  CXX_STD_FLAGS="" ;;
+        --debug                 ) DEBUG_FLAGS="-g -O0"
+    esac
+done
+
+echo -I@CMAKE_INSTALL_PREFIX@/include/tnl-@tnlVersion@ ${CUDA_FLAGS} ${CXX_STD_FLAGS} ${DEBUG_FLAGS}
\ No newline at end of file
diff --git a/tools/tnl-link.in b/tools/tnl-link.in
new file mode 100644
index 0000000000000000000000000000000000000000..52806a77bd008f9681bdc9a24974fb2b45f05ea3
--- /dev/null
+++ b/tools/tnl-link.in
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+DEBUG=""
+
+for option in "$@"
+do
+    case $option in
+        --debug                  ) DEBUG="-dbg"
+    esac
+done
+
+echo -L@CMAKE_INSTALL_PREFIX@/lib -ltnl${DEBUG}-@tnlVersion@
\ No newline at end of file
diff --git a/tools/tnl-quickstart/CMakeLists.txt b/tools/tnl-quickstart/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1a403b4bcfa9d1b644120456b40a799df138da77
--- /dev/null
+++ b/tools/tnl-quickstart/CMakeLists.txt
@@ -0,0 +1,4 @@
+INSTALL( FILES tnl-quickstart
+               tnl-quickstart.py
+         DESTINATION bin
+         PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
\ No newline at end of file
diff --git a/tools/tnl-quickstart/tnl-quickstart b/tools/tnl-quickstart/tnl-quickstart
new file mode 100644
index 0000000000000000000000000000000000000000..eefb6910f5f32d63f3fc2c6fb1342f21aa190c79
--- /dev/null
+++ b/tools/tnl-quickstart/tnl-quickstart
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash 
+
+PYTHON=`which python3`
+TNL_BINDIR=`tnl-bindir`
+
+if test x${PYTHON} = x;
+then 
+    echo "It seems that Python 3 is not install on your system."
+    echo "You may install it as follows:"
+    echo ""
+    echo "In Ubuntu: sudo apt-get install python3"
+    echo "In OpenSuse:"
+    echo "In CentOS:"
+else
+${PYTHON} ${TNL_BINDIR}/tnl-quickstart.py
+fi
+
diff --git a/tools/tnl-quickstart/tnl-quickstart.py b/tools/tnl-quickstart/tnl-quickstart.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1052b00630b23dd079511175835d14b4ef4d762
--- /dev/null
+++ b/tools/tnl-quickstart/tnl-quickstart.py
@@ -0,0 +1,764 @@
+#! /usr/bin/python
+
+# To change this license header, choose License Headers in Project Properties.
+# To change this template file, choose Tools | Templates
+# and open the template in the editor.
+
+__author__ = "oberhuber"
+__date__ = "$May 6, 2015 8:40:59 PM$"
+
+
+def generateMakefile( problemBaseName ):
+    file = open( "Makefile", "w" )
+    file.write( "# Uncomment the following line to enable CUDA\n" )
+    file.write( "#WITH_CUDA = yes\n" )
+    file.write( "\n" ) 
+    file.write( "TARGET = " + problemBaseName + "\n")
+    file.write( "INSTALL_DIR = ${HOME}/local\n" )
+    file.write( "\n" )
+    file.write( "LDFLAGS = $(shell tnl-link )\n" )
+    file.write( "\n" )
+    file.write( "ifdef WITH_CUDA\n" )
+    file.write( "   CXX = nvcc\n" )     
+    file.write( "   CXX_FLAGS = $(shell tnl-compile --cuda)\n" )    
+    file.write( "else\n" )                                  
+    file.write( "   CXX = g++\n" ) 
+    file.write( "   CXX_FLAGS = $(shell tnl-compile)\n" )    
+    file.write( "endif\n" )                              
+    file.write( "\n" )
+    file.write( "SOURCES = " + problemBaseName + ".cpp\n" )
+    file.write( "HEADERS = " + problemBaseName + ".h\n" )
+    file.write( "OBJECTS = " + problemBaseName + ".o\n" ) 
+    file.write( "DIST = $(SOURCES) $(CUDA_SOURCES) $(HEADERS) Makefile\n" ) 
+    file.write( "\n" ) 
+    file.write( "ifdef WITH_CUDA\n" )
+    file.write( "   OBJECTS = " + problemBaseName + "-cuda.o\n" )     
+    file.write( "endif\n" )     
+    file.write( "\n" )     
+    file.write( "all: $(TARGET)\n" ) 
+    file.write( "\n" ) 
+    file.write( "clean:\n" ) 
+    file.write( "\t rm -f *.o" ) 
+    file.write( "\n" ) 
+    file.write( "dist: $(DIST)" ) 
+    file.write( "\t tar zcvf $(TARGET).tgz $(DIST)\n" ) 
+    file.write( "\n" ) 
+    file.write( "$(TARGET): $(OBJECTS)\n" ) 
+    file.write( "\t$(CXX) -o $@ $< $(LDFLAGS)\n" ) 
+    file.write( "\n" ) 
+    file.write( "%.o: %.cpp\n" ) 
+    file.write( "\t $(CXX) $(CPPFLAGS) $(CXX_FLAGS) -c -o $@ $<" ) 
+    file.write( "\n" ) 
+    file.write( "%.o: %.cu\n" ) 
+    file.write( "\t $(CXX) $(CPPFLAGS) $(CXX_FLAGS) -c -o $@ $<" )     
+    file.close()
+
+def generateMain( problemName, problemBaseName, operatorName ):
+    file = open( problemBaseName + ".h", "w" )
+    file.write( "#include <tnlConfig.h>\n" )
+    file.write( "#include <solvers/tnlSolver.h>\n" )
+    file.write( "#include <solvers/tnlConfigTags.h>\n" )
+    file.write( "#include <solvers/tnlFastBuildConfig.h>\n" )    
+    file.write( "#include <operators/tnlAnalyticDirichletBoundaryConditions.h>\n" )
+    file.write( "#include <operators/tnlDirichletBoundaryConditions.h>\n" )
+    file.write( "#include <operators/tnlAnalyticNeumannBoundaryConditions.h>\n" )
+    file.write( "#include <operators/tnlNeumannBoundaryConditions.h>\n" )
+    file.write( "#include <functors/tnlConstantFunction.h>\n" )
+    file.write( "#include \"" + problemBaseName + "Problem.h\"\n" )
+    file.write( "#include \"" + operatorName + ".h\"\n" )
+    file.write( "#include \"" + problemBaseName + "Rhs.h\"\n" )    
+    file.write( "\n" )
+    file.write( "typedef tnlFastBuildConfig BuildConfig;\n" )    
+    file.write( "\n" )    
+    file.write( "/****\n" )    
+    file.write( " * Uncoment the following (and comment the previous line) for the complete build.\n" )    
+    file.write( " * This will include support for all floating point precisions, all indexing types\n" )    
+    file.write( " * and more solvers. You may then choose between them from the command line.\n" )    
+    file.write( " * The compile time may, however, take tens of minutes or even several hours,\n" )    
+    file.write( " * esppecially if CUDA is enabled. Use this, if you want, only for the final build,\n" )        
+    file.write( " * not in the development phase.\n" )    
+    file.write( " */\n" )    
+    file.write( "//typedef tnlDefaultConfigTag BuildConfig;\n" )    
+    file.write( "\n" )
+    file.write( "template< typename ConfigTag >" )
+    file.write( "class " + problemBaseName + "Config\n" )
+    file.write( "{\n" )
+    file.write( "   public:\n" )
+    file.write( "      static void configSetup( tnlConfigDescription & config )\n" )
+    file.write( "      {\n" )
+    file.write( "         config.addDelimiter( \"" + problemName + " settings:\" );\n" )
+    file.write( "         config.addEntry< tnlString >( \"boundary-conditions-type\", \"Choose the boundary conditions type.\", \"dirichlet\");\n" )
+    file.write( "            config.addEntryEnum< tnlString >( \"dirichlet\" );\n" )
+    file.write( "            config.addEntryEnum< tnlString >( \"neumann\" );\n" )
+    file.write( "         config.addEntry< double >( \"boundary-conditions-constant\", \"This sets a value in case of the constant boundary conditions.\" );\n" )
+    file.write( "\n" )
+    file.write( "         /****\n" )
+    file.write( "          * Add definition of your solver command line arguments.\n" )
+    file.write( "          */\n" )
+    file.write( "\n" )
+    file.write( "      }\n" )
+    file.write( "};\n" )
+    file.write( "\n" )
+    file.write( "template< typename Real,\n" )
+    file.write( "          typename Device,\n" )
+    file.write( "          typename Index,\n" )
+    file.write( "          typename MeshType,\n" )
+    file.write( "          typename ConfigTag,\n" )
+    file.write( "          typename SolverStarter >\n" )
+    file.write( "class " + problemBaseName + "Setter\n" )
+    file.write( "{\n" )
+    file.write( "   public:\n" )
+    file.write( "\n" )
+    file.write( "      typedef Real RealType;\n" )
+    file.write( "      typedef Device DeviceType;\n" )
+    file.write( "      typedef Index IndexType;\n" )
+    file.write( "\n" )
+    file.write( "      static bool run( const tnlParameterContainer & parameters )\n" )
+    file.write( "      {\n" )
+    file.write( "          enum { Dimensions = MeshType::Dimensions };\n" )
+    file.write( "          typedef " + operatorName + "< MeshType, Real, Index > ApproximateOperator;\n" )
+    file.write( "          typedef " + problemBaseName + "Rhs RightHandSide;\n" )    
+    file.write( "          typedef tnlStaticVector < MeshType::Dimensions, Real > Vertex;\n" )
+    file.write( "\n" )
+    file.write( "         /****\n" )
+    file.write( "          * Resolve the template arguments of your solver here.\n" )
+    file.write( "          * The following code is for the Dirichlet and the Neumann boundary conditions.\n" )
+    file.write( "          * Both can be constant or defined as descrete values of tnlVector.\n" )    
+    file.write( "          */\n" )    
+    file.write( "          tnlString boundaryConditionsType = parameters.getParameter< tnlString >( \"boundary-conditions-type\" );\n" )
+    file.write( "          if( parameters.checkParameter( \"boundary-conditions-constant\" ) )\n" )
+    file.write( "          {\n" )
+    file.write( "             typedef tnlConstantFunction< Dimensions, Real > ConstantFunction;\n" )
+    file.write( "             if( boundaryConditionsType == \"dirichlet\" )\n" )
+    file.write( "             {\n" )
+    file.write( "                typedef tnlAnalyticDirichletBoundaryConditions< MeshType, ConstantFunction, Real, Index > BoundaryConditions;\n" )
+    file.write( "                typedef " + problemBaseName + "Problem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Problem;\n" )
+    file.write( "                SolverStarter solverStarter;\n" )
+    file.write( "                return solverStarter.template run< Problem >( parameters );\n" )
+    file.write( "             }\n" )
+    file.write( "             typedef tnlAnalyticNeumannBoundaryConditions< MeshType, ConstantFunction, Real, Index > BoundaryConditions;\n" )
+    file.write( "             typedef " + problemBaseName + "Problem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Problem;\n" )
+    file.write( "             SolverStarter solverStarter;\n" )
+    file.write( "             return solverStarter.template run< Problem >( parameters );\n" )
+    file.write( "          }\n" )
+    file.write( "          typedef tnlVector< Real, Device, Index > VectorType;\n" )
+    file.write( "          if( boundaryConditionsType == \"dirichlet\" )\n" )
+    file.write( "          {\n" )
+    file.write( "             typedef tnlDirichletBoundaryConditions< MeshType, VectorType, Real, Index > BoundaryConditions;\n" )
+    file.write( "             typedef " + problemBaseName + "Problem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Problem;\n" )
+    file.write( "             SolverStarter solverStarter;\n" )
+    file.write( "             return solverStarter.template run< Problem >( parameters );\n" )
+    file.write( "          }\n" )
+    file.write( "          typedef tnlNeumannBoundaryConditions< MeshType, VectorType, Real, Index > BoundaryConditions;\n" )
+    file.write( "          typedef " + problemBaseName + "Problem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Problem;\n" )
+    file.write( "          SolverStarter solverStarter;\n" )
+    file.write( "          return solverStarter.template run< Problem >( parameters );\n" )
+    file.write( "      }\n" )
+    file.write( "\n" )
+    file.write( "};\n" )
+    file.write( "\n" )
+    file.write( "int main( int argc, char* argv[] )\n" )
+    file.write( "{\n" )
+    file.write( "   tnlSolver< " + problemBaseName + "Setter, " + problemBaseName + "Config, BuildConfig > solver;\n" )
+    file.write( "   if( ! solver. run( argc, argv ) )\n" )
+    file.write( "      return EXIT_FAILURE;\n" )
+    file.write( "   return EXIT_SUCCESS;\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.close()
+    file = open( problemBaseName + ".cpp", "w")
+    file.write( "#include \"" + problemBaseName + ".h\"\n")
+    file.close();
+    file = open( problemBaseName + "-cuda.cu", "w")
+    file.write( "#include \"" + problemBaseName + ".h\"\n")
+    file.close()
+    
+def generateProblem( problemName, problemBaseName ):
+    file = open( problemBaseName + "Problem.h", "w" )
+    file.write( "#ifndef " + problemBaseName + "PROBLEM_H_\n" )
+    file.write( "#define " + problemBaseName + "PROBLEM_H_\n" )
+    file.write( "\n" )
+    file.write( "#include <problems/tnlPDEProblem.h>\n")
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "           typename DifferentialOperator >\n" )
+    file.write( "class " + problemBaseName + "Problem:\n" )
+    file.write( "   public tnlPDEProblem< Mesh,\n" )
+    file.write( "                         typename DifferentialOperator::RealType,\n" )
+    file.write( "                         typename Mesh::DeviceType,\n" )
+    file.write( "                         typename DifferentialOperator::IndexType >\n" )
+    file.write( "{\n" )
+    file.write( "   public:\n" )
+    file.write( "\n" )
+    file.write( "      typedef typename DifferentialOperator::RealType RealType;\n" )
+    file.write( "      typedef typename Mesh::DeviceType DeviceType;\n" )
+    file.write( "      typedef typename DifferentialOperator::IndexType IndexType;\n" )
+    file.write( "      typedef tnlPDEProblem< Mesh, RealType, DeviceType, IndexType > BaseType;\n" )
+    file.write( "\n" )
+    file.write( "      using typename BaseType::MeshType;\n" )
+    file.write( "      using typename BaseType::DofVectorType;\n" )
+    file.write( "      using typename BaseType::MeshDependentDataType;\n" )
+    file.write( "\n" )
+    file.write( "      static tnlString getTypeStatic();\n" )
+    file.write( "\n" )
+    file.write( "      tnlString getPrologHeader() const;\n" )
+    file.write( "\n" )
+    file.write( "      void writeProlog( tnlLogger& logger,\n" )
+    file.write( "                        const tnlParameterContainer& parameters ) const;\n" )
+    file.write( "\n" )
+    file.write( "      bool setup( const tnlParameterContainer& parameters );\n" )
+    file.write( "\n" )
+    file.write( "      bool setInitialCondition( const tnlParameterContainer& parameters,\n" )
+    file.write( "                                const MeshType& mesh,\n" )
+    file.write( "                                DofVectorType& dofs,\n" )
+    file.write( "                                MeshDependentDataType& meshDependentData );\n" )
+    file.write( "\n" )
+    file.write( "      template< typename Matrix >\n" )
+    file.write( "      bool setupLinearSystem( const MeshType& mesh,\n" )
+    file.write( "                              Matrix& matrix );\n" )
+    file.write( "\n" )
+    file.write( "      bool makeSnapshot( const RealType& time,\n" )
+    file.write( "                         const IndexType& step,\n" )
+    file.write( "                         const MeshType& mesh,\n" )
+    file.write( "                         DofVectorType& dofs,\n" )
+    file.write( "                         MeshDependentDataType& meshDependentData );\n" )
+    file.write( "\n" )
+    file.write( "      IndexType getDofs( const MeshType& mesh ) const;\n" )
+    file.write( "\n" )
+    file.write( "      void bindDofs( const MeshType& mesh,\n" )
+    file.write( "                     DofVectorType& dofs );\n" )
+    file.write( "\n" )
+    file.write( "      void getExplicitRHS( const RealType& time,\n" )
+    file.write( "                           const RealType& tau,\n" )
+    file.write( "                           const MeshType& mesh,\n" )
+    file.write( "                           DofVectorType& _u,\n" )
+    file.write( "                           DofVectorType& _fu );\n" )
+    file.write( "\n" )
+    file.write( "      template< typename Matrix >\n" )
+    file.write( "      void assemblyLinearSystem( const RealType& time,\n" )
+    file.write( "                                 const RealType& tau,\n" )
+    file.write( "                                 const MeshType& mesh,\n" )
+    file.write( "                                 DofVectorType& dofs,\n" )
+    file.write( "                                 DofVectorType& auxDofs,\n" )
+    file.write( "                                 Matrix& matrix,\n" )
+    file.write( "                                 DofVectorType& rightHandSide );\n" )
+    file.write( "\n" )
+    file.write( "   protected:\n" )
+    file.write( "\n" )    
+    file.write( "      DifferentialOperator differentialOperator;\n" )
+    file.write( "      BoundaryCondition boundaryCondition;\n" )
+    file.write( "      RightHandSide rightHandSide;\n" )
+    file.write( "};\n" )
+    file.write( "\n" )
+    file.write( "#include \"" + problemBaseName + "Problem_impl.h\"\n" )
+    file.write( "\n" )
+    file.write( "#endif /* " + problemBaseName + "PROBLEM_H_ */\n" )
+    file.close()
+                                 
+    file = open( problemBaseName + "Problem_impl.h", "w" )
+    file.write( "#ifndef " + problemBaseName + "PROBLEM_IMPL_H_\n" )
+    file.write( "#define " + problemBaseName + "PROBLEM_IMPL_H_\n" )
+    file.write( "\n" )
+    file.write( "#include <core/mfilename.h>\n" )
+    file.write( "#include <matrices/tnlMatrixSetter.h>\n" )
+    file.write( "#include <solvers/pde/tnlExplicitUpdater.h>\n" )
+    file.write( "#include <solvers/pde/tnlLinearSystemAssembler.h>\n" )
+    file.write( "#include <solvers/pde/tnlBackwardTimeDiscretisation.h>\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "tnlString\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )
+    file.write( "getTypeStatic()\n" )
+    file.write( "{\n" )
+    file.write( "   return tnlString( \"" + problemBaseName + "Problem< \" ) + Mesh :: getTypeStatic() + \" >\";\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "tnlString\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )
+    file.write( "getPrologHeader() const\n" )
+    file.write( "{\n" )    
+    file.write( "   return tnlString( \"" + problemName + "\" );\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "void\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )    
+    file.write( "writeProlog( tnlLogger& logger, const tnlParameterContainer& parameters ) const\n" )
+    file.write( "{\n" )
+    file.write( "   /****\n" )
+    file.write( "    * Add data you want to have in the computation report (log) as follows:\n" )
+    file.write( "    * logger.writeParameter< double >( \"Parameter description\", parameter );\n" )
+    file.write( "    */\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "bool\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )        
+    file.write( "setup( const tnlParameterContainer& parameters )\n" )
+    file.write( "{\n" )
+    file.write( "   if( ! this->boundaryCondition.setup( parameters, \"boundary-conditions-\" ) ||\n" )
+    file.write( "       ! this->rightHandSide.setup( parameters, \"right-hand-side-\" ) )\n" )
+    file.write( "      return false;\n" )
+    file.write( "   return true;\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "typename " + problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::IndexType\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )            
+    file.write( "getDofs( const MeshType& mesh ) const\n" )
+    file.write( "{\n" )
+    file.write( "   /****\n" )
+    file.write( "    * Return number of  DOFs (degrees of freedom) i.e. number\n" )
+    file.write( "    * of unknowns to be resolved by the main solver.\n" )
+    file.write( "    */\n" )
+    file.write( "   return mesh.getNumberOfCells();\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "void\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )            
+    file.write( "bindDofs( const MeshType& mesh,\n" )
+    file.write( "          DofVectorType& dofVector )\n" )    
+    file.write( "{\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "bool\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )            
+    file.write( "setInitialCondition( const tnlParameterContainer& parameters,\n" )    
+    file.write( "                     const MeshType& mesh,\n" )
+    file.write( "                     DofVectorType& dofs,\n" )
+    file.write( "                     MeshDependentDataType& meshDependentData )\n" )
+    file.write( "{\n" )
+    file.write( "   const tnlString& initialConditionFile = parameters.getParameter< tnlString >( \"initial-condition\" );\n" )
+    file.write( "   if( ! dofs.load( initialConditionFile ) )\n" )
+    file.write( "   {\n" )
+    file.write( "      cerr << \"I am not able to load the initial condition from the file \" << initialConditionFile << \".\" << endl;\n" )
+    file.write( "      return false;\n" )
+    file.write( "   }\n" )
+    file.write( "   return true; \n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "   template< typename Matrix >\n" )
+    file.write( "bool\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )                
+    file.write( "setupLinearSystem( const MeshType& mesh,\n" )
+    file.write( "                   Matrix& matrix )\n" )
+    file.write( "{\n" )
+    file.write( "   const IndexType dofs = this->getDofs( mesh );\n" )
+    file.write( "   typedef typename Matrix::RowLengthsVector RowLengthsVectorType;\n" )
+    file.write( "   RowLengthsVectorType rowLengths;\n" )
+    file.write( "   if( ! rowLengths.setSize( dofs ) )\n" )
+    file.write( "      return false;\n" )
+    file.write( "   tnlMatrixSetter< MeshType, DifferentialOperator, BoundaryCondition, RowLengthsVectorType > matrixSetter;\n" )
+    file.write( "   matrixSetter.template getRowLengths< Mesh::Dimensions >( mesh,\n" )
+    file.write( "                                                            differentialOperator,\n" )
+    file.write( "                                                            boundaryCondition,\n" )
+    file.write( "                                                            rowLengths );\n" )
+    file.write( "   matrix.setDimensions( dofs, dofs );\n" )
+    file.write( "   if( ! matrix.setRowLengths( rowLengths ) )\n" )
+    file.write( "      return false;\n" )
+    file.write( "   return true;\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "bool\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )                    
+    file.write( "makeSnapshot( const RealType& time,\n" )
+    file.write( "              const IndexType& step,\n" )
+    file.write( "              const MeshType& mesh,\n" )
+    file.write( "              DofVectorType& dofs,\n" )
+    file.write( "              MeshDependentDataType& meshDependentData )\n" )
+    file.write( "{\n" )
+    file.write( "   cout << endl << \"Writing output at time \" << time << \" step \" << step << \".\" << endl;\n" )
+    file.write( "   this->bindDofs( mesh, dofs );\n" )
+    file.write( "   tnlString fileName;\n" )
+    file.write( "   FileNameBaseNumberEnding( \"u-\", step, 5, \".tnl\", fileName );\n" )
+    file.write( "   if( ! dofs.save( fileName ) )\n" )
+    file.write( "      return false;\n" )
+    file.write( "   return true;\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "void\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )                    
+    file.write( "getExplicitRHS( const RealType& time,\n" )
+    file.write( "                const RealType& tau,\n" )
+    file.write( "                const MeshType& mesh,\n" )
+    file.write( "                DofVectorType& u,\n" )
+    file.write( "                DofVectorType& fu )\n" )
+    file.write( "{\n" )
+    file.write( "   /****\n" )
+    file.write( "    * If you use an explicit solver like tnlEulerSolver or tnlMersonSolver, you\n" )
+    file.write( "    * need to implement this method. Compute the right-hand side of\n" )
+    file.write( "    *\n" )
+    file.write( "    *   d/dt u(x) = fu( x, u )\n" )
+    file.write( "    *\n" )
+    file.write( "    * You may use supporting mesh dependent data if you need.\n" )
+    file.write( "    */\n" )
+    file.write( "\n" )
+    file.write( "   this->bindDofs( mesh, u );\n" )
+    file.write( "   tnlExplicitUpdater< Mesh, DofVectorType, DifferentialOperator, BoundaryCondition, RightHandSide > explicitUpdater;\n" )
+    file.write( "   explicitUpdater.template update< Mesh::Dimensions >( time,\n" )
+    file.write( "                                                        mesh,\n" )
+    file.write( "                                                        this->differentialOperator,\n" )
+    file.write( "                                                        this->boundaryCondition,\n" )
+    file.write( "                                                        this->rightHandSide,\n" )
+    file.write( "                                                        u,\n" )
+    file.write( "                                                        fu );\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "   template< typename Matrix >\n" )
+    file.write( "void\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )                
+    file.write( "assemblyLinearSystem( const RealType& time,\n" )
+    file.write( "                      const RealType& tau,\n" )
+    file.write( "                      const MeshType& mesh,\n" )
+    file.write( "                      DofVectorType& u,\n" )
+    file.write( "                      DofVectorType& auxDofs,\n" )
+    file.write( "                      Matrix& matrix,\n" )
+    file.write( "                      DofVectorType& b )\n" )
+    file.write( "{\n" )
+    file.write( "   tnlLinearSystemAssembler< Mesh,\n" )
+    file.write( "                             DofVectorType,\n" )
+    file.write( "                             DifferentialOperator,\n" )
+    file.write( "                             BoundaryCondition,\n" )
+    file.write( "                             RightHandSide,\n" )
+    file.write( "                             tnlBackwardTimeDiscretisation,\n" )
+    file.write( "                             Matrix > systemAssembler;\n" )
+    file.write( "   systemAssembler.template assembly< Mesh::Dimensions >( time,\n" )
+    file.write( "                                                          tau,\n" )
+    file.write( "                                                          mesh,\n" )
+    file.write( "                                                          this->differentialOperator,\n" )
+    file.write( "                                                          this->boundaryCondition,\n" )
+    file.write( "                                                          this->rightHandSide,\n" )
+    file.write( "                                                          u,\n" )
+    file.write( "                                                          matrix,\n" )
+    file.write( "                                                          b );\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "#endif /* " + problemBaseName + "PROBLEM_IMPL_H_ */\n" )
+    file.close()
+
+def generateOperatorGridSpecializationHeader( file, operatorName, dimensions ):
+    file.write( "template< typename MeshReal,\n" )
+    file.write( "          typename Device,\n" )
+    file.write( "          typename MeshIndex,\n" )
+    file.write( "          typename Real,\n" )
+    file.write( "          typename Index >\n" )
+    file.write( "class " + operatorName + "< tnlGrid< " + dimensions + ",MeshReal, Device, MeshIndex >, Real, Index >\n" )
+    file.write( "{\n" )
+    file.write( "   public:\n" )
+    file.write( "      typedef tnlGrid< " + dimensions + ", MeshReal, Device, MeshIndex > MeshType;\n" )
+    file.write( "      typedef typename MeshType::CoordinatesType CoordinatesType;\n" )
+    file.write( "      typedef Real RealType;\n" )
+    file.write( "      typedef Device DeviceType;\n" )
+    file.write( "      typedef Index IndexType;\n" )
+    file.write( "      enum { Dimensions = MeshType::Dimensions };\n" )
+    file.write( "\n" )
+    file.write( "      static tnlString getType();\n" )
+    file.write( "\n" )
+    file.write( "      template< typename Vector >\n" )
+    file.write( "      __cuda_callable__\n" )
+    file.write( "      Real getValue( const MeshType& mesh,\n" )
+    file.write( "                     const IndexType cellIndex,\n" )
+    file.write( "                     const CoordinatesType& coordinates,\n" )
+    file.write( "                     const Vector& u,\n" )
+    file.write( "                     const RealType& time ) const;\n" )
+    file.write( "\n" )
+    file.write( "      __cuda_callable__\n" )
+    file.write( "      Index getLinearSystemRowLength( const MeshType& mesh,\n" )
+    file.write( "                                      const IndexType& index,\n" )
+    file.write( "                                      const CoordinatesType& coordinates ) const;\n" )
+    file.write( "\n" )
+    file.write( "      template< typename Vector, typename MatrixRow >\n" )
+    file.write( "      __cuda_callable__\n" )
+    file.write( "      void updateLinearSystem( const RealType& time,\n" )
+    file.write( "                               const RealType& tau,\n" )
+    file.write( "                               const MeshType& mesh,\n" )
+    file.write( "                               const IndexType& index,\n" )
+    file.write( "                               const CoordinatesType& coordinates,\n" )
+    file.write( "                               Vector& u,\n" )
+    file.write( "                               Vector& b,\n" )
+    file.write( "                               MatrixRow& matrixRow ) const;\n" )
+    file.write( "};\n" )
+    file.write( "\n" )    
+    
+def generateOperatorGridSpecializationImplementation( file, operatorName, dimensions ):
+    file.write( "template< typename MeshReal,\n" )
+    file.write( "          typename Device,\n" )
+    file.write( "          typename MeshIndex,\n" )
+    file.write( "          typename Real,\n" )
+    file.write( "          typename Index >\n" )
+    file.write( "tnlString\n" )
+    file.write( operatorName + "< tnlGrid< " + dimensions + ", MeshReal, Device, MeshIndex >, Real, Index >::\n" )
+    file.write( "getType()\n" )
+    file.write( "{\n" )
+    file.write( "   return tnlString( \"" + operatorName + "< \" ) +\n" )
+    file.write( "          MeshType::getType() + \", \" +\n" )
+    file.write( "          ::getType< Real >() + \", \" +\n" )
+    file.write( "          ::getType< Index >() + \" >\";\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename MeshReal,\n" )
+    file.write( "          typename Device,\n" )
+    file.write( "          typename MeshIndex,\n" )
+    file.write( "          typename Real,\n" )
+    file.write( "          typename Index >\n" )    
+    file.write( "template< typename Vector >\n" )
+    file.write( "__cuda_callable__\n" )
+    file.write( "Real\n" )
+    file.write( operatorName + "< tnlGrid< " + dimensions + ", MeshReal, Device, MeshIndex >, Real, Index >::\n" )
+    file.write( "getValue( const MeshType& mesh,\n" )
+    file.write( "          const IndexType cellIndex,\n" )
+    file.write( "          const CoordinatesType& coordinates,\n" )
+    file.write( "          const Vector& u,\n" )
+    file.write( "          const Real& time ) const\n" )
+    file.write( "{\n" )
+    file.write( "   /****\n" )
+    file.write( "    * Implement your explicit form of the differential operator here.\n" )
+    file.write( "    * The following example is the Laplace operator approximated \n" )
+    file.write( "    * by the Finite difference method.\n" )
+    file.write( "    */\n" )    
+    file.write( "\n" )
+    if dimensions == "1":
+        file.write( "   return ( u[ mesh.template getCellNextToCell< -1 >( cellIndex ) ]\n" )
+        file.write( "            - 2.0 * u[ cellIndex ]\n" )
+        file.write( "            + u[ mesh.template getCellNextToCell< 1 >( cellIndex ) ] ) * mesh.getHxSquareInverse();\n" )
+    if dimensions == "2":
+        file.write( "   return ( u[ mesh.template getCellNextToCell< -1, 0 >( cellIndex ) ]\n" )
+        file.write( "            - 2.0 * u[ cellIndex ]\n" )
+        file.write( "            + u[ mesh.template getCellNextToCell< 1, 0 >( cellIndex ) ] ) * mesh.getHxSquareInverse() +\n" )
+        file.write( "           ( u[ mesh.template getCellNextToCell< 0, -1 >( cellIndex ) ]\n" )
+        file.write( "             - 2.0 * u[ cellIndex ]\n" )
+        file.write( "             + u[ mesh.template getCellNextToCell< 0, 1 >( cellIndex ) ] ) * mesh.getHySquareInverse();\n" )
+    if dimensions == "3":
+        file.write( "   return ( u[ mesh.template getCellNextToCell< -1, 0, 0 >( cellIndex ) ]\n" )
+        file.write( "            - 2.0 * u[ cellIndex ]\n" )
+        file.write( "            + u[ mesh.template getCellNextToCell< 1, 0, 0 >( cellIndex ) ] ) * mesh.getHxSquareInverse() +\n" )
+        file.write( "          ( u[ mesh.template getCellNextToCell< 0, -1, 0 >( cellIndex ) ]\n" )
+        file.write( "            - 2.0 * u[ cellIndex ]\n" )
+        file.write( "            + u[ mesh.template getCellNextToCell< 0, 1, 0 >( cellIndex ) ] ) * mesh.getHySquareInverse() +\n" )
+        file.write( "          ( u[ mesh.template getCellNextToCell< 0, 0, -1 >( cellIndex ) ]\n" )
+        file.write( "            - 2.0 * u[ cellIndex ]\n" )
+        file.write( "            + u[ mesh.template getCellNextToCell< 0, 0, 1 >( cellIndex ) ] ) * mesh.getHzSquareInverse();\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename MeshReal,\n" )
+    file.write( "          typename Device,\n" )
+    file.write( "          typename MeshIndex,\n" )
+    file.write( "          typename Real,\n" )
+    file.write( "          typename Index >\n" )        
+    file.write( "__cuda_callable__\n" )
+    file.write( "Index\n" )
+    file.write( operatorName + "< tnlGrid< " + dimensions + ", MeshReal, Device, MeshIndex >, Real, Index >::\n" )
+    file.write( "getLinearSystemRowLength( const MeshType& mesh,\n" )
+    file.write( "                          const IndexType& index,\n" )
+    file.write( "                          const CoordinatesType& coordinates ) const\n" )
+    file.write( "{\n" )
+    file.write( "   /****\n" )
+    file.write( "    * Return a number of non-zero elements in a line (associated with given grid element) of\n" )
+    file.write( "    * the linear system.\n" )
+    file.write( "    * The following example is the Laplace operator approximated \n" )
+    file.write( "    * by the Finite difference method.\n" )
+    file.write( "    */\n" )
+    file.write( "\n" )
+    file.write( "   return 2*Dimensions + 1;\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename MeshReal,\n" )
+    file.write( "          typename Device,\n" )
+    file.write( "          typename MeshIndex,\n" )
+    file.write( "          typename Real,\n" )
+    file.write( "          typename Index >\n" )
+    file.write( "   template< typename Vector, typename MatrixRow >\n" )       
+    file.write( "__cuda_callable__\n" )        
+    file.write( "void\n" )
+    file.write( operatorName + "< tnlGrid< " + dimensions + ", MeshReal, Device, MeshIndex >, Real, Index >::\n" )
+    file.write( "updateLinearSystem( const RealType& time,\n" )
+    file.write( "                    const RealType& tau,\n" )
+    file.write( "                    const MeshType& mesh,\n" )
+    file.write( "                    const IndexType& index,\n" )
+    file.write( "                    const CoordinatesType& coordinates,\n" )
+    file.write( "                    Vector& u,\n" )
+    file.write( "                    Vector& b,\n" )
+    file.write( "                    MatrixRow& matrixRow ) const\n" )
+    file.write( "{\n" )
+    file.write( "   /****\n" )
+    file.write( "    * Setup the non-zero elements of the linear system here.\n" )
+    file.write( "    * The following example is the Laplace operator appriximated \n" )
+    file.write( "    * by the Finite difference method.\n" )
+    file.write( "    */\n" )    
+    file.write( "\n" )
+    if dimensions == "1":
+       file.write( "   const RealType lambdaX = tau * mesh.getHxSquareInverse();\n" )
+       file.write( "   matrixRow.setElement( 0, mesh.template getCellNextToCell< -1 >( index ),     - lambdaX );\n" )
+       file.write( "   matrixRow.setElement( 1, index,                             2.0 * lambdaX );\n" )
+       file.write( "   matrixRow.setElement( 2, mesh.template getCellNextToCell< 1 >( index ),       - lambdaX );\n" )
+    if dimensions == "2":
+        file.write( "   const RealType lambdaX = tau * mesh.getHxSquareInverse();\n" )
+        file.write( "   const RealType lambdaY = tau * mesh.getHySquareInverse();\n" )
+        file.write( "   matrixRow.setElement( 0, mesh.template getCellNextToCell< 0, -1 >( index ), -lambdaY );\n" )
+        file.write( "   matrixRow.setElement( 1, mesh.template getCellNextToCell< -1, 0 >( index ), -lambdaX );\n" )
+        file.write( "   matrixRow.setElement( 2, index,                                             2.0 * ( lambdaX + lambdaY ) );\n" )
+        file.write( "   matrixRow.setElement( 3, mesh.template getCellNextToCell< 1, 0 >( index ),   -lambdaX );\n" )
+        file.write( "   matrixRow.setElement( 4, mesh.template getCellNextToCell< 0, 1 >( index ),   -lambdaY );\n" )
+    if dimensions == "3":        
+        file.write( "   const RealType lambdaX = tau * mesh.getHxSquareInverse();\n" )
+        file.write( "   const RealType lambdaY = tau * mesh.getHySquareInverse();\n" )
+        file.write( "   const RealType lambdaZ = tau * mesh.getHzSquareInverse();\n" )
+        file.write( "   matrixRow.setElement( 0, mesh.template getCellNextToCell< 0, 0, -1 >( index ), -lambdaZ );\n" )
+        file.write( "   matrixRow.setElement( 1, mesh.template getCellNextToCell< 0, -1, 0 >( index ), -lambdaY );\n" )
+        file.write( "   matrixRow.setElement( 2, mesh.template getCellNextToCell< -1, 0, 0 >( index ), -lambdaX );\n" )
+        file.write( "   matrixRow.setElement( 3, index,                             2.0 * ( lambdaX + lambdaY + lambdaZ ) );\n" )
+        file.write( "   matrixRow.setElement( 4, mesh.template getCellNextToCell< 1, 0, 0 >( index ),   -lambdaX );\n" )
+        file.write( "   matrixRow.setElement( 5, mesh.template getCellNextToCell< 0, 1, 0 >( index ),   -lambdaY );\n" )
+        file.write( "   matrixRow.setElement( 6, mesh.template getCellNextToCell< 0, 0, 1 >( index ),   -lambdaZ );\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    
+def generateOperator( operatorName):
+    file = open( operatorName + ".h", "w" )    
+    file.write( "#ifndef " + operatorName + "_H\n" )
+    file.write( "#define " + operatorName + "_H\n" )
+    file.write( "\n" )
+    file.write( "#include <core/vectors/tnlVector.h>\n" )
+    file.write( "#include <mesh/tnlGrid.h>\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename Real = typename Mesh::RealType,\n" )
+    file.write( "          typename Index = typename Mesh::IndexType >\n" )
+    file.write( "class " + operatorName + "\n" )
+    file.write( "{\n" )
+    file.write( "};\n" )
+    file.write( "\n" )
+    generateOperatorGridSpecializationHeader( file, operatorName, "1" )
+    generateOperatorGridSpecializationHeader( file, operatorName, "2" )
+    generateOperatorGridSpecializationHeader( file, operatorName, "3" )
+    file.write( "\n" )
+    file.write( "#include \""+ operatorName + "_impl.h\"\n" )
+    file.write( "\n" )
+    file.write( "#endif	/* " + operatorName + "_H */\n" )
+    file.close()
+    file = open( operatorName + "_impl.h", "w" )    
+    file.write( "#ifndef " + operatorName + "_IMPL_H\n" )
+    file.write( "#define " + operatorName + "_IMPL_H\n" )
+    file.write( "\n" )
+    file.write( "/****\n" )
+    file.write( " * 1D problem\n" )
+    file.write( " */\n" )
+    generateOperatorGridSpecializationImplementation( file, operatorName, "1" )
+    file.write( "/****\n" )
+    file.write( " * 2D problem\n" )
+    file.write( " */\n" )
+    generateOperatorGridSpecializationImplementation( file, operatorName, "2" )
+    file.write( "/****\n" )
+    file.write( " * 3D problem\n" )
+    file.write( " */\n" )    
+    generateOperatorGridSpecializationImplementation( file, operatorName, "3" )
+    file.write( "#endif	/* " + operatorName + "IMPL_H */\n" )
+    file.write( "\n" )
+    file.close()
+    
+def generateRhs( problemBaseName ):
+    file = open( problemBaseName + "Rhs.h", "w" )
+    file.write( "#ifndef " + problemBaseName + "RHS_H_\n" )
+    file.write( "#define " + problemBaseName + "RHS_H_\n" )
+    file.write( "\n" )
+    file.write( "class " + problemBaseName + "Rhs\n" )
+    file.write( "{\n" )
+    file.write( "   public:\n" )
+    file.write( "      bool setup( const tnlParameterContainer& parameters,\n" )
+    file.write( "                  const tnlString& prefix = \"\" )\n" )
+    file.write( "      {\n" )
+    file.write( "         return true;\n" )
+    file.write( "      }\n" )
+    file.write( "\n" )
+    file.write( "      template< typename Mesh,\n" )
+    file.write( "                typename Index,\n" )
+    file.write( "                typename Real >\n" )
+    file.write( "      __cuda_callable__\n" )
+    file.write( "      Real getValue( const Mesh& mesh,\n" )
+    file.write( "                     const Index& index,\n" )
+    file.write( "                     const Real& time ) const\n" )    
+    file.write( "      {\n" )    
+    file.write( "         typedef typename Mesh::VertexType VertexType;\n" )
+    file.write( "         VertexType v = mesh.template getCellCenter< VertexType >( index );\n" )        
+    file.write( "         return 0.0;\n" )    
+    file.write( "      };\n" )    
+    file.write( "};\n" )    
+    file.write( "\n" )    
+    file.write( "#endif /* " + problemBaseName + "RHS_H_ */\n" )    
+    file.close()
+
+def generateRunScript( problemBaseName ):
+    file = open( "run-" + problemBaseName, "w" )
+    file.write( "#!/usr/bin/env bash\n" ) 
+    file.write( "\n" ) 
+    file.write( "tnl-grid-setup --dimensions 2 \\\n" ) 
+    file.write( "               --origin-x 0.0 \\\n" ) 
+    file.write( "               --origin-y 0.0 \\\n" ) 
+    file.write( "               --proportions-x 1.0 \\\n" ) 
+    file.write( "               --proportions-y 1.0 \\\n" ) 
+    file.write( "               --size-x 100 \\\n" ) 
+    file.write( "               --size-y 100\n" ) 
+    file.write( "\n" ) 
+    file.write( "tnl-init --test-function sin-wave \\\n" ) 
+    file.write( "         --output-file init.tnl\n" ) 
+    file.write( "./" + problemName + " --time-discretisation explicit \\\n" ) 
+    file.write( "              --discrete-solver merson \\\n" ) 
+    file.write( "              --snapshot-period 0.01 \\\n" ) 
+    file.write( "              --final-time 1.0\n" ) 
+    file.write( "\n" ) 
+    file.write( "tnl-view --mesh mesh.tnl --input-files *tnl     \n" ) 
+    file.close()
+    
+print( "TNL Quickstart -- solver generator")
+print( "----------------------------------")
+problemName = input( "Problam name:" )
+problemBaseName = input( "Problem class base name (base name acceptable in C++ code):" )
+operatorName = input( "Operator name:")
+generateMakefile( problemBaseName )
+generateMain( problemName, problemBaseName, operatorName )
+generateProblem( problemName, problemBaseName )
+generateOperator( operatorName )
+generateRhs( problemBaseName )
+generateRunScript( problemBaseName )
\ No newline at end of file