diff --git a/.cproject b/.cproject
deleted file mode 100644
index 2e42104373e8b53683590425ef365ed94e058198..0000000000000000000000000000000000000000
--- a/.cproject
+++ /dev/null
@@ -1,66 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
-	<storageModule moduleId="org.eclipse.cdt.core.settings">
-		<cconfiguration id="cdt.managedbuild.toolchain.gnu.base.2134791445">
-			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.toolchain.gnu.base.2134791445" moduleId="org.eclipse.cdt.core.settings" name="Default">
-				<externalSettings/>
-				<extensions>
-					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
-					<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
-				</extensions>
-			</storageModule>
-			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-				<configuration buildProperties="" description="" id="cdt.managedbuild.toolchain.gnu.base.2134791445" name="Default" parent="org.eclipse.cdt.build.core.emptycfg">
-					<folderInfo id="cdt.managedbuild.toolchain.gnu.base.2134791445.1799170915" name="/" resourcePath="">
-						<toolChain id="cdt.managedbuild.toolchain.gnu.base.1762402759" name="cdt.managedbuild.toolchain.gnu.base" superClass="cdt.managedbuild.toolchain.gnu.base">
-							<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.base.840154721" name="Debug Platform" osList="linux,hpux,aix,qnx" superClass="cdt.managedbuild.target.gnu.platform.base"/>
-							<builder id="cdt.managedbuild.target.gnu.builder.base.573876641" keepEnvironmentInBuildfile="false" managedBuildOn="false" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.base">
-								<outputEntries>
-									<entry flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="outputPath" name=""/>
-								</outputEntries>
-							</builder>
-							<tool id="cdt.managedbuild.tool.gnu.archiver.base.958650307" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
-							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.base.1237330740" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.base">
-								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1856262209" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
-							</tool>
-							<tool id="cdt.managedbuild.tool.gnu.c.compiler.base.315517693" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.base">
-								<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.950045328" superClass="cdt.managedbuild.tool.gnu.c.compiler.input"/>
-							</tool>
-							<tool id="cdt.managedbuild.tool.gnu.c.linker.base.1786292703" name="GCC C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.base"/>
-							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.base.1133414639" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.base">
-								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.692614544" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
-									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
-									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
-								</inputType>
-							</tool>
-							<tool id="cdt.managedbuild.tool.gnu.assembler.base.336757143" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.base">
-								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1329383797" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
-							</tool>
-						</toolChain>
-					</folderInfo>
-					<sourceEntries>
-						<entry excluding="tnlLinearDiffusionTest.cu" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
-					</sourceEntries>
-				</configuration>
-			</storageModule>
-			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
-		</cconfiguration>
-	</storageModule>
-	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
-		<project id="tnl.null.219466708" name="tnl"/>
-	</storageModule>
-	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
-	<storageModule moduleId="scannerConfiguration">
-		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.base.2134791445;cdt.managedbuild.toolchain.gnu.base.2134791445.1799170915;cdt.managedbuild.tool.gnu.c.compiler.base.315517693;cdt.managedbuild.tool.gnu.c.compiler.input.950045328">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
-		</scannerConfigBuildInfo>
-		<scannerConfigBuildInfo instanceId="cdt.managedbuild.toolchain.gnu.base.2134791445;cdt.managedbuild.toolchain.gnu.base.2134791445.1799170915;cdt.managedbuild.tool.gnu.cpp.compiler.base.1237330740;cdt.managedbuild.tool.gnu.cpp.compiler.input.1856262209">
-			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
-		</scannerConfigBuildInfo>
-	</storageModule>
-</cproject>
diff --git a/.gitignore b/.gitignore
index 95228a33e88d9365ddb32d498eea03a54192e3f0..26b082851c1968ea8a0896a009be495c73f884b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,44 +1,8 @@
-
-# /
-/build
-/missing
-/Makefile.in
-/ltmain.sh
-/install-sh
-/depcomp
-/configure
-/config.*
-/aclocal.m4
-/m4
-/autom4te.cache
+.settings
+/nbproject
 /Debug
 /Release
-.settings
-
-.settings
-
-# /src/
-/src/Makefile.in
-
-# /src/core/
-/src/core/Makefile.in
-
-# /src/debug/
-/src/debug/Makefile.in
-
-# /src/diff/
-/src/diff/Makefile.in
-
-# /src/matrix/
-/src/matrix/*.in
-
-# /tools/
-/tools/Makefile.in
-
-# /tools/share/
-/tools/share/Makefile.in
-
-# /tools/src/
-/tools/src/Makefile.in
 /Testing
 /CMakeLists.txt.user
+/doc/_build
+/Build
diff --git a/AUTHORS b/AUTHORS
index 6c6172690fe420ea75aae907a6882c5612c8b076..c3a12fe499b7478498ddf2be897558e71884f731 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,5 +1,11 @@
 Oberhuber Tomas <tomas.oberhuber@fjfi.cvut.cz>
 Zabka Vitezslav <zabkavit@fjfi.cvut.cz>
+Vladimir Klement
+Tomáš Sobotík
+Ondřej Székely
+Jiří Kafka
+Libor Bakajsa
+Jakub Klinkovský
 Vacata Jan
 Heller Martin
 Novotny Matej
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d343d7ff6caa8defdd9897ef041e6502beee288a..e974244f955532becedeb0250912eae1a0ce6053 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,3 +1,17 @@
+###############################################################################
+#                      Cmake project script for TNL
+#                             -------------------
+#    begin               : Dec 8, 2010
+#    copyright           : (C) 2010 by Tomas Oberhuber et al.
+#    email               : tomas.oberhuber@fjfi.cvut.cz
+#
+###############################################################################
+#
+# Authors:
+# Tomas Oberhuber
+# Vladimir Klement
+# Jakub Klinkovsky
+
 cmake_minimum_required( VERSION 2.8.10 )
 
 project( tnl )
@@ -15,23 +29,22 @@ include( UseCodeCoverage )
 if( CMAKE_BUILD_TYPE STREQUAL "Debug")
     set( PROJECT_BUILD_PATH ${PROJECT_SOURCE_DIR}/Debug/src )
     set( PROJECT_TESTS_PATH ${PROJECT_SOURCE_DIR}/Debug/tests )
+    set( PROJECT_TOOLS_PATH ${PROJECT_SOURCE_DIR}/Debug/tools )
     set( LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/Debug/lib )
     set( EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/Debug/bin )
     set( debugExt -dbg )
-    AddCompilerFlag( "-g" )
+    set( CXX_FLAGS "${CXXFLAGS} -g ")
+    #AddCompilerFlag( "-g" )
 else()
     set( PROJECT_BUILD_PATH ${PROJECT_SOURCE_DIR}/Release/src )
     set( PROJECT_TESTS_PATH ${PROJECT_SOURCE_DIR}/Release/tests )
+    set( PROJECT_TOOLS_PATH ${PROJECT_SOURCE_DIR}/Release/tools )
     set( LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/Release/lib)
     set( EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/Release/bin)
-    OptimizeForArchitecture()
-    AddCompilerFlag( "-O3 -DNDEBUG" )
+    #OptimizeForArchitecture()
+    AddCompilerFlag( "-O3 -march=native -DNDEBUG" )
 endif()
 
-if( WITH_TEMPLATE_EXPLICIT_INSTANTIATION STREQUAL "yes" )
-   AddCompilerFlag( "-DTEMPLATE_EXPLICIT_INSTANTIATION " )
-endif()   
-
 #####
 # Check for CUDA
 #
@@ -42,12 +55,60 @@ if( WITH_CUDA STREQUAL "yes" )
         set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
         set(BUILD_SHARED_LIBS ON)
         set(CUDA_SEPARABLE_COMPILATION ON)
-        set(CUSPARSE_LIBRARY /usr/local/cuda/lib64/libcusparse.so) # TODO: fix this              
-        set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -DHAVE_CUDA )
-        AddCompilerFlag( "-DHAVE_NOT_CXX11 -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128 " )          
-        set( CUDA_ADD_EXECUTABLE_OPTIONS -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 )
-        set( CUDA_ADD_LIBRARY_OPTIONS -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -shared )
-        set( CUDA_LINKER_OPTIONS "-arch sm_20 -shared " )
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ; -DHAVE_CUDA )
+        AddCompilerFlag( "-DHAVE_NOT_CXX11" ) # -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128 " )
+        set( ALL_CUDA_ARCHS -gencode arch=compute_20,code=sm_20
+                            -gencode arch=compute_30,code=sm_30
+                            -gencode arch=compute_32,code=sm_32 
+                            -gencode arch=compute_35,code=sm_35 
+                            -gencode arch=compute_37,code=sm_37 
+                            -gencode arch=compute_50,code=sm_50 
+                            -gencode arch=compute_52,code=sm_52 )
+        if( WITH_CUDA_ARCH STREQUAL "all" )
+           set( CUDA_ARCH ${ALL_CUDA_ARCHS} )   
+        else()
+            if( WITH_CUDA_ARCH STREQUAL "auto")
+                ####
+                # Select GPU architecture
+                #
+                set( CUDA_ARCH_EXECUTABLE ${EXECUTABLE_OUTPUT_PATH}/tnl-cuda-arch)
+                set( CUDA_ARCH_SOURCE ${PROJECT_SOURCE_DIR}/tools/src/tnl-cuda-arch.cu)
+                message( "Compiling tnl-cuda-arch ..." )
+                file( MAKE_DIRECTORY ${EXECUTABLE_OUTPUT_PATH} )
+                execute_process( COMMAND nvcc ${CUDA_ARCH_SOURCE} -o ${CUDA_ARCH_EXECUTABLE}
+                                 RESULT_VARIABLE CUDA_ARCH_RESULT
+                                 OUTPUT_VARIABLE CUDA_ARCH_OUTPUT
+                                 ERROR_VARIABLE CUDA_ARCH_OUTPUT )
+                execute_process( COMMAND ${CUDA_ARCH_EXECUTABLE}
+                                 OUTPUT_VARIABLE CUDA_ARCH )
+                if( NOT CUDA_ARCH_RESULT )
+                    # strip linebreaks and convert to list delimited with ';'
+                    string( REGEX REPLACE "[\n ]" ";" CUDA_ARCH ${CUDA_ARCH} )
+                    # cache the result
+                    set( CUDA_ARCH ${CUDA_ARCH} CACHE LIST "GPU architecture options" )
+                else()
+                    message( "Failed to detect GPU architecture:\n${CUDA_ARCH_OUTPUT}" )
+                    message( "Using (almost) all GPU architectures as fallback." )
+                    set( CUDA_ARCH ${ALL_CUDA_ARCHS} )
+                endif()
+                message( "GPU architecture options:  ${CUDA_ARCH}" )
+            else()
+                set( CUDA_ARCH -gencode arch=compute_${WITH_CUDA_ARCH},code=sm_${WITH_CUDA_ARCH} )
+            endif()
+        endif()
+        set( CUDA_ADD_EXECUTABLE_OPTIONS ${CUDA_ARCH} )
+        set( CUDA_ADD_LIBRARY_OPTIONS ${CUDA_ARCH} -shared )
+        set( CUDA_LINKER_OPTIONS "-arch sm_20 -shared" )
+
+        ####
+        # Check for cuBLAS
+        #
+        if( WITH_CUBLAS STREQUAL "yes" ) 
+            message( "Enabling CUBLAS." )
+            set( HAVE_CUBLAS TRUE)
+            set( HAVE_CUBLAS "#define HAVE_CUBLAS" )
+        endif( WITH_CUBLAS STREQUAL "yes" )       
+
         ####
         # Check for CUSP
         #
@@ -78,6 +139,7 @@ if( WITH_CUDA STREQUAL "yes" )
                message( "CUSPARSE found. -- ${CUSPARSE_INCLUDE_DIR}" )
                set( HAVE_CUSPARSE "#define HAVE_CUSPARSE" )
                cuda_include_directories( ${CUSPARSE_INCLUDE_DIR} )
+               set( CUSPARSE_LIBRARY "${CUDA_cusparse_LIBRARY}" )
            endif()            
         endif( NOT WITH_CUSPARSE STREQUAL "no" )
    
@@ -85,6 +147,7 @@ if( WITH_CUDA STREQUAL "yes" )
       AddCompilerFlag( "-std=gnu++0x" )         
     endif( CUDA_FOUND )
 else( WITH_CUDA STREQUAL "yes" )
+   #AddCompilerFlag( "-std=gnu++0x -ftree-vectorizer-verbose=1" )       
    AddCompilerFlag( "-std=gnu++0x" )       
 endif( WITH_CUDA STREQUAL "yes" )    
 
@@ -93,8 +156,8 @@ endif( WITH_CUDA STREQUAL "yes" )
 #
 find_package( OpenMP ) 
 if( OPENMP_FOUND )
-   AddCompilerFlag( "-DHAVE_OPENMP -fopenmp" )
-# TODO: finish this
+   message( "Compiler supports OpenMP." )
+   set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_OPENMP -fopenmp")
 endif()
 
 ####
@@ -133,57 +196,58 @@ else()
     set( HAVE_SYS_RESOURCE_H "#define HAVE_SYS_RESOURCE_H" )
 endif()
 
-  
-
-
 ####
 # Check for cppunit
 #
-FIND_PATH(CPPUNIT_INCLUDE_DIR cppunit/TestCase.h
-  /usr/local/include
-  /usr/include
-  DOC "CppUnit headers."
-)
+if( WITH_TESTS STREQUAL "yes" )
+    FIND_PATH(CPPUNIT_INCLUDE_DIR cppunit/TestCase.h
+      /usr/local/include
+      /usr/include
+      DOC "CppUnit headers."
+    )
 
-####
-# With Win32, important to have both
-#
-if(WIN32)
-  FIND_LIBRARY(CPPUNIT_LIBRARY cppunit
-               ${CPPUNIT_INCLUDE_DIR}/../lib
-               /usr/local/lib
-               /usr/lib)
-  FIND_LIBRARY(CPPUNIT_DEBUG_LIBRARY cppunitd
-               ${CPPUNIT_INCLUDE_DIR}/../lib
-               /usr/local/lib
-               /usr/lib)
-else(WIN32)
-  # On unix system, debug and release have the same name
-  FIND_LIBRARY(CPPUNIT_LIBRARY cppunit
-               ${CPPUNIT_INCLUDE_DIR}/../lib
-               /usr/local/lib
-               /usr/lib)
-  FIND_LIBRARY(CPPUNIT_DEBUG_LIBRARY cppunit
-               ${CPPUNIT_INCLUDE_DIR}/../lib
-               /usr/local/lib
-               /usr/lib)
-endif(WIN32)
+    ####
+    # With Win32, important to have both
+    #
+    if(WIN32)
+      FIND_LIBRARY(CPPUNIT_LIBRARY cppunit
+                   ${CPPUNIT_INCLUDE_DIR}/../lib
+                   /usr/local/lib
+                   /usr/lib)
+      FIND_LIBRARY(CPPUNIT_DEBUG_LIBRARY cppunitd
+                   ${CPPUNIT_INCLUDE_DIR}/../lib
+                   /usr/local/lib
+                   /usr/lib)
+    else(WIN32)
+      # On unix system, debug and release have the same name
+      FIND_LIBRARY(CPPUNIT_LIBRARY cppunit
+                   ${CPPUNIT_INCLUDE_DIR}/../lib
+                   /usr/local/lib
+                   /usr/lib)
+      FIND_LIBRARY(CPPUNIT_DEBUG_LIBRARY cppunit
+                   ${CPPUNIT_INCLUDE_DIR}/../lib
+                   /usr/local/lib
+                   /usr/lib)
+    endif(WIN32)
 
 
-if( ${CPPUNIT_INCLUDE_DIR} STREQUAL "CPPUNIT_INCLUDE_DIR-NOTFOUND" )
-      message( "CPPUNIT not found." )
-      set( HAVE_CPPUNIT "//#define HAVE_CPPUNIT" )
-else( ${CPPUNIT_INCLUDE_DIR} STREQUAL "CPPUNIT_INCLUDE_DIR-NOTFOUND" )
-  message( "CPPUNIT headers found -- ${CPPUNIT_INCLUDE_DIR}" )
-  if(CPPUNIT_LIBRARY)
-    message( "CPPUNIT library found -- ${CPPUNIT_LIBRARY}" )
-    set(CPPUNIT_FOUND "YES")
-    set(CPPUNIT_LIBRARIES ${CPPUNIT_LIBRARY} ${CMAKE_DL_LIBS})
-    set(CPPUNIT_DEBUG_LIBRARIES ${CPPUNIT_DEBUG_LIBRARY}
-                                ${CMAKE_DL_LIBS})
-   set( HAVE_CPPUNIT "#define HAVE_CPPUNIT" )
-  endif(CPPUNIT_LIBRARY)
-endif( ${CPPUNIT_INCLUDE_DIR} STREQUAL "CPPUNIT_INCLUDE_DIR-NOTFOUND" )
+    if( ${CPPUNIT_INCLUDE_DIR} STREQUAL "CPPUNIT_INCLUDE_DIR-NOTFOUND" )
+          message( "CPPUNIT not found." )
+          set( HAVE_CPPUNIT "//#define HAVE_CPPUNIT" )
+    else( ${CPPUNIT_INCLUDE_DIR} STREQUAL "CPPUNIT_INCLUDE_DIR-NOTFOUND" )
+      message( "CPPUNIT headers found -- ${CPPUNIT_INCLUDE_DIR}" )
+      if(CPPUNIT_LIBRARY)
+        message( "CPPUNIT library found -- ${CPPUNIT_LIBRARY}" )
+        set(CPPUNIT_FOUND "YES")
+        set(CPPUNIT_LIBRARIES ${CPPUNIT_LIBRARY} ${CMAKE_DL_LIBS})
+        set(CPPUNIT_DEBUG_LIBRARIES ${CPPUNIT_DEBUG_LIBRARY}
+                                    ${CMAKE_DL_LIBS})
+       set( HAVE_CPPUNIT "#define HAVE_CPPUNIT" )
+      endif(CPPUNIT_LIBRARY)
+    endif( ${CPPUNIT_INCLUDE_DIR} STREQUAL "CPPUNIT_INCLUDE_DIR-NOTFOUND" )
+    ENABLE_TESTING()
+    INCLUDE( Dart )
+endif( WITH_TESTS STREQUAL "yes" )
 
 #if( BUILD_MPI )
 #   FIND_PATH( PETSC_INCLUDE_DIR petsc.h
@@ -210,9 +274,33 @@ endif( ${CPPUNIT_INCLUDE_DIR} STREQUAL "CPPUNIT_INCLUDE_DIR-NOTFOUND" )
 #   endif()
 #endif()
 
+####
+# Explicit template instantiation
+#
+if( WITH_TEMPLATE_INSTANTIATION STREQUAL "yes" )
+   AddCompilerFlag( "-DTEMPLATE_EXPLICIT_INSTANTIATION " )
+endif()   
+
+if( INSTANTIATE_INT STREQUAL "yes" )
+   AddCompilerFlag( "-DINSTANTIATE_INT " )
+endif()   
+
+if( INSTANTIATE_LONG_INT STREQUAL "yes" )
+   AddCompilerFlag( "-DINSTANTIATE_LONG_INT " )
+endif()   
+
+if( INSTANTIATE_FLOAT STREQUAL "yes" )
+   AddCompilerFlag( "-DINSTANTIATE_FLOAT " )
+endif()   
+
+if( INSTANTIATE_DOUBLE STREQUAL "yes" )
+   AddCompilerFlag( "-DINSTANTIATE_DOUBLE " )
+endif()   
+
+if( INSTANTIATE_LONG_DOUBLE STREQUAL "yes" )
+   AddCompilerFlag( "-DINSTANTIATE_LONG_DOUBLE " )
+endif()   
 
-ENABLE_TESTING()
-INCLUDE( Dart )
 set( CXX_TEST_FLAGS "-fprofile-arcs -ftest-coverage" )
 set( LD_TEST_FLAGS "-lgcov -coverage" )
 
diff --git a/TODO b/TODO
index 66e551d7dd1f6c4d28ac70885595ab74e24cf368..c8ed457124545ba0f48fc89c85a1777e92ca1339 100644
--- a/TODO
+++ b/TODO
@@ -1,8 +1,8 @@
+TODO: v tnlMeshResolver se provadi preklad pro vsechny mozne sablonove parametry => prorezat
+
 TODO: napsat FunctionDiscretizer pro jednotne rozhrani RightHandSide
 
-TODO: doladit vse s CUDA
 TODO: doplnit mesh travelsals pro jine mesh entity nez cell
-TODO: implementovat tnlProblem
 TODO: implementace maticovych resicu
       * Gaussova eliminace
       * SOR metoda
@@ -10,6 +10,8 @@ TODO: implementace maticovych resicu
       * TFQMR metoda
       * IDR metody 
 
+TODO: Nahradit sablonovy parametr dimenze sitove entity za typ entity. V pripade hran gridu, by ty v sobe mohly mit i orientaci.
+      Asi by to bylo vyhodne i pro site se smisenym typem entit. 
 
 TODO: implementovat tridu tnlFileName pro generovani jmen souboru
 
diff --git a/build b/build
new file mode 100755
index 0000000000000000000000000000000000000000..cf269c8138fe75ea889184025d938a10bb04f52c
--- /dev/null
+++ b/build
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+TARGET=TNL
+PREFIX=${HOME}/local
+WITH_CUDA="yes"
+WITH_TESTS="yes"
+
+WITH_CUDA_ARCH="auto"
+WITH_CUBLAS="no"
+WITH_TEMPLATE_INSTANTIATION="yes"
+INSTANTIATE_LONG_INT="yes"
+INSTANTIATE_INT="yes"
+INSTANTIATE_LONG_DOUBLE="yes"
+INSTANTIATE_DOUBLE="yes"
+INSTANTIATE_FLOAT="yes"
+CMAKE="cmake"
+CMAKE_ONLY="no"
+HELP="no"
+VERBOSE=""
+ROOT_DIR="."
+BUILD_JOBS=`grep -c processor /proc/cpuinfo`
+
+for option in "$@"
+do
+    case $option in
+        --prefix=*                     ) PREFIX="${option#*=}" ;;
+        --build=*                      ) BUILD="${option#*=}" ;;
+        --with-tests=*                 ) WITH_TESTS="${option#*=}" ;;
+        --with-cuda=*                  ) WITH_CUDA="${option#*=}" ;;
+        --with-cublas=*                ) WITH_CUBLAS="${option#*=}" ;;
+        --with-cuda-arch=*             ) WITH_CUDA_ARCH="${option#*=}";;
+        --with-templates-instantiation ) WITH_TEMPLATE_INSTANTIATION="${option#*=}" ;;
+        --instantiate-long-int=*       ) INSTANTIATE_LONG_INT="${option#*=}" ;;
+        --instantiate-int=*            ) INSTANTIATE_INT="${option#*=}" ;;
+        --instantiate-long-double=*    ) INSTANTIATE_LONG_DOUBLE="${option#*=}" ;;
+        --instantiate-double=*         ) INSTANTIATE_DOUBLE="${option#*=}" ;;
+        --instantiate-float=*          ) INSTANTIATE_FLOAT="${option#*=}" ;;
+        --fast-build                   ) INSTANTIATE_LONG_INT="no"
+                                         INSTANTIATE_INT="yes"
+                                         INSTANTIATE_LONG_DOUBLE="no"
+                                         INSTANTIATE_DOUBLE="yes"
+                                         INSTANTIATE_FLOAT="no"
+                                         WITH_CUDA_ARCH="auto" ;;
+        --with-cmake=*                 ) CMAKE="${option#*=}" ;;
+        --build-jobs=*                 ) BUILD_JOBS="${option#*=}" ;;
+        --cmake-only=*                 ) CMAKE_ONLY="${option#*=}" ;;
+        --verbose                      ) VERBOSE="VERBOSE=1" ;;
+        --root-dir=*                   ) ROOT_DIR="${option#*=}" ;;
+        --help                         ) HELP="yes" ;;
+        *                              ) 
+           echo "Unknown option ${option}. Use --help for more information."
+           exit 1 ;;
+    esac
+done
+
+if test ${HELP} = "yes";
+then
+    echo "TNL build options:"
+    echo ""
+    echo "   --prefix=PATH                         Prefix for the installation directory. ${HOME}/local by default."
+    echo "   --build=Debug/Release                 Build type."
+    echo "   --with-tests=yes/no                   Enable unit tests. 'yes' by default (libcppunit-dev is required)."
+    echo "   --with-cuda=yes/no                    Enable CUDA. 'yes' by default (CUDA Toolkit is required)."
+    echo "   --with-cuda-arch=all/auto/30/35/...   Choose CUDA architecture."   
+    echo "   --with-templates-instantiation=yes/no Some TNL templates are precompiled during the build. 'yes' by default."
+    echo "   --with-cmake=CMAKE                    Path to cmake. 'cmake' by default."
+    echo "   --build-jobs=NUM                      Number of processes to be used for the build. It is set to a number of CPU cores by default."
+    echo "   --verbose                             It enables verbose build."
+    echo "   --root-dir=PATH                       Path to the TNL source code root dir."
+    echo "   --help                                Write this help."
+    exit 1
+fi
+
+echo "Configuring ${BUILD} $TARGET ..."
+
+${CMAKE} ${ROOT_DIR} \
+         -DCMAKE_BUILD_TYPE=${BUILD} \
+         -DCMAKE_INSTALL_PREFIX=${PREFIX} \
+         -DWITH_CUDA=${WITH_CUDA} \
+         -DWITH_CUDA_ARCH=${WITH_CUDA_ARCH} \
+         -DWITH_CUBLAS=${WITH_CUBLAS} \
+         -DWITH_TESTS=${WITH_TESTS} \
+         -DPETSC_DIR=${PETSC_DIR} \
+         -DWITH_TEMPLATE_INSTANTIATION=${WITH_TEMPLATE_INSTANTIATION} \
+         -DINSTANTIATE_FLOAT=${INSTANTIATE_FLOAT} \
+         -DINSTANTIATE_DOUBLE=${INSTANTIATE_DOUBLE} \
+         -DINSTANTIATE_LONG_DOUBLE=${INSTANTIATE_LONG_DOUBLE} \
+         -DINSTANTIATE_INT=${INSTANTIATE_INT} \
+         -DINSTANTIATE_LONG_INT=${INSTANTIATE_LONG_INT}
+
+if test ${CMAKE_ONLY} = "yes";
+then
+    exit 1
+fi
+
+echo "Building ${BUILD} $TARGET using $BUILD_JOBS processors ..."
+
+make -j${BUILD_JOBS} ${VERBOSE}
+
+if test WITH_TESTS = "yes";
+then
+    make -j${BUILD_JOBS} test
+fi
+
+exit 0
\ No newline at end of file
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..0c09d9529610ac26a27bb1783e8d003721ad3315
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,177 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/TNL.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/TNL.qhc"
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/TNL"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/TNL"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdcf0336e26e9c90b32d3bfe140235ed9178156f
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,331 @@
+# -*- coding: utf-8 -*-
+#
+# TNL documentation build configuration file, created by
+# sphinx-quickstart on Sun Mar 29 13:12:39 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.coverage',
+    'sphinx.ext.mathjax',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'TNL'
+copyright = u'2015, Tomáš Oberhuber'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '0.1'
+# The full version, including alpha/beta/rc tags.
+release = '0.1'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'TNLdoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  ('index', 'TNL.tex', u'TNL Documentation',
+   u'Tomáš Oberhuber', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'tnl', u'TNL Documentation',
+     [u'Tomáš Oberhuber'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  ('index', 'TNL', u'TNL Documentation',
+   u'Tomáš Oberhuber', 'TNL', 'One line description of project.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
+
+
+# -- Options for Epub output ----------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = u'TNL'
+epub_author = u'Tomáš Oberhuber'
+epub_publisher = u'Tomáš Oberhuber'
+epub_copyright = u'2015, Tomáš Oberhuber'
+
+# The basename for the epub file. It defaults to the project name.
+#epub_basename = u'TNL'
+
+# The HTML theme for the epub output. Since the default themes are not optimized
+# for small screen space, using the same theme for HTML and epub output is
+# usually not wise. This defaults to 'epub', a theme designed to save visual
+# space.
+#epub_theme = 'epub'
+
+# The language of the text. It defaults to the language option
+# or en if the language is not set.
+#epub_language = ''
+
+# The scheme of the identifier. Typical schemes are ISBN or URL.
+#epub_scheme = ''
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#epub_identifier = ''
+
+# A unique identification for the text.
+#epub_uid = ''
+
+# A tuple containing the cover image and cover page html template filenames.
+#epub_cover = ()
+
+# A sequence of (type, uri, title) tuples for the guide element of content.opf.
+#epub_guide = ()
+
+# HTML files that should be inserted before the pages created by sphinx.
+# The format is a list of tuples containing the path and title.
+#epub_pre_files = []
+
+# HTML files shat should be inserted after the pages created by sphinx.
+# The format is a list of tuples containing the path and title.
+#epub_post_files = []
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+# The depth of the table of contents in toc.ncx.
+#epub_tocdepth = 3
+
+# Allow duplicate toc entries.
+#epub_tocdup = True
+
+# Choose between 'default' and 'includehidden'.
+#epub_tocscope = 'default'
+
+# Fix unsupported image types using the PIL.
+#epub_fix_images = False
+
+# Scale large images.
+#epub_max_image_width = 0
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#epub_show_urls = 'inline'
+
+# If false, no index is generated.
+#epub_use_index = True
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e76c24afab71f694da91ad2ea6b4764ca75d3b88
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,27 @@
+.. TNL documentation master file, created by
+   sphinx-quickstart on Sun Mar 29 13:12:39 2015.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to TNL's documentation!
+===============================
+
+Contents:
+
+.. toctree::
+   :maxdepth: 2
+
+   Introduction to TNL <intro>
+   Installation <install>
+   Users guid to PDE solvers <pde-solvers>
+
+   
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
diff --git a/doc/install.rst b/doc/install.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dc8261a102d61036f38566ea6f258e02759c1f46
--- /dev/null
+++ b/doc/install.rst
@@ -0,0 +1,8 @@
+============
+Installation
+============
+
+TNL can be downloaded from GitHub.
+
+
+
diff --git a/doc/intro.rst b/doc/intro.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d00f28855a518b785c947006a79002f1177acd5c
--- /dev/null
+++ b/doc/intro.rst
@@ -0,0 +1,33 @@
+============
+Introduction
+============
+
+TNL means *Template Numerical Library*. Aim of this project is to develop *efficient, flexible and easy to use* numerical library.
+
+**Efficiency**
+   Complex numerical simulations may take hundreds of hours. Fast and efficient solvers are therefore very important. TNL is designed to profit from abilities of new accelerators like GPUs (NVidia GeForce, Tesla) and MICs (Xeon Phi). To generate efficient executables, we avoid use of virtual methods on low levels of the code. Instead, C++ templates are used. 
+
+**Flexibility**
+   Development of new numerical schemes and solvers often requires to test many different approaches. Thanks to C++ templates and the design of TNL, it should be quite easy to switch between different schemes, solvers, meshes, precision of the floating point arithmetics or parallel architectures.
+
+**Easy to use**
+   Thanks to C++ templates, TNL offers automatic set-up of underlying structures (numerical meshes, sparse matrices, etc.), solvers (linear solvers, Runge-Kutta solvers, PDE solvers) and parallel architectures (GPU, MIC or MPI (not implemented yet)). TNL can also manage configuration parameters passed from the command line. The user may then concentrate only on the numerical model. 
+
+:Authors:
+   **Tomáš Oberhuber** - TNL design
+
+   **Vítězslav Žabka** - unstructured numerical mesh
+
+   **Vladimír Klement** - multigrid methods
+
+   **Tomáš Sobotík** - numerical methods for signed distance function
+
+   **Ondřej Székely** - FDM solvers for non-linear diffusion problems
+
+   **Libor Bakajsa** - sparse matrix formats for GPUs
+
+   **Jan Vacata** - sparse matrix formats for GPUs
+
+   **Martin Heller** - sparse matrix formats for GPUs
+
+   **Matěj Novotný** - quad double arithmetics
\ No newline at end of file
diff --git a/doc/pde-solvers.rst b/doc/pde-solvers.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6f7eed46d377e19862f297d31ec6d70f3846ed56
--- /dev/null
+++ b/doc/pde-solvers.rst
@@ -0,0 +1,3 @@
+===========
+PDE Solvers
+===========
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index f86a63e693d5580955fc2c5553edd653f7f99c5c..61286338725b3f12eeca4229d2f574fc1a779f10 100755
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,5 +1,3 @@
-add_subdirectory( make-project )
-add_subdirectory( simple-solver )
 add_subdirectory( heat-equation )
 add_subdirectory( incompressible-navier-stokes )
 add_subdirectory( navier-stokes )
diff --git a/examples/heat-equation/CMakeLists.txt b/examples/heat-equation/CMakeLists.txt
index 79273b47a83aeddd9da7491153445b23b234fcf9..c974a8d9684ea4f28918e10916075f8c351781ce 100755
--- a/examples/heat-equation/CMakeLists.txt
+++ b/examples/heat-equation/CMakeLists.txt
@@ -5,13 +5,15 @@ set( tnl_heat_equation_SOURCES
 IF( BUILD_CUDA )
    CUDA_ADD_EXECUTABLE(tnl-heat-equation${debugExt} tnl-heat-equation.cu)
    CUDA_ADD_EXECUTABLE(tnl-heat-equation-eoc-test${debugExt} tnl-heat-equation-eoc.cu)
+   target_link_libraries (tnl-heat-equation${debugExt} tnl${debugExt}-${tnlVersion}  ${CUSPARSE_LIBRARY} )
+   target_link_libraries (tnl-heat-equation-eoc-test${debugExt} tnl${debugExt}-${tnlVersion}  ${CUSPARSE_LIBRARY} )
 ELSE(  BUILD_CUDA )               
    ADD_EXECUTABLE(tnl-heat-equation${debugExt} tnl-heat-equation.cpp)     
    ADD_EXECUTABLE(tnl-heat-equation-eoc-test${debugExt} tnl-heat-equation-eoc.cpp)   
+   target_link_libraries (tnl-heat-equation${debugExt} tnl${debugExt}-${tnlVersion} )
+   target_link_libraries (tnl-heat-equation-eoc-test${debugExt} tnl${debugExt}-${tnlVersion} )
 ENDIF( BUILD_CUDA )
 
-target_link_libraries (tnl-heat-equation${debugExt} tnl${debugExt}-${tnlVersion} )
-target_link_libraries (tnl-heat-equation-eoc-test${debugExt} tnl${debugExt}-${tnlVersion} )
 
 INSTALL( TARGETS tnl-heat-equation${debugExt}
                  tnl-heat-equation-eoc-test${debugExt}
diff --git a/examples/heat-equation/tnl-heat-equation-eoc.h b/examples/heat-equation/tnl-heat-equation-eoc.h
index 21157bfaaeb3fd759da28fd7be323824525c1b28..7001a2d71c0dc45166468a752844847794c0cc82 100644
--- a/examples/heat-equation/tnl-heat-equation-eoc.h
+++ b/examples/heat-equation/tnl-heat-equation-eoc.h
@@ -19,16 +19,16 @@
 #define TNL_HEAT_EQUATION_EOC_H_
 
 #include <solvers/tnlSolver.h>
-#include <solvers/tnlFastBuildConfig.h>
-#include <solvers/tnlConfigTags.h>
-#include <functions/tnlTestFunction.h>
+#include <solvers/tnlFastBuildConfigTag.h>
+#include <solvers/tnlBuildConfigTags.h>
+#include <functors/tnlTestFunction.h>
 #include <operators/diffusion/tnlLinearDiffusion.h>
 #include <operators/diffusion/tnlExactLinearDiffusion.h>
 #include <operators/tnlAnalyticDirichletBoundaryConditions.h>
 #include <problems/tnlHeatEquationEocRhs.h>
 #include <problems/tnlHeatEquationEocProblem.h>
 
-//typedef tnlDefaultConfigTag BuildConfig;
+//typedef tnlDefaultBuildConfigTag BuildConfig;
 typedef tnlFastBuildConfig BuildConfig;
 
 template< typename ConfigTag >
diff --git a/examples/heat-equation/tnl-heat-equation.h b/examples/heat-equation/tnl-heat-equation.h
index 9eaf50c9e7bd583d899ad100e1741d177cb5121b..77c3121bc4718c06921cb67789224054ff1ed041 100644
--- a/examples/heat-equation/tnl-heat-equation.h
+++ b/examples/heat-equation/tnl-heat-equation.h
@@ -19,17 +19,17 @@
 #define TNL_HEAT_EQUATION_H_
 
 #include <solvers/tnlSolver.h>
-#include <solvers/tnlFastBuildConfig.h>
-#include <solvers/tnlConfigTags.h>
+#include <solvers/tnlFastBuildConfigTag.h>
+#include <solvers/tnlBuildConfigTags.h>
 #include <operators/diffusion/tnlLinearDiffusion.h>
 #include <operators/tnlAnalyticDirichletBoundaryConditions.h>
 #include <operators/tnlDirichletBoundaryConditions.h>
 #include <operators/tnlAnalyticNeumannBoundaryConditions.h>
 #include <operators/tnlNeumannBoundaryConditions.h>
-#include <functions/tnlConstantFunction.h>
+#include <functors/tnlConstantFunction.h>
 #include <problems/tnlHeatEquationProblem.h>
 
-//typedef tnlDefaultConfigTag BuildConfig;
+//typedef tnlDefaultBuildConfigTag BuildConfig;
 typedef tnlFastBuildConfig BuildConfig;
 
 template< typename ConfigTag >
@@ -80,34 +80,33 @@ class heatEquationSetter
          if( boundaryConditionsType == "dirichlet" )
          {
             typedef tnlAnalyticDirichletBoundaryConditions< MeshType, ConstantFunction, Real, Index > BoundaryConditions;
-            typedef tnlHeatEquationProblem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Solver;
+            typedef tnlHeatEquationProblem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Problem;
             SolverStarter solverStarter;
-            return solverStarter.template run< Solver >( parameters );
+            return solverStarter.template run< Problem >( parameters );
          }
          typedef tnlAnalyticNeumannBoundaryConditions< MeshType, ConstantFunction, Real, Index > BoundaryConditions;
-         typedef tnlHeatEquationProblem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Solver;
+         typedef tnlHeatEquationProblem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Problem;
          SolverStarter solverStarter;
-         return solverStarter.template run< Solver >( parameters );
+         return solverStarter.template run< Problem >( parameters );
       }
       typedef tnlVector< Real, Device, Index > VectorType;
       if( boundaryConditionsType == "dirichlet" )
       {
          typedef tnlDirichletBoundaryConditions< MeshType, VectorType, Real, Index > BoundaryConditions;
-         typedef tnlHeatEquationProblem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Solver;
+         typedef tnlHeatEquationProblem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Problem;
          SolverStarter solverStarter;
-         return solverStarter.template run< Solver >( parameters );
+         return solverStarter.template run< Problem >( parameters );
       }
       typedef tnlNeumannBoundaryConditions< MeshType, VectorType, Real, Index > BoundaryConditions;
-      typedef tnlHeatEquationProblem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Solver;
+      typedef tnlHeatEquationProblem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Problem;
       SolverStarter solverStarter;
-      return solverStarter.template run< Solver >( parameters );
+      return solverStarter.template run< Problem >( parameters );
    };
 };
 
 int main( int argc, char* argv[] )
 {
-   tnlSolver< heatEquationSetter, heatEquationConfig, BuildConfig > solver;
-   if( ! solver. run( argc, argv ) )
+   if( ! tnlSolver< heatEquationSetter, heatEquationConfig, BuildConfig >::run( argc, argv ) )
       return EXIT_FAILURE;
    return EXIT_SUCCESS;
 }
diff --git a/examples/make-project/CMakeLists.txt b/examples/make-project/CMakeLists.txt
deleted file mode 100755
index 85e54342b7ccff7a625b52249cb0bba30b5c4455..0000000000000000000000000000000000000000
--- a/examples/make-project/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-INSTALL( FILES Makefile
-               main.cpp
-               program-name.cfg.desc
-         DESTINATION share/tnl-${tnlVersion}/examples/make-project )
\ No newline at end of file
diff --git a/examples/make-project/Makefile b/examples/make-project/Makefile
deleted file mode 100644
index b00118da70bb6ce3fcc31b53e594be3119bd0875..0000000000000000000000000000000000000000
--- a/examples/make-project/Makefile
+++ /dev/null
@@ -1,42 +0,0 @@
-TNL_VERSION=0.1
-TNL_INSTALL_DIR=${HOME}/local/lib
-TNL_INCLUDE_DIR=${HOME}/local/include/tnl-${TNL_VERSION}
-
-TARGET = program-name
-CONFIG_FILE = $(TARGET).cfg.desc
-INSTALL_DIR = ${HOME}/local
-CXX = g++
-CUDA_CXX = nvcc
-CXX_FLAGS = -std=gnu++0x -I$(TNL_INCLUDE_DIR)
-LD_FLAGS = -L$(TNL_INSTALL_DIR) -ltnl-0.1
-
-SOURCES = main.cpp
-HEADERS = 
-OBJECTS = main.o
-DIST = $(SOURCES) Makefile
-
-all: $(TARGET)
-clean: 
-	rm -f $(OBJECTS)
-	rm -f $(TARGET)-conf.h	
-
-dist: $(DIST)
-	tar zcvf $(TARGET).tgz $(DIST) 
-
-install: $(TARGET)
-	cp $(TARGET) $(INSTALL_DIR)/bin
-	cp $(CONFIG_FILE) $(INSTALL_DIR)/share
-
-uninstall: $(TARGET)
-	rm -f $(INSTALL_DIR)/bin/$(TARGET) 
-	rm -f $(CONFIG_FILE) $(INSTALL_DIR)/share
-
-$(TARGET): $(OBJECTS)
-	$(CXX) -o $(TARGET) $(OBJECTS) $(LD_FLAGS)
-
-%.o: %.cpp $(TARGET)-conf.h $(HEADERS)
-	$(CXX) -c -o $@ $(CXX_FLAGS) $<
-
-$(TARGET)-conf.h:
-	echo "#define CONFIG_FILE \"${INSTALL_DIR}/share/${CONFIG_FILE}\" " > $(TARGET)-conf.h 
-
diff --git a/examples/make-project/program-name.cfg.desc b/examples/make-project/program-name.cfg.desc
deleted file mode 100644
index f2ac640391feb48e75ad4e3f9c4fc215afd5d52a..0000000000000000000000000000000000000000
--- a/examples/make-project/program-name.cfg.desc
+++ /dev/null
@@ -1,22 +0,0 @@
-group IO
-{
-   string input-file            [Input file name.];
-   string output-file           [Output file name.];
-   real output-period           [Intervals for writing the state of the computation (in the meaning of parameter t).];
-},[Arguments describing input and output data.];
-group Problem
-{
-   real final-t(!)              [When reaching this t the computation will stop.];
-
-},[Setting up the problem we solve.];
-group Method
-{
-   string method(!)             [Method for solving the problem.];
-},[Parameters controling the method we use.];
-group Solver
-{
-   string  solver-name;
-   real    max-solver-res( 1.0e-6 ); 
-   integer max-solver-iterations( 1000000 );
-},[Parameters of the solver];
-
diff --git a/examples/navier-stokes/navierStokesSolverMonitor_impl.h b/examples/navier-stokes/navierStokesSolverMonitor_impl.h
index a913f34902aa11b2ff3400a1ded37acda0868dd0..db69fe7b212a453a43b0e7c2cbae5a50060a7dec 100644
--- a/examples/navier-stokes/navierStokesSolverMonitor_impl.h
+++ b/examples/navier-stokes/navierStokesSolverMonitor_impl.h
@@ -30,7 +30,7 @@ navierStokesSolverMonitor< Real, Index > :: navierStokesSolverMonitor()
 template< typename Real, typename Index >
 void navierStokesSolverMonitor< Real, Index > :: refresh()
 {
-   if( this -> verbose > 0 && this -> refreshing % this -> outputPeriod == 0 )
+   if( this -> verbose > 0 && this -> refresRate % this -> refreshRate == 0 )
    {
       cout << "V=( " << uMax
            << " , " << uAvg
diff --git a/examples/simple-solver/CMakeLists.txt b/examples/simple-solver/CMakeLists.txt
deleted file mode 100755
index 5b8bcc644608ccc1cd6ba3338de994df7f5ec09a..0000000000000000000000000000000000000000
--- a/examples/simple-solver/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-INSTALL( FILES Makefile
-               main.cpp
-               simpleProblemSolver.h
-               simpleProblemSolver_impl.h
-               simpleProblemSetter.h
-               simpleProblemSetter_impl.h
-               simpleProblemConfig.h
-               run-simple-solver
-         DESTINATION share/tnl-${tnlVersion}/examples/simple-solver )
\ No newline at end of file
diff --git a/examples/simple-solver/Makefile b/examples/simple-solver/Makefile
deleted file mode 100644
index 2e9fb8bb68dde48d07ffa7c3fc2fa979742fc378..0000000000000000000000000000000000000000
--- a/examples/simple-solver/Makefile
+++ /dev/null
@@ -1,37 +0,0 @@
-TNL_VERSION=0.1
-TNL_INSTALL_DIR=${HOME}/local/lib
-TNL_INCLUDE_DIR=${HOME}/local/include/tnl-${TNL_VERSION}
-
-TARGET = simple-solver
-CONFIG_FILE = $(TARGET).cfg.desc
-INSTALL_DIR = ${HOME}/local
-CXX = g++
-CUDA_CXX = nvcc
-CXX_FLAGS = -std=gnu++0x -I$(TNL_INCLUDE_DIR)
-LD_FLAGS = -L$(TNL_INSTALL_DIR) -ltnl-0.1
-
-SOURCES = main.cpp
-HEADERS = 
-OBJECTS = main.o
-DIST = $(SOURCES) Makefile
-
-all: $(TARGET)
-clean: 
-	rm -f $(OBJECTS)	
-
-dist: $(DIST)
-	tar zcvf $(TARGET).tgz $(DIST) 
-
-install: $(TARGET)
-	cp $(TARGET) $(INSTALL_DIR)/bin
-	cp $(CONFIG_FILE) $(INSTALL_DIR)/share
-
-uninstall: $(TARGET)
-	rm -f $(INSTALL_DIR)/bin/$(TARGET) 
-	rm -f $(CONFIG_FILE) $(INSTALL_DIR)/share
-
-$(TARGET): $(OBJECTS)
-	$(CXX) -o $(TARGET) $(OBJECTS) $(LD_FLAGS)
-
-%.o: %.cpp $(HEADERS)
-	$(CXX) -c -o $@ $(CXX_FLAGS) $<
diff --git a/examples/simple-solver/run-simple-solver b/examples/simple-solver/run-simple-solver
deleted file mode 100644
index ee7d7234e64ddec4478199640b92dde89e922e46..0000000000000000000000000000000000000000
--- a/examples/simple-solver/run-simple-solver
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-tnl-grid-setup --dimensions 2 \
-               --origin-x 0.0 \
-               --origin-y 0.0 \
-               --proportions-x 1.0 \
-               --proportions-y 1.0 \
-               --size-x 100 \
-               --size-y 100
-               
-tnl-discrete --function sin-waves \
-             --output-file u-ini.tnl               
-
-simple-solver --dimensions 2 \
-              --time-discretisation explicit \
-              --discrete-solver merson \
-              --snapshot-period 0.01 \
-              --final-time 1.0
-              
-tnl-view --mesh mesh.tnl *tnl              
-              
\ No newline at end of file
diff --git a/examples/simple-solver/simpleProblemSetter_impl.h b/examples/simple-solver/simpleProblemSetter_impl.h
deleted file mode 100644
index 5dae6e57158ad5006e580b3cbf471d6a7f30e3f9..0000000000000000000000000000000000000000
--- a/examples/simple-solver/simpleProblemSetter_impl.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/***************************************************************************
-                          simpleProblemSetter_impl.h  -  description
-                             -------------------
-    begin                : Mar 10, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef SIMPLEPROBLEMSETTER_IMPL_H_
-#define SIMPLEPROBLEMSETTER_IMPL_H_
-
-template< typename RealType,
-          typename DeviceType,
-          typename IndexType,
-          typename MeshType,
-          typename ConfigTag,
-          typename SolverStarter >
-bool simpleProblemSetter< RealType, DeviceType, IndexType, MeshType, ConfigTag, SolverStarter > :: run( const tnlParameterContainer& parameters )
-{
-   SolverStarter solverStarter;
-   return solverStarter. template run< simpleProblemSolver< MeshType > >( parameters );
-}
-
-
-#endif /* SIMPLEPROBLEMSETTER_IMPL_H_ */
diff --git a/examples/simple-solver/simpleProblemSolver.h b/examples/simple-solver/simpleProblemSolver.h
deleted file mode 100644
index 3ca680273366b95f5569cba0a38e23ec42de916a..0000000000000000000000000000000000000000
--- a/examples/simple-solver/simpleProblemSolver.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/***************************************************************************
-                          simpleProblemSolver.h  -  description
-                             -------------------
-    begin                : Feb 23, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef SIMPLEPROBLEMSOLVER_H_
-#define SIMPLEPROBLEMSOLVER_H_
-
-#include <matrices/tnlCSRMatrix.h>
-#include <solvers/preconditioners/tnlDummyPreconditioner.h>
-#include <solvers/tnlSolverMonitor.h>
-#include <core/tnlLogger.h>
-#include <core/vectors/tnlVector.h>
-#include <core/vectors/tnlSharedVector.h>
-
-template< typename Mesh >
-class simpleProblemSolver
-{
-   public:
-
-   typedef typename Mesh :: RealType RealType;
-   typedef typename Mesh :: DeviceType DeviceType;
-   typedef typename Mesh :: IndexType IndexType;
-   typedef Mesh MeshType;
-   typedef tnlVector< RealType, DeviceType, IndexType> DofVectorType;
-   typedef tnlCSRMatrix< RealType, DeviceType, IndexType > DiscreteSolverMatrixType;
-   typedef tnlDummyPreconditioner< RealType, DeviceType, IndexType > DiscreteSolverPreconditioner;
-
-   static tnlString getTypeStatic();
-
-   tnlString getPrologHeader() const;
-
-   void writeProlog( tnlLogger& logger,
-                     const tnlParameterContainer& parameters ) const;
-
-   bool setup( const tnlParameterContainer& parameters );
-
-   IndexType getDofs( const MeshType& mesh ) const;
-
-   IndexType getAuxiliaryDofs( const MeshType& mesh ) const;
-
-   void bindDofs( const MeshType& mesh,
-                  DofVectorType& dofs,
-                  DofVectorType& auxiliaryDofs );
-
-   bool setInitialCondition( const tnlParameterContainer& parameters );
-
-   bool makeSnapshot( const RealType& time,
-                      const IndexType& step,
-                      const MeshType& mesh );
-
-
-
-   void GetExplicitRHS( const RealType& time,
-                        const RealType& tau,
-                        const MeshType& mesh,
-                        DofVectorType& _u,
-                        DofVectorType& _fu );
-
-   tnlSolverMonitor< RealType, IndexType >* getSolverMonitor();
-
-   protected:
-
-   DofVectorType dofVector;
-
-   tnlSharedVector< RealType, DeviceType, IndexType > u, v;
-
-   MeshType mesh;
-
-};
-
-#include "simpleProblemSolver_impl.h"
-
-#endif /* SIMPLEPROBLEM_H_ */
diff --git a/examples/simple-solver/simpleProblemSolver_impl.h b/examples/simple-solver/simpleProblemSolver_impl.h
deleted file mode 100644
index b00f59111a6ac0bdbc7dd6096da92799dc3d1ead..0000000000000000000000000000000000000000
--- a/examples/simple-solver/simpleProblemSolver_impl.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/***************************************************************************
-                          simpleProblemSolver_impl.h  -  description
-                             -------------------
-    begin                : Mar 10, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef SIMPLEPROBLEMSOLVER_IMPL_H_
-#define SIMPLEPROBLEMSOLVER_IMPL_H_
-
-#include <core/mfilename.h>
-
-template< typename Mesh >
-tnlString simpleProblemSolver< Mesh>::getTypeStatic()
-{
-   /****
-    * Replace 'simpleProblemSolver' by the name of your solver.
-    */
-   return tnlString( "simpleProblemSolver< " ) + Mesh :: getTypeStatic() + " >";
-}
-
-template< typename Mesh >
-tnlString simpleProblemSolver< Mesh>::getPrologHeader() const
-{
-   /****
-    * Replace 'Simple Problem' by the your desired title in the log table.
-    */
-   return tnlString( "Simple Problem" );
-}
-
-template< typename Mesh >
-void simpleProblemSolver< Mesh>::writeProlog( tnlLogger& logger,
-                                              const tnlParameterContainer& parameters ) const
-{
-   /****
-    * In prolog, write all input parameters which define the numerical simulation.
-    * Use methods:
-    *
-    *    logger. writeParameters< Type >( "Label:", "name", parameters );
-    *
-    *  or
-    *
-    *    logger. writeParameter< Type >( "Label:", value );
-    *
-    *  See tnlLogger.h for more details.
-    */
-
-   logger. WriteParameter< tnlString >( "Problem name:", "problem-name", parameters );
-   logger. WriteParameter< int >( "Simple parameter:", 1 );
-}
-
-template< typename Mesh >
-bool simpleProblemSolver< Mesh>::setup( const tnlParameterContainer& parameters )
-{
-   /****
-    * Set-up your solver here. It means:
-    * 1. Read input parameters and model coefficients like these
-    */
-   const tnlString& problemName = parameters. getParameter< tnlString >( "problem-name" );
-   return true;
-}
-
-template< typename Mesh >
-typename simpleProblemSolver< Mesh >::IndexType simpleProblemSolver< Mesh>::getDofs( const Mesh& mesh ) const
-{
-   /****
-    * Set-up DOFs and supporting grid functions
-    */
-   return 2*mesh.getDofs();
-}
-
-template< typename Mesh >
-typename simpleProblemSolver< Mesh >::IndexType simpleProblemSolver< Mesh>::getAuxiliaryDofs( const Mesh& mesh ) const
-{
-   /****
-    * Set-up DOFs and supporting grid functions
-    */
-   return 2*mesh.getDofs();
-}
-
-
-template< typename Mesh >
-void simpleProblemSolver< Mesh >::bindDofs( const MeshType& mesh,
-                                            DofVectorType& dofVector,
-                                            DofVectorType& auxiliaryDofVector )
-{
-   /****
-    * You may use tnlSharedVector if you need to split the dofVector into more
-    * grid functions like the following example:
-    */
-   const IndexType dofs = this->getDofs( mesh );
-   this -> u. bind( & dofVector. getData()[ 0 * dofs ], dofs );
-   this -> v. bind( & dofVector. getData()[ 1 * dofs ], dofs );
-   /****
-    * You may now treat u and v as usual vectors and indirectly work with this->dofVector.
-    */
-}
-
-template< typename Mesh >
-bool simpleProblemSolver< Mesh>::setInitialCondition( const tnlParameterContainer& parameters )
-{
-   /****
-    * Set the initial condition here. Manipulate only this -> dofVector.
-    */
-   /*const tnlString& initialConditionFile = parameters.getParameter< tnlString >( "initial-condition" );
-   if( ! this->u.load( initialConditionFile ) )
-   {
-      cerr << "I am not able to load the initial condition from the file " << initialConditionFile << "." << endl;
-      return false;
-   }*/
-   return true;
-}
-
-template< typename Mesh >
-bool simpleProblemSolver< Mesh>::makeSnapshot( const RealType& time,
-                                               const IndexType& step,
-                                               const MeshType& mesh )
-{
-   /****
-    * Use this method to write state of the solver to file(s).
-    * All data are stored in this -> dofVector. You may use
-    * supporting vectors and bind them with the dofVector as before.
-    */
-   cout << endl << "Writing output at time " << time << " step " << step << "." << endl;
-
-   /****
-    * Now write them to files.
-    */
-   tnlString fileName;
-   FileNameBaseNumberEnding( "u-", step, 5, ".tnl", fileName );
-   if( ! this -> u. save( fileName ) )
-      return false;
-
-   FileNameBaseNumberEnding( "v-", step, 5, ".tnl", fileName );
-   if( ! this -> v. save( fileName ) )
-      return false;
-
-   return true;
-}
-
-template< typename Mesh >
-void simpleProblemSolver< Mesh>::GetExplicitRHS( const RealType& time,
-                                                 const RealType& tau,
-                                                 const MeshType& mesh,
-                                                 DofVectorType& _u,
-                                                 DofVectorType& _fu )
-{
-   /****
-    * If you use an explicit solver like tnlEulerSolver or tnlMersonSolver, you
-    * need to implement this method. Compute the right-hand side of
-    *
-    *   d/dt u(x) = fu( x, u )
-    *
-    * You may use supporting vectors again if you need.
-    */
-
-   _fu.setValue( 1.0 );
-   if( DeviceType :: getDevice() == tnlHostDevice )
-   {
-      /****
-       *  Write the host solver here.
-       */
-   }
-#ifdef HAVE_CUDA
-   if( DeviceType :: getDevice() == tnlCudaDevice )
-   {
-      /****
-       * Write the CUDA solver here.
-       */
-   }
-#endif
-}
-
-template< typename Mesh >
-tnlSolverMonitor< typename simpleProblemSolver< Mesh > :: RealType,
-                  typename simpleProblemSolver< Mesh > :: IndexType >*
-   simpleProblemSolver< Mesh >::getSolverMonitor()
-{
-   return 0;
-}
-
-#endif /* SIMPLEPROBLEM_IMPL_H_ */
diff --git a/install b/install
index 731ccf9aa5c353597b56de95ec7cecf3217a80a7..950f1018e0bf5a52abc8b0034708b91a39082634 100755
--- a/install
+++ b/install
@@ -1,43 +1,95 @@
 #!/bin/bash
 
-TARGET=TNL
-INSTALL_PREFIX=${HOME}/local
-WITH_CUDA=no
-TEMPLATE_EXPLICIT_INSTANTIATION=yes
-#VERBOSE="VERBOSE=1"
-
-CMAKE="cmake"
-CPUS=`grep -c processor /proc/cpuinfo`
-#CPUS="1"
+OPTIONS=""
 
+CMAKE_TEST=`which cmake`    
+if test x${CMAKE_TEST} = "x";
+then
+    echo "Cmake is not installed on your system. Please install it by:"
+    echo ""
+    echo "   sudo apt-get install cmake     on Ubuntu and Debian based systems"
+    echo "   sudo yum install cmake         on RedHat, Fedora or CentOS"
+    echo "   sudo zypper install cmake      on OpenSuse"
+    echo ""
+    echo "You may also install it from the source code at:"
+    echo " http://www.cmake.org/download/"
+    exit 1
+fi
 
-echo "Building $TARGET using $CPUS processors."
+for option in "$@"
+do
+    case $option in
+        --no-debug                    ) BUILD_DEBUG="no" ;;
+        --no-release                  ) BUILD_RELEASE="no" ;;        
+        *                             ) OPTIONS="${OPTIONS} ${option}" ;;
+    esac
+done
 
-if [ ! -d Debug ];
+if test ${BUILD_DEBUG} = "yes";
 then
-   mkdir Debug
+    if [ ! -d Debug ];
+    then
+       mkdir Debug
+    fi
+    cd Debug
+    ../build --root-dir=.. --build=Debug ${OPTIONS}
+    if test $? != 0;
+    then
+       exit 1
+    fi
+    make install
+    cd ..
 fi
-if [ ! -d Release ];
+
+if test ${BUILD_RELEASE} = "yes";
 then
-   mkdir Release
+    if [ ! -d Release ];
+    then
+       mkdir Release
+    fi
+    cd Release
+    ../build --root-dir=.. --build=Release ${OPTIONS}
+    if test $? != 0;
+    then
+        exit 1
+    fi
+    make install
+    cd ..
 fi
 
-cd Debug
-${CMAKE} .. -DCMAKE_BUILD_TYPE=Debug \
-            -DCMAKE_INSTALL_PREFIX=${HOME}/local \
-            -DWITH_CUDA=${WITH_CUDA} \
-            -DPETSC_DIR=${PETSC_DIR} \
-            -DWITH_TEMPLATE_EXPLICIT_INSTANTIATION=${TEMPLATE_EXPLICIT_INSTANTIATION}
-make -j${CPUS} ${VERBOSE}
-#make -j${CPUS} test
-#make -j${CPUS} install
-
-#cd ../Release
-#${CMAKE} .. -DCMAKE_INSTALL_PREFIX=${HOME}/local \
-#            -DWITH_CUDA=${WITH_CUDA} \
-#            -DPETSC_DIR=${PETSC_DIR} \
-#            -DWITH_TEMPLATE_EXPLICIT_INSTANTIATION=${TEMPLATE_EXPLICIT_INSTANTIATION}
-#make -j${CPUS} ${VERBOSE}
-#make -j${CPUS} test
-#make -j${CPUS} install
+TNL_TEST=`which tnl-bindir`
+
+if test x${TNL_TEST} = x;
+then
+    echo ""
+    echo "WARNING !!!"
+    echo ""
+    echo "Your system does not see TNL which was installed right now."
+    echo "You need to add it to your system variables PATH and LD_LIBRARY_PATH."
+    echo "Add the following to your .bashrc file:"
+    echo ""
+    
+    PREFIX=${HOME}/local
+    for option in "$@"
+    do
+        case $option in
+            --prefix=*                     ) PREFIX="${option#*=}" ;;
+        esac
+    done
+
+    echo "if test x\${PATH} = x;"
+    echo "then"
+    echo "   PATH=${PREFIX}/bin"
+    echo "else"
+    echo "   PATH=\${PATH}:${PREFIX}/bin"
+    echo "fi"
+    echo "if test x\${LD_LIBRARY_PATH} = x;"
+    echo "then"
+    echo "   LD_LIBRARY_PATH=${PREFIX}/lib"
+    echo "else"
+    echo "   LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:${PREFIX}/lib"
+    echo "fi"
+    echo "export PATH"
+    echo "export LD_LIBRARY_PATH"
+fi
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d6c150901a9f30bed8ac8846dc07c38c57c53dc8..6851150184a1b4c0077b75ed992740f14442e109 100755
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,5 @@
 INCLUDE_DIRECTORIES( config )
-ADD_SUBDIRECTORY( functions )
+ADD_SUBDIRECTORY( functors )
 ADD_SUBDIRECTORY( config )
 ADD_SUBDIRECTORY( core )
 ADD_SUBDIRECTORY( debug )
@@ -10,7 +10,7 @@ ADD_SUBDIRECTORY( problems )
 ADD_SUBDIRECTORY( solvers )
 ADD_SUBDIRECTORY( legacy )
 
-set( tnl_SOURCES ${tnl_functions_SOURCES}
+set( tnl_SOURCES ${tnl_functors_SOURCES}
                  ${tnl_config_SOURCES}
                  ${tnl_core_SOURCES}
                  ${tnl_legacy_SOURCES}
@@ -21,7 +21,7 @@ set( tnl_SOURCES ${tnl_functions_SOURCES}
                  ${tnl_problems_SOURCES}
                   )
 
-set( tnl_CUDA__SOURCES ${tnl_functions_CUDA__SOURCES}
+set( tnl_CUDA__SOURCES ${tnl_functors_CUDA__SOURCES}
                        ${tnl_config_CUDA__SOURCES}
                        ${tnl_core_CUDA__SOURCES}
                        ${tnl_legacy_CUDA__SOURCES}
@@ -35,6 +35,9 @@ set( tnl_CUDA__SOURCES ${tnl_functions_CUDA__SOURCES}
 if( BUILD_CUDA )
    CUDA_ADD_LIBRARY( tnl${debugExt}-${tnlVersion} SHARED ${tnl_CUDA__SOURCES}
                                                   OPTIONS ${CUDA_ADD_LIBRARY_OPTIONS} )
+    if( HAVE_CUBLAS )
+       CUDA_ADD_CUBLAS_TO_TARGET( tnl${debugExt}-${tnlVersion} )
+    endif( HAVE_CUBLAS )
 else( BUILD_CUDA )
    ADD_LIBRARY( tnl${debugExt}-${tnlVersion} SHARED 
                 ${tnl_SOURCES} )
@@ -51,6 +54,9 @@ IF( BUILD_MPI )
    if( BUILD_CUDA )
       CUDA_ADD_LIBRARY( tnl-mpi${debugExt}-${tnlVersion} SHARED ${tnl_CUDA__SOURCES} 
                                                          OPTIONS ${CUDA_ADD_LIBRARY_OPTIONS} )
+      if( HAVE_CUBLAS )
+         CUDA_ADD_CUBLAS_TO_TARGET( tnl-mpi${debugExt}-${tnlVersion} )
+      endif( HAVE_CUBLAS )
    else( BUILD_CUDA )
          ADD_LIBRARY( tnl-mpi${debugExt}-${tnlVersion} SHARED
                       ${tnl_SOURCES} )  
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index c4731dfc983be9b3b40f7bd8da2bc7de19944070..5ac2823e6035003a69638bb37ed74c9083e85ea1 100755
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -4,30 +4,32 @@ ADD_SUBDIRECTORY( cuda )
 ADD_SUBDIRECTORY( vectors )
 
 set (headers tnlAssert.h               
+             tnlConstants.h
              tnlCurve.h
-      	    tnlCuda.h
-  		       tnlDataElement.h
-  		       tnlDevice.h
-  		       tnlDynamicTypeTag.h
-  		       tnlFeature.h
-  		       tnlFile.h 
-  		       tnlFlopsCounter.h
-   		    tnlHost.h 
-   		    tnlIndexedSet.h
-   		    tnlList.h
-   		    tnlLogger.h 
-   		    tnlObject.h 
-   		    tnlStack.h
-   		    tnlStaticFor.h
-   		    tnlStatistics.h 
-   		    tnlString.h 
-   		    tnlReal.h
-   		    tnlTimerCPU.h  
-   		    tnlTimerRT.h    
-   		    mfilename.h 
-   		    mfuncs.h 
-   		    mpi-supp.h 
-   		    param-types.h
+      	     tnlCuda.h
+             tnlCudaDeviceInfo.h
+             tnlDataElement.h
+  	     tnlDevice.h
+             tnlDynamicTypeTag.h
+             tnlFeature.h
+             tnlFile.h 
+             tnlFlopsCounter.h
+             tnlHost.h 
+             tnlIndexedSet.h
+             tnlList.h
+             tnlLogger.h 
+             tnlObject.h 
+             tnlStack.h
+             tnlStaticFor.h
+             tnlStatistics.h 
+             tnlString.h 
+             tnlReal.h
+             tnlTimerCPU.h  
+             tnlTimerRT.h    
+             mfilename.h 
+             mfuncs.h 
+             mpi-supp.h 
+             param-types.h
              tnlCuda_impl.h
              tnlLogger_impl.h 
              tnlIndexedSet_impl.h )
@@ -55,6 +57,7 @@ IF( BUILD_CUDA )
         ${tnl_core_vectors_CUDA__SOURCES}
         ${common_SOURCES} 
         ${CURRENT_DIR}/tnlCuda.cu
+        ${CURRENT_DIR}/tnlCudaDeviceInfo.cu
         PARENT_SCOPE )
 ENDIF()    
 
@@ -64,6 +67,7 @@ set( tnl_core_SOURCES
      ${tnl_core_cuda_SOURCES}
      ${tnl_core_vectors_SOURCES}
      ${common_SOURCES}
+     ${CURRENT_DIR}/tnlCudaDeviceInfo.cpp
      PARENT_SCOPE )
     
 
diff --git a/src/core/arrays/tnlArray.h b/src/core/arrays/tnlArray.h
index e5f96e3c91014c72bdeba15515a6c4df15a644f3..6ebf9399b58afc449d8f5ec3fefa9cdfcd0df2cd 100644
--- a/src/core/arrays/tnlArray.h
+++ b/src/core/arrays/tnlArray.h
@@ -61,24 +61,15 @@ class tnlArray : public virtual tnlObject
 
    void reset();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getSize() const;
+   __cuda_callable__ Index getSize() const;
 
    void setElement( const Index i, const Element& x );
 
    Element getElement( Index i ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Element& operator[] ( Index i );
+   __cuda_callable__ Element& operator[] ( Index i );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element& operator[] ( Index i ) const;
+   __cuda_callable__ const Element& operator[] ( Index i ) const;
 
    tnlArray< Element, Device, Index >& operator = ( const tnlArray< Element, Device, Index >& array );
 
@@ -93,15 +84,9 @@ class tnlArray : public virtual tnlObject
 
    void setValue( const Element& e );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element* getData() const;
+   __cuda_callable__ const Element* getData() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Element* getData();
+   __cuda_callable__ Element* getData();
 
    /*!
     * Returns true if non-zero size is set.
diff --git a/src/core/arrays/tnlArrayOperationsCuda_impl.cpp b/src/core/arrays/tnlArrayOperationsCuda_impl.cpp
index b1dba6e09237516ef773e933b372e84b817010e4..18cc3c6280e573e47c6b0a218bca034ac793d086 100644
--- a/src/core/arrays/tnlArrayOperationsCuda_impl.cpp
+++ b/src/core/arrays/tnlArrayOperationsCuda_impl.cpp
@@ -22,152 +22,275 @@
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< char,        int >( char*& data, const int size );
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< int,         int >( int*& data, const int size );
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< long int,    int >( long int*& data, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< float,       int >( float*& data, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< double,      int >( double*& data, const int size );
-//template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, int >( long double*& data, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, int >( long double*& data, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< char,        long int >( char*& data, const long int size );
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< int,         long int >( int*& data, const long int size );
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< long int,    long int >( long int*& data, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< float,       long int >( float*& data, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< double,      long int >( double*& data, const long int size );
-//template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, long int >( long double*& data, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, long int >( long double*& data, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlCuda >::freeMemory< char        >( char* data );
 template bool tnlArrayOperations< tnlCuda >::freeMemory< int         >( int* data );
 template bool tnlArrayOperations< tnlCuda >::freeMemory< long int    >( long int* data );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::freeMemory< float       >( float* data );
+#endif
 template bool tnlArrayOperations< tnlCuda >::freeMemory< double      >( double* data );
-//template bool tnlArrayOperations< tnlCuda >::freeMemory< long double >( long double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::freeMemory< long double >( long double* data );
+#endif
 
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< char        >( char* data, const char& value );
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< int         >( int* data, const int& value );
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< long int    >( long int* data, const long int& value );
+#ifdef INSTANTIATE_FLOAT
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< float       >( float* data, const float& value );
+#endif
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< double      >( double* data, const double& value );
-//template void tnlArrayOperations< tnlCuda >::setMemoryElement< long double >( long double* data, const long double& value );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template void tnlArrayOperations< tnlCuda >::setMemoryElement< long double >( long double* data, const long double& value );
+#endif
 
 template char        tnlArrayOperations< tnlCuda >::getMemoryElement< char        >( const char* data );
 template int         tnlArrayOperations< tnlCuda >::getMemoryElement< int         >( const int* data );
 template long int    tnlArrayOperations< tnlCuda >::getMemoryElement< long int    >( const long int* data );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlArrayOperations< tnlCuda >::getMemoryElement< float       >( const float* data );
+#endif
 template double      tnlArrayOperations< tnlCuda >::getMemoryElement< double      >( const double* data );
-//template long double tnlArrayOperations< tnlCuda >::getMemoryElement< long double >( const long double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlArrayOperations< tnlCuda >::getMemoryElement< long double >( const long double* data );
+#endif
 
 template char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        int >( char* data, const int i );
 template int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         int >( int* data, const int i );
 template long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    int >( long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 template float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       int >( float* data, const int i );
+#endif
 template double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      int >( double* data, const int i );
-//template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( long double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        long int >( char* data, const long int i );
 template int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         long int >( int* data, const long int i );
 template long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    long int >( long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 template float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       long int >( float* data, const long int i );
+#endif
 template double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      long int >( double* data, const long int i );
-//template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( long double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( long double* data, const long int i );
+#endif
+#endif
 
 template const char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        int >( const char* data, const int i );
 template const int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         int >( const int* data, const int i );
 template const long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    int >( const long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 template const float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       int >( const float* data, const int i );
+#endif
 template const double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      int >( const double* data, const int i );
-//template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( const long double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( const long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template const char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        long int >( const char* data, const long int i );
 template const int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         long int >( const int* data, const long int i );
 template const long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    long int >( const long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 template const float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       long int >( const float* data, const long int i );
+#endif
 template const double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      long int >( const double* data, const long int i );
-//template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( const long double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( const long double* data, const long int i );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlCuda >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 template bool tnlArrayOperations< tnlCuda >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 template bool tnlArrayOperations< tnlCuda >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-//template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 template bool tnlArrayOperations< tnlCuda >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 template bool tnlArrayOperations< tnlCuda >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-//template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
+
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-//template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-//template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
+
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-//template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-//template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlCuda >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 template bool tnlArrayOperations< tnlCuda >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-//template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 template bool tnlArrayOperations< tnlCuda >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-//template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
+
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-//template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-//template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
+
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-//template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-//template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
-
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlCuda >::setMemory< char,        int >( char* destination, const char& value, const int size );
 template bool tnlArrayOperations< tnlCuda >::setMemory< int,         int >( int* destination, const int& value, const int size );
 template bool tnlArrayOperations< tnlCuda >::setMemory< long int,    int >( long int* destination, const long int& value, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::setMemory< float,       int >( float* destination, const float& value, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::setMemory< double,      int >( double* destination, const double& value, const int size );
-//template bool tnlArrayOperations< tnlCuda >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda >::setMemory< char,        long int >( char* destination, const char& value, const long int size );
 template bool tnlArrayOperations< tnlCuda >::setMemory< int,         long int >( int* destination, const int& value, const long int size );
 template bool tnlArrayOperations< tnlCuda >::setMemory< long int,    long int >( long int* destination, const long int& value, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::setMemory< float,       long int >( float* destination, const float& value, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::setMemory< double,      long int >( double* destination, const double& value, const long int size );
-//template bool tnlArrayOperations< tnlCuda >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
+#endif
+#endif
 
 #endif
 
diff --git a/src/core/arrays/tnlArrayOperationsCuda_impl.cu b/src/core/arrays/tnlArrayOperationsCuda_impl.cu
index 326ead4c762d042f5fe31c4393d4abd7ab4393d4..38f97587561979873bb702f76e671ba5b25735df 100644
--- a/src/core/arrays/tnlArrayOperationsCuda_impl.cu
+++ b/src/core/arrays/tnlArrayOperationsCuda_impl.cu
@@ -22,155 +22,274 @@
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< char,        int >( char*& data, const int size );
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< int,         int >( int*& data, const int size );
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< long int,    int >( long int*& data, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< float,       int >( float*& data, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< double,      int >( double*& data, const int size );
-//template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, int >( long double*& data, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, int >( long double*& data, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< char,        long int >( char*& data, const long int size );
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< int,         long int >( int*& data, const long int size );
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< long int,    long int >( long int*& data, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< float,       long int >( float*& data, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::allocateMemory< double,      long int >( double*& data, const long int size );
-//template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, long int >( long double*& data, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, long int >( long double*& data, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlCuda >::freeMemory< char        >( char* data );
 template bool tnlArrayOperations< tnlCuda >::freeMemory< int         >( int* data );
 template bool tnlArrayOperations< tnlCuda >::freeMemory< long int    >( long int* data );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::freeMemory< float       >( float* data );
+#endif
 template bool tnlArrayOperations< tnlCuda >::freeMemory< double      >( double* data );
-//template bool tnlArrayOperations< tnlCuda >::freeMemory< long double >( long double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::freeMemory< long double >( long double* data );
+#endif
 
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< char        >( char* data, const char& value );
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< int         >( int* data, const int& value );
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< long int    >( long int* data, const long int& value );
+#ifdef INSTANTIATE_FLOAT
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< float       >( float* data, const float& value );
+#endif
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< double      >( double* data, const double& value );
-//template void tnlArrayOperations< tnlCuda >::setMemoryElement< long double >( long double* data, const long double& value );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template void tnlArrayOperations< tnlCuda >::setMemoryElement< long double >( long double* data, const long double& value );
+#endif
 
 template char        tnlArrayOperations< tnlCuda >::getMemoryElement< char        >( const char* data );
 template int         tnlArrayOperations< tnlCuda >::getMemoryElement< int         >( const int* data );
 template long int    tnlArrayOperations< tnlCuda >::getMemoryElement< long int    >( const long int* data );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlArrayOperations< tnlCuda >::getMemoryElement< float       >( const float* data );
+#endif
 template double      tnlArrayOperations< tnlCuda >::getMemoryElement< double      >( const double* data );
-//template long double tnlArrayOperations< tnlCuda >::getMemoryElement< long double >( const long double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlArrayOperations< tnlCuda >::getMemoryElement< long double >( const long double* data );
+#endif
 
 template char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        int >( char* data, const int i );
 template int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         int >( int* data, const int i );
 template long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    int >( long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 template float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       int >( float* data, const int i );
+#endif
 template double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      int >( double* data, const int i );
-//template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( long double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        long int >( char* data, const long int i );
 template int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         long int >( int* data, const long int i );
 template long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    long int >( long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 template float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       long int >( float* data, const long int i );
+#endif
 template double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      long int >( double* data, const long int i );
-//template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( long double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( long double* data, const long int i );
+#endif
+#endif
 
 template const char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        int >( const char* data, const int i );
 template const int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         int >( const int* data, const int i );
 template const long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    int >( const long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 template const float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       int >( const float* data, const int i );
+#endif
 template const double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      int >( const double* data, const int i );
-//template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( const long double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( const long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template const char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        long int >( const char* data, const long int i );
 template const int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         long int >( const int* data, const long int i );
 template const long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    long int >( const long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 template const float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       long int >( const float* data, const long int i );
+#endif
 template const double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      long int >( const double* data, const long int i );
-//template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( const long double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( const long double* data, const long int i );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlCuda >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 template bool tnlArrayOperations< tnlCuda >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 template bool tnlArrayOperations< tnlCuda >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-//template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 template bool tnlArrayOperations< tnlCuda >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 template bool tnlArrayOperations< tnlCuda >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-//template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
+
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-//template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-//template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
+
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-//template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-//template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlCuda >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 template bool tnlArrayOperations< tnlCuda >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-//template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 template bool tnlArrayOperations< tnlCuda >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-//template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
+
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-//template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-//template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
+
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-//template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-//template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
-
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlCuda >::setMemory< char,        int >( char* destination, const char& value, const int size );
 template bool tnlArrayOperations< tnlCuda >::setMemory< int,         int >( int* destination, const int& value, const int size );
 template bool tnlArrayOperations< tnlCuda >::setMemory< long int,    int >( long int* destination, const long int& value, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::setMemory< float,       int >( float* destination, const float& value, const int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::setMemory< double,      int >( double* destination, const double& value, const int size );
-//template bool tnlArrayOperations< tnlCuda >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlCuda >::setMemory< char,        long int >( char* destination, const char& value, const long int size );
 template bool tnlArrayOperations< tnlCuda >::setMemory< int,         long int >( int* destination, const int& value, const long int size );
 template bool tnlArrayOperations< tnlCuda >::setMemory< long int,    long int >( long int* destination, const long int& value, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlCuda >::setMemory< float,       long int >( float* destination, const float& value, const long int size );
+#endif
 template bool tnlArrayOperations< tnlCuda >::setMemory< double,      long int >( double* destination, const double& value, const long int size );
-//template bool tnlArrayOperations< tnlCuda >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
-
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool tnlArrayOperations< tnlCuda >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
+#endif
 #endif
 
-
-
-
+#endif
diff --git a/src/core/arrays/tnlArrayOperationsCuda_impl.h b/src/core/arrays/tnlArrayOperationsCuda_impl.h
index 5622d4961d85ba07e018b5bedd6dbdb44abfacc1..4179c38ffa9292b72612d7b4f093b0ef58f68382 100644
--- a/src/core/arrays/tnlArrayOperationsCuda_impl.h
+++ b/src/core/arrays/tnlArrayOperationsCuda_impl.h
@@ -374,151 +374,275 @@ bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory( const Element1* host
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< char,        int >( char*& data, const int size );
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< int,         int >( int*& data, const int size );
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< long int,    int >( long int*& data, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< float,       int >( float*& data, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< double,      int >( double*& data, const int size );
-//extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, int >( long double*& data, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, int >( long double*& data, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< char,        long int >( char*& data, const long int size );
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< int,         long int >( int*& data, const long int size );
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< long int,    long int >( long int*& data, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< float,       long int >( float*& data, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< double,      long int >( double*& data, const long int size );
-//extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, long int >( long double*& data, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda >::allocateMemory< long double, long int >( long double*& data, const long int size );
+#endif
+#endif
 
 extern template bool tnlArrayOperations< tnlCuda >::freeMemory< char        >( char* data );
 extern template bool tnlArrayOperations< tnlCuda >::freeMemory< int         >( int* data );
 extern template bool tnlArrayOperations< tnlCuda >::freeMemory< long int    >( long int* data );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda >::freeMemory< float       >( float* data );
+#endif
 extern template bool tnlArrayOperations< tnlCuda >::freeMemory< double      >( double* data );
-//extern template bool tnlArrayOperations< tnlCuda >::freeMemory< long double >( long double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda >::freeMemory< long double >( long double* data );
+#endif
 
 extern template void tnlArrayOperations< tnlCuda >::setMemoryElement< char        >( char* data, const char& value );
 extern template void tnlArrayOperations< tnlCuda >::setMemoryElement< int         >( int* data, const int& value );
 extern template void tnlArrayOperations< tnlCuda >::setMemoryElement< long int    >( long int* data, const long int& value );
+#ifdef INSTANTIATE_FLOAT
 extern template void tnlArrayOperations< tnlCuda >::setMemoryElement< float       >( float* data, const float& value );
+#endif
 extern template void tnlArrayOperations< tnlCuda >::setMemoryElement< double      >( double* data, const double& value );
-//extern template void tnlArrayOperations< tnlCuda >::setMemoryElement< long double >( long double* data, const long double& value );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template void tnlArrayOperations< tnlCuda >::setMemoryElement< long double >( long double* data, const long double& value );
+#endif
 
 extern template char        tnlArrayOperations< tnlCuda >::getMemoryElement< char        >( const char* data );
 extern template int         tnlArrayOperations< tnlCuda >::getMemoryElement< int         >( const int* data );
 extern template long int    tnlArrayOperations< tnlCuda >::getMemoryElement< long int    >( const long int* data );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlArrayOperations< tnlCuda >::getMemoryElement< float       >( const float* data );
+#endif
 extern template double      tnlArrayOperations< tnlCuda >::getMemoryElement< double      >( const double* data );
-//extern template long double tnlArrayOperations< tnlCuda >::getMemoryElement< long double >( const long double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlArrayOperations< tnlCuda >::getMemoryElement< long double >( const long double* data );
+#endif
 
 extern template char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        int >( char* data, const int i );
 extern template int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         int >( int* data, const int i );
 extern template long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    int >( long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 extern template float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       int >( float* data, const int i );
+#endif
 extern template double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      int >( double* data, const int i );
-//extern template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( long double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        long int >( char* data, const long int i );
 extern template int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         long int >( int* data, const long int i );
 extern template long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    long int >( long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 extern template float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       long int >( float* data, const long int i );
+#endif
 extern template double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      long int >( double* data, const long int i );
-//extern template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( long double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( long double* data, const long int i );
+#endif
+#endif
 
 extern template const char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        int >( const char* data, const int i );
 extern template const int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         int >( const int* data, const int i );
 extern template const long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    int >( const long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 extern template const float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       int >( const float* data, const int i );
+#endif
 extern template const double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      int >( const double* data, const int i );
-//extern template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( const long double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, int >( const long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template const char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        long int >( const char* data, const long int i );
 extern template const int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         long int >( const int* data, const long int i );
 extern template const long int&    tnlArrayOperations< tnlCuda >::getArrayElementReference< long int,    long int >( const long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 extern template const float&       tnlArrayOperations< tnlCuda >::getArrayElementReference< float,       long int >( const float* data, const long int i );
+#endif
 extern template const double&      tnlArrayOperations< tnlCuda >::getArrayElementReference< double,      long int >( const double* data, const long int i );
-//extern template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( const long double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template const long double& tnlArrayOperations< tnlCuda >::getArrayElementReference< long double, long int >( const long double* data, const long int i );
+#endif
+#endif
 
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-//extern template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-//extern template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
+
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-//extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-//extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda, tnlHost >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
+
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
-//extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
-//extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
 
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-//extern template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-//extern template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
+
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-//extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-//extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda, tnlHost >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
+
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
-//extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
-//extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
 
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< char,        int >( char* destination, const char& value, const int size );
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< int,         int >( int* destination, const int& value, const int size );
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< long int,    int >( long int* destination, const long int& value, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< float,       int >( float* destination, const float& value, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< double,      int >( double* destination, const double& value, const int size );
-//extern template bool tnlArrayOperations< tnlCuda >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< char,        long int >( char* destination, const char& value, const long int size );
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< int,         long int >( int* destination, const int& value, const long int size );
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< long int,    long int >( long int* destination, const long int& value, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< float,       long int >( float* destination, const float& value, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlCuda >::setMemory< double,      long int >( double* destination, const double& value, const long int size );
-//extern template bool tnlArrayOperations< tnlCuda >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool tnlArrayOperations< tnlCuda >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
+#endif
+#endif
 
 #endif
 
diff --git a/src/core/arrays/tnlArrayOperationsHost_impl.cpp b/src/core/arrays/tnlArrayOperationsHost_impl.cpp
index 65af28f12ecb3206a2fffb37ed552ea943e83aa8..c2688d84be3411a2bba1e6ac955afdea001e5de8 100644
--- a/src/core/arrays/tnlArrayOperationsHost_impl.cpp
+++ b/src/core/arrays/tnlArrayOperationsHost_impl.cpp
@@ -22,106 +22,178 @@
 template bool tnlArrayOperations< tnlHost >::allocateMemory< char,        int >( char*& data, const int size );
 template bool tnlArrayOperations< tnlHost >::allocateMemory< int,         int >( int*& data, const int size );
 template bool tnlArrayOperations< tnlHost >::allocateMemory< long int,    int >( long int*& data, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::allocateMemory< float,       int >( float*& data, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::allocateMemory< double,      int >( double*& data, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::allocateMemory< long double, int >( long double*& data, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost >::allocateMemory< char,        long int >( char*& data, const long int size );
 template bool tnlArrayOperations< tnlHost >::allocateMemory< int,         long int >( int*& data, const long int size );
 template bool tnlArrayOperations< tnlHost >::allocateMemory< long int,    long int >( long int*& data, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::allocateMemory< float,       long int >( float*& data, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::allocateMemory< double,      long int >( double*& data, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::allocateMemory< long double, long int >( long double*& data, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlHost >::freeMemory< char        >( char* data );
 template bool tnlArrayOperations< tnlHost >::freeMemory< int         >( int* data );
 template bool tnlArrayOperations< tnlHost >::freeMemory< long int    >( long int* data );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::freeMemory< float       >( float* data );
+#endif
 template bool tnlArrayOperations< tnlHost >::freeMemory< double      >( double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::freeMemory< long double >( long double* data );
+#endif
 
 template void tnlArrayOperations< tnlHost >::setMemoryElement< char        >( char* data, const char& value );
 template void tnlArrayOperations< tnlHost >::setMemoryElement< int         >( int* data, const int& value );
 template void tnlArrayOperations< tnlHost >::setMemoryElement< long int    >( long int* data, const long int& value );
+#ifdef INSTANTIATE_FLOAT
 template void tnlArrayOperations< tnlHost >::setMemoryElement< float       >( float* data, const float& value );
+#endif
 template void tnlArrayOperations< tnlHost >::setMemoryElement< double      >( double* data, const double& value );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template void tnlArrayOperations< tnlHost >::setMemoryElement< long double >( long double* data, const long double& value );
+#endif
 
 template char        tnlArrayOperations< tnlHost >::getMemoryElement< char        >( char* data );
 template int         tnlArrayOperations< tnlHost >::getMemoryElement< int         >( int* data );
 template long int    tnlArrayOperations< tnlHost >::getMemoryElement< long int    >( long int* data );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlArrayOperations< tnlHost >::getMemoryElement< float       >( float* data );
+#endif
 template double      tnlArrayOperations< tnlHost >::getMemoryElement< double      >( double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlArrayOperations< tnlHost >::getMemoryElement< long double >( long double* data );
+#endif
 
 template char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        int >( char* data, const int i );
 template int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         int >( int* data, const int i );
 template long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    int >( long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 template float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       int >( float* data, const int i );
+#endif
 template double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      int >( double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, int >( long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        long int >( char* data, const long int i );
 template int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         long int >( int* data, const long int i );
 template long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    long int >( long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 template float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       long int >( float* data, const long int i );
+#endif
 template double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      long int >( double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, long int >( long double* data, const long int i );
+#endif
+#endif
 
 template const char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        int >( const char* data, const int i );
 template const int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         int >( const int* data, const int i );
 template const long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    int >( const long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 template const float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       int >( const float* data, const int i );
+#endif
 template const double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      int >( const double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template const long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, int >( const long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template const char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        long int >( const char* data, const long int i );
 template const int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         long int >( const int* data, const long int i );
 template const long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    long int >( const long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 template const float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       long int >( const float* data, const long int i );
+#endif
 template const double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      long int >( const double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template const long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, long int >( const long double* data, const long int i );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlHost >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 template bool tnlArrayOperations< tnlHost >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 template bool tnlArrayOperations< tnlHost >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 template bool tnlArrayOperations< tnlHost >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 template bool tnlArrayOperations< tnlHost >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlHost >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 template bool tnlArrayOperations< tnlHost >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 template bool tnlArrayOperations< tnlHost >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 template bool tnlArrayOperations< tnlHost >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 template bool tnlArrayOperations< tnlHost >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlHost >::setMemory< char,        int >( char* destination, const char& value, const int size );
 template bool tnlArrayOperations< tnlHost >::setMemory< int,         int >( int* destination, const int& value, const int size );
 template bool tnlArrayOperations< tnlHost >::setMemory< long int,    int >( long int* destination, const long int& value, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::setMemory< float,       int >( float* destination, const float& value, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::setMemory< double,      int >( double* destination, const double& value, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost >::setMemory< char,        long int >( char* destination, const char& value, const long int size );
 template bool tnlArrayOperations< tnlHost >::setMemory< int,         long int >( int* destination, const int& value, const long int size );
 template bool tnlArrayOperations< tnlHost >::setMemory< long int,    long int >( long int* destination, const long int& value, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::setMemory< float,       long int >( float* destination, const float& value, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::setMemory< double,      long int >( double* destination, const double& value, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
-
+#endif
 #endif
 
-
-
-
+#endif
diff --git a/src/core/arrays/tnlArrayOperationsHost_impl.cu b/src/core/arrays/tnlArrayOperationsHost_impl.cu
index 8e719a923940a514c9d0444df192b02025f31dd4..2f7e7e4ebfae0af37fd42863f6d40e740f0daa30 100644
--- a/src/core/arrays/tnlArrayOperationsHost_impl.cu
+++ b/src/core/arrays/tnlArrayOperationsHost_impl.cu
@@ -22,106 +22,178 @@
 template bool tnlArrayOperations< tnlHost >::allocateMemory< char,        int >( char*& data, const int size );
 template bool tnlArrayOperations< tnlHost >::allocateMemory< int,         int >( int*& data, const int size );
 template bool tnlArrayOperations< tnlHost >::allocateMemory< long int,    int >( long int*& data, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::allocateMemory< float,       int >( float*& data, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::allocateMemory< double,      int >( double*& data, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::allocateMemory< long double, int >( long double*& data, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost >::allocateMemory< char,        long int >( char*& data, const long int size );
 template bool tnlArrayOperations< tnlHost >::allocateMemory< int,         long int >( int*& data, const long int size );
 template bool tnlArrayOperations< tnlHost >::allocateMemory< long int,    long int >( long int*& data, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::allocateMemory< float,       long int >( float*& data, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::allocateMemory< double,      long int >( double*& data, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::allocateMemory< long double, long int >( long double*& data, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlHost >::freeMemory< char        >( char* data );
 template bool tnlArrayOperations< tnlHost >::freeMemory< int         >( int* data );
 template bool tnlArrayOperations< tnlHost >::freeMemory< long int    >( long int* data );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::freeMemory< float       >( float* data );
+#endif
 template bool tnlArrayOperations< tnlHost >::freeMemory< double      >( double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::freeMemory< long double >( long double* data );
+#endif
 
 template void tnlArrayOperations< tnlHost >::setMemoryElement< char        >( char* data, const char& value );
 template void tnlArrayOperations< tnlHost >::setMemoryElement< int         >( int* data, const int& value );
 template void tnlArrayOperations< tnlHost >::setMemoryElement< long int    >( long int* data, const long int& value );
+#ifdef INSTANTIATE_FLOAT
 template void tnlArrayOperations< tnlHost >::setMemoryElement< float       >( float* data, const float& value );
+#endif
 template void tnlArrayOperations< tnlHost >::setMemoryElement< double      >( double* data, const double& value );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template void tnlArrayOperations< tnlHost >::setMemoryElement< long double >( long double* data, const long double& value );
+#endif
 
 template char        tnlArrayOperations< tnlHost >::getMemoryElement< char        >( char* data );
 template int         tnlArrayOperations< tnlHost >::getMemoryElement< int         >( int* data );
 template long int    tnlArrayOperations< tnlHost >::getMemoryElement< long int    >( long int* data );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlArrayOperations< tnlHost >::getMemoryElement< float       >( float* data );
+#endif
 template double      tnlArrayOperations< tnlHost >::getMemoryElement< double      >( double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlArrayOperations< tnlHost >::getMemoryElement< long double >( long double* data );
+#endif
 
 template char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        int >( char* data, const int i );
 template int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         int >( int* data, const int i );
 template long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    int >( long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 template float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       int >( float* data, const int i );
+#endif
 template double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      int >( double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, int >( long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        long int >( char* data, const long int i );
 template int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         long int >( int* data, const long int i );
 template long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    long int >( long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 template float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       long int >( float* data, const long int i );
+#endif
 template double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      long int >( double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, long int >( long double* data, const long int i );
+#endif
+#endif
 
 template const char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        int >( const char* data, const int i );
 template const int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         int >( const int* data, const int i );
 template const long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    int >( const long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 template const float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       int >( const float* data, const int i );
+#endif
 template const double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      int >( const double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template const long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, int >( const long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template const char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        long int >( const char* data, const long int i );
 template const int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         long int >( const int* data, const long int i );
 template const long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    long int >( const long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 template const float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       long int >( const float* data, const long int i );
+#endif
 template const double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      long int >( const double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template const long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, long int >( const long double* data, const long int i );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlHost >::copyMemory< char,               char, int >( char* destination, const char* source, const int size );
 template bool tnlArrayOperations< tnlHost >::copyMemory< int,                 int, int >( int* destination, const int* source, const int size );
 template bool tnlArrayOperations< tnlHost >::copyMemory< long int,       long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::copyMemory< float,             float, int >( float* destination, const float* source, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::copyMemory< double,           double, int >( double* destination, const double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::copyMemory< long double, long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost >::copyMemory< char,               char, long int >( char* destination, const char* source, const long int size );
 template bool tnlArrayOperations< tnlHost >::copyMemory< int,                 int, long int >( int* destination, const int* source, const long int size );
 template bool tnlArrayOperations< tnlHost >::copyMemory< long int,       long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::copyMemory< float,             float, long int >( float* destination, const float* source, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::copyMemory< double,           double, long int >( double* destination, const double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::copyMemory< long double, long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlHost >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 template bool tnlArrayOperations< tnlHost >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 template bool tnlArrayOperations< tnlHost >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 template bool tnlArrayOperations< tnlHost >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 template bool tnlArrayOperations< tnlHost >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
 
 template bool tnlArrayOperations< tnlHost >::setMemory< char,        int >( char* destination, const char& value, const int size );
 template bool tnlArrayOperations< tnlHost >::setMemory< int,         int >( int* destination, const int& value, const int size );
 template bool tnlArrayOperations< tnlHost >::setMemory< long int,    int >( long int* destination, const long int& value, const int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::setMemory< float,       int >( float* destination, const float& value, const int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::setMemory< double,      int >( double* destination, const double& value, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template bool tnlArrayOperations< tnlHost >::setMemory< char,        long int >( char* destination, const char& value, const long int size );
 template bool tnlArrayOperations< tnlHost >::setMemory< int,         long int >( int* destination, const int& value, const long int size );
 template bool tnlArrayOperations< tnlHost >::setMemory< long int,    long int >( long int* destination, const long int& value, const long int size );
+#ifdef INSTANTIATE_FLOAT
 template bool tnlArrayOperations< tnlHost >::setMemory< float,       long int >( float* destination, const float& value, const long int size );
+#endif
 template bool tnlArrayOperations< tnlHost >::setMemory< double,      long int >( double* destination, const double& value, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool tnlArrayOperations< tnlHost >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
-
+#endif
 #endif
 
-
-
-
+#endif
diff --git a/src/core/arrays/tnlArrayOperationsHost_impl.h b/src/core/arrays/tnlArrayOperationsHost_impl.h
index afb56febb7eae7cc3e1e1224fcf280b6fa9fe736..27ee9093ff21854ab9aff2ba573f0209af10ef13 100644
--- a/src/core/arrays/tnlArrayOperationsHost_impl.h
+++ b/src/core/arrays/tnlArrayOperationsHost_impl.h
@@ -50,22 +50,22 @@ Element tnlArrayOperations< tnlHost >::getMemoryElement( Element* data )
 
 template< typename Element, typename Index >
 Element& tnlArrayOperations< tnlHost >::getArrayElementReference( Element* data,
-                                                                    const Index i )
+                                                                  const Index i )
 {
    return data[ i ];
 };
 
 template< typename Element, typename Index >
 const Element& tnlArrayOperations< tnlHost >::getArrayElementReference( const Element* data,
-                                                                          const Index i )
+                                                                       const Index i )
 {
    return data[ i ];
 };
 
 template< typename Element, typename Index >
 bool tnlArrayOperations< tnlHost >::setMemory( Element* data,
-                                                        const Element& value,
-                                                        const Index size )
+                                               const Element& value,
+                                               const Index size )
 {
    for( Index i = 0; i < size; i ++ )
       data[ i ] = value;
@@ -111,103 +111,179 @@ bool tnlArrayOperations< tnlHost >::compareMemory( const DestinationElement* des
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< char,        int >( char*& data, const int size );
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< int,         int >( int*& data, const int size );
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< long int,    int >( long int*& data, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< float,       int >( float*& data, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< double,      int >( double*& data, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< long double, int >( long double*& data, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< char,        long int >( char*& data, const long int size );
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< int,         long int >( int*& data, const long int size );
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< long int,    long int >( long int*& data, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< float,       long int >( float*& data, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< double,      long int >( double*& data, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool tnlArrayOperations< tnlHost >::allocateMemory< long double, long int >( long double*& data, const long int size );
+#endif
+#endif
 
 extern template bool tnlArrayOperations< tnlHost >::freeMemory< char        >( char* data );
 extern template bool tnlArrayOperations< tnlHost >::freeMemory< int         >( int* data );
 extern template bool tnlArrayOperations< tnlHost >::freeMemory< long int    >( long int* data );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost >::freeMemory< float       >( float* data );
+#endif
 extern template bool tnlArrayOperations< tnlHost >::freeMemory< double      >( double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool tnlArrayOperations< tnlHost >::freeMemory< long double >( long double* data );
+#endif
 
 extern template void tnlArrayOperations< tnlHost >::setMemoryElement< char        >( char* data, const char& value );
 extern template void tnlArrayOperations< tnlHost >::setMemoryElement< int         >( int* data, const int& value );
 extern template void tnlArrayOperations< tnlHost >::setMemoryElement< long int    >( long int* data, const long int& value );
+#ifdef INSTANTIATE_FLOAT
 extern template void tnlArrayOperations< tnlHost >::setMemoryElement< float       >( float* data, const float& value );
+#endif
 extern template void tnlArrayOperations< tnlHost >::setMemoryElement< double      >( double* data, const double& value );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template void tnlArrayOperations< tnlHost >::setMemoryElement< long double >( long double* data, const long double& value );
+#endif
 
 extern template char        tnlArrayOperations< tnlHost >::getMemoryElement< char        >( char* data );
 extern template int         tnlArrayOperations< tnlHost >::getMemoryElement< int         >( int* data );
 extern template long int    tnlArrayOperations< tnlHost >::getMemoryElement< long int    >( long int* data );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlArrayOperations< tnlHost >::getMemoryElement< float       >( float* data );
+#endif
 extern template double      tnlArrayOperations< tnlHost >::getMemoryElement< double      >( double* data );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlArrayOperations< tnlHost >::getMemoryElement< long double >( long double* data );
+#endif
 
 extern template char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        int >( char* data, const int i );
 extern template int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         int >( int* data, const int i );
 extern template long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    int >( long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 extern template float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       int >( float* data, const int i );
+#endif
 extern template double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      int >( double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, int >( long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        long int >( char* data, const long int i );
 extern template int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         long int >( int* data, const long int i );
 extern template long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    long int >( long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 extern template float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       long int >( float* data, const long int i );
+#endif
 extern template double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      long int >( double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, long int >( long double* data, const long int i );
+#endif
+#endif
 
 extern template const char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        int >( const char* data, const int i );
 extern template const int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         int >( const int* data, const int i );
 extern template const long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    int >( const long int* data, const int i );
+#ifdef INSTANTIATE_FLOAT
 extern template const float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       int >( const float* data, const int i );
+#endif
 extern template const double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      int >( const double* data, const int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template const long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, int >( const long double* data, const int i );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template const char&        tnlArrayOperations< tnlHost >::getArrayElementReference< char,        long int >( const char* data, const long int i );
 extern template const int&         tnlArrayOperations< tnlHost >::getArrayElementReference< int,         long int >( const int* data, const long int i );
 extern template const long int&    tnlArrayOperations< tnlHost >::getArrayElementReference< long int,    long int >( const long int* data, const long int i );
+#ifdef INSTANTIATE_FLOAT
 extern template const float&       tnlArrayOperations< tnlHost >::getArrayElementReference< float,       long int >( const float* data, const long int i );
+#endif
 extern template const double&      tnlArrayOperations< tnlHost >::getArrayElementReference< double,      long int >( const double* data, const long int i );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template const long double& tnlArrayOperations< tnlHost >::getArrayElementReference< long double, long int >( const long double* data, const long int i );
+#endif
+#endif
 
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< char,                char, int >( char* destination, const char* source, const int size );
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< int,                  int, int >( int* destination, const int* source, const int size );
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< long int,        long int, int >( long int* destination, const long int* source, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< float,              float, int >( float* destination, const float* source, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< double,            double, int >( double* destination, const double* source, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< long double,  long double, int >( long double* destination, const long double* source, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< char,                char, long int >( char* destination, const char* source, const long int size );
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< int,                  int, long int >( int* destination, const int* source, const long int size );
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< long int,        long int, long int >( long int* destination, const long int* source, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< float,              float, long int >( float* destination, const float* source, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< double,            double, long int >( double* destination, const double* source, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool tnlArrayOperations< tnlHost >::copyMemory< long double,  long double, long int >( long double* destination, const long double* source, const long int size );
+#endif
+#endif
 
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< char,               char, int >( const char* data1, const char* data2, const int size );
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< int,                 int, int >( const int* data1, const int* data2, const int size );
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< long int,       long int, int >( const long int* data1, const long int* data2, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< float,             float, int >( const float* data1, const float* data2, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< double,           double, int >( const double* data1, const double* data2, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< long double, long double, int >( const long double* data1, const long double* data2, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< char,               char, long int >( const char* data1, const char* data2, const long int size );
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< int,                 int, long int >( const int* data1, const int* data2, const long int size );
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< long int,       long int, long int >( const long int* data1, const long int* data2, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< float,             float, long int >( const float* data1, const float* data2, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< double,           double, long int >( const double* data1, const double* data2, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool tnlArrayOperations< tnlHost >::compareMemory< long double, long double, long int >( const long double* data1, const long double* data2, const long int size );
+#endif
+#endif
 
 extern template bool tnlArrayOperations< tnlHost >::setMemory< char,        int >( char* destination, const char& value, const int size );
 extern template bool tnlArrayOperations< tnlHost >::setMemory< int,         int >( int* destination, const int& value, const int size );
 extern template bool tnlArrayOperations< tnlHost >::setMemory< long int,    int >( long int* destination, const long int& value, const int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost >::setMemory< float,       int >( float* destination, const float& value, const int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost >::setMemory< double,      int >( double* destination, const double& value, const int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool tnlArrayOperations< tnlHost >::setMemory< long double, int >( long double* destination, const long double& value, const int size );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template bool tnlArrayOperations< tnlHost >::setMemory< char,        long int >( char* destination, const char& value, const long int size );
 extern template bool tnlArrayOperations< tnlHost >::setMemory< int,         long int >( int* destination, const int& value, const long int size );
 extern template bool tnlArrayOperations< tnlHost >::setMemory< long int,    long int >( long int* destination, const long int& value, const long int size );
+#ifdef INSTANTIATE_FLOAT
 extern template bool tnlArrayOperations< tnlHost >::setMemory< float,       long int >( float* destination, const float& value, const long int size );
+#endif
 extern template bool tnlArrayOperations< tnlHost >::setMemory< double,      long int >( double* destination, const double& value, const long int size );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool tnlArrayOperations< tnlHost >::setMemory< long double, long int >( long double* destination, const long double& value, const long int size );
+#endif
+#endif
 
 #endif
 
diff --git a/src/core/arrays/tnlArray_impl.cpp b/src/core/arrays/tnlArray_impl.cpp
index 6a429605828f2408f2cac263a8dcba637079be8d..c021341d1618065fb9fdea67e551c8829a2c42ee 100644
--- a/src/core/arrays/tnlArray_impl.cpp
+++ b/src/core/arrays/tnlArray_impl.cpp
@@ -19,16 +19,44 @@
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlArray< float, tnlHost, int >;
+#endif
 template class tnlArray< double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlArray< long double, tnlHost, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlArray< float, tnlHost, long int >;
+#endif
 template class tnlArray< double, tnlHost, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlArray< long double, tnlHost, long int >;
+#endif
+#endif
 
 #ifndef HAVE_CUDA
+#ifdef INSTANTIATE_FLOAT
 template class tnlArray< float, tnlCuda, int >;
+#endif
 template class tnlArray< double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlArray< long double, tnlCuda, int >;
+#endif
+
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlArray< float, tnlCuda, long int >;
+#endif
 template class tnlArray< double, tnlCuda, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlArray< long double, tnlCuda, long int >;
+#endif
+#endif
+
 #endif
 
 #endif
diff --git a/src/core/arrays/tnlArray_impl.cu b/src/core/arrays/tnlArray_impl.cu
index 21a149024cc5ce0b9d65c739752139a0eaa31bb6..f6b42f22efcaa757fc27c80728fa379b46b35958 100644
--- a/src/core/arrays/tnlArray_impl.cu
+++ b/src/core/arrays/tnlArray_impl.cu
@@ -20,10 +20,25 @@
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
 #ifdef HAVE_CUDA
+#ifdef INSTANTIATE_FLOAT
 template class tnlArray< float, tnlCuda, int >;
+#endif
 template class tnlArray< double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlArray< long double, tnlCuda, int >;
+#endif
+
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlArray< float, tnlCuda, long int >;
+#endif
 template class tnlArray< double, tnlCuda, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlArray< long double, tnlCuda, long int >;
+#endif
+#endif
+
 #endif
 
 #endif
diff --git a/src/core/arrays/tnlArray_impl.h b/src/core/arrays/tnlArray_impl.h
index 11ec6f56f620af6b6190d27b366c891028bf547a..3655f886914843aa421983532d67cf3743f3a7db 100644
--- a/src/core/arrays/tnlArray_impl.h
+++ b/src/core/arrays/tnlArray_impl.h
@@ -145,9 +145,7 @@ void tnlArray< Element, Device, Index > :: reset()
 template< typename Element,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlArray< Element, Device, Index > :: getSize() const
 {
    return this -> size;
@@ -182,9 +180,7 @@ Element tnlArray< Element, Device, Index > :: getElement( Index i ) const
 template< typename Element,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Element& tnlArray< Element, Device, Index > :: operator[] ( Index i )
 {
    tnlAssert( 0 <= i && i < this -> getSize(),
@@ -198,9 +194,7 @@ Element& tnlArray< Element, Device, Index > :: operator[] ( Index i )
 template< typename Element,
            typename Device,
            typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Element& tnlArray< Element, Device, Index > :: operator[] ( Index i ) const
 {
    tnlAssert( 0 <= i && i < this -> getSize(),
@@ -295,9 +289,7 @@ void tnlArray< Element, Device, Index > :: setValue( const Element& e )
 template< typename Element,
            typename Device,
            typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Element* tnlArray< Element, Device, Index > :: getData() const
 {
    return this -> data;
@@ -306,9 +298,7 @@ const Element* tnlArray< Element, Device, Index > :: getData() const
 template< typename Element,
            typename Device,
            typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Element* tnlArray< Element, Device, Index > :: getData()
 {
    return this -> data;
@@ -434,16 +424,34 @@ ostream& operator << ( ostream& str, const tnlArray< Element, Device, Index >& v
 
 // TODO: this does not work with CUDA 5.5 - fix it later
 
-/*extern template class tnlArray< float, tnlHost, int >;
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlArray< float, tnlHost, int >;
+#endif
 extern template class tnlArray< double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlArray< long double, tnlHost, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlArray< float, tnlHost, long int >;
-extern template class tnlArray< double, tnlHost, long int >;*/
+#endif
+extern template class tnlArray< double, tnlHost, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlArray< long double, tnlHost, long int >;
+#endif
+#endif
 
 #ifdef HAVE_CUDA
-/*extern template class tnlArray< float, tnlCuda, int >;
-extern template class tnlArray< double, tnlCuda, int >;
-extern template class tnlArray< float, tnlCuda, long int >;
-extern template class tnlArray< double, tnlCuda, long int >;*/
+/*
+ #ifdef INSTANTIATE_FLOAT
+ extern template class tnlArray< float, tnlCuda, int >;
+ #endif
+ extern template class tnlArray< double, tnlCuda, int >;
+ #ifdef INSTANTIATE_FLOAT
+ extern template class tnlArray< float, tnlCuda, long int >;
+ #endif
+ extern template class tnlArray< double, tnlCuda, long int >;*/
 #endif
 
 #endif
diff --git a/src/core/arrays/tnlConstSharedArray.h b/src/core/arrays/tnlConstSharedArray.h
index bf14408b365ce74893be48a1b63fee1bddaad313..e107f9c4364a15204f330e1e3a46131d0e71372a 100644
--- a/src/core/arrays/tnlConstSharedArray.h
+++ b/src/core/arrays/tnlConstSharedArray.h
@@ -53,23 +53,19 @@ class tnlConstSharedArray : public tnlObject
               const Index _size );
 
    template< typename Array >
-   void bind( const Array& array );
+   void bind( const Array& array,
+              IndexType index = 0,
+              IndexType size = 0 );
 
    void swap( tnlConstSharedArray< Element, Device, Index >& array );
 
    void reset();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getSize() const;
+   __cuda_callable__ Index getSize() const;
 
    Element getElement( Index i ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element& operator[] ( Index i ) const;
+   __cuda_callable__ const Element& operator[] ( Index i ) const;
 
    tnlConstSharedArray< Element, Device, Index >& operator = ( const tnlConstSharedArray< Element, Device, Index >& array );
 
@@ -82,10 +78,7 @@ class tnlConstSharedArray : public tnlObject
    template< typename Array >
    bool operator != ( const Array& array ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element* getData() const;
+   __cuda_callable__ const Element* getData() const;
 
    /****
     * Returns true if non-zero size is set.
diff --git a/src/core/arrays/tnlConstSharedArray_impl.h b/src/core/arrays/tnlConstSharedArray_impl.h
index 4a541474a753728419cfe6b185b5bc7339e3fc6a..49b525a77fe014faa06df9dd9085bbad41e4b75b 100644
--- a/src/core/arrays/tnlConstSharedArray_impl.h
+++ b/src/core/arrays/tnlConstSharedArray_impl.h
@@ -92,10 +92,19 @@ template< typename Element,
           typename Device,
           typename Index >
    template< typename Array >
-void tnlConstSharedArray< Element, Device, Index > :: bind( const Array& array )
+void tnlConstSharedArray< Element, Device, Index > :: bind( const Array& array,
+                                                            IndexType index,
+                                                            IndexType size )
 {
-   this -> size = array. getSize();
-   this -> data = array. getData();
+   // TODO: This does not work for static arrays.
+   //tnlStaticAssert( Array::DeviceType::DeviceType == DeviceType::DeviceType,
+   //                 "Attempt to bind arrays between different devices." );
+   this->data = &( array. getData()[ index ] );
+   if( ! size )
+      this->size = array. getSize();
+   else
+      this->size = size;
+   
 };
 
 template< typename Element,
@@ -119,9 +128,7 @@ void tnlConstSharedArray< Element, Device, Index > :: reset()
 template< typename Element,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlConstSharedArray< Element, Device, Index > :: getSize() const
 {
    return this -> size;
@@ -143,9 +150,7 @@ Element tnlConstSharedArray< Element, Device, Index > :: getElement( Index i ) c
 template< typename Element,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Element& tnlConstSharedArray< Element, Device, Index > :: operator[] ( Index i ) const
 {
    tnlAssert( 0 <= i && i < this -> getSize(),
@@ -281,16 +286,43 @@ ostream& operator << ( ostream& str, const tnlConstSharedArray< Element, Device,
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlConstSharedArray< float, tnlHost, int >;
+#endif
 extern template class tnlConstSharedArray< double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlConstSharedArray< long double, tnlHost, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlConstSharedArray< float, tnlHost, long int >;
+#endif
 extern template class tnlConstSharedArray< double, tnlHost, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlConstSharedArray< long double, tnlHost, long int >;
+#endif
+#endif
 
 #ifdef HAVE_CUDA
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlConstSharedArray< float, tnlCuda, int >;
+#endif
 extern template class tnlConstSharedArray< double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlConstSharedArray< long double, tnlCuda, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlConstSharedArray< float, tnlCuda, long int >;
+#endif
 extern template class tnlConstSharedArray< double, tnlCuda, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlConstSharedArray< long double, tnlCuda, long int >;
+#endif
+
+#endif
 #endif
 
 #endif
diff --git a/src/core/arrays/tnlMultiArray.h b/src/core/arrays/tnlMultiArray.h
index 815102e5d0cfbb400e39c38c35a4287cc8cc62b1..eb24d7afa552628755684120d775a35ea0b89987 100644
--- a/src/core/arrays/tnlMultiArray.h
+++ b/src/core/arrays/tnlMultiArray.h
@@ -41,9 +41,6 @@ class tnlMultiArray< 1, Element, Device, Index > : public tnlArray< Element, Dev
    typedef tnlMultiArray< 1, Element, tnlCuda, Index > CudaType;
 
 
-#ifdef HAVE_CUDA
-   //__device__ __host__
-#endif
    tnlMultiArray();
 
    tnlMultiArray( const tnlString& name );
@@ -60,15 +57,9 @@ class tnlMultiArray< 1, Element, Device, Index > : public tnlArray< Element, Dev
 
    bool setDimensions( const tnlStaticVector< 1, Index >& dimensions );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   void getDimensions( Index& iSize ) const;
+   __cuda_callable__ void getDimensions( Index& iSize ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const tnlStaticVector< 1, Index >& getDimensions() const;
+   __cuda_callable__ const tnlStaticVector< 1, Index >& getDimensions() const;
 
    //! Set dimensions of the array using another array as a template
    template< typename MultiArray >
@@ -76,10 +67,7 @@ class tnlMultiArray< 1, Element, Device, Index > : public tnlArray< Element, Dev
    
    void reset();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getElementIndex( const Index i ) const;
+   __cuda_callable__ Index getElementIndex( const Index i ) const;
 
    void setElement( const Index i, Element value );
 
@@ -91,15 +79,9 @@ class tnlMultiArray< 1, Element, Device, Index > : public tnlArray< Element, Dev
    Element getElement( const Index i ) const;
 
    //! Operator for accessing elements of the array.
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Element& operator()( const Index i );
+   __cuda_callable__ Element& operator()( const Index i );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element& operator()( const Index i ) const;
+   __cuda_callable__ const Element& operator()( const Index i ) const;
 
 
    template< typename MultiArray >
@@ -140,9 +122,6 @@ class tnlMultiArray< 2, Element, Device, Index > : public tnlArray< Element, Dev
    typedef tnlMultiArray< 2, Element, tnlCuda, Index > CudaType;
 
 
-#ifdef HAVE_CUDA
-   //__device__ __host__
-#endif
    tnlMultiArray();
 
    tnlMultiArray( const tnlString& name );
@@ -159,15 +138,9 @@ class tnlMultiArray< 2, Element, Device, Index > : public tnlArray< Element, Dev
 
    bool setDimensions( const tnlStaticVector< 2, Index >& dimensions );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   void getDimensions( Index& jSize, Index& iSize ) const;
+   __cuda_callable__ void getDimensions( Index& jSize, Index& iSize ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const tnlStaticVector< 2, Index >& getDimensions() const;
+   __cuda_callable__ const tnlStaticVector< 2, Index >& getDimensions() const;
 
    //! Set dimensions of the array using another array as a template
    template< typename MultiArray >
@@ -175,10 +148,7 @@ class tnlMultiArray< 2, Element, Device, Index > : public tnlArray< Element, Dev
 
    void reset();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getElementIndex( const Index j, const Index i ) const;
+   __cuda_callable__ Index getElementIndex( const Index j, const Index i ) const;
 
    void setElement( const Index j, const Index i, Element value );
 
@@ -194,15 +164,9 @@ class tnlMultiArray< 2, Element, Device, Index > : public tnlArray< Element, Dev
     *  used to access elements of arrays in different address space
     *  (GPU device usually).
     */
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Element& operator()( const Index j, const Index i );
+   __cuda_callable__ Element& operator()( const Index j, const Index i );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element& operator()( const Index j, const Index i ) const;
+   __cuda_callable__ const Element& operator()( const Index j, const Index i ) const;
 
    template< typename MultiArray >
    bool operator == ( const MultiArray& array ) const;
@@ -243,9 +207,6 @@ class tnlMultiArray< 3, Element, Device, Index > : public tnlArray< Element, Dev
    typedef tnlMultiArray< 3, Element, tnlCuda, Index > CudaType;
 
 
-#ifdef HAVE_CUDA
-   //__device__ __host__
-#endif
    tnlMultiArray();
 
    tnlMultiArray( const tnlString& name );
@@ -262,15 +223,9 @@ class tnlMultiArray< 3, Element, Device, Index > : public tnlArray< Element, Dev
 
    bool setDimensions( const tnlStaticVector< 3, Index >& dimensions );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   void getDimensions( Index& k, Index& j, Index& iSize ) const;
+   __cuda_callable__ void getDimensions( Index& k, Index& j, Index& iSize ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const tnlStaticVector< 3, Index >& getDimensions() const;
+   __cuda_callable__ const tnlStaticVector< 3, Index >& getDimensions() const;
 
    //! Set dimensions of the array using another array as a template
    template< typename MultiArray >
@@ -278,10 +233,7 @@ class tnlMultiArray< 3, Element, Device, Index > : public tnlArray< Element, Dev
 
    void reset();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getElementIndex( const Index k, const Index j, const Index i ) const;
+   __cuda_callable__ Index getElementIndex( const Index k, const Index j, const Index i ) const;
 
    void setElement( const Index k, const Index j, const Index i, Element value );
 
@@ -297,15 +249,9 @@ class tnlMultiArray< 3, Element, Device, Index > : public tnlArray< Element, Dev
     *  used to access elements of arrays in different adress space
     *  (GPU device usualy).
     */
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Element& operator()( const Index k, const Index j, const Index i );
+   __cuda_callable__ Element& operator()( const Index k, const Index j, const Index i );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element& operator()( const Index k, const Index j, const Index i ) const;
+   __cuda_callable__ const Element& operator()( const Index k, const Index j, const Index i ) const;
 
    template< typename MultiArray >
    bool operator == ( const MultiArray& array ) const;
@@ -346,9 +292,6 @@ class tnlMultiArray< 4, Element, Device, Index > : public tnlArray< Element, Dev
    typedef tnlMultiArray< 4, Element, tnlCuda, Index > CudaType;
 
 
-#ifdef HAVE_CUDA
-   //__device__ __host__
-#endif
    tnlMultiArray();
 
    tnlMultiArray( const tnlString& name );
@@ -365,15 +308,9 @@ class tnlMultiArray< 4, Element, Device, Index > : public tnlArray< Element, Dev
 
    bool setDimensions( const tnlStaticVector< 4, Index >& dimensions );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   void getDimensions( Index& l, Index& k, Index& j, Index& iSize ) const;
+   __cuda_callable__ void getDimensions( Index& l, Index& k, Index& j, Index& iSize ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const tnlStaticVector< 4, Index >& getDimensions() const;
+   __cuda_callable__ const tnlStaticVector< 4, Index >& getDimensions() const;
 
    //! Set dimensions of the array using another array as a template
    template< typename MultiArray >
@@ -381,10 +318,7 @@ class tnlMultiArray< 4, Element, Device, Index > : public tnlArray< Element, Dev
 
    void reset();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getElementIndex( const Index l, const Index k, const Index j, const Index i ) const;
+   __cuda_callable__ Index getElementIndex( const Index l, const Index k, const Index j, const Index i ) const;
 
    void setElement( const Index l, const Index k, const Index j, const Index i, Element value );
 
@@ -400,15 +334,9 @@ class tnlMultiArray< 4, Element, Device, Index > : public tnlArray< Element, Dev
     *  used to access elements of arrays in different adress space
     *  (GPU device usualy).
     */
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Element& operator()( const Index l, const Index k, const Index j, const Index i );
+   __cuda_callable__ Element& operator()( const Index l, const Index k, const Index j, const Index i );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element& operator()( const Index l, const Index k, const Index j, const Index i ) const;
+   __cuda_callable__ const Element& operator()( const Index l, const Index k, const Index j, const Index i ) const;
 
    template< typename MultiArray >
    bool operator == ( const MultiArray& array ) const;
@@ -456,39 +384,83 @@ ostream& operator << ( ostream& str, const tnlMultiArray< 4, Element, device, In
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 1, float,  tnlHost, int >;
+#endif
 extern template class tnlMultiArray< 1, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 1, float,  tnlHost, long int >;
+#endif
 extern template class tnlMultiArray< 1, double, tnlHost, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 2, float,  tnlHost, int >;
+#endif
 extern template class tnlMultiArray< 2, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 2, float,  tnlHost, long int >;
+#endif
 extern template class tnlMultiArray< 2, double, tnlHost, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 3, float,  tnlHost, int >;
+#endif
 extern template class tnlMultiArray< 3, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 3, float,  tnlHost, long int >;
+#endif
 extern template class tnlMultiArray< 3, double, tnlHost, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 4, float,  tnlHost, int >;
+#endif
 extern template class tnlMultiArray< 4, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 4, float,  tnlHost, long int >;
+#endif
 extern template class tnlMultiArray< 4, double, tnlHost, long int >;
+#endif
 
 // TODO: There are problems with nvlink - it might be better in later versions
-/*extern template class tnlMultiArray< 1, float,  tnlCuda, int >;
+/*
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiArray< 1, float,  tnlCuda, int >;
+#endif
 extern template class tnlMultiArray< 1, double, tnlCuda, int >;
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 1, float,  tnlCuda, long int >;
+#endif
 extern template class tnlMultiArray< 1, double, tnlCuda, long int >;
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 2, float,  tnlCuda, int >;
+#endif
 extern template class tnlMultiArray< 2, double, tnlCuda, int >;
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 2, float,  tnlCuda, long int >;
+#endif
 extern template class tnlMultiArray< 2, double, tnlCuda, long int >;
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 3, float,  tnlCuda, int >;
+#endif
 extern template class tnlMultiArray< 3, double, tnlCuda, int >;
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 3, float,  tnlCuda, long int >;
+#endif
 extern template class tnlMultiArray< 3, double, tnlCuda, long int >;
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 4, float,  tnlCuda, int >;
+#endif
 extern template class tnlMultiArray< 4, double, tnlCuda, int >;
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlMultiArray< 4, float,  tnlCuda, long int >;
+#endif
 extern template class tnlMultiArray< 4, double, tnlCuda, long int >;*/
 
 #endif
diff --git a/src/core/arrays/tnlMultiArray1D_impl.h b/src/core/arrays/tnlMultiArray1D_impl.h
index 7d5afb4edd05481a45d9c9571e50306bbe2c3b5e..b45aabdedb0bf4b5f5d8cb72a2d39070597cd98b 100644
--- a/src/core/arrays/tnlMultiArray1D_impl.h
+++ b/src/core/arrays/tnlMultiArray1D_impl.h
@@ -19,9 +19,6 @@
 #define TNLMULTIARRAY1D_IMPL_H_
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   //__device__ __host__
-#endif
 tnlMultiArray< 1, Element, Device, Index > :: tnlMultiArray()
 {
 }
@@ -103,27 +100,21 @@ void tnlMultiArray< 1, Element, Device, Index >::reset()
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlMultiArray< 1, Element, Device, Index > :: getDimensions( Index& xSize ) const
 {
    xSize = this -> dimensions[ 0 ];
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const tnlStaticVector< 1, Index >& tnlMultiArray< 1, Element, Device, Index > :: getDimensions() const
 {
    return this -> dimensions;
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlMultiArray< 1, Element, Device, Index > :: getElementIndex( const Index i ) const
 {
    tnlAssert( i >= 0 && i < this -> dimensions[ 0 ],
@@ -144,18 +135,14 @@ void tnlMultiArray< 1, Element, Device, Index > :: setElement( const Index i, El
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Element& tnlMultiArray< 1, Element, Device, Index > :: operator()( const Index element )
 {
    return tnlArray< Element, Device, Index > :: operator[]( getElementIndex( element ) );
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Element& tnlMultiArray< 1, Element, Device, Index > :: operator()( const Index element ) const
 {
    return tnlArray< Element, Device, Index > :: operator[]( getElementIndex( element ) );
diff --git a/src/core/arrays/tnlMultiArray2D_impl.h b/src/core/arrays/tnlMultiArray2D_impl.h
index 15c1aacd25ceecd0b4715db6f94ec80428e2c0d9..cc10ee13d4ccb96750a2f63df0e02932bbc2c27e 100644
--- a/src/core/arrays/tnlMultiArray2D_impl.h
+++ b/src/core/arrays/tnlMultiArray2D_impl.h
@@ -21,9 +21,6 @@
 
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   //__device__ __host__
-#endif
 tnlMultiArray< 2, Element, Device, Index > :: tnlMultiArray()
 {
 }
@@ -114,9 +111,7 @@ void tnlMultiArray< 2, Element, Device, Index >::reset()
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlMultiArray< 2, Element, Device, Index > :: getDimensions( Index& jSize, Index& iSize ) const
 {
    iSize = this -> dimensions[ 0 ];
@@ -124,18 +119,14 @@ void tnlMultiArray< 2, Element, Device, Index > :: getDimensions( Index& jSize,
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const tnlStaticVector< 2, Index >& tnlMultiArray< 2, Element, Device, Index > :: getDimensions() const
 {
    return this -> dimensions;
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlMultiArray< 2, Element, Device, Index > :: getElementIndex( const Index j, const Index i ) const
 {
    tnlAssert( i >= 0 && i < this -> dimensions[ 0 ] && j >= 0 && j < this -> dimensions[ 1 ],
@@ -157,18 +148,14 @@ void tnlMultiArray< 2, Element, Device, Index > :: setElement( const Index j, co
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Element& tnlMultiArray< 2, Element, Device, Index > :: operator()( const Index j, const Index i )
 {
    return tnlArray< Element, Device, Index > :: operator[]( getElementIndex( j, i ) );
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Element& tnlMultiArray< 2, Element, Device, Index > :: operator()( const Index j, const Index i ) const
 {
    return tnlArray< Element, Device, Index > :: operator[]( getElementIndex( j, i ) );
diff --git a/src/core/arrays/tnlMultiArray3D_impl.h b/src/core/arrays/tnlMultiArray3D_impl.h
index 2fa78ff7c1f9478b73e81adba2a20c984bc3fbb9..3cd8dfbd237ba327d54d77d9359041320b95ead0 100644
--- a/src/core/arrays/tnlMultiArray3D_impl.h
+++ b/src/core/arrays/tnlMultiArray3D_impl.h
@@ -21,9 +21,6 @@
 
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   //__device__ __host__
-#endif
 tnlMultiArray< 3, Element, Device, Index > :: tnlMultiArray()
 {
 }
@@ -119,9 +116,7 @@ void tnlMultiArray< 3, Element, Device, Index >::reset()
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlMultiArray< 3, Element, Device, Index > :: getDimensions( Index& kSize,
                                                                   Index& jSize,
                                                                   Index& iSize ) const
@@ -132,18 +127,14 @@ void tnlMultiArray< 3, Element, Device, Index > :: getDimensions( Index& kSize,
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const tnlStaticVector< 3, Index >& tnlMultiArray< 3, Element, Device, Index > :: getDimensions() const
 {
    return this -> dimensions;
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlMultiArray< 3, Element, Device, Index > :: getElementIndex( const Index k,
                                                                      const Index j,
                                                                      const Index i ) const
@@ -176,9 +167,7 @@ void tnlMultiArray< 3, Element, Device, Index > :: setElement( const Index k,
 
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Element& tnlMultiArray< 3, Element, Device, Index > :: operator()( const Index k,
                                                                         const Index j,
                                                                         const Index i )
@@ -187,9 +176,7 @@ Element& tnlMultiArray< 3, Element, Device, Index > :: operator()( const Index k
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Element& tnlMultiArray< 3, Element, Device, Index > :: operator()( const Index k,
                                                                                const Index j,
                                                                                const Index i ) const
diff --git a/src/core/arrays/tnlMultiArray4D_impl.h b/src/core/arrays/tnlMultiArray4D_impl.h
index a309545b613e46136088922a001433cb8312aa13..a2ebfcd21f709f8da7ad77d1ec96d3ee54f03a06 100644
--- a/src/core/arrays/tnlMultiArray4D_impl.h
+++ b/src/core/arrays/tnlMultiArray4D_impl.h
@@ -21,9 +21,6 @@
 
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   //__device__ __host__
-#endif
 tnlMultiArray< 4, Element, Device, Index > :: tnlMultiArray()
 {
 }
@@ -124,9 +121,7 @@ void tnlMultiArray< 4, Element, Device, Index >::reset()
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlMultiArray< 4, Element, Device, Index > :: getDimensions( Index& lSize,
                                                                        Index& kSize,
                                                                        Index& jSize,
@@ -139,18 +134,14 @@ void tnlMultiArray< 4, Element, Device, Index > :: getDimensions( Index& lSize,
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const tnlStaticVector< 4, Index >& tnlMultiArray< 4, Element, Device, Index > :: getDimensions() const
 {
    return this -> dimensions;
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlMultiArray< 4, Element, Device, Index > :: getElementIndex( const Index l,
                                                                      const Index k,
                                                                      const Index j,
@@ -188,9 +179,7 @@ void tnlMultiArray< 4, Element, Device, Index > :: setElement( const Index l,
 
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Element& tnlMultiArray< 4, Element, Device, Index > :: operator()( const Index l,
                                                                         const Index k,
                                                                         const Index j,
@@ -200,9 +189,7 @@ Element& tnlMultiArray< 4, Element, Device, Index > :: operator()( const Index l
 }
 
 template< typename Element, typename Device, typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Element& tnlMultiArray< 4, Element, Device, Index > :: operator()( const Index l,
                                                                                const Index k,
                                                                                const Index j,
diff --git a/src/core/arrays/tnlMultiArray_impl.cpp b/src/core/arrays/tnlMultiArray_impl.cpp
index 8d6efafd088aca0c75525dab886a763a0cca5b9a..39695db81e0eb208c56811beb023aa250ac855f6 100644
--- a/src/core/arrays/tnlMultiArray_impl.cpp
+++ b/src/core/arrays/tnlMultiArray_impl.cpp
@@ -19,41 +19,95 @@
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 1, float,  tnlHost, int >;
+#endif
 template class tnlMultiArray< 1, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 1, float,  tnlHost, long int >;
+#endif
 template class tnlMultiArray< 1, double, tnlHost, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 2, float,  tnlHost, int >;
+#endif
 template class tnlMultiArray< 2, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 2, float,  tnlHost, long int >;
+#endif
 template class tnlMultiArray< 2, double, tnlHost, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 3, float,  tnlHost, int >;
+#endif
 template class tnlMultiArray< 3, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 3, float,  tnlHost, long int >;
+#endif
 template class tnlMultiArray< 3, double, tnlHost, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 4, float,  tnlHost, int >;
+#endif
 template class tnlMultiArray< 4, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 4, float,  tnlHost, long int >;
+#endif
 template class tnlMultiArray< 4, double, tnlHost, long int >;
+#endif
 
 #ifndef HAVE_CUDA
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 1, float,  tnlCuda, int >;
+#endif
 template class tnlMultiArray< 1, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 1, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiArray< 1, double, tnlCuda, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 2, float,  tnlCuda, int >;
+#endif
 template class tnlMultiArray< 2, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 2, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiArray< 2, double, tnlCuda, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 3, float,  tnlCuda, int >;
+#endif
 template class tnlMultiArray< 3, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 3, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiArray< 3, double, tnlCuda, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 4, float,  tnlCuda, int >;
+#endif
 template class tnlMultiArray< 4, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 4, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiArray< 4, double, tnlCuda, long int >;
+#endif
 
 #endif
 
diff --git a/src/core/arrays/tnlMultiArray_impl.cu b/src/core/arrays/tnlMultiArray_impl.cu
index a168d7111a98bc71a565b73493f69b562c64c9cf..a0ac41bde5a89f13589a8dada3506e5ace09706a 100644
--- a/src/core/arrays/tnlMultiArray_impl.cu
+++ b/src/core/arrays/tnlMultiArray_impl.cu
@@ -20,22 +20,49 @@
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
 #ifdef HAVE_CUDA
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 1, float,  tnlCuda, int >;
+#endif
 template class tnlMultiArray< 1, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 1, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiArray< 1, double, tnlCuda, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 2, float,  tnlCuda, int >;
+#endif
 template class tnlMultiArray< 2, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 2, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiArray< 2, double, tnlCuda, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 3, float,  tnlCuda, int >;
+#endif
 template class tnlMultiArray< 3, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 3, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiArray< 3, double, tnlCuda, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 4, float,  tnlCuda, int >;
+#endif
 template class tnlMultiArray< 4, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiArray< 4, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiArray< 4, double, tnlCuda, long int >;
+#endif
 
 #endif
 
diff --git a/src/core/arrays/tnlSharedArray.h b/src/core/arrays/tnlSharedArray.h
index e4a4b0104ac8fc4ad8b265d469aad56a2c9c9c7c..0787b1c30594a5ebb4e711412a406b87d6fe685c 100644
--- a/src/core/arrays/tnlSharedArray.h
+++ b/src/core/arrays/tnlSharedArray.h
@@ -19,6 +19,7 @@
 #define TNLSHAREDARRAY_H_
 
 #include <core/tnlObject.h>
+#include <core/tnlCuda.h>
 
 class tnlFile;
 class tnlHost;
@@ -63,7 +64,10 @@ class tnlSharedArray : public tnlObject
    void bind( Element* _data,
               const Index _size );
 
-   void bind( tnlArray< Element, Device, Index >& array );
+   template< typename Array >
+   void bind( Array& array,
+              IndexType index = 0,
+              IndexType size = 0 );
 
    template< int Size >
    void bind( tnlStaticArray< Size, Element >& array );
@@ -74,24 +78,15 @@ class tnlSharedArray : public tnlObject
 
    void reset();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getSize() const;
+   __cuda_callable__ Index getSize() const;
 
    void setElement( const Index i, const Element& x );
 
    Element getElement( Index i ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Element& operator[] ( Index i );
+   __cuda_callable__ Element& operator[] ( Index i );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element& operator[] ( Index i ) const;
+   __cuda_callable__ const Element& operator[] ( Index i ) const;
 
    tnlSharedArray< Element, Device, Index >& operator = ( const tnlSharedArray< Element, Device, Index >& array );
 
@@ -106,15 +101,9 @@ class tnlSharedArray : public tnlObject
 
    void setValue( const Element& e );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   const Element* getData() const;
+   __cuda_callable__ const Element* getData() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Element* getData();
+   __cuda_callable__ Element* getData();
 
    /*!
     * Returns true if non-zero size is set.
diff --git a/src/core/arrays/tnlSharedArray_impl.cpp b/src/core/arrays/tnlSharedArray_impl.cpp
index 15788dd5d87d245167e5972199592e6b58d234ad..b0c92f0cfeafc61c74d40e5fe66175ad35e1dd9c 100644
--- a/src/core/arrays/tnlSharedArray_impl.cpp
+++ b/src/core/arrays/tnlSharedArray_impl.cpp
@@ -18,17 +18,43 @@
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedArray< float, tnlHost, int >;
+#endif
 template class tnlSharedArray< double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlSharedArray< long double, tnlHost, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedArray< float, tnlHost, long int >;
+#endif
 template class tnlSharedArray< double, tnlHost, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlSharedArray< long double, tnlHost, long int >;
+#endif
+#endif
 
-#ifdef HAVE_CUDA
+/*#ifdef HAVE_CUDA
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedArray< float, tnlCuda, int >;
+#endif
 template class tnlSharedArray< double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlSharedArray< long double, tnlCuda, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedArray< float, tnlCuda, long int >;
+#endif
 template class tnlSharedArray< double, tnlCuda, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlSharedArray< long double, tnlCuda, long int >;
+#endif
 #endif
+#endif*/
 
 #endif
 
diff --git a/src/core/arrays/tnlSharedArray_impl.cu b/src/core/arrays/tnlSharedArray_impl.cu
index 51d2eae92b17f2e597ba3754df9264389ad7c634..010e3ce01bb209093a4056acf182f4f422d3c5c1 100644
--- a/src/core/arrays/tnlSharedArray_impl.cu
+++ b/src/core/arrays/tnlSharedArray_impl.cu
@@ -20,10 +20,23 @@
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
 #ifdef HAVE_CUDA
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedArray< float, tnlCuda, int >;
+#endif
 template class tnlSharedArray< double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlSharedArray< long double, tnlCuda, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedArray< float, tnlCuda, long int >;
+#endif
 template class tnlSharedArray< double, tnlCuda, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlSharedArray< long double, tnlCuda, long int >;
+#endif
+#endif
 #endif
 
 #endif
\ No newline at end of file
diff --git a/src/core/arrays/tnlSharedArray_impl.h b/src/core/arrays/tnlSharedArray_impl.h
index 2b298d15ba9d856bff6369e8391f7af48a9bde5b..c58802aef78569d31543cfb22f2028abea756347 100644
--- a/src/core/arrays/tnlSharedArray_impl.h
+++ b/src/core/arrays/tnlSharedArray_impl.h
@@ -117,10 +117,19 @@ void tnlSharedArray< Element, Device, Index > :: bind( Element* data,
 template< typename Element,
           typename Device,
           typename Index >
-void tnlSharedArray< Element, Device, Index > :: bind( tnlArray< Element, Device, Index >& array )
-{
-   this->size = array. getSize();
-   this->data = array. getData();
+   template< typename Array >
+void tnlSharedArray< Element, Device, Index > :: bind( Array& array,
+                                                       IndexType index,
+                                                       IndexType size )
+{
+   tnlStaticAssert( Array::DeviceType::DeviceType == DeviceType::DeviceType,
+                    "Attempt to bind arrays between different devices." );
+   this->data = &( array. getData()[ index ] );
+   if( ! size )
+      this->size = array. getSize();
+   else
+      this->size = size;
+   
 };
 
 template< typename Element,
@@ -163,9 +172,7 @@ void tnlSharedArray< Element, Device, Index > :: reset()
 template< typename Element,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlSharedArray< Element, Device, Index > :: getSize() const
 {
    return this -> size;
@@ -200,9 +207,7 @@ Element tnlSharedArray< Element, Device, Index > :: getElement( Index i ) const
 template< typename Element,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Element& tnlSharedArray< Element, Device, Index > :: operator[] ( Index i )
 {
    tnlAssert( 0 <= i && i < this -> getSize(),
@@ -216,9 +221,7 @@ Element& tnlSharedArray< Element, Device, Index > :: operator[] ( Index i )
 template< typename Element,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Element& tnlSharedArray< Element, Device, Index > :: operator[] ( Index i ) const
 {
    tnlAssert( 0 <= i && i < this -> getSize(),
@@ -441,15 +444,34 @@ ostream& operator << ( ostream& str, const tnlSharedArray< Element, Device, Inde
 
 // TODO: this does not work with CUDA 5.5 - fix it later
 
-/*extern template class tnlSharedArray< float, tnlHost, int >;
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlSharedArray< float, tnlHost, int >;
+#endif
 extern template class tnlSharedArray< double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlSharedArray< long double, tnlHost, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlSharedArray< float, tnlHost, long int >;
-extern template class tnlSharedArray< double, tnlHost, long int >;*/
+#endif
+extern template class tnlSharedArray< double, tnlHost, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlSharedArray< long double, tnlHost, long int >;
+#endif
+#endif
+
 
 #ifdef HAVE_CUDA
-/*extern template class tnlSharedArray< float, tnlCuda, int >;
+/*
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlSharedArray< float, tnlCuda, int >;
+#endif
 extern template class tnlSharedArray< double, tnlCuda, int >;
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlSharedArray< float, tnlCuda, long int >;
+#endif
 extern template class tnlSharedArray< double, tnlCuda, long int >;*/
 #endif
 
diff --git a/src/core/arrays/tnlStaticArray1D_impl.h b/src/core/arrays/tnlStaticArray1D_impl.h
index 37150c4c4d0fc0d6556861d2618e42e60ae201cc..170488b655ba13fe968628f860d81ba838b1ed6b 100644
--- a/src/core/arrays/tnlStaticArray1D_impl.h
+++ b/src/core/arrays/tnlStaticArray1D_impl.h
@@ -209,10 +209,16 @@ void tnlStaticArray< 1, Element >::sort()
 #ifndef HAVE_CUDA
 extern template class tnlStaticArray< 1, char >;
 extern template class tnlStaticArray< 1, int >;
+#ifdef INSTANTIATE_LONG_INT
 extern template class tnlStaticArray< 1, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlStaticArray< 1, float >;
+#endif
 extern template class tnlStaticArray< 1, double >;
-//extern template class tnlStaticArray< 1, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlStaticArray< 1, long double >;
+#endif
 #endif
 
 #endif
diff --git a/src/core/arrays/tnlStaticArray2D_impl.h b/src/core/arrays/tnlStaticArray2D_impl.h
index 0f7d7b26d6782924e4fd1b126a2a7c925c078f5a..3e3cd35820ec1c83a86a4ded314d5bd10c5e28f4 100644
--- a/src/core/arrays/tnlStaticArray2D_impl.h
+++ b/src/core/arrays/tnlStaticArray2D_impl.h
@@ -246,10 +246,16 @@ void tnlStaticArray< 2, Element >::sort()
 #ifndef HAVE_CUDA
 extern template class tnlStaticArray< 2, char >;
 extern template class tnlStaticArray< 2, int >;
+#ifdef INSTANTIATE_LONG_INT
 extern template class tnlStaticArray< 2, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlStaticArray< 2, float >;
+#endif
 extern template class tnlStaticArray< 2, double >;
-//extern template class tnlStaticArray< 2, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlStaticArray< 2, long double >;
+#endif
 #endif
 
 #endif
diff --git a/src/core/arrays/tnlStaticArray3D_impl.h b/src/core/arrays/tnlStaticArray3D_impl.h
index 74254d4c76b838df0892614eb4a33c358da49668..5b9755f0281d614177db0e781d2986c57c52030f 100644
--- a/src/core/arrays/tnlStaticArray3D_impl.h
+++ b/src/core/arrays/tnlStaticArray3D_impl.h
@@ -277,10 +277,16 @@ void tnlStaticArray< 3, Element >::sort()
 #ifndef HAVE_CUDA
 extern template class tnlStaticArray< 3, char >;
 extern template class tnlStaticArray< 3, int >;
+#ifdef INSTANTIATE_LONG_INT
 extern template class tnlStaticArray< 3, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlStaticArray< 3, float >;
+#endif
 extern template class tnlStaticArray< 3, double >;
-//extern template class tnlStaticArray< 3, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlStaticArray< 3, long double >;
+#endif
 #endif
 
 #endif
diff --git a/src/core/arrays/tnlStaticArray_impl.cpp b/src/core/arrays/tnlStaticArray_impl.cpp
index 4d9b09efbc0044db090137d712474aa37cf317c8..f16eb80a4eb4bd3dcf929259df2076eadb798e0f 100644
--- a/src/core/arrays/tnlStaticArray_impl.cpp
+++ b/src/core/arrays/tnlStaticArray_impl.cpp
@@ -22,31 +22,55 @@
 
 template class tnlStaticArray< 1, char >;
 template class tnlStaticArray< 1, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlStaticArray< 1, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticArray< 1, float >;
+#endif
 template class tnlStaticArray< 1, double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
 template class tnlStaticArray< 1, long double >;
+#endif
 
 template class tnlStaticArray< 2, char >;
 template class tnlStaticArray< 2, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlStaticArray< 2, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticArray< 2, float >;
+#endif
 template class tnlStaticArray< 2, double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
 template class tnlStaticArray< 2, long double >;
+#endif
 
 template class tnlStaticArray< 3, char >;
 template class tnlStaticArray< 3, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlStaticArray< 3, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticArray< 3, float >;
+#endif
 template class tnlStaticArray< 3, double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
 template class tnlStaticArray< 3, long double >;
+#endif
 
 template class tnlStaticArray< 4, char >;
 template class tnlStaticArray< 4, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlStaticArray< 4, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticArray< 4, float >;
+#endif
 template class tnlStaticArray< 4, double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
 template class tnlStaticArray< 4, long double >;
+#endif
 
 #endif
 #endif
diff --git a/src/core/arrays/tnlStaticArray_impl.cu b/src/core/arrays/tnlStaticArray_impl.cu
index 3cb51866dbc842157d6a1e8d1139e9f0481c16b6..5ae563aab4ed8cb418f1b49e05ef5b5f9bee3d20 100644
--- a/src/core/arrays/tnlStaticArray_impl.cu
+++ b/src/core/arrays/tnlStaticArray_impl.cu
@@ -22,31 +22,55 @@
 
 template class tnlStaticArray< 1, char >;
 template class tnlStaticArray< 1, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlStaticArray< 1, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticArray< 1, float >;
+#endif
 template class tnlStaticArray< 1, double >;
-//template class tnlStaticArray< 1, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlStaticArray< 1, long double >;
+#endif
 
 template class tnlStaticArray< 2, char >;
 template class tnlStaticArray< 2, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlStaticArray< 2, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticArray< 2, float >;
+#endif
 template class tnlStaticArray< 2, double >;
-//template class tnlStaticArray< 2, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlStaticArray< 2, long double >;
+#endif
 
 template class tnlStaticArray< 3, char >;
 template class tnlStaticArray< 3, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlStaticArray< 3, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticArray< 3, float >;
+#endif
 template class tnlStaticArray< 3, double >;
-//template class tnlStaticArray< 3, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlStaticArray< 3, long double >;
+#endif
 
 template class tnlStaticArray< 4, char >;
 template class tnlStaticArray< 4, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlStaticArray< 4, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticArray< 4, float >;
+#endif
 template class tnlStaticArray< 4, double >;
-//template class tnlStaticArray< 4, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlStaticArray< 4, long double >;
+#endif
 
 #endif
 #endif
\ No newline at end of file
diff --git a/src/core/arrays/tnlStaticArray_impl.h b/src/core/arrays/tnlStaticArray_impl.h
index 63afb64be467433395a855b453f863f3c3becb80..08c5606e4f387b0763060ffcb1a05e3b6176d717 100644
--- a/src/core/arrays/tnlStaticArray_impl.h
+++ b/src/core/arrays/tnlStaticArray_impl.h
@@ -222,10 +222,16 @@ ostream& operator << ( ostream& str, const tnlStaticArray< Size, Element >& a )
 #ifndef HAVE_CUDA
 extern template class tnlStaticArray< 4, char >;
 extern template class tnlStaticArray< 4, int >;
+#ifdef INSTANTIATE_LONG_INT
 extern template class tnlStaticArray< 4, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlStaticArray< 4, float >;
+#endif
 extern template class tnlStaticArray< 4, double >;
-//extern template class tnlStaticArray< 4, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlStaticArray< 4, long double >;
+#endif
 #endif
 
 #endif
diff --git a/src/core/cuda/CMakeLists.txt b/src/core/cuda/CMakeLists.txt
index 6406a6a2c318f6fa313af3ea018be4ba34a78aef..3530dda77a5aed1f1b2acdbd160af0a26555b74d 100755
--- a/src/core/cuda/CMakeLists.txt
+++ b/src/core/cuda/CMakeLists.txt
@@ -2,7 +2,11 @@ set( headers cuda-prefix-sum.h
              cuda-prefix-sum_impl.h
              cuda-reduction.h             
              cuda-reduction_impl.h
-             reduction-operations.h )
+             reduction-operations.h
+             tnlCudaReduction.h
+             tnlCudaReduction_impl.h
+             tnlCudaReductionBuffer.h
+             tnlCublasWrapper.h )
 
 SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/core/cuda ) 
 IF( BUILD_CUDA )
diff --git a/src/core/cuda/cuda-prefix-sum_impl.cu b/src/core/cuda/cuda-prefix-sum_impl.cu
index 36497541e34da0eb35d9252bf7a67754681c191d..74f3e85fb7c75554e37e3f4075aabcf8fa2092bd 100644
--- a/src/core/cuda/cuda-prefix-sum_impl.cu
+++ b/src/core/cuda/cuda-prefix-sum_impl.cu
@@ -27,12 +27,14 @@ template bool cudaPrefixSum( const int size,
                              const enumPrefixSumType prefixSumType );
 
 
+#ifdef INSTANTIATE_FLOAT
 template bool cudaPrefixSum( const int size,
                              const int blockSize,
                              const float *deviceInput,
                              float* deviceOutput,
                              const tnlParallelReductionSum< float, int >& operation,
                              const enumPrefixSumType prefixSumType );
+#endif
 
 template bool cudaPrefixSum( const int size,
                              const int blockSize,
@@ -41,13 +43,16 @@ template bool cudaPrefixSum( const int size,
                              const tnlParallelReductionSum< double, int >& operation,
                              const enumPrefixSumType prefixSumType );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool cudaPrefixSum( const int size,
                              const int blockSize,
                              const long double *deviceInput,
                              long double* deviceOutput,
                              const tnlParallelReductionSum< long double, int >& operation,
                              const enumPrefixSumType prefixSumType );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool cudaPrefixSum( const long int size,
                              const long int blockSize,
                              const int *deviceInput,
@@ -56,12 +61,14 @@ template bool cudaPrefixSum( const long int size,
                              const enumPrefixSumType prefixSumType );
 
 
+#ifdef INSTANTIATE_FLOAT
 template bool cudaPrefixSum( const long int size,
                              const long int blockSize,
                              const float *deviceInput,
                              float* deviceOutput,
                              const tnlParallelReductionSum< float, long int >& operation,
                              const enumPrefixSumType prefixSumType );
+#endif
 
 template bool cudaPrefixSum( const long int size,
                              const long int blockSize,
@@ -70,10 +77,13 @@ template bool cudaPrefixSum( const long int size,
                              const tnlParallelReductionSum< double, long int >& operation,
                              const enumPrefixSumType prefixSumType );
 
-/*template bool cudaPrefixSum( const long int size,
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool cudaPrefixSum( const long int size,
                              const long int blockSize,
                              const long double *deviceInput,
                              long double* deviceOutput,
                              const tnlParallelReductionSum< long double, long int >& operation,
-                             const enumPrefixSumType prefixSumType );*/   
+                             const enumPrefixSumType prefixSumType );
+#endif
+#endif 
 #endif
diff --git a/src/core/cuda/cuda-prefix-sum_impl.h b/src/core/cuda/cuda-prefix-sum_impl.h
index a37b818de944e023d0bc41a9dad1855f4a18b3db..ad2d02aa3341c21056b9e2994df93918aa38db52 100644
--- a/src/core/cuda/cuda-prefix-sum_impl.h
+++ b/src/core/cuda/cuda-prefix-sum_impl.h
@@ -53,7 +53,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
    if( prefixSumType == exclusivePrefixSum )
    {
       if( idx == 0 )
-         sharedData[ 0 ] = operation.identity();
+         sharedData[ 0 ] = operation.initialValue();
       while( idx < elementsInBlock && blockOffset + idx < size )
       {
          sharedData[ tnlCuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ];
@@ -86,8 +86,8 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
    while( chunkPointer < chunkSize &&
           chunkOffset + chunkPointer < lastElementInBlock )
    {
-      operation.performInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ],
-                                sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
+      operation.commonReductionOnDevice( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ],
+                                         sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
       auxData[ threadIdx. x ] =
          sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer  ) ];
       chunkPointer ++;
@@ -100,7 +100,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
    const int warpIdx = threadIdx. x / tnlCuda::getWarpSize();
    for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 )
       if( threadInWarpIdx >= stride && threadIdx. x < numberOfChunks )
-         operation.performInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] );
+         operation.commonReductionOnDevice( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] );
 
    if( threadInWarpIdx == tnlCuda::getWarpSize() - 1 )
       warpSums[ warpIdx ] = auxData[ threadIdx. x ];
@@ -112,14 +112,14 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
    if( warpIdx == 0 )
       for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 )
          if( threadInWarpIdx >= stride )
-            operation.performInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] );
+            operation.commonReductionOnDevice( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] );
    __syncthreads();
 
    /****
     * Shift the warp prefix-sums.
     */
    if( warpIdx > 0 )
-      operation.performInPlace( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] );
+      operation.commonReductionOnDevice( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] );
 
    /***
     *  Store the result back in global memory.
@@ -129,10 +129,10 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
    while( idx < elementsInBlock && blockOffset + idx < size )
    {
       const Index chunkIdx = idx / chunkSize;
-      DataType chunkShift( operation.identity() );
+      DataType chunkShift( operation.initialValue() );
       if( chunkIdx > 0 )
          chunkShift = auxData[ chunkIdx - 1 ];
-      operation.performInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift );
+      operation.commonReductionOnDevice( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift );
       output[ blockOffset + idx ] = sharedData[ tnlCuda::getInterleaving( idx ) ];
       idx += blockDim. x;
    }
@@ -141,10 +141,15 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
    if( threadIdx. x == 0 )
    {
       if( prefixSumType == exclusivePrefixSum )
-         auxArray[ blockIdx. x ] =
-            operation.commonReductionOnDevice( tnlCuda::getInterleaving( lastElementInBlock - 1 ),
-                                               tnlCuda::getInterleaving( lastElementInBlock ),
-                                               sharedData );
+      {
+         /*auxArray[ blockIdx. x ] = operation.commonReductionOnDevice( tnlCuda::getInterleaving( lastElementInBlock - 1 ),
+                                                                      tnlCuda::getInterleaving( lastElementInBlock ),
+                                                                      sharedData );*/
+         DataType aux = operation.initialValue();
+         operation.commonReductionOnDevice( aux, sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ] );
+         operation.commonReductionOnDevice( aux, sharedData[ tnlCuda::getInterleaving( lastElementInBlock ) ] );
+         auxArray[ blockIdx. x ] = aux;
+      }
       else
          auxArray[ blockIdx. x ] = sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ];
    }
@@ -163,13 +168,14 @@ __global__ void cudaSecondPhaseBlockPrefixSum( const Operation operation,
 {
    if( blockIdx. x > 0 )
    {
-      const DataType shift = operation.commonReductionOnDevice( gridShift, auxArray[ blockIdx. x - 1 ] );
+      DataType shift( gridShift );
+      operation.commonReductionOnDevice( shift, auxArray[ blockIdx. x - 1 ] );
 
       const Index readOffset = blockIdx. x * elementsInBlock;
       Index readIdx = threadIdx. x;
       while( readIdx < elementsInBlock && readOffset + readIdx < size )
       {
-         operation.performInPlace( data[ readIdx + readOffset ], shift );
+         operation.commonReductionOnDevice( data[ readIdx + readOffset ], shift );
          readIdx += blockDim. x;
       }
    }
@@ -373,13 +379,16 @@ extern template bool cudaPrefixSum( const int size,
                                     const tnlParallelReductionSum< double, int >& operation,
                                     const enumPrefixSumType prefixSumType );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool cudaPrefixSum( const int size,
                                     const int blockSize,
                                     const long double *deviceInput,
                                     long double* deviceOutput,
                                     const tnlParallelReductionSum< long double, int >& operation,
                                     const enumPrefixSumType prefixSumType );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool cudaPrefixSum( const long int size,
                                     const long int blockSize,
                                     const int *deviceInput,
@@ -402,6 +411,7 @@ extern template bool cudaPrefixSum( const long int size,
                                     const tnlParallelReductionSum< double, long int >& operation,
                                     const enumPrefixSumType prefixSumType );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template bool cudaPrefixSum( const long int size,
                                     const long int blockSize,
                                     const long double *deviceInput,
@@ -409,7 +419,9 @@ extern template bool cudaPrefixSum( const long int size,
                                     const tnlParallelReductionSum< long double, long int >& operation,
                                     const enumPrefixSumType prefixSumType );
 #endif
+#endif
 
+#endif
 
 #endif
 
diff --git a/src/core/cuda/cuda-reduction-abs-max_impl.cu b/src/core/cuda/cuda-reduction-abs-max_impl.cu
index 93605031fd1071be207ef28c2055761867c74011..8540fc71a381ba90b2db7c341abe24ff1e60f74a 100644
--- a/src/core/cuda/cuda-reduction-abs-max_impl.cu
+++ b/src/core/cuda/cuda-reduction-abs-max_impl.cu
@@ -52,13 +52,14 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, int > >
                                      const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, int > >
                                    ( const tnlParallelReductionAbsMax< long double, int>& operation,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< long double, int> :: ResultType& result );
-                                     
+#endif                                     
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int > >
                                    ( const tnlParallelReductionAbsMax< char, long int >& operation,
@@ -67,6 +68,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int
                                      const typename tnlParallelReductionAbsMax< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< char, long int > :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, long int > >
                                    ( const tnlParallelReductionAbsMax< int, long int >& operation,
                                      const typename tnlParallelReductionAbsMax< int, long int > :: IndexType size,
@@ -88,11 +90,14 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, long in
                                      const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > >
                                    ( const tnlParallelReductionAbsMax< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionAbsMax< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionAbsMax< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 #endif                                     
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-abs-min_impl.cu b/src/core/cuda/cuda-reduction-abs-min_impl.cu
index 812159df5ff33db346c1615a128acc278b938168..629fa37ddcf71fb4adacaf7e7e872fa938fda10c 100644
--- a/src/core/cuda/cuda-reduction-abs-min_impl.cu
+++ b/src/core/cuda/cuda-reduction-abs-min_impl.cu
@@ -52,12 +52,14 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, int > >
                                      const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, int > >
                                    ( const tnlParallelReductionAbsMin< long double, int>& operation,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< long double, int> :: ResultType& result );
+#endif
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int > >
                                    ( const tnlParallelReductionAbsMin< char, long int >& operation,
@@ -66,6 +68,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int
                                      const typename tnlParallelReductionAbsMin< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< char, long int > :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, long int > >
                                    ( const tnlParallelReductionAbsMin< int, long int >& operation,
                                      const typename tnlParallelReductionAbsMin< int, long int > :: IndexType size,
@@ -87,11 +90,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, long in
                                      const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > >
                                    ( const tnlParallelReductionAbsMin< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionAbsMin< long double, long int> :: ResultType& result );*/
-                                    
+                                     typename tnlParallelReductionAbsMin< long double, long int> :: ResultType& result );
+#endif
+#endif                                    
 #endif                                     
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-abs-sum_impl.cu b/src/core/cuda/cuda-reduction-abs-sum_impl.cu
index a6a22f16c8f8c2b925ee2ba2eaca16214fb59b13..a023631a03927fed1f0a57c71d671d7b25dcb01f 100644
--- a/src/core/cuda/cuda-reduction-abs-sum_impl.cu
+++ b/src/core/cuda/cuda-reduction-abs-sum_impl.cu
@@ -52,12 +52,14 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, int > >
                                      const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, int > >
                                    ( const tnlParallelReductionAbsSum< long double, int>& operation,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< long double, int> :: ResultType& result );
+#endif
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, long int > >
                                    ( const tnlParallelReductionAbsSum< char, long int >& operation,
@@ -66,6 +68,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, long int
                                      const typename tnlParallelReductionAbsSum< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< char, long int > :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< int, long int > >
                                    ( const tnlParallelReductionAbsSum< int, long int >& operation,
                                      const typename tnlParallelReductionAbsSum< int, long int > :: IndexType size,
@@ -87,11 +90,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, long in
                                      const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, long int > >
                                    ( const tnlParallelReductionAbsSum< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionAbsSum< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionAbsSum< long double, long int> :: ResultType& result );
+#endif
+#endif
 #endif                                     
diff --git a/src/core/cuda/cuda-reduction-and_impl.cu b/src/core/cuda/cuda-reduction-and_impl.cu
index 592c0cd5847c036e70ee1dca05c724f069360ec8..ac71e46e192fef91c6bbd62fc6418aeb435355f6 100644
--- a/src/core/cuda/cuda-reduction-and_impl.cu
+++ b/src/core/cuda/cuda-reduction-and_impl.cu
@@ -51,13 +51,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, int
                                      const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, int > >
                                    ( const tnlParallelReductionLogicalAnd< long double, int>& operation,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, long int > >
                                    ( const tnlParallelReductionLogicalAnd< char, long int >& operation,
                                      const typename tnlParallelReductionLogicalAnd< char, long int > :: IndexType size,
@@ -86,11 +89,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, lon
                                      const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, long int > >
                                    ( const tnlParallelReductionLogicalAnd< long double, long int>& operation,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionLogicalAnd< long double, long int> :: ResultType& result );*/
-
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionLogicalAnd< long double, long int> :: ResultType& result );
+#endif
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-diff-abs-max_impl.cu b/src/core/cuda/cuda-reduction-diff-abs-max_impl.cu
index 291fdc9d888a2ac71106d733ba20d49a8eaff536..291810ef74385624b4213c0d6afff9b4da009781 100644
--- a/src/core/cuda/cuda-reduction-diff-abs-max_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-abs-max_impl.cu
@@ -52,13 +52,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< double, int
                                      const typename tnlParallelReductionDiffAbsMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, int > >
                                    ( const tnlParallelReductionDiffAbsMax< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< char, long int > >
                                    ( const tnlParallelReductionDiffAbsMax< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsMax< char, long int > :: IndexType size,
@@ -87,11 +90,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< double, lon
                                      const typename tnlParallelReductionDiffAbsMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, long int > >
                                    ( const tnlParallelReductionDiffAbsMax< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMax< long double, long int> :: ResultType& result );*/
-
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionDiffAbsMax< long double, long int> :: ResultType& result );
+#endif
+#endif                        
+#endif             
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-diff-abs-min_impl.cu b/src/core/cuda/cuda-reduction-diff-abs-min_impl.cu
index 14428d5dff3cab5021c55a3168d7d67452d7bf75..d9ce714abde23b2788c9a94a81e03b283860ebf5 100644
--- a/src/core/cuda/cuda-reduction-diff-abs-min_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-abs-min_impl.cu
@@ -53,13 +53,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< double, int
                                      const typename tnlParallelReductionDiffAbsMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, int > >
                                    ( const tnlParallelReductionDiffAbsMin< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< char, long int > >
                                    ( const tnlParallelReductionDiffAbsMin< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsMin< char, long int > :: IndexType size,
@@ -88,11 +91,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< double, lon
                                      const typename tnlParallelReductionDiffAbsMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, long int > >
                                    ( const tnlParallelReductionDiffAbsMin< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMin< long double, long int> :: ResultType& result );*/
-
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionDiffAbsMin< long double, long int> :: ResultType& result );
+#endif
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-diff-abs-sum_impl.cu b/src/core/cuda/cuda-reduction-diff-abs-sum_impl.cu
index f033706ef704e81e405654e799954b136348738e..5298d033491f1d23330a05bc5528c211c1335e99 100644
--- a/src/core/cuda/cuda-reduction-diff-abs-sum_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-abs-sum_impl.cu
@@ -52,13 +52,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< double, int
                                      const typename tnlParallelReductionDiffAbsSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, int > >
                                    ( const tnlParallelReductionDiffAbsSum< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< char, long int > >
                                    ( const tnlParallelReductionDiffAbsSum< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsSum< char, long int > :: IndexType size,
@@ -87,11 +90,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< double, lon
                                      const typename tnlParallelReductionDiffAbsSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, long int > >
                                    ( const tnlParallelReductionDiffAbsSum< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsSum< long double, long int> :: ResultType& result );*/
-
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionDiffAbsSum< long double, long int> :: ResultType& result );
+#endif
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-diff-lp-norm_impl.cu b/src/core/cuda/cuda-reduction-diff-lp-norm_impl.cu
index 1403d34b148a6c553c78454908018e81ea1812bc..2359564477c934f1bcfa62de9e1a155c6875cff3 100644
--- a/src/core/cuda/cuda-reduction-diff-lp-norm_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-lp-norm_impl.cu
@@ -37,13 +37,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< double, int
                                      const typename tnlParallelReductionDiffLpNorm< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, int > >
                                    ( const tnlParallelReductionDiffLpNorm< long double, int>& operation,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< char, long int > >
                                    ( const tnlParallelReductionDiffLpNorm< char, long int >& operation,
                                      const typename tnlParallelReductionDiffLpNorm< char, long int > :: IndexType size,
@@ -72,13 +75,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< double, lon
                                      const typename tnlParallelReductionDiffLpNorm< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, long int > >
                                    ( const tnlParallelReductionDiffLpNorm< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffLpNorm< long double, long int> :: ResultType& result );*/
-
-
-
+                                     typename tnlParallelReductionDiffLpNorm< long double, long int> :: ResultType& result );
+#endif
+#endif
 #endif
diff --git a/src/core/cuda/cuda-reduction-diff-max_impl.cu b/src/core/cuda/cuda-reduction-diff-max_impl.cu
index 76fac28487598502b5ceee8d5830180dba62d9b3..fe91ae6ef6a0f9116df733f392016a588dd9c5e4 100644
--- a/src/core/cuda/cuda-reduction-diff-max_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-max_impl.cu
@@ -52,13 +52,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< double, int >
                                      const typename tnlParallelReductionDiffMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, int > >
                                    ( const tnlParallelReductionDiffMax< long double, int>& operation,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< long double, int> :: ResultType& result );
-                                     
+#endif
+
+#ifdef INSTANTIATE_LONG_INT                                     
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< char, long int > >
                                    ( const tnlParallelReductionDiffMax< char, long int >& operation,
                                      const typename tnlParallelReductionDiffMax< char, long int > :: IndexType size,
@@ -87,11 +90,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< double, long i
                                      const typename tnlParallelReductionDiffMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, long int > >
                                    ( const tnlParallelReductionDiffMax< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffMax< long double, long int> :: ResultType& result );*/
-
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionDiffMax< long double, long int> :: ResultType& result );
+#endif
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-diff-min_impl.cu b/src/core/cuda/cuda-reduction-diff-min_impl.cu
index fe75190d51657d6d658b2fe4623fbc18b7f6f191..ed13335b8b282352727a8a6687c10ae122196a48 100644
--- a/src/core/cuda/cuda-reduction-diff-min_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-min_impl.cu
@@ -53,13 +53,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< double, int >
                                      const typename tnlParallelReductionDiffMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, int > >
                                    ( const tnlParallelReductionDiffMin< long double, int>& operation,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< char, long int > >
                                    ( const tnlParallelReductionDiffMin< char, long int >& operation,
                                      const typename tnlParallelReductionDiffMin< char, long int > :: IndexType size,
@@ -88,11 +91,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< double, long i
                                      const typename tnlParallelReductionDiffMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, long int > >
                                    ( const tnlParallelReductionDiffMin< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffMin< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionDiffMin< long double, long int> :: ResultType& result );
+#endif
+#endif
 #endif                                     
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-diff-sum_impl.cu b/src/core/cuda/cuda-reduction-diff-sum_impl.cu
index ce79e8cc4b0835b68365168e859c5b51bf8f605a..aa08778ea70f66cc6c6d24b86effa4cf942db1d3 100644
--- a/src/core/cuda/cuda-reduction-diff-sum_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-sum_impl.cu
@@ -52,13 +52,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< double, int >
                                      const typename tnlParallelReductionDiffSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, int > >
                                    ( const tnlParallelReductionDiffSum< long double, int>& operation,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< char, long int > >
                                    ( const tnlParallelReductionDiffSum< char, long int >& operation,
                                      const typename tnlParallelReductionDiffSum< char, long int > :: IndexType size,
@@ -87,11 +90,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< double, long i
                                      const typename tnlParallelReductionDiffSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, long int > >
                                    ( const tnlParallelReductionDiffSum< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffSum< long double, long int> :: ResultType& result );*/
-                                    
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionDiffSum< long double, long int> :: ResultType& result );
+#endif                                    
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-equalities_impl.cu b/src/core/cuda/cuda-reduction-equalities_impl.cu
index 8b4fec91d0820ee5c82fdd77e30b3a441adecc93..6bf7f0263055adc7d6deda2f4316de48006ec3f6 100644
--- a/src/core/cuda/cuda-reduction-equalities_impl.cu
+++ b/src/core/cuda/cuda-reduction-equalities_impl.cu
@@ -51,13 +51,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, int
                                      const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, int > >
                                    ( const tnlParallelReductionEqualities< long double, int>& operation,
                                      const typename tnlParallelReductionEqualities< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, long int > >
                                    ( const tnlParallelReductionEqualities< char, long int >& operation,
                                      const typename tnlParallelReductionEqualities< char, long int > :: IndexType size,
@@ -86,11 +89,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, lon
                                      const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, long int > >
                                    ( const tnlParallelReductionEqualities< long double, long int>& operation,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionEqualities< long double, long int> :: ResultType& result );*/
-
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionEqualities< long double, long int> :: ResultType& result );
+#endif
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-inequalities_impl.cu b/src/core/cuda/cuda-reduction-inequalities_impl.cu
index a04537c97d44b606fcc463372e92246c9ddb6caf..828c88af106c9e8965bf64d12bc798aac41a9cae 100644
--- a/src/core/cuda/cuda-reduction-inequalities_impl.cu
+++ b/src/core/cuda/cuda-reduction-inequalities_impl.cu
@@ -51,13 +51,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, i
                                      const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, int > >
                                    ( const tnlParallelReductionInequalities< long double, int>& operation,
                                      const typename tnlParallelReductionInequalities< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, long int > >
                                    ( const tnlParallelReductionInequalities< char, long int >& operation,
                                      const typename tnlParallelReductionInequalities< char, long int > :: IndexType size,
@@ -86,11 +89,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, l
                                      const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, long int > >
                                    ( const tnlParallelReductionInequalities< long double, long int>& operation,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionInequalities< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionInequalities< long double, long int> :: ResultType& result );
+#endif
+#endif
 #endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-lp-norm_impl.cu b/src/core/cuda/cuda-reduction-lp-norm_impl.cu
index ac4fc3a8f6e0e11f86bf078dfa29eabd545bdc58..a5f5d6644cad7c3c9ff11a83e3343f75f16471e4 100644
--- a/src/core/cuda/cuda-reduction-lp-norm_impl.cu
+++ b/src/core/cuda/cuda-reduction-lp-norm_impl.cu
@@ -37,13 +37,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, int > >
                                      const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, int > >
                                    ( const tnlParallelReductionLpNorm< long double, int>& operation,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< int, long int > >
                                    ( const tnlParallelReductionLpNorm< int, long int >& operation,
                                      const typename tnlParallelReductionLpNorm< int, long int > :: IndexType size,
@@ -65,11 +68,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, long in
                                      const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, long int > >
                                    ( const tnlParallelReductionLpNorm< long double, long int>& operation,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionLpNorm< long double, long int> :: ResultType& result );*/
-
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionLpNorm< long double, long int> :: ResultType& result );
+#endif
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-max_impl.cu b/src/core/cuda/cuda-reduction-max_impl.cu
index 63847eb441ca9147270bb58ac63dfcfd74413e45..cba153c81bf8d4a7a0964b62f155217ff10b7a26 100644
--- a/src/core/cuda/cuda-reduction-max_impl.cu
+++ b/src/core/cuda/cuda-reduction-max_impl.cu
@@ -52,13 +52,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionMax< double, int > >
                                      const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, int > >
                                    ( const tnlParallelReductionMax< long double, int>& operation,
                                      const typename tnlParallelReductionMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionMax< char, long int > >
                                    ( const tnlParallelReductionMax< char, long int >& operation,
                                      const typename tnlParallelReductionMax< char, long int > :: IndexType size,
@@ -87,11 +90,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionMax< double, long int >
                                      const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, long int > >
                                    ( const tnlParallelReductionMax< long double, long int>& operation,
                                      const typename tnlParallelReductionMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionMax< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionMax< long double, long int> :: ResultType& result );
+#endif
+#endif
 #endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-min_impl.cu b/src/core/cuda/cuda-reduction-min_impl.cu
index 5ba284339dcc2f19d8c9ba68a2491ef98eb03c57..dc5a1f41407b410e4333f73c70744506814288a3 100644
--- a/src/core/cuda/cuda-reduction-min_impl.cu
+++ b/src/core/cuda/cuda-reduction-min_impl.cu
@@ -52,13 +52,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionMin< double, int > >
                                      const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, int > >
                                    ( const tnlParallelReductionMin< long double, int>& operation,
                                      const typename tnlParallelReductionMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionMin< char, long int > >
                                    ( const tnlParallelReductionMin< char, long int >& operation,
                                      const typename tnlParallelReductionMin< char, long int > :: IndexType size,
@@ -87,11 +90,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionMin< double, long int >
                                      const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, long int > >
                                    ( const tnlParallelReductionMin< long double, long int>& operation,
                                      const typename tnlParallelReductionMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionMin< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionMin< long double, long int> :: ResultType& result );
+#endif
+#endif
 #endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-or_impl.cu b/src/core/cuda/cuda-reduction-or_impl.cu
index 6128fc48c6152a85c17fb49c906cb5d8358fac34..811ec445fd4c79fc99dd02d023fabdf856c71000 100644
--- a/src/core/cuda/cuda-reduction-or_impl.cu
+++ b/src/core/cuda/cuda-reduction-or_impl.cu
@@ -51,13 +51,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, int
                                      const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, int > >
                                    ( const tnlParallelReductionLogicalOr< long double, int>& operation,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, long int > >
                                    ( const tnlParallelReductionLogicalOr< char, long int >& operation,
                                      const typename tnlParallelReductionLogicalOr< char, long int > :: IndexType size,
@@ -86,11 +89,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, long
                                      const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, long int > >
                                    ( const tnlParallelReductionLogicalOr< long double, long int>& operation,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionLogicalOr< long double, long int> :: ResultType& result );*/
-
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionLogicalOr< long double, long int> :: ResultType& result );
+#endif
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-scalar-product_impl.cu b/src/core/cuda/cuda-reduction-scalar-product_impl.cu
index 1ee2d85af2418032f7bd441acf17d3b77588e270..082d65540ae5f5d514436d847b168a97c9ca4ad7 100644
--- a/src/core/cuda/cuda-reduction-scalar-product_impl.cu
+++ b/src/core/cuda/cuda-reduction-scalar-product_impl.cu
@@ -51,13 +51,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< double,
                                      const typename tnlParallelReductionScalarProduct< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, int > >
                                    ( const tnlParallelReductionScalarProduct< long double, int>& operation,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< char, long int > >
                                    ( const tnlParallelReductionScalarProduct< char, long int >& operation,
                                      const typename tnlParallelReductionScalarProduct< char, long int > :: IndexType size,
@@ -86,11 +89,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< double,
                                      const typename tnlParallelReductionScalarProduct< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, long int > >
                                    ( const tnlParallelReductionScalarProduct< long double, long int>& operation,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionScalarProduct< long double, long int> :: ResultType& result );*/
-
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionScalarProduct< long double, long int> :: ResultType& result );
+#endif
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction-sum_impl.cu b/src/core/cuda/cuda-reduction-sum_impl.cu
index 9cd01bcfd0c3de5a8f612a2205323f4772a567b3..8447ea5f00d8c448362d13181ae70e4574e3c350 100644
--- a/src/core/cuda/cuda-reduction-sum_impl.cu
+++ b/src/core/cuda/cuda-reduction-sum_impl.cu
@@ -52,13 +52,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionSum< double, int > >
                                      const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, int > >
                                    ( const tnlParallelReductionSum< long double, int>& operation,
                                      const typename tnlParallelReductionSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionSum< char, long int > >
                                    ( const tnlParallelReductionSum< char, long int >& operation,
                                      const typename tnlParallelReductionSum< char, long int > :: IndexType size,
@@ -87,11 +90,13 @@ template bool reductionOnCudaDevice< tnlParallelReductionSum< double, long int >
                                      const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< double, long int> :: ResultType& result );
 
-/*template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, long int > >
                                    ( const tnlParallelReductionSum< long double, long int>& operation,
                                      const typename tnlParallelReductionSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionSum< long double, long int> :: ResultType& result );*/
-                                     
-#endif                                     
\ No newline at end of file
+                                     typename tnlParallelReductionSum< long double, long int> :: ResultType& result );
+#endif                                     
+#endif                                     
+#endif
\ No newline at end of file
diff --git a/src/core/cuda/cuda-reduction_impl.cpp b/src/core/cuda/cuda-reduction_impl.cpp
index 9fd1c74283d16ecc3515f752e47973d8b596bcbb..a36767e388bb265d410f7f77838f5598ec529e23 100644
--- a/src/core/cuda/cuda-reduction_impl.cpp
+++ b/src/core/cuda/cuda-reduction_impl.cpp
@@ -52,13 +52,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionSum< double, int > >
                                      const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, int > >
                                    ( const tnlParallelReductionSum< long double, int>& operation,
                                      const typename tnlParallelReductionSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionSum< char, long int > >
                                    ( const tnlParallelReductionSum< char, long int >& operation,
                                      const typename tnlParallelReductionSum< char, long int > :: IndexType size,
@@ -87,12 +90,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionSum< double, long int >
                                      const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, long int > >
                                    ( const tnlParallelReductionSum< long double, long int>& operation,
                                      const typename tnlParallelReductionSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Min
@@ -126,13 +132,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionMin< double, int > >
                                      const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, int > >
                                    ( const tnlParallelReductionMin< long double, int>& operation,
                                      const typename tnlParallelReductionMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionMin< char, long int > >
                                    ( const tnlParallelReductionMin< char, long int >& operation,
                                      const typename tnlParallelReductionMin< char, long int > :: IndexType size,
@@ -161,12 +170,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionMin< double, long int >
                                      const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, long int > >
                                    ( const tnlParallelReductionMin< long double, long int>& operation,
                                      const typename tnlParallelReductionMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Max
@@ -200,13 +212,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionMax< double, int > >
                                      const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, int > >
                                    ( const tnlParallelReductionMax< long double, int>& operation,
                                      const typename tnlParallelReductionMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionMax< char, long int > >
                                    ( const tnlParallelReductionMax< char, long int >& operation,
                                      const typename tnlParallelReductionMax< char, long int > :: IndexType size,
@@ -235,12 +250,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionMax< double, long int >
                                      const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, long int > >
                                    ( const tnlParallelReductionMax< long double, long int>& operation,
                                      const typename tnlParallelReductionMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Abs sum
@@ -274,13 +292,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, int > >
                                      const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, int > >
                                    ( const tnlParallelReductionAbsSum< long double, int>& operation,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, long int > >
                                    ( const tnlParallelReductionAbsSum< char, long int >& operation,
                                      const typename tnlParallelReductionAbsSum< char, long int > :: IndexType size,
@@ -309,12 +330,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, long in
                                      const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, long int > >
                                    ( const tnlParallelReductionAbsSum< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Abs min
@@ -348,13 +372,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, int > >
                                      const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, int > >
                                    ( const tnlParallelReductionAbsMin< long double, int>& operation,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int > >
                                    ( const tnlParallelReductionAbsMin< char, long int >& operation,
                                      const typename tnlParallelReductionAbsMin< char, long int > :: IndexType size,
@@ -383,16 +410,18 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, long in
                                      const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > >
                                    ( const tnlParallelReductionAbsMin< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< long double, long int> :: ResultType& result );
+#endif
+#endif
 /****
  * Abs max
  */
-
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, int > >
                                    ( const tnlParallelReductionAbsMax< char, int >& operation,
                                      const typename tnlParallelReductionAbsMax< char, int > :: IndexType size,
@@ -421,13 +450,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, int > >
                                      const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, int > >
                                    ( const tnlParallelReductionAbsMax< long double, int>& operation,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int > >
                                    ( const tnlParallelReductionAbsMax< char, long int >& operation,
                                      const typename tnlParallelReductionAbsMax< char, long int > :: IndexType size,
@@ -456,12 +488,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, long in
                                      const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > >
                                    ( const tnlParallelReductionAbsMax< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Logical AND
@@ -494,13 +529,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, int
                                      const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, int > >
                                    ( const tnlParallelReductionLogicalAnd< long double, int>& operation,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, long int > >
                                    ( const tnlParallelReductionLogicalAnd< char, long int >& operation,
                                      const typename tnlParallelReductionLogicalAnd< char, long int > :: IndexType size,
@@ -529,12 +567,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, lon
                                      const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, long int > >
                                    ( const tnlParallelReductionLogicalAnd< long double, long int>& operation,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Logical OR
@@ -567,13 +608,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, int
                                      const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, int > >
                                    ( const tnlParallelReductionLogicalOr< long double, int>& operation,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, long int > >
                                    ( const tnlParallelReductionLogicalOr< char, long int >& operation,
                                      const typename tnlParallelReductionLogicalOr< char, long int > :: IndexType size,
@@ -602,13 +646,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, long
                                      const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, long int > >
                                    ( const tnlParallelReductionLogicalOr< long double, long int>& operation,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< long double, long int> :: ResultType& result );
-
+#endif
+#endif
 
 /****
  * Lp Norm
@@ -627,13 +673,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, int > >
                                      const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, int > >
                                    ( const tnlParallelReductionLpNorm< long double, int>& operation,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< char, long int > >
                                    ( const tnlParallelReductionLpNorm< char, long int >& operation,
                                      const typename tnlParallelReductionLpNorm< char, long int > :: IndexType size,
@@ -662,13 +711,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, long in
                                      const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, long int > >
                                    ( const tnlParallelReductionLpNorm< long double, long int>& operation,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< long double, long int> :: ResultType& result );
-
+#endif
+#endif
 
 /****
  * Equalities
@@ -701,13 +752,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, int
                                      const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, int > >
                                    ( const tnlParallelReductionEqualities< long double, int>& operation,
                                      const typename tnlParallelReductionEqualities< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, long int > >
                                    ( const tnlParallelReductionEqualities< char, long int >& operation,
                                      const typename tnlParallelReductionEqualities< char, long int > :: IndexType size,
@@ -736,13 +790,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, lon
                                      const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, long int > >
                                    ( const tnlParallelReductionEqualities< long double, long int>& operation,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< long double, long int> :: ResultType& result );
-
+#endif
+#endif
 
 /****
  * Inequalities
@@ -775,13 +831,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, i
                                      const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, int > >
                                    ( const tnlParallelReductionInequalities< long double, int>& operation,
                                      const typename tnlParallelReductionInequalities< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, long int > >
                                    ( const tnlParallelReductionInequalities< char, long int >& operation,
                                      const typename tnlParallelReductionInequalities< char, long int > :: IndexType size,
@@ -810,13 +869,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, l
                                      const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, long int > >
                                    ( const tnlParallelReductionInequalities< long double, long int>& operation,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< long double, long int> :: ResultType& result );
-
+#endif
+#endif
 
 /****
  * ScalarProduct
@@ -849,13 +910,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< double,
                                      const typename tnlParallelReductionScalarProduct< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, int > >
                                    ( const tnlParallelReductionScalarProduct< long double, int>& operation,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< char, long int > >
                                    ( const tnlParallelReductionScalarProduct< char, long int >& operation,
                                      const typename tnlParallelReductionScalarProduct< char, long int > :: IndexType size,
@@ -884,12 +948,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< double,
                                      const typename tnlParallelReductionScalarProduct< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, long int > >
                                    ( const tnlParallelReductionScalarProduct< long double, long int>& operation,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff sum
@@ -923,13 +990,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< double, int >
                                      const typename tnlParallelReductionDiffSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, int > >
                                    ( const tnlParallelReductionDiffSum< long double, int>& operation,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< char, long int > >
                                    ( const tnlParallelReductionDiffSum< char, long int >& operation,
                                      const typename tnlParallelReductionDiffSum< char, long int > :: IndexType size,
@@ -958,12 +1028,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< double, long i
                                      const typename tnlParallelReductionDiffSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, long int > >
                                    ( const tnlParallelReductionDiffSum< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff min
@@ -997,13 +1070,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< double, int >
                                      const typename tnlParallelReductionDiffMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, int > >
                                    ( const tnlParallelReductionDiffMin< long double, int>& operation,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< long double, int> :: ResultType& result );
+#endif 
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< char, long int > >
                                    ( const tnlParallelReductionDiffMin< char, long int >& operation,
                                      const typename tnlParallelReductionDiffMin< char, long int > :: IndexType size,
@@ -1032,17 +1108,19 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< double, long i
                                      const typename tnlParallelReductionDiffMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, long int > >
                                    ( const tnlParallelReductionDiffMin< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff max
  */
-
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< char, int > >
                                    ( const tnlParallelReductionDiffMax< char, int >& operation,
                                      const typename tnlParallelReductionDiffMax< char, int > :: IndexType size,
@@ -1071,13 +1149,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< double, int >
                                      const typename tnlParallelReductionDiffMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, int > >
                                    ( const tnlParallelReductionDiffMax< long double, int>& operation,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< char, long int > >
                                    ( const tnlParallelReductionDiffMax< char, long int >& operation,
                                      const typename tnlParallelReductionDiffMax< char, long int > :: IndexType size,
@@ -1106,17 +1187,19 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< double, long i
                                      const typename tnlParallelReductionDiffMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, long int > >
                                    ( const tnlParallelReductionDiffMax< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff abs sum
  */
-
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< char, int > >
                                    ( const tnlParallelReductionDiffAbsSum< char, int >& operation,
                                      const typename tnlParallelReductionDiffAbsSum< char, int > :: IndexType size,
@@ -1145,13 +1228,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< double, int
                                      const typename tnlParallelReductionDiffAbsSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, int > >
                                    ( const tnlParallelReductionDiffAbsSum< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< char, long int > >
                                    ( const tnlParallelReductionDiffAbsSum< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsSum< char, long int > :: IndexType size,
@@ -1180,17 +1266,19 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< double, lon
                                      const typename tnlParallelReductionDiffAbsSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, long int > >
                                    ( const tnlParallelReductionDiffAbsSum< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff abs min
  */
-
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< char, int > >
                                    ( const tnlParallelReductionDiffAbsMin< char, int >& operation,
                                      const typename tnlParallelReductionDiffAbsMin< char, int > :: IndexType size,
@@ -1219,13 +1307,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< double, int
                                      const typename tnlParallelReductionDiffAbsMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, int > >
                                    ( const tnlParallelReductionDiffAbsMin< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< char, long int > >
                                    ( const tnlParallelReductionDiffAbsMin< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsMin< char, long int > :: IndexType size,
@@ -1254,16 +1345,19 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< double, lon
                                      const typename tnlParallelReductionDiffAbsMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, long int > >
                                    ( const tnlParallelReductionDiffAbsMin< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< long double, long int> :: ResultType& result );
+#endif
+#endif
+
 /****
  * Diff abs max
  */
-
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< char, int > >
                                    ( const tnlParallelReductionDiffAbsMax< char, int >& operation,
                                      const typename tnlParallelReductionDiffAbsMax< char, int > :: IndexType size,
@@ -1292,13 +1386,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< double, int
                                      const typename tnlParallelReductionDiffAbsMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, int > >
                                    ( const tnlParallelReductionDiffAbsMax< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< char, long int > >
                                    ( const tnlParallelReductionDiffAbsMax< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsMax< char, long int > :: IndexType size,
@@ -1327,14 +1424,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< double, lon
                                      const typename tnlParallelReductionDiffAbsMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, long int > >
                                    ( const tnlParallelReductionDiffAbsMax< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< long double, long int> :: ResultType& result );
-
-
+#endif
+#endif
 
 /****
  * Diff Lp Norm
@@ -1353,13 +1451,16 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< double, int
                                      const typename tnlParallelReductionDiffLpNorm< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< double, int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, int > >
                                    ( const tnlParallelReductionDiffLpNorm< long double, int>& operation,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< char, long int > >
                                    ( const tnlParallelReductionDiffLpNorm< char, long int >& operation,
                                      const typename tnlParallelReductionDiffLpNorm< char, long int > :: IndexType size,
@@ -1388,13 +1489,15 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< double, lon
                                      const typename tnlParallelReductionDiffLpNorm< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< double, long int> :: ResultType& result );
 
+#ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, long int > >
                                    ( const tnlParallelReductionDiffLpNorm< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< long double, long int> :: ResultType& result );
-
+#endif
+#endif
 
 #endif /* TEMPLATE_EXPLICIT_INSTANTIATION */
 
diff --git a/src/core/cuda/cuda-reduction_impl.h b/src/core/cuda/cuda-reduction_impl.h
index 368a75a978f04cb3008e1a76a750c099744cbc60..389d166ca9a11945cab2e2dfafcbfc78106e6acf 100644
--- a/src/core/cuda/cuda-reduction_impl.h
+++ b/src/core/cuda/cuda-reduction_impl.h
@@ -18,6 +18,8 @@
 #ifndef CUDA_REDUCTION_IMPL_H_
 #define CUDA_REDUCTION_IMPL_H_
 
+//#define CUDA_REDUCTION_PROFILING
+
 #ifdef HAVE_CUDA
 #include <cuda.h>
 #endif
@@ -25,328 +27,109 @@
 #include <core/tnlAssert.h>
 #include <core/cuda/reduction-operations.h>
 #include <core/arrays/tnlArrayOperations.h>
+#include <core/mfuncs.h>
+#include <core/cuda/tnlCudaReductionBuffer.h>
+#include <core/cuda/tnlCudaReduction.h>
 
-using namespace std;
+#ifdef CUDA_REDUCTION_PROFILING
+#include <core/tnlTimerRT.h>
+#endif
 
+using namespace std;
 
 /****
- * This constant says that arrays smaller than its value
- * are going to be reduced on CPU.
+ * Arrays smaller than the following constant
+ * are reduced on CPU. The constant must not be larger
+ * than maximal CUDA grid size.
  */
-const int maxGPUReductionDataSize = 256;
-
-#ifdef HAVE_CUDA
+const int minGPUReductionDataSize = 128;//65536; //16384;//1024;//256;
 
+//static tnlCudaReductionBuffer cudaReductionBuffer( 8 * minGPUReductionDataSize );
 
-/***
- * For each thread in block with thread ID smaller then s this function reduces
- * data elements with indecis tid and tid + s. Here we assume that for each
- * tid the tid + s element also exists i.e. we have even number of elements.
- */
-template< typename Operation >
-__device__ void reduceAligned( const Operation& operation,
-                               typename Operation :: IndexType tid,
-                               typename Operation :: IndexType  s,
-                               typename Operation :: ResultType* sdata )
-{
-   if( tid < s )
-   {
-      sdata[ tid ] = operation. commonReductionOnDevice( tid, tid + s, sdata );
-   }
-}
-
-
-/***
- * For each thread in block with thread ID smaller then s this function reduces
- * data elements with indices tid and tid + s. This is a modified version of
- * the previous algorithm. This one works even for odd number of elements but
- * it is a bit slower.
- */
-template< typename Operation >
-__device__ void reduceNonAligned( const Operation& operation,
-                                  typename Operation :: IndexType tid,
-                                  typename Operation :: IndexType s,
-                                  typename Operation :: IndexType n,
-                                  typename Operation :: ResultType* sdata )
-{
-   if( tid < s )
-   {
-      sdata[ tid ] = operation. commonReductionOnDevice( tid, tid + s, sdata );
-   }
-   /* This is for the case when we have odd number of elements.
-    * The last one will be reduced using the thread with ID 0.
-    */
-   if( s > 32 )
-      __syncthreads();
-   if( 2 * s < n && tid == n - 1 )
-   {
-      sdata[ 0 ] = operation. commonReductionOnDevice( 0, tid, sdata );
-   }
-}
+#ifdef HAVE_CUDA
 
-/***
- * The parallel reduction of one vector.
- *
- * WARNING: This kernel only reduce data in one block. Use rather tnlCUDASimpleReduction2
- *          to call this kernel then doing it by yourself.
- *          This kernel is very inefficient. It is here only for educative and testing reasons.
- *          Please use tnlCUDAReduction instead.
- *
- * The kernel parameters:
- * @param size is the number of all element to reduce - not just in one block.
- * @param deviceInput input data which we want to reduce
- * @param deviceOutput an array to which we write the result of reduction.
- *                     Each block of the grid writes one element in this array
- *                     (i.e. the size of this array equals the number of CUDA blocks).
- */
-template < typename Operation, int blockSize >
+template< typename Operation, int blockSize >
 __global__ void tnlCUDAReductionKernel( const Operation operation,
                                         const typename Operation :: IndexType size,
-                                        const typename Operation :: RealType* deviceInput,
-                                        const typename Operation :: RealType* deviceInput2,
-                                        typename Operation :: ResultType* deviceOutput )
+                                        const typename Operation :: RealType* input1,
+                                        const typename Operation :: RealType* input2,
+                                        typename Operation :: ResultType* output )
 {
-   extern __shared__ __align__ ( 8 ) char __sdata[];
-   
-   typedef typename Operation :: IndexType IndexType;
-   typedef typename Operation :: RealType RealType;
-   typedef typename Operation :: ResultType ResultType;
-
-   ResultType* sdata = reinterpret_cast< ResultType* >( __sdata );
-
-   /***
-    * Get thread id (tid) and global thread id (gid).
-    * lastTId is the last relevant thread id in this block.
-    * gridSize is the number of element processed by all blocks at the
-    * same time.
-    */
-   IndexType tid = threadIdx. x;
-   IndexType gid = 2 * blockIdx. x * blockDim. x + threadIdx. x;
-   IndexType lastTId = size - 2 * blockIdx. x * blockDim. x;
-   IndexType gridSize = 2 * blockDim. x * gridDim.x;
-
-   /***
-    * Read data into the shared memory. We start with the
-    * sequential reduction.
-    */
-   if( gid + blockDim. x < size )
-      sdata[ tid ] = operation. initialValueOnDevice( gid, gid + blockDim. x, deviceInput, deviceInput2 );
-   else if( gid < size )
-      sdata[ tid ] = operation. initialValueOnDevice( gid, deviceInput, deviceInput2 );
-
-   gid += gridSize;
-   while( gid + blockDim. x < size )
-   {
-      sdata[ tid ] = operation. firstReductionOnDevice( tid, gid, gid + blockDim. x, sdata, deviceInput, deviceInput2 );
-      gid += gridSize;
-   }
-   if( gid < size )
-      sdata[ tid ] = operation. firstReductionOnDevice( tid, gid, sdata, deviceInput, deviceInput2 );
-   __syncthreads();
-
-   unsigned int n = lastTId < blockDim. x ? lastTId : blockDim. x;
-
-   /***
-    *  Perform the parallel reduction.
-    *  We reduce the data with step s which is one half of the elements to reduce.
-    *  Each thread with ID < s reduce elements tid and tid + s. The result is stored
-    *  in shared memory in sdata 0 .. s. We set s = s / 2 ( i.e. s >>= 1) and repeat
-    *  the algorithm again until s = 1.
-    *  We also separate the case when the blockDim. x is power of 2 and the algorithm
-    *  can be written in more efficient way without some conditions.
-    */
-   if( n == 128 || n ==  64 || n ==  32 || n ==  16 ||
-       n ==   8 || n ==   4 || n ==   2 || n == 256 ||
-       n == 512 )
-   {
-      if( blockSize >= 512 )
-      {
-         if( tid < 256 )
-            reduceAligned( operation, tid, 256, sdata );
-         __syncthreads();
-      }
-      if( blockSize >= 256 )
-      {
-         if( tid < 128 )
-            reduceAligned( operation, tid, 128, sdata );
-         __syncthreads();
-      }
-      if( blockSize >= 128 )
-      {
-         if( tid <  64 )
-            reduceAligned( operation, tid, 64, sdata );
-         __syncthreads();
-      }
-
-      /***
-       * This runs in one warp so it is synchronised implicitly.
-       */
-      if (tid < 32)
-      {
-         if( blockSize >= 64 )
-            reduceAligned( operation, tid, 32, sdata );
-         if( blockSize >= 32 )
-            reduceAligned( operation, tid, 16, sdata );
-         if( blockSize >= 16 )
-            reduceAligned( operation, tid,  8, sdata );
-         if( blockSize >=  8 )
-            reduceAligned( operation, tid,  4, sdata );
-         if( blockSize >=  4 )
-            reduceAligned( operation, tid,  2, sdata );
-         if( blockSize >=  2 )
-            reduceAligned( operation, tid,  1, sdata );
-      }
-   }
-   else
-   {
-      unsigned int s;
-      if( n >= 512 )
-      {
-         s = n / 2;
-         reduceNonAligned( operation, tid, s, n, sdata );
-         n = s;
-         __syncthreads();
-      }
-      if( n >= 256 )
-      {
-         s = n / 2;
-         reduceNonAligned( operation, tid, s, n, sdata );
-         n = s;
-         __syncthreads();
-      }
-      if( n >= 128 )
-      {
-         s = n / 2;
-         reduceNonAligned( operation, tid, s, n, sdata );
-         n = s;
-         __syncthreads();
-      }
-      if( n >= 64 )
-      {
-         s = n / 2;
-         reduceNonAligned( operation, tid, s, n, sdata );
-         n = s;
-         __syncthreads();
-      }
-      if( n >= 32 )
-      {
-         s = n / 2;
-         reduceNonAligned( operation, tid, s, n, sdata );
-         n = s;
-         __syncthreads();
-      }
-      /***
-       * This runs in one warp so it is synchronised implicitly.
-       */
-      if( n >= 16 )
-      {
-         s = n / 2;
-         reduceNonAligned( operation, tid, s, n, sdata );
-         n = s;
-      }
-      if( n >= 8 )
-      {
-         s = n / 2;
-         reduceNonAligned( operation, tid, s, n, sdata );
-         n = s;
-      }
-      if( n >= 4 )
-      {
-         s = n / 2;
-         reduceNonAligned( operation, tid, s, n, sdata );
-         n = s;
-      }
-      if( n >= 2 )
-      {
-         s = n / 2;
-         reduceNonAligned( operation, tid, s, n, sdata );
-         n = s;
-      }
-   }
-
-   /***
-    * Store the result back in the global memory.
-    */
-   if( tid == 0 )
-      deviceOutput[ blockIdx. x ] = sdata[ 0 ];
-}
+   typedef tnlCUDAReduction< Operation, blockSize > Reduction;
+   Reduction::reduce( operation, size, input1, input2, output );
+};
 
 template< typename Operation >
-typename Operation :: IndexType reduceOnCudaDevice( const Operation& operation,
-                                                    const typename Operation :: IndexType size,
-                                                    const typename Operation :: RealType* input1,
-                                                    const typename Operation :: RealType* input2,
-                                                    typename Operation :: ResultType*& output)
+typename Operation::IndexType reduceOnCudaDevice( const Operation& operation,
+                                                  const typename Operation::IndexType size,
+                                                  const typename Operation::RealType* input1,
+                                                  const typename Operation::RealType* input2,
+                                                  typename Operation::ResultType*& output)
 {
-   typedef typename Operation :: IndexType IndexType;
-   typedef typename Operation :: RealType RealType;
-   typedef typename Operation :: ResultType ResultType;
-
-   const IndexType desBlockSize( 512 );
-   const IndexType desGridSize( 2048 );
-   dim3 blockSize( 0 ), gridSize( 0 );
-
-   /***
-    * Compute the CUDA block size aligned to the power of two.
-    */
-   blockSize. x = :: Min( size, desBlockSize );
-   IndexType alignedBlockSize = 1;
-   while( alignedBlockSize < blockSize. x ) alignedBlockSize <<= 1;
-   blockSize. x = alignedBlockSize;
-
-   gridSize. x = Min( ( IndexType ) ( size / blockSize. x + 1 ) / 2, desGridSize );
-
-   if( ! output &&
-       ! tnlArrayOperations< tnlCuda >::allocateMemory( output, :: Max( ( IndexType ) 1, size / desBlockSize ) ) )
-         return false;
-
-   IndexType shmem = blockSize. x * sizeof( ResultType );
+   typedef typename Operation::IndexType IndexType;
+   typedef typename Operation::RealType RealType;
+   typedef typename Operation::ResultType ResultType;
+   
+   const IndexType desGridSize( minGPUReductionDataSize );   
+   dim3 blockSize( 256 ), gridSize( 0 );   
+   gridSize.x = Min( tnlCuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
+  
+   // create reference to the reduction buffer singleton and set default size
+   tnlCudaReductionBuffer & cudaReductionBuffer = tnlCudaReductionBuffer::getInstance( 8 * minGPUReductionDataSize );
+   
+   //tnlCudaReductionBuffer cudaReductionBuffer( 8 * minGPUReductionDataSize );
+   if( ! cudaReductionBuffer.setSize( gridSize.x * sizeof( ResultType ) ) )
+      return false;
+   output = cudaReductionBuffer.template getData< ResultType >();      
+   IndexType shmem = blockSize.x * sizeof( ResultType );
+   
    /***
     * Depending on the blockSize we generate appropriate template instance.
     */
-      switch( blockSize. x )
-      {
-         case 512:
-            tnlCUDAReductionKernel< Operation, 512 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case 256:
-            tnlCUDAReductionKernel< Operation, 256 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case 128:
-            tnlCUDAReductionKernel< Operation, 128 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case  64:
-            tnlCUDAReductionKernel< Operation,  64 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case  32:
-            tnlCUDAReductionKernel< Operation,  32 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case  16:
-            tnlCUDAReductionKernel< Operation,  16 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case   8:
-            tnlCUDAReductionKernel< Operation,   8 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case   4:
-            tnlCUDAReductionKernel< Operation,   4 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case   2:
-            tnlCUDAReductionKernel< Operation,   2 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case   1:
-            tnlAssert( false, cerr << "blockSize should not be 1." << endl );
-         default:
-            tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
-      }
+   switch( blockSize.x )         
+   {
+      case 512:
+         tnlCUDAReductionKernel< Operation, 512 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case 256:
+         tnlCUDAReductionKernel< Operation, 256 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case 128:
+         tnlCUDAReductionKernel< Operation, 128 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case  64:
+         tnlCUDAReductionKernel< Operation,  64 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case  32:
+         tnlCUDAReductionKernel< Operation,  32 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case  16:
+         tnlCUDAReductionKernel< Operation,  16 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+     case   8:
+         tnlCUDAReductionKernel< Operation,   8 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case   4:
+         tnlCUDAReductionKernel< Operation,   4 >
+        <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+        break;
+      case   2:
+         tnlCUDAReductionKernel< Operation,   2 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case   1:
+         tnlAssert( false, cerr << "blockSize should not be 1." << endl );
+      default:
+         tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
+   }
+   //checkCudaDevice;
    return gridSize. x;
 }
 #endif
@@ -360,75 +143,83 @@ bool reductionOnCudaDevice( const Operation& operation,
 {
 #ifdef HAVE_CUDA
 
-   typedef typename Operation :: IndexType IndexType;
-   typedef typename Operation :: RealType RealType;
-   typedef typename Operation :: ResultType ResultType;
-   typedef typename Operation :: LaterReductionOperation LaterReductionOperation;
-
+   typedef typename Operation::IndexType IndexType;
+   typedef typename Operation::RealType RealType;
+   typedef typename Operation::ResultType ResultType;
+   typedef typename Operation::LaterReductionOperation LaterReductionOperation;
+   
    /***
     * First check if the input array(s) is/are large enough for the reduction on GPU.
     * Otherwise copy it/them to host and reduce on CPU.
-    */
-   RealType hostArray1[ maxGPUReductionDataSize ];
-   RealType hostArray2[ maxGPUReductionDataSize ];
-   if( size <= maxGPUReductionDataSize )
+    */   
+   RealType hostArray1[ minGPUReductionDataSize ];
+   RealType hostArray2[ minGPUReductionDataSize ];
+   if( size <= minGPUReductionDataSize )
    {
       if( ! tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< RealType, RealType, IndexType >( hostArray1, deviceInput1, size ) )
          return false;
       if( deviceInput2 && ! 
           tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< RealType, RealType, IndexType >( hostArray2, deviceInput2, size ) )
          return false;
-      result = operation. initialValueOnHost( 0, hostArray1, hostArray2 );
-      for( IndexType i = 1; i < size; i ++ )
-         result = operation. reduceOnHost( i, result, hostArray1, hostArray2 );
+      result = operation.initialValue();
+      for( IndexType i = 0; i < size; i ++ )
+         result = operation.reduceOnHost( i, result, hostArray1, hostArray2 );
       return true;
    }
 
+   #ifdef CUDA_REDUCTION_PROFILING
+      tnlTimerRT timer;
+      timer.reset();
+      timer.start();
+   #endif   
+
    /****
     * Reduce the data on the CUDA device.
-    */
-   ResultType* deviceAux1( 0 ), *deviceAux2( 0 );
+    */      
+   ResultType* deviceAux1( 0 );
    IndexType reducedSize = reduceOnCudaDevice( operation,
                                                size,
                                                deviceInput1,
                                                deviceInput2,
                                                deviceAux1 );
-
-   LaterReductionOperation laterReductionOperation;
-   while( reducedSize > maxGPUReductionDataSize )
-   {
-      reducedSize = reduceOnCudaDevice( laterReductionOperation,
-                                        reducedSize,
-                                        deviceAux1,
-                                        ( ResultType* ) 0,
-                                        deviceAux2 );
-      Swap( deviceAux1, deviceAux2 );
-   }
+   #ifdef CUDA_REDUCTION_PROFILING
+      timer.stop();
+      cout << "   Reduction on GPU to size " << reducedSize << " took " << timer.getTime() << " sec. " << endl;
+      timer.reset();
+      timer.start();
+   #endif   
 
    /***
     * Transfer the reduced data from device to host.
     */
-   ResultType resultArray[ maxGPUReductionDataSize ];
+   ResultType resultArray[ minGPUReductionDataSize ];
    if( ! tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< ResultType, ResultType, IndexType >( resultArray, deviceAux1, reducedSize ) )
       return false;
-
+   
+   #ifdef CUDA_REDUCTION_PROFILING   
+      timer.stop();
+      cout << "   Transferring data to CPU took " << timer.getTime() << " sec. " << endl;
+   #endif   
+
+   #ifdef CUDA_REDUCTION_PROFILING
+      timer.reset();
+      timer.start();
+   #endif      
+   
    /***
     * Reduce the data on the host system.
-    */
-   //for( IndexType i = 0; i < reducedSize; i ++ )
-   //   cout << resultArray[ i ] << ", ";
-   result = laterReductionOperation. initialValueOnHost( 0, resultArray, ( ResultType* ) 0 );
-   for( IndexType i = 1; i < reducedSize; i ++ )
-      result = laterReductionOperation. reduceOnHost( i, result, resultArray, ( ResultType*) 0 );
-
-   /****
-    * Free the memory allocated on the device.
-    */
-   if( deviceAux1 && ! tnlArrayOperations< tnlCuda >::freeMemory( deviceAux1 ) )
-      return false;
-   if( deviceAux2 && ! tnlArrayOperations< tnlCuda >::freeMemory( deviceAux2 ) )
-      return false;
-   return true;
+    */    
+   LaterReductionOperation laterReductionOperation;
+   result = laterReductionOperation. initialValue();
+   for( IndexType i = 0; i < reducedSize; i ++ )
+      result = laterReductionOperation.reduceOnHost( i, result, resultArray, ( ResultType*) 0 );
+   
+   #ifdef CUDA_REDUCTION_PROFILING
+      timer.stop();
+      cout << "   Reduction of small data set on CPU took " << timer.getTime() << " sec. " << endl;
+   #endif 
+   
+   return checkCudaDevice;
 #else
    tnlCudaSupportMissingMessage;;
    return false;
@@ -469,13 +260,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionSum< double, int
                                      const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, int > >
                                    ( const tnlParallelReductionSum< long double, int>& operation,
                                      const typename tnlParallelReductionSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionSum< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionSum< char, long int > >
                                    ( const tnlParallelReductionSum< char, long int >& operation,
                                      const typename tnlParallelReductionSum< char, long int > :: IndexType size,
@@ -504,12 +298,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionSum< double, lon
                                      const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, long int > >
                                    ( const tnlParallelReductionSum< long double, long int>& operation,
                                      const typename tnlParallelReductionSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionSum< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionSum< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Min
@@ -543,13 +340,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionMin< double, int
                                      const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, int > >
                                    ( const tnlParallelReductionMin< long double, int>& operation,
                                      const typename tnlParallelReductionMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionMin< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionMin< char, long int > >
                                    ( const tnlParallelReductionMin< char, long int >& operation,
                                      const typename tnlParallelReductionMin< char, long int > :: IndexType size,
@@ -578,12 +378,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionMin< double, lon
                                      const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, long int > >
                                    ( const tnlParallelReductionMin< long double, long int>& operation,
                                      const typename tnlParallelReductionMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionMin< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionMin< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Max
@@ -617,13 +420,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionMax< double, int
                                      const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, int > >
                                    ( const tnlParallelReductionMax< long double, int>& operation,
                                      const typename tnlParallelReductionMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionMax< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionMax< char, long int > >
                                    ( const tnlParallelReductionMax< char, long int >& operation,
                                      const typename tnlParallelReductionMax< char, long int > :: IndexType size,
@@ -652,12 +458,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionMax< double, lon
                                      const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, long int > >
                                    ( const tnlParallelReductionMax< long double, long int>& operation,
                                      const typename tnlParallelReductionMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionMax< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionMax< long double, long int> :: ResultType& result );
+#endif
+#endif
+
 
 /****
  * Abs sum
@@ -691,13 +501,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double,
                                      const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, int > >
                                    ( const tnlParallelReductionAbsSum< long double, int>& operation,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionAbsSum< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionAbsSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, long int > >
                                    ( const tnlParallelReductionAbsSum< char, long int >& operation,
                                      const typename tnlParallelReductionAbsSum< char, long int > :: IndexType size,
@@ -726,12 +539,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double,
                                      const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, long int > >
                                    ( const tnlParallelReductionAbsSum< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionAbsSum< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionAbsSum< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Abs min
@@ -765,13 +581,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double,
                                      const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, int > >
                                    ( const tnlParallelReductionAbsMin< long double, int>& operation,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionAbsMin< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionAbsMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int > >
                                    ( const tnlParallelReductionAbsMin< char, long int >& operation,
                                      const typename tnlParallelReductionAbsMin< char, long int > :: IndexType size,
@@ -800,12 +619,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double,
                                      const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > >
                                    ( const tnlParallelReductionAbsMin< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionAbsMin< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionAbsMin< long double, long int> :: ResultType& result );
+#endif
+#endif
+
 /****
  * Abs max
  */
@@ -838,13 +661,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double,
                                      const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, int > >
                                    ( const tnlParallelReductionAbsMax< long double, int>& operation,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionAbsMax< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionAbsMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int > >
                                    ( const tnlParallelReductionAbsMax< char, long int >& operation,
                                      const typename tnlParallelReductionAbsMax< char, long int > :: IndexType size,
@@ -873,12 +699,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double,
                                      const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > >
                                    ( const tnlParallelReductionAbsMax< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionAbsMax< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionAbsMax< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Logical AND
@@ -911,13 +740,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< doub
                                      const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, int > >
                                    ( const tnlParallelReductionLogicalAnd< long double, int>& operation,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionLogicalAnd< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionLogicalAnd< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, long int > >
                                    ( const tnlParallelReductionLogicalAnd< char, long int >& operation,
                                      const typename tnlParallelReductionLogicalAnd< char, long int > :: IndexType size,
@@ -946,12 +778,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< doub
                                      const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, long int > >
                                    ( const tnlParallelReductionLogicalAnd< long double, long int>& operation,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionLogicalAnd< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionLogicalAnd< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Logical OR
@@ -984,13 +819,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< doubl
                                      const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, int > >
                                    ( const tnlParallelReductionLogicalOr< long double, int>& operation,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionLogicalOr< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionLogicalOr< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, long int > >
                                    ( const tnlParallelReductionLogicalOr< char, long int >& operation,
                                      const typename tnlParallelReductionLogicalOr< char, long int > :: IndexType size,
@@ -1019,13 +857,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< doubl
                                      const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, long int > >
                                    ( const tnlParallelReductionLogicalOr< long double, long int>& operation,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionLogicalOr< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionLogicalOr< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Lp Norm
@@ -1044,13 +884,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double,
                                      const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, int > >
                                    ( const tnlParallelReductionLpNorm< long double, int>& operation,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionLpNorm< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionLpNorm< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< char, long int > >
                                    ( const tnlParallelReductionLpNorm< char, long int >& operation,
                                      const typename tnlParallelReductionLpNorm< char, long int > :: IndexType size,
@@ -1079,13 +922,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double,
                                      const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, long int > >
                                    ( const tnlParallelReductionLpNorm< long double, long int>& operation,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionLpNorm< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionLpNorm< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Equalities
@@ -1118,13 +963,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< doub
                                      const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, int > >
                                    ( const tnlParallelReductionEqualities< long double, int>& operation,
                                      const typename tnlParallelReductionEqualities< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionEqualities< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionEqualities< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, long int > >
                                    ( const tnlParallelReductionEqualities< char, long int >& operation,
                                      const typename tnlParallelReductionEqualities< char, long int > :: IndexType size,
@@ -1153,13 +1001,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< doub
                                      const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, long int > >
                                    ( const tnlParallelReductionEqualities< long double, long int>& operation,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionEqualities< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionEqualities< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Inequalities
@@ -1192,13 +1042,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< do
                                      const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, int > >
                                    ( const tnlParallelReductionInequalities< long double, int>& operation,
                                      const typename tnlParallelReductionInequalities< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionInequalities< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionInequalities< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, long int > >
                                    ( const tnlParallelReductionInequalities< char, long int >& operation,
                                      const typename tnlParallelReductionInequalities< char, long int > :: IndexType size,
@@ -1227,13 +1080,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< do
                                      const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, long int > >
                                    ( const tnlParallelReductionInequalities< long double, long int>& operation,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionInequalities< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionInequalities< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * ScalarProduct
@@ -1266,13 +1121,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< d
                                      const typename tnlParallelReductionScalarProduct< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, int > >
                                    ( const tnlParallelReductionScalarProduct< long double, int>& operation,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionScalarProduct< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionScalarProduct< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< char, long int > >
                                    ( const tnlParallelReductionScalarProduct< char, long int >& operation,
                                      const typename tnlParallelReductionScalarProduct< char, long int > :: IndexType size,
@@ -1301,12 +1159,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< d
                                      const typename tnlParallelReductionScalarProduct< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, long int > >
                                    ( const tnlParallelReductionScalarProduct< long double, long int>& operation,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionScalarProduct< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionScalarProduct< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff sum
@@ -1340,13 +1201,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< double,
                                      const typename tnlParallelReductionDiffSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, int > >
                                    ( const tnlParallelReductionDiffSum< long double, int>& operation,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffSum< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< char, long int > >
                                    ( const tnlParallelReductionDiffSum< char, long int >& operation,
                                      const typename tnlParallelReductionDiffSum< char, long int > :: IndexType size,
@@ -1375,12 +1239,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< double,
                                      const typename tnlParallelReductionDiffSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, long int > >
                                    ( const tnlParallelReductionDiffSum< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffSum< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffSum< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff min
@@ -1414,13 +1281,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< double,
                                      const typename tnlParallelReductionDiffMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, int > >
                                    ( const tnlParallelReductionDiffMin< long double, int>& operation,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffMin< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< char, long int > >
                                    ( const tnlParallelReductionDiffMin< char, long int >& operation,
                                      const typename tnlParallelReductionDiffMin< char, long int > :: IndexType size,
@@ -1449,12 +1319,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< double,
                                      const typename tnlParallelReductionDiffMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, long int > >
                                    ( const tnlParallelReductionDiffMin< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffMin< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffMin< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff max
@@ -1488,13 +1361,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< double,
                                      const typename tnlParallelReductionDiffMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, int > >
                                    ( const tnlParallelReductionDiffMax< long double, int>& operation,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffMax< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< char, long int > >
                                    ( const tnlParallelReductionDiffMax< char, long int >& operation,
                                      const typename tnlParallelReductionDiffMax< char, long int > :: IndexType size,
@@ -1523,12 +1399,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< double,
                                      const typename tnlParallelReductionDiffMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, long int > >
                                    ( const tnlParallelReductionDiffMax< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffMax< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffMax< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff abs sum
@@ -1562,13 +1441,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< doub
                                      const typename tnlParallelReductionDiffAbsSum< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, int > >
                                    ( const tnlParallelReductionDiffAbsSum< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsSum< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffAbsSum< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< char, long int > >
                                    ( const tnlParallelReductionDiffAbsSum< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsSum< char, long int > :: IndexType size,
@@ -1597,12 +1479,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< doub
                                      const typename tnlParallelReductionDiffAbsSum< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, long int > >
                                    ( const tnlParallelReductionDiffAbsSum< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsSum< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffAbsSum< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 /****
  * Diff abs min
@@ -1636,13 +1521,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< doub
                                      const typename tnlParallelReductionDiffAbsMin< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, int > >
                                    ( const tnlParallelReductionDiffAbsMin< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMin< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffAbsMin< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< char, long int > >
                                    ( const tnlParallelReductionDiffAbsMin< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsMin< char, long int > :: IndexType size,
@@ -1671,12 +1559,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< doub
                                      const typename tnlParallelReductionDiffAbsMin< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, long int > >
                                    ( const tnlParallelReductionDiffAbsMin< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMin< long double, long int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffAbsMin< long double, long int> :: ResultType& result );
+#endif
+#endif
+
 /****
  * Diff abs max
  */
@@ -1709,13 +1601,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< doub
                                      const typename tnlParallelReductionDiffAbsMax< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, int > >
                                    ( const tnlParallelReductionDiffAbsMax< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMax< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffAbsMax< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< char, long int > >
                                    ( const tnlParallelReductionDiffAbsMax< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsMax< char, long int > :: IndexType size,
@@ -1744,13 +1639,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< doub
                                      const typename tnlParallelReductionDiffAbsMax< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, long int > >
                                    ( const tnlParallelReductionDiffAbsMax< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffAbsMax< long double, long int> :: ResultType& result );*/
-
+                                     typename tnlParallelReductionDiffAbsMax< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 
 /****
@@ -1770,13 +1667,16 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< doub
                                      const typename tnlParallelReductionDiffLpNorm< double, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< double, int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, int > >
                                    ( const tnlParallelReductionDiffLpNorm< long double, int>& operation,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffLpNorm< long double, int> :: ResultType& result );*/
+                                     typename tnlParallelReductionDiffLpNorm< long double, int> :: ResultType& result );
+#endif
 
+#ifdef INSTANTIATE_LONG_INT
 extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< char, long int > >
                                    ( const tnlParallelReductionDiffLpNorm< char, long int >& operation,
                                      const typename tnlParallelReductionDiffLpNorm< char, long int > :: IndexType size,
@@ -1805,15 +1705,15 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< doub
                                      const typename tnlParallelReductionDiffLpNorm< double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< double, long int> :: ResultType& result );
 
-/*extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, long int > >
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, long int > >
                                    ( const tnlParallelReductionDiffLpNorm< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: RealType* deviceInput2,
-                                     typename tnlParallelReductionDiffLpNorm< long double, long int> :: ResultType& result );*/
-
-
-
+                                     typename tnlParallelReductionDiffLpNorm< long double, long int> :: ResultType& result );
+#endif
+#endif
 
 #endif /* TEMPLATE_EXPLICIT_INSTANTIATION */
 
diff --git a/src/core/cuda/reduction-operations.h b/src/core/cuda/reduction-operations.h
index f5dad03987b6a010f340ad548bf732b8be2de6c2..93de5076d8fad0fd24c41c60d5f829cadd5b27c9 100644
--- a/src/core/cuda/reduction-operations.h
+++ b/src/core/cuda/reduction-operations.h
@@ -18,6 +18,9 @@
 #ifndef REDUCTION_OPERATIONS_H_
 #define REDUCTION_OPERATIONS_H_
 
+#include <core/tnlConstants.h>
+#include <core/tnlCuda.h>
+
 #ifdef HAVE_CUDA
 #include <cuda.h>
 #include <core/mfuncs.h>
@@ -50,6 +53,31 @@ __device__ inline  double tnlCudaMin( const double& a,
    return fmin( a, b );
 }
 
+template< class T > __device__ T tnlCudaMin( volatile const T& a,
+                                             volatile const T& b )
+{
+   return a < b ? a : b;
+}
+
+__device__ inline int tnlCudaMin( volatile const int& a,
+                                  volatile const int& b )
+{
+   return min( a, b );
+}
+
+__device__ inline  float tnlCudaMin( volatile const float& a,
+                                     volatile const float& b )
+{
+   return fminf( a, b );
+}
+
+__device__ inline  double tnlCudaMin( volatile const double& a,
+                                      volatile const double& b )
+{
+   return fmin( a, b );
+}
+
+
 /***
  * This function returns maximum of two numbers stored on the device.
  */
@@ -77,6 +105,30 @@ __device__  inline double tnlCudaMax( const double& a,
    return fmax( a, b );
 }
 
+template< class T > __device__ T tnlCudaMax( volatile const T& a,
+                                             volatile const T& b )
+{
+   return a > b ? a : b;
+}
+
+__device__  inline int tnlCudaMax( volatile const int& a,
+                                   volatile const int& b )
+{
+   return max( a, b );
+}
+
+__device__  inline float tnlCudaMax( volatile const float& a,
+                                     volatile const float& b )
+{
+   return fmaxf( a, b );
+}
+
+__device__  inline double tnlCudaMax( volatile const double& a,
+                                      volatile const double& b )
+{
+   return fmax( a, b );
+}
+
 /***
  * This function returns absolute value of given number on the device.
  */
@@ -105,6 +157,32 @@ __device__  inline long double tnlCudaAbs( const long double& a )
    return fabs( ( double ) a );
 }
 
+__device__  inline int tnlCudaAbs( volatile const int& a )
+{
+   return abs( a );
+}
+
+__device__  inline long int tnlCudaAbs( volatile const long int& a )
+{
+   return abs( a );
+}
+
+__device__  inline float tnlCudaAbs( volatile const float& a )
+{
+   return fabs( a );
+}
+
+__device__  inline double tnlCudaAbs( volatile const double& a )
+{
+   return fabs( a );
+}
+
+__device__  inline long double tnlCudaAbs( volatile const long double& a )
+{
+   return fabs( ( double ) a );
+}
+
+
 template< typename Type1, typename Type2 >
 __device__ Type1 tnlCudaPow( const Type1& x, const Type2& power )
 {
@@ -122,13 +200,6 @@ class tnlParallelReductionSum
    typedef Real ResultType;
    typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return data1[ idx ];
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -136,66 +207,31 @@ class tnlParallelReductionSum
    {
       return current + data1[ idx ];
    };
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ] + data1[ idx2 ];
-   };
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
+   
+   __cuda_callable__ ResultType initialValue() const { return 0; };
+   
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                              const IndexType index,
+                                              const RealType* data1,
+                                              const RealType* data2 ) const
    {
-      return data1[ idx1 ];
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] + data2[ idx2 ] + data2[ idx3 ];
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] + data2[ idx2 ];
-   };
-
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
-   {
-      return data[ idx1 ] + data[ idx2 ];
-   };
+      result += data1[ index ];
+   }
+   
+#ifdef HAVE_CUDA
 
-   __device__ ResultType commonReductionOnDevice( const ResultType& a,
-                                                  const ResultType& b ) const
+   __device__ void commonReductionOnDevice( ResultType& result,
+                                            const ResultType& data ) const
    {
-      return a + b;
+      result += data;
    };
-
-
-   __device__ RealType identity() const
+   
+   __device__ void commonReductionOnDevice( volatile ResultType& result,
+                                            volatile const ResultType& data ) const
    {
-      return 0;
+      result += data;
    };
 
-   __device__ void performInPlace( ResultType& a,
-                                   const ResultType& b ) const
-   {
-      a += b;
-   }
-
 #endif
 };
 
@@ -209,13 +245,6 @@ class tnlParallelReductionMin
    typedef Real ResultType;
    typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return data1[ idx ];
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -224,47 +253,30 @@ class tnlParallelReductionMin
       return Min( current, data1[ idx ] );
    };
 
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
+   __cuda_callable__ ResultType initialValue() const { return tnlMaxValue< ResultType>(); };
+   
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                              const IndexType index,
+                                              const RealType* data1,
+                                              const RealType* data2 ) const
    {
-      return tnlCudaMin( data1[ idx1 ], data1[ idx2 ] );
+      result = tnlCudaMin( result, data1[ index ] );
    }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ];
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
+   
+#ifdef HAVE_CUDA   
+   __device__ void commonReductionOnDevice( ResultType& result,
+                                            const ResultType& data ) const
    {
-      return tnlCudaMin( data1[ idx1 ], tnlCudaMin(  data2[ idx2 ],  data2[ idx3 ] ) );
+      result = tnlCudaMin( result, data );
    };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
+   
+   __device__ void commonReductionOnDevice( volatile ResultType& result,
+                                            volatile const ResultType& data ) const
    {
-      return tnlCudaMin( data1[ idx1 ], data2[ idx2 ] );
+      result = tnlCudaMin( result, data );
    };
+   
 
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
-   {
-      return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
-   };
 #endif
 };
 
@@ -278,13 +290,6 @@ class tnlParallelReductionMax
    typedef Real ResultType;
    typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return data1[ idx ];
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -293,398 +298,212 @@ class tnlParallelReductionMax
       return Max( current, data1[ idx ] );
    };
 
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaMax( data1[ idx1 ], data1[ idx2 ] );
-   }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ];
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
+   __cuda_callable__ ResultType initialValue() const { return tnlMinValue< ResultType>(); };
+   
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                              const IndexType index,
+                                              const RealType* data1,
+                                              const RealType* data2 ) const
    {
-      return tnlCudaMax( data1[ idx1 ], tnlCudaMax( data2[ idx2 ], data2[ idx3 ] ) );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
+      result = tnlCudaMax( result, data1[ index ] );
+   }   
+   
+#ifdef HAVE_CUDA   
+   __device__ void commonReductionOnDevice( ResultType& result,
+                                            const ResultType& data ) const
    {
-      return tnlCudaMax( data1[ idx1 ], data2[ idx2 ] );
+      result = tnlCudaMax( result, data );
    };
 
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
+   __device__ void commonReductionOnDevice( volatile ResultType& result,
+                                            volatile const ResultType& data ) const
    {
-      return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
-   };
+      result = tnlCudaMax( result, data );
+   };   
 #endif
 };
 
 template< typename Real, typename Index >
-class tnlParallelReductionAbsSum
+class tnlParallelReductionLogicalAnd
 {
    public:
 
    typedef Real RealType;
    typedef Index IndexType;
    typedef Real ResultType;
-   typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation;
-
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return tnlAbs( data1[ idx ] );
-   };
+   typedef tnlParallelReductionLogicalAnd< Real, Index > LaterReductionOperation;
 
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
                             const RealType* data2 ) const
    {
-      return current + tnlAbs( data1[ idx ] );
-   };
-
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaAbs( data1[ idx1 ] ) + tnlCudaAbs( data1[ idx2 ] );
+      return current && data1[ idx ];
    };
 
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
+   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) true; };
+   
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                              const IndexType index,
+                                              const RealType* data1,
+                                              const RealType* data2 ) const
    {
-      return tnlCudaAbs( data1[ idx1 ] );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
+      result = result && data1[ index ];
+   }
+   
+   
+#ifdef HAVE_CUDA   
+   __device__ void commonReductionOnDevice( ResultType& result,
+                                            const ResultType& data ) const
    {
-      return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] ) + tnlCudaAbs( data2[ idx3 ] );
+      result = result && data;
    };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
+   
+   __device__ void commonReductionOnDevice( volatile ResultType& result,
+                                            volatile const ResultType& data ) const
    {
-      return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] );
+      result = result && data;
    };
+   
 
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
-   {
-      return data[ idx1 ] + data[ idx2 ];
-   };
 #endif
 };
 
+
 template< typename Real, typename Index >
-class tnlParallelReductionAbsMin
+class tnlParallelReductionLogicalOr
 {
    public:
 
    typedef Real RealType;
    typedef Index IndexType;
    typedef Real ResultType;
-   typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation;
-
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return tnlAbs( data1[ idx ] );
-   };
+   typedef tnlParallelReductionLogicalOr< Real, Index > LaterReductionOperation;
 
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
                             const RealType* data2 ) const
    {
-      return Min( current, tnlAbs( data1[ idx ] ) );
+      return current || data1[ idx ];
    };
-
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
+   
+   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) false; };
+   
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                              const IndexType index,
+                                              const RealType* data1,
+                                              const RealType* data2 ) const
    {
-      return tnlCudaMin( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) );
+      result = result || data1[ index ];
    }
 
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaAbs( data1[ idx1 ] );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMin( data1[ idx1 ], tnlCudaMin(  tnlCudaAbs( data2[ idx2 ] ),  tnlCudaAbs( data2[ idx3 ] ) ) );
-   };
 
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
+#ifdef HAVE_CUDA   
+   __device__ void commonReductionOnDevice( ResultType& result,
+                                            const ResultType& data ) const
    {
-      return tnlCudaMin( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) );
+      result = result || data;
    };
-
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
+   
+   __device__ void commonReductionOnDevice( volatile ResultType& result,
+                                            volatile const ResultType& data ) const
    {
-      return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
+      result = result || data;
    };
 #endif
 };
 
 template< typename Real, typename Index >
-class tnlParallelReductionAbsMax
+class tnlParallelReductionAbsSum : public tnlParallelReductionSum< Real, Index >
 {
    public:
 
    typedef Real RealType;
    typedef Index IndexType;
    typedef Real ResultType;
-   typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation;
-
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return tnlAbs( data1[ idx ] );
-   };
+   typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation;
 
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
                             const RealType* data2 ) const
    {
-      return Max( current, tnlAbs( data1[ idx ] ) );
-   };
-
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaMax( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) );
-   }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaAbs( data1[ idx1 ] );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMax( data1[ idx1 ], tnlCudaMax( tnlCudaAbs( data2[ idx2 ] ), tnlCudaAbs( data2[ idx3 ] ) ) );
+      return current + tnlAbs( data1[ idx ] );
    };
 
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMax( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) );
-   };
+   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
 
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                              const IndexType index,
+                                              const RealType* data1,
+                                              const RealType* data2 ) const
    {
-      return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
-   };
-#endif
+      result += tnlCudaAbs( data1[ index ] );
+   }
 };
 
 template< typename Real, typename Index >
-class tnlParallelReductionLogicalAnd
+class tnlParallelReductionAbsMin : public tnlParallelReductionMin< Real, Index >
 {
    public:
 
    typedef Real RealType;
    typedef Index IndexType;
    typedef Real ResultType;
-   typedef tnlParallelReductionLogicalAnd< Real, Index > LaterReductionOperation;
-
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return data1[ idx ];
-   };
+   typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation;
 
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
                             const RealType* data2 ) const
    {
-      return current && data1[ idx ];
-   };
-
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ] && data1[ idx2 ];
-   }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ];
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] && data2[ idx2 ] && data2[ idx3 ];
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] && data2[ idx2 ];
+      return Min( current, tnlAbs( data1[ idx ] ) );
    };
 
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
+   __cuda_callable__ ResultType initialValue() const { return tnlMaxValue< ResultType>(); };
+   
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                              const IndexType index,
+                                              const RealType* data1,
+                                              const RealType* data2 ) const
    {
-      return data[ idx1 ] && data[ idx2 ];
-   };
-#endif
+      result = tnlCudaMin( result, tnlCudaAbs( data1[ index ] ) );
+   }   
 };
 
-
 template< typename Real, typename Index >
-class tnlParallelReductionLogicalOr
+class tnlParallelReductionAbsMax : public tnlParallelReductionMax< Real, Index >
 {
    public:
 
    typedef Real RealType;
    typedef Index IndexType;
    typedef Real ResultType;
-   typedef tnlParallelReductionLogicalOr< Real, Index > LaterReductionOperation;
-
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return data1[ idx ];
-   };
+   typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation;
 
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
                             const RealType* data2 ) const
    {
-      return current || data1[ idx ];
-   };
-
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ] || data1[ idx2 ];
-   }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ];
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] || data2[ idx2 ] || data2[ idx3 ];
+      return Max( current, tnlAbs( data1[ idx ] ) );
    };
 
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] || data2[ idx2 ];
-   };
+   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
 
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                              const IndexType index,
+                                              const RealType* data1,
+                                              const RealType* data2 ) const
    {
-      return data[ idx1 ] || data[ idx2 ];
-   };
-#endif
+      result = tnlCudaMax( result, tnlCudaAbs( data1[ index ] ) );
+   }   
 };
 
+
 template< typename Real, typename Index >
-class tnlParallelReductionLpNorm
+class tnlParallelReductionLpNorm : public tnlParallelReductionSum< Real, Index >
 {
    public:
 
@@ -698,13 +517,6 @@ class tnlParallelReductionLpNorm
       this -> p = p;
    };
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return pow( tnlAbs( data1[ idx ] ), p );
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -713,58 +525,23 @@ class tnlParallelReductionLpNorm
       return current + pow( tnlAbs( data1[ idx ] ), p );
    };
 
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
+   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
+   
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                              const IndexType index,
+                                              const RealType* data1,
+                                              const RealType* data2 ) const
    {
-      return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] ), p ) + tnlCudaPow( tnlCudaAbs( data1[ idx2 ] ), p );
+      result += tnlCudaPow( tnlCudaAbs( data1[ index ] ), p );
    }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] ), p );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] +
-             tnlCudaPow( tnlCudaAbs( data2[ idx2 ] ), p ) +
-             tnlCudaPow( tnlCudaAbs( data2[ idx3 ] ), p );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] + tnlCudaPow( tnlCudaAbs( data2[ idx2 ] ), p );
-   };
-
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
-   {
-      return data[ idx1 ] + data[ idx2 ];
-   };
-#endif
-
+   
    protected:
 
    RealType p;
 };
 
 template< typename Real, typename Index >
-class tnlParallelReductionEqualities
+class tnlParallelReductionEqualities : public tnlParallelReductionLogicalAnd< bool, Index >
 {
    public:
 
@@ -773,13 +550,6 @@ class tnlParallelReductionEqualities
    typedef bool ResultType;
    typedef tnlParallelReductionLogicalAnd< bool, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return  ( data1[ idx ] == data2[ idx ] );
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -788,54 +558,19 @@ class tnlParallelReductionEqualities
       return current && ( data1[ idx ] == data2[ idx ] );
    };
 
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
+   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) true; }; 
+   
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                              const IndexType index,
+                                              const RealType* data1,
+                                              const RealType* data2 ) const
    {
-      return ( data1[ idx1 ] == data2[ idx1 ] ) && ( data1[ idx2 ] == data2[ idx2] );
+      result = result && ( data1[ index ] == data2[ index ] );
    }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return ( data1[ idx1 ]== data2[ idx1 ] );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] &&
-             ( data2[ idx2 ] == data2[ idx2] ) &&
-             ( data2[ idx3 ] == data3[ idx3] );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] && ( data2[ idx2 ] == data3[ idx2 ] );
-   };
-
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
-   {
-      return data[ idx1 ] && data[ idx2 ];
-   };
-#endif
 };
 
 template< typename Real, typename Index >
-class tnlParallelReductionInequalities
+class tnlParallelReductionInequalities : public tnlParallelReductionLogicalAnd< bool, Index >
 {
    public:
 
@@ -844,13 +579,6 @@ class tnlParallelReductionInequalities
    typedef bool ResultType;
    typedef tnlParallelReductionLogicalAnd< bool, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return  ( data1[ idx ] != data2[ idx ] );
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -859,54 +587,19 @@ class tnlParallelReductionInequalities
       return current && ( data1[ idx ] != data2[ idx ] );
    };
 
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
+   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) false; };
+   
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                              const IndexType index,
+                                              const RealType* data1,
+                                              const RealType* data2 ) const
    {
-      return ( data1[ idx1 ] != data2[ idx1 ] ) && ( data1[ idx2 ] != data2[ idx2] );
+      result = result && ( data1[ index ] != data2[ index ] );
    }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return ( data1[ idx1 ] != data2[ idx1 ] );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] &&
-             ( data2[ idx2 ] != data2[ idx2] ) &&
-             ( data2[ idx3 ] != data3[ idx3] );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] && ( data2[ idx2 ] != data3[ idx2 ] );
-   };
-
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
-   {
-      return data[ idx1 ] && data[ idx2 ];
-   };
-#endif
 };
 
 template< typename Real, typename Index >
-class tnlParallelReductionScalarProduct
+class tnlParallelReductionScalarProduct : public tnlParallelReductionSum< Real, Index >
 {
    public:
 
@@ -915,13 +608,6 @@ class tnlParallelReductionScalarProduct
    typedef Real ResultType;
    typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return  data1[ idx ] * data2[ idx ];
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -930,54 +616,19 @@ class tnlParallelReductionScalarProduct
       return current + ( data1[ idx ] * data2[ idx ] );
    };
 
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return ( data1[ idx1 ] * data2[ idx1 ] ) + ( data1[ idx2 ] * data2[ idx2] );
-   }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return ( data1[ idx1 ] * data2[ idx1 ] );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] +
-             ( data2[ idx2 ] * data2[ idx2] ) +
-             ( data2[ idx3 ] * data3[ idx3] );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] + ( data2[ idx2 ] * data3[ idx2 ] );
-   };
-
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
+   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
+   
+   __cuda_callable__ inline void cudaFirstReduction( ResultType& result, 
+                                                 const IndexType index,
+                                                 const RealType* data1,
+                                                 const RealType* data2 ) const
    {
-      return data[ idx1 ] + data[ idx2 ];
-   };
-#endif
+      result += data1[ index ] * data2[ index ];
+   }   
 };
 
 template< typename Real, typename Index >
-class tnlParallelReductionDiffSum
+class tnlParallelReductionDiffSum : public tnlParallelReductionSum< Real, Index >
 {
    public:
 
@@ -986,13 +637,6 @@ class tnlParallelReductionDiffSum
    typedef Real ResultType;
    typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return data1[ idx ] - data2[ idx ];
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -1000,54 +644,20 @@ class tnlParallelReductionDiffSum
    {
       return current + ( data1[ idx ] - data2[ idx ] );
    };
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
+   
+   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };   
+   
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                          const IndexType index,
+                                          const RealType* data1,
+                                          const RealType* data2 ) const
    {
-      return ( data1[ idx1 ] - data2[ idx1 ] ) + ( data1[ idx2 ] - data2[ idx2 ] );
-   };
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ] - data2[ idx1 ];
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] +
-             ( data2[ idx2 ] - data3[ idx2 ] ) +
-             ( data2[ idx3 ] - data3[ idx3 ] );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] + ( data2[ idx2 ] - data3[ idx2 ] );
-   };
-
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
-   {
-      return data[ idx1 ] + data[ idx2 ];
-   };
-#endif
+      result += data1[ index ] - data2[ index ];
+   }   
 };
 
 template< typename Real, typename Index >
-class tnlParallelReductionDiffMin
+class tnlParallelReductionDiffMin : public tnlParallelReductionMin< Real, Index >
 {
    public:
 
@@ -1056,13 +666,6 @@ class tnlParallelReductionDiffMin
    typedef Real ResultType;
    typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return data1[ idx ] - data2[ idx ];
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -1071,54 +674,19 @@ class tnlParallelReductionDiffMin
       return Min( current, data1[ idx ] - data2[ idx ] );
    };
 
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
+   __cuda_callable__ ResultType initialValue() const { return tnlMaxValue< ResultType>(); };
+   
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                          const IndexType index,
+                                          const RealType* data1,
+                                          const RealType* data2 ) const
    {
-      return tnlCudaMin( data1[ idx1 ] - data2[ idx1 ], data1[ idx2 ] - data2[ idx2 ] );
+      result = tnlCudaMin( result, data1[ index ] - data2[ index ] );
    }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ] - data2[ idx1 ];
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMin( data1[ idx1 ],
-                         tnlCudaMin(  data2[ idx2 ] - data3[ idx2 ],
-                                      data2[ idx3 ] - data3[ idx3 ] ) );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMin( data1[ idx1 ], data2[ idx2 ] - data3[ idx2 ] );
-   };
-
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
-   {
-      return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
-   };
-#endif
 };
 
 template< typename Real, typename Index >
-class tnlParallelReductionDiffMax
+class tnlParallelReductionDiffMax : public tnlParallelReductionMax< Real, Index >
 {
    public:
 
@@ -1127,13 +695,6 @@ class tnlParallelReductionDiffMax
    typedef Real ResultType;
    typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return data1[ idx ] - data2[ idx ];
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -1142,55 +703,19 @@ class tnlParallelReductionDiffMax
       return Max( current, data1[ idx ] - data2[ idx ] );
    };
 
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
+   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
+   
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                              const IndexType index,
+                                              const RealType* data1,
+                                              const RealType* data2 ) const
    {
-      return tnlCudaMax( data1[ idx1 ] - data2[ idx1 ],
-                         data1[ idx2 ] - data2[ idx2 ] );
+      result = tnlCudaMax( result, data1[ index ] - data2[ index ] );
    }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ] - data2[ idx1 ];
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMax( data1[ idx1 ],
-                         tnlCudaMax( data2[ idx2 ] - data3[ idx2 ],
-                                     data2[ idx3 ] - data3[ idx3 ] ) );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMax( data1[ idx1 ], data2[ idx2 ] - data3[ idx2 ] );
-   };
-
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
-   {
-      return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
-   };
-#endif
 };
 
 template< typename Real, typename Index >
-class tnlParallelReductionDiffAbsSum
+class tnlParallelReductionDiffAbsSum : public tnlParallelReductionMax< Real, Index >
 {
    public:
 
@@ -1199,13 +724,6 @@ class tnlParallelReductionDiffAbsSum
    typedef Real ResultType;
    typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return tnlAbs( data1[ idx ] - data2[ idx ] );
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -1214,55 +732,19 @@ class tnlParallelReductionDiffAbsSum
       return current + tnlAbs( data1[ idx ] - data2[ idx ] );
    };
 
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ) + tnlCudaAbs( data1[ idx2 ] - data2[ idx2 ] );
-   };
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
+   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
+   
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                          const IndexType index,
+                                          const RealType* data1,
+                                          const RealType* data2 ) const
    {
-      return tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] +
-             tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ) +
-             tnlCudaAbs( data2[ idx3 ] - data3[ idx3 ] );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] +
-             tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] );
-   };
-
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
-   {
-      return data[ idx1 ] + data[ idx2 ];
-   };
-#endif
+      result += tnlCudaAbs( data1[ index ] - data2[ index ] );
+   }
 };
 
 template< typename Real, typename Index >
-class tnlParallelReductionDiffAbsMin
+class tnlParallelReductionDiffAbsMin : public tnlParallelReductionMin< Real, Index >
 {
    public:
 
@@ -1271,13 +753,6 @@ class tnlParallelReductionDiffAbsMin
    typedef Real ResultType;
    typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return tnlAbs( data1[ idx ] - data2[ idx ] );
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -1286,57 +761,19 @@ class tnlParallelReductionDiffAbsMin
       return Min( current, tnlAbs( data1[ idx ] - data2[ idx ] ) );
    };
 
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
+   __cuda_callable__ ResultType initialValue() const { return tnlMaxValue< ResultType>(); };
+   
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                          const IndexType index,
+                                          const RealType* data1,
+                                          const RealType* data2 ) const
    {
-      return tnlCudaMin( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ),
-                         tnlCudaAbs( data1[ idx2 ] - data2[ idx2 ] ) );
+      result = tnlCudaMin( result, tnlCudaAbs( data1[ index ] - data2[ index ] ) );
    }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMin( data1[ idx1 ],
-                         tnlCudaMin(  tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ),
-                                      tnlCudaAbs( data2[ idx3 ] - data3[ idx3 ] ) ) );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMin( data1[ idx1 ],
-                         tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ) );
-   };
-
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
-   {
-      //return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
-      return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
-   };
-#endif
 };
 
 template< typename Real, typename Index >
-class tnlParallelReductionDiffAbsMax
+class tnlParallelReductionDiffAbsMax : public tnlParallelReductionMax< Real, Index >
 {
    public:
 
@@ -1345,13 +782,6 @@ class tnlParallelReductionDiffAbsMax
    typedef Real ResultType;
    typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return tnlAbs( data1[ idx ] -data2[ idx ] );
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -1360,57 +790,19 @@ class tnlParallelReductionDiffAbsMax
       return Max( current, tnlAbs( data1[ idx ] - data2[ idx ] ) );
    };
 
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
+   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
+   
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                          const IndexType index,
+                                          const RealType* data1,
+                                          const RealType* data2 ) const
    {
-      return tnlCudaMax( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ),
-                         tnlCudaAbs( data1[ idx2 ] - data2[ idx2 ] ) );
+      result = tnlCudaMax( result, tnlCudaAbs( data1[ index ] - data2[ index ] ) );
    }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMax( data1[ idx1 ],
-                         tnlCudaMax( tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ),
-                                     tnlCudaAbs( data2[ idx3 ] - data3[ idx3 ] ) ) );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMax( data1[ idx1 ],
-                         tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ) );
-   };
-
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
-   {
-      //return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
-      return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
-   };
-#endif
 };
 
 template< typename Real, typename Index >
-class tnlParallelReductionDiffLpNorm
+class tnlParallelReductionDiffLpNorm : public tnlParallelReductionSum< Real, Index >
 {
    public:
 
@@ -1424,13 +816,6 @@ class tnlParallelReductionDiffLpNorm
       this -> p = p;
    };
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return pow( tnlAbs( data1[ idx ] - data2[ idx ] ), p );
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -1439,52 +824,16 @@ class tnlParallelReductionDiffLpNorm
       return current + pow( tnlAbs( data1[ idx ] - data2[ idx ] ), p );
    };
 
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
+   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
+   
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                              const IndexType index,
+                                              const RealType* data1,
+                                              const RealType* data2 ) const
    {
-      return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ), p ) +
-             tnlCudaPow( tnlCudaAbs( data1[ idx2 ] - data2[ idx2 ] ), p );
+      result += tnlCudaPow( tnlCudaAbs( data1[ index ] - data2[ index ] ), p );
    }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ), p );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] +
-             tnlCudaPow( tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ), p ) +
-             tnlCudaPow( tnlCudaAbs( data2[ idx3 ] - data3[ idx3 ] ), p );
-   };
-
-   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] + tnlCudaPow( tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ), p );
-   };
-
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  const ResultType* data ) const
-   {
-      return data[ idx1 ] + data[ idx2 ];
-   };
-#endif
-
+   
    protected:
 
    RealType p;
diff --git a/src/core/cuda/tnlCublasWrapper.h b/src/core/cuda/tnlCublasWrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..54d3e861a251605886f37c40ae1ecb0b1571504d
--- /dev/null
+++ b/src/core/cuda/tnlCublasWrapper.h
@@ -0,0 +1,70 @@
+/***************************************************************************
+                          tnlCublasWrapper.h  -  description
+                             -------------------
+    begin                : Apr 7, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef TNLCUBLASWARPER_H
+#define	TNLCUBLASWARPER_H
+
+#if defined HAVE_CUBLAS && defined HAVE_CUDA
+#include <cublas_v2.h>
+#endif
+
+template< typename Real1, 
+          typename Real2,
+          typename Index >
+class tnlCublasWrapper
+{
+    public:
+        static bool dot( const Real1* v1, const Real2* v2, const Index size, Real1& result)
+        {
+            return false;
+        }        
+};
+
+#if defined HAVE_CUBLAS && defined HAVE_CUDA
+
+template< typename Index >
+class tnlCublasWrapper< float, float, Index >
+{
+    public:
+        static bool dot( const float* v1, const float* v2, const Index size, float& result)
+        {
+
+            cublasHandle_t handle;
+            cublasCreate( &handle );
+            cublasSdot( handle, size, v1, 1, v2, 1, &result );
+            cublasDestroy( handle );
+            return false;
+        }        
+};
+
+template< typename Index >
+class tnlCublasWrapper< double, double, Index >
+{
+    public:
+        static bool dot( const double* v1, const double* v2, const Index size, double& result)
+        {
+            cublasHandle_t handle;
+            cublasCreate( &handle );
+            cublasDdot( handle, size, v1, 1, v2, 1, &result );
+            cublasDestroy( handle );
+            return false;
+        }        
+};
+#endif            
+
+#endif	/* TNLCUBLASWARPER_H */
+
diff --git a/src/core/cuda/tnlCudaReduction.h b/src/core/cuda/tnlCudaReduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b7bf7ab2cb234ecbeae74f4a571cc8383720cb0
--- /dev/null
+++ b/src/core/cuda/tnlCudaReduction.h
@@ -0,0 +1,62 @@
+/***************************************************************************
+                          tnlCudaReduction.h  -  description
+                             -------------------
+    begin                : Jun 17, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef TNLCUDAREDUCTION_H
+#define	TNLCUDAREDUCTION_H
+
+#ifdef HAVE_CUDA
+
+template< typename Operation, int blockSize >
+class tnlCUDAReduction
+{
+   public:
+
+      typedef typename Operation::IndexType IndexType;
+      typedef typename Operation::RealType RealType;
+      typedef typename Operation::ResultType ResultType;
+
+      
+      __device__ static void reduce( const Operation operation,
+                                     const IndexType size,
+                                     const RealType* input1,
+                                     const RealType* input2,
+                                     ResultType* output );
+};
+      
+/*template< typename Real, typename Index, int blockSize >
+class tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize >
+{
+   public:
+      
+      typedef tnlParallelReductionScalarProduct< Real, Index > Operation;      
+      typedef typename Operation::IndexType IndexType;
+      typedef typename Operation::RealType RealType;
+      typedef typename Operation::ResultType ResultType;
+      
+      __device__ static void reduce( const Operation operation,
+                                     const IndexType size,
+                                     const RealType* input1,
+                                     const RealType* input2,
+                                     ResultType* output );
+};*/
+
+#include <core/cuda/tnlCudaReduction_impl.h>
+
+#endif
+
+#endif	/* TNLCUDAREDUCTION_H */
+
diff --git a/src/core/cuda/tnlCudaReductionBuffer.h b/src/core/cuda/tnlCudaReductionBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b47ab5baa5539580e34724d4807f61f8f72df697
--- /dev/null
+++ b/src/core/cuda/tnlCudaReductionBuffer.h
@@ -0,0 +1,96 @@
+/***************************************************************************
+                          tnlCudaReductionBuffer.h  -  description
+                             -------------------
+    begin                : June 17, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef TNLCUDAREDUCTIONBUFFER_H
+#define	TNLCUDAREDUCTIONBUFFER_H
+
+#include <stdlib.h>
+
+#include <core/tnlCuda.h>
+
+class tnlCudaReductionBuffer
+{
+   public:
+      inline static tnlCudaReductionBuffer& getInstance( size_t size = 0 )
+      {
+         static tnlCudaReductionBuffer instance( size );
+         return instance;
+      }
+
+      inline bool setSize( size_t size )
+      {
+#ifdef HAVE_CUDA
+         if( size > this->size )
+         {
+            if( data ) cudaFree( data );
+            this->size = size;
+            if( cudaMalloc( ( void** ) &this->data, size ) != cudaSuccess )
+            {
+               cerr << "I am not able to allocate reduction buffer on the GPU." << endl;
+               this->data = 0;
+            }
+            return checkCudaDevice;
+         }
+         else
+            return true;
+#else
+         return false;
+#endif
+      }
+
+      template< typename Type >
+      Type* getData() { return ( Type* ) this->data; }
+
+   private:
+      // stop the compiler generating methods of copy the object
+      tnlCudaReductionBuffer( tnlCudaReductionBuffer const& copy );            // Not Implemented
+      tnlCudaReductionBuffer& operator=( tnlCudaReductionBuffer const& copy ); // Not Implemented
+
+      // private constructor of the singleton
+      inline tnlCudaReductionBuffer( size_t size = 0 ): data( 0 ), size( 0 )
+      {
+#ifdef HAVE_CUDA
+         if( size != 0 ) setSize( size );
+         atexit( tnlCudaReductionBuffer::free_atexit );
+#endif
+      }
+
+      inline static void free_atexit( void )
+      {
+         tnlCudaReductionBuffer::getInstance().free();
+      }
+
+   protected:
+      inline void free( void )
+      {
+#ifdef HAVE_CUDA
+         if( data )
+         {
+            cudaFree( data );
+            data = 0;
+         }
+#endif
+      }
+
+      void* data;
+
+      size_t size;
+};
+
+
+#endif	/* TNLCUDAREDUCTIONBUFFER_H */
+
diff --git a/src/core/cuda/tnlCudaReduction_impl.h b/src/core/cuda/tnlCudaReduction_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..5187141a1a70c4d601b3fb87ba5e7018b5e8f0ed
--- /dev/null
+++ b/src/core/cuda/tnlCudaReduction_impl.h
@@ -0,0 +1,296 @@
+/***************************************************************************
+                          tnlCudaReduction_impl.h  -  description
+                             -------------------
+    begin                : Jun 17, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef TNLCUDAREDUCTION_IMPL_H
+#define	TNLCUDAREDUCTION_IMPL_H
+
+template< typename Operation, int blockSize >      
+__device__
+void
+tnlCUDAReduction< Operation, blockSize >::
+reduce( const Operation operation,
+        const IndexType size,
+        const RealType* input1,
+        const RealType* input2,
+        ResultType* output )
+{
+   extern __shared__ __align__ ( 8 ) char __sdata[];
+
+   ResultType* sdata = reinterpret_cast< ResultType* >( __sdata );
+
+   /***
+    * Get thread id (tid) and global thread id (gid).
+    * gridSize is the number of element processed by all blocks at the
+    * same time.
+    */
+   IndexType tid = threadIdx. x;
+   IndexType gid = blockIdx. x * blockDim. x + threadIdx. x;
+   IndexType gridSize = blockDim. x * gridDim.x;
+
+   sdata[ tid ] = operation.initialValue();
+   /***
+    * Read data into the shared memory. We start with the
+    * sequential reduction.
+    */
+   while( gid + 4 * gridSize < size )
+   {
+      operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
+      operation.cudaFirstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
+      operation.cudaFirstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 );
+      operation.cudaFirstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 );
+      gid += 4*gridSize;
+   }
+   while( gid + 2 * gridSize < size )
+   {
+      operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
+      operation.cudaFirstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
+      gid += 2*gridSize;
+   }
+   while( gid < size )
+   {
+      operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
+      gid += gridSize;
+   }
+   __syncthreads();
+
+
+   //printf( "1: tid %d data %f \n", tid, sdata[ tid ] );
+
+   //return;
+   /***
+    *  Perform the parallel reduction.
+    */
+   if( blockSize >= 1024 )
+   {
+      if( tid < 512 )
+         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] );
+      __syncthreads();
+   }
+   if( blockSize >= 512 )
+   {
+      if( tid < 256 )
+         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] );
+      __syncthreads();
+   }
+   if( blockSize >= 256 )
+   {
+      if( tid < 128 )
+         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] );
+      __syncthreads();
+      //printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
+   }
+
+   if( blockSize >= 128 )
+   {
+      if( tid <  64 )
+         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] );
+      __syncthreads();
+      //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
+   }
+
+
+   /***
+    * This runs in one warp so it is synchronized implicitly.
+    */
+   if( tid < 32 )
+   {
+      volatile ResultType* vsdata = sdata;
+      if( blockSize >= 64 )
+      {
+         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] );
+         //printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
+      }
+      if( blockSize >= 32 )
+      {
+         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] );
+         //printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
+      }
+      if( blockSize >= 16 )
+      {
+         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] );
+         //printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
+      }
+      if( blockSize >=  8 )
+      {
+         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] );
+         //printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
+      }
+      if( blockSize >=  4 )
+      {
+         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] );
+         //printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
+      }
+      if( blockSize >=  2 )
+      {
+         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] );
+         //printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
+      }
+   }
+
+   /***
+    * Store the result back in the global memory.
+    */
+   if( tid == 0 )
+   {
+      //printf( "Block %d result = %f \n", blockIdx.x, sdata[ 0 ] );
+      output[ blockIdx.x ] = sdata[ 0 ];
+   }
+
+}
+
+#ifdef UNDEF
+
+template< typename Real, typename Index, int blockSize >      
+__device__
+void
+tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize >::
+reduce( const Operation operation,
+        const IndexType size,
+        const RealType* input1,
+        const RealType* input2,
+        ResultType* output )
+{
+  extern __shared__ __align__ ( 8 ) char __sdata[];
+
+   ResultType* sdata = reinterpret_cast< ResultType* >( __sdata );        
+
+   /***
+    * Get thread id (tid) and global thread id (gid).
+    * gridSize is the number of element processed by all blocks at the
+    * same time.
+    */
+   IndexType tid = threadIdx. x;
+   IndexType gid = blockIdx. x * blockDim. x + threadIdx. x;
+   IndexType gridSize = blockDim. x * gridDim.x;
+
+   /***
+    * Read data into the shared memory. We start with the
+    * sequential reduction.
+    */
+   sdata[ tid ] = ( RealType ) 0;
+   while( gid + 4 * gridSize < size )
+   {
+      sdata[ tid ] += input1[ gid                ] * input2[ gid ];
+      sdata[ tid ] += input1[ gid + gridSize     ] * input2[ gid + gridSize ];
+      sdata[ tid ] += input1[ gid + 2 * gridSize ] * input2[ gid + 2 * gridSize ];
+      sdata[ tid ] += input1[ gid + 3 * gridSize ] * input2[ gid + 3 * gridSize ];
+      gid += 4*gridSize;
+   }
+   while( gid + 2 * gridSize < size )
+   {
+      sdata[ tid ] += input1[ gid            ] * input2[ gid ];
+      sdata[ tid ] += input1[ gid + gridSize ] * input2[ gid + gridSize ];
+      gid += 2*gridSize;
+   }
+   while( gid < size )
+   {
+      sdata[ tid ] += input1[ gid ] * input2[ gid ];
+      gid += gridSize;
+   }
+   __syncthreads();
+
+   //printf( "1: tid %d data %f \n", tid, sdata[ tid ] );
+
+   /***
+    *  Perform the parallel reduction.
+    */
+   if( blockSize >= 1024 )
+   {
+      if( tid < 512 )
+         sdata[ tid ] += sdata[ tid + 512 ];
+      __syncthreads();
+   }
+   if( blockSize >= 512 )
+   {
+      if( tid < 256 )
+         sdata[ tid ] += sdata[ tid + 256 ];
+      __syncthreads();
+   }
+   if( blockSize >= 256 )
+   {
+      if( tid < 128 )
+         sdata[ tid ] += sdata[ tid + 128 ];
+      __syncthreads();
+      //printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
+   }
+
+   if( blockSize >= 128 )
+   {
+      if( tid <  64 )
+         sdata[ tid ] += sdata[ tid + 64 ];
+      __syncthreads();
+      //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
+   }
+
+   /***
+    * This runs in one warp so it is synchronized implicitly.
+    */
+   if( tid < 32 )
+   {
+      volatile ResultType* vsdata = sdata;
+      if( blockSize >= 64 )
+      {
+         vsdata[ tid ] += vsdata[ tid + 32 ];
+         //__syncthreads();
+         //printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
+      }
+      if( blockSize >= 32 )
+      {
+         vsdata[ tid ] += vsdata[ tid + 16 ];
+         //__syncthreads();
+         //printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
+      }
+      if( blockSize >= 16 )
+      {
+         vsdata[ tid ] += vsdata[ tid + 8 ];
+         //__syncthreads();
+         //printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
+      }
+      if( blockSize >=  8 )
+      {
+         vsdata[ tid ] += vsdata[ tid + 4 ];
+         //__syncthreads();
+         //printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
+      }
+      if( blockSize >=  4 )
+      {
+         vsdata[ tid ] += vsdata[ tid + 2 ];
+         //__syncthreads();
+         //printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
+      }
+      if( blockSize >=  2 )
+      {
+         vsdata[ tid ] += vsdata[ tid + 1 ];
+         //__syncthreads();
+         //printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
+      }
+   }
+
+   /***
+    * Store the result back in the global memory.
+    */
+   if( tid == 0 )
+   {
+      //printf( "Block %d result = %f \n", blockIdx.x, sdata[ 0 ] );
+      output[ blockIdx.x ] = sdata[ 0 ];
+   }
+}
+
+#endif
+
+#endif	/* TNLCUDAREDUCTION_IMPL_H */
+
diff --git a/src/core/mfuncs.h b/src/core/mfuncs.h
index 4719ad2fa9a6844700080a011e4010bd158f3353..4958a8c84ed1f97cb8bc8b8405012c718da1cff4 100644
--- a/src/core/mfuncs.h
+++ b/src/core/mfuncs.h
@@ -20,29 +20,24 @@
 
 #include <math.h>
 #include <stdlib.h>
+#include <core/tnlCuda.h>
 
 template< typename Type1, typename Type2 >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Type1 Min( const Type1& a, const Type2& b )
 {
    return a < b ? a : b;
 };
 
 template< typename Type1, typename Type2 >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Type1 Max( const Type1& a, const Type2& b )
 {
    return a > b ? a : b;
 };
 
 template< typename Type >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 void Swap( Type& a, Type& b )
 {
    Type tmp( a );
@@ -51,9 +46,7 @@ void Swap( Type& a, Type& b )
 };
 
 template< class T >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 T Sign( const T& a )
 {
    if( a < ( T ) 0 ) return -1;
@@ -62,9 +55,7 @@ T Sign( const T& a )
 };
 
 template< class T >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 T tnlAbs( const T& n )
 {
    if( n < ( T ) 0 )
@@ -88,34 +79,36 @@ inline double tnlAbs( const double& d )
 };
 
 template< typename Real >
+__cuda_callable__
 bool isSmall( const Real& v,
               const Real& tolerance = 1.0e-5 )
 {
    return ( -tolerance <= v && v <= tolerance );
 }
 
+__cuda_callable__
 inline int roundUpDivision( const int num, const int div )
 {
    return num / div + ( num % div != 0 );
 }
 
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 inline int roundToMultiple( int number, int multiple )
 {
    return multiple*( number/ multiple + ( number % multiple != 0 ) );
 }
 
+__cuda_callable__
+inline bool isPow2( int x )
+{
+   return ( x & ( x - 1 ) == 0 );
+}
 
-/*template< typename T >
-void swap( T& a, T& b)
+__cuda_callable__
+inline bool isPow2( long int x )
 {
-   T aux;
-   aux = a;
-   a = b;
-   b = aux;
-}*/
+   return ( x & ( x - 1 ) == 0 );
+}
 
 
 #endif
diff --git a/src/core/tnlAssert.h b/src/core/tnlAssert.h
index 1f59f11b3bc83f8d2e5ff3610310195ea997a414..3a37eb52f2789986c413f11c0c601fc38caa7183 100644
--- a/src/core/tnlAssert.h
+++ b/src/core/tnlAssert.h
@@ -38,7 +38,7 @@ using namespace std;
            __STRING( ___tnl__assert_condition ),                                                         \
            __FILE__,                                                                                     \
            __LINE__ );                                                                                   \
-    abort();                                                                   \
+                                                              \
    }
 
 #else
diff --git a/src/core/tnlConstants.h b/src/core/tnlConstants.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc7d1f5ea12d067283e01f5143755fe4c3e36a5d
--- /dev/null
+++ b/src/core/tnlConstants.h
@@ -0,0 +1,57 @@
+/***************************************************************************
+                           tnlConstants.h -  description
+                             -------------------
+    begin                : June 17, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef TNLCONSTANTS_H
+#define	TNLCONSTANTS_H
+
+#include <limits.h>
+#include <float.h>
+#include <cstdio>
+#include <core/tnlAssert.h>
+#include <core/tnlCuda.h>
+
+template< typename T > __cuda_callable__ T tnlMinValue(){ tnlAssert( false,)};
+template<> inline __cuda_callable__ char               tnlMinValue< char >() { return CHAR_MIN; }
+template<> inline __cuda_callable__ unsigned char      tnlMinValue< unsigned char >() { return 0; }
+template<> inline __cuda_callable__ short int          tnlMinValue< short int >() { return SHRT_MIN; }
+template<> inline __cuda_callable__ unsigned short int tnlMinValue< unsigned short int >() { return 0; }
+template<> inline __cuda_callable__ int                tnlMinValue< int >() { return INT_MIN; }
+template<> inline __cuda_callable__ unsigned int       tnlMinValue< unsigned int >() { return 0; }
+template<> inline __cuda_callable__ long int           tnlMinValue< long int >() { return LONG_MIN; }
+template<> inline __cuda_callable__ unsigned long int  tnlMinValue< unsigned long int >() { return 0; }
+template<> inline __cuda_callable__ float              tnlMinValue< float >() { return -FLT_MAX; }
+template<> inline __cuda_callable__ double             tnlMinValue< double >() { return -DBL_MAX; }
+template<> inline __cuda_callable__ long double        tnlMinValue< long double >() { return -LDBL_MAX; }
+
+template< typename T > __cuda_callable__ T tnlMaxValue(){ tnlAssert( false,)};
+template<> inline __cuda_callable__ char               tnlMaxValue< char >() { return CHAR_MAX; }
+template<> inline __cuda_callable__ unsigned char      tnlMaxValue< unsigned char >() { return UCHAR_MAX; }
+template<> inline __cuda_callable__ short int          tnlMaxValue< short int >() { return SHRT_MAX; }
+template<> inline __cuda_callable__ unsigned short int tnlMaxValue< unsigned short int >() { return USHRT_MAX; }
+template<> inline __cuda_callable__ int                tnlMaxValue< int >() { return INT_MAX; }
+template<> inline __cuda_callable__ unsigned int       tnlMaxValue< unsigned int >() { return UINT_MAX; }
+template<> inline __cuda_callable__ long int           tnlMaxValue< long int >() { return LONG_MAX; }
+template<> inline __cuda_callable__ unsigned long int  tnlMaxValue< unsigned long int >() { return ULONG_MAX; }
+template<> inline __cuda_callable__ float              tnlMaxValue< float >() { return FLT_MAX; }
+template<> inline __cuda_callable__ double             tnlMaxValue< double >() { return DBL_MAX; }
+template<> inline __cuda_callable__ long double        tnlMaxValue< long double >() { return LDBL_MAX; }
+
+
+
+
+#endif	/* TNLCONSTANTS_H */
+
diff --git a/src/core/tnlCuda.cu b/src/core/tnlCuda.cu
index ee30490b89728546a9eb1b6d7e594a87f83b9615..9178a3261e48a300267b0f292f06cec68575ba4d 100644
--- a/src/core/tnlCuda.cu
+++ b/src/core/tnlCuda.cu
@@ -391,6 +391,6 @@ bool tnlCuda::checkDevice( const char* file_name, int line )
        break;
 
    }
-   throw EXIT_FAILURE;
+   //throw EXIT_FAILURE;
    return false;
 }
diff --git a/src/core/tnlCuda.h b/src/core/tnlCuda.h
index 79d145e4bf74b21d30277670d17631e3e0c4823e..6889f3f109f9075205eaa0591bf29b06703ad6fd 100644
--- a/src/core/tnlCuda.h
+++ b/src/core/tnlCuda.h
@@ -24,6 +24,13 @@
 #include <core/tnlString.h>
 #include <core/tnlAssert.h>
 
+#ifdef HAVE_CUDA
+#define __cuda_callable__ __device__ __host__
+#else
+#define __cuda_callable__
+#endif
+
+
 class tnlCuda
 {
    public:
@@ -32,36 +39,20 @@ class tnlCuda
 
    static tnlString getDeviceType();
 
-#ifdef HAVE_CUDA
-   __host__ __device__
-#endif
-   static inline tnlDeviceEnum getDevice();
-
+   __cuda_callable__ static inline tnlDeviceEnum getDevice();
 
-#ifdef HAVE_CUDA
-   __host__ __device__
-#endif
-   static inline int getMaxGridSize();
+   __cuda_callable__ static inline int getMaxGridSize();
 
-#ifdef HAVE_CUDA
-   __host__ __device__
-#endif
-   static inline int getMaxBlockSize();
+   __cuda_callable__ static inline int getMaxBlockSize();
 
-#ifdef HAVE_CUDA
-   __host__ __device__
-#endif
-   static inline int getWarpSize();
+   __cuda_callable__ static inline int getWarpSize();
 
 #ifdef HAVE_CUDA
    template< typename Index >
    __device__ static Index getGlobalThreadIdx( const Index gridIdx = 0 );
 #endif
 
-#ifdef HAVE_CUDA
-   __host__ __device__
-#endif
-   static inline int getNumberOfSharedMemoryBanks();
+   __cuda_callable__ static inline int getNumberOfSharedMemoryBanks();
 
    static int getGPUTransferBufferSize();
 
diff --git a/src/core/tnlCudaDeviceInfo.cpp b/src/core/tnlCudaDeviceInfo.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd5a4623900ebc174ee5da705674fe23eccb4ccc
--- /dev/null
+++ b/src/core/tnlCudaDeviceInfo.cpp
@@ -0,0 +1,100 @@
+/***************************************************************************
+                          tnlCudaDeviceInfo.cpp  -  description
+                             -------------------
+    begin                : Jun 21, 2015
+    copyright            : (C) 2007 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef HAVE_CUDA
+
+#include <core/tnlCudaDeviceInfo.h>
+
+int
+tnlCudaDeviceInfo::
+getNumberOfDevices()
+{
+   return -1;
+}
+      
+tnlString
+tnlCudaDeviceInfo::
+getDeviceName( int deviceNum )
+{
+   return tnlString( "" );
+}
+
+int
+tnlCudaDeviceInfo::
+getArchitectureMajor( int deviceNum )
+{
+    return 0;
+}
+      
+int
+tnlCudaDeviceInfo::
+getArchitectureMinor( int deviceNum )
+{
+    return 0;
+}
+
+int
+tnlCudaDeviceInfo::
+getClockRate( int deviceNum )
+{
+   return 0;
+}
+      
+size_t
+tnlCudaDeviceInfo::
+getGlobalMemory( int deviceNum )
+{
+   return 0;
+}
+
+int
+tnlCudaDeviceInfo::
+getMemoryClockRate( int deviceNum )
+{
+   return 0;
+}
+
+bool
+tnlCudaDeviceInfo::
+getECCEnabled( int deviceNum )
+{
+   return 0;
+}
+
+int
+tnlCudaDeviceInfo::
+getCudaMultiprocessors( int deviceNum )
+{
+   return 0;
+}
+
+int
+tnlCudaDeviceInfo::
+getCudaCoresPerMultiprocessors( int deviceNum )
+{
+   return 0;
+}
+
+int
+tnlCudaDeviceInfo::
+getCudaCores( int deviceNum )
+{
+   return 0;
+}
+
+
+#endif
\ No newline at end of file
diff --git a/src/core/tnlCudaDeviceInfo.cu b/src/core/tnlCudaDeviceInfo.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5754f38b43a7ac42360f3f2443c0e89e34d965d6
--- /dev/null
+++ b/src/core/tnlCudaDeviceInfo.cu
@@ -0,0 +1,140 @@
+/***************************************************************************
+                          tnlCudaDeviceInfo.cu  -  description
+                             -------------------
+    begin                : Jun 21, 2015
+    copyright            : (C) 2007 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifdef HAVE_CUDA
+
+#include <core/tnlCudaDeviceInfo.h>
+#include <core/tnlCuda.h>
+
+int
+tnlCudaDeviceInfo::
+getNumberOfDevices()
+{
+    int devices;
+    cudaGetDeviceCount( &devices );
+    return devices;
+}
+      
+tnlString
+tnlCudaDeviceInfo::
+getDeviceName( int deviceNum )
+{
+    cudaDeviceProp properties;
+    cudaGetDeviceProperties( &properties, deviceNum );
+    return tnlString( properties.name );
+}
+
+int
+tnlCudaDeviceInfo::
+getArchitectureMajor( int deviceNum )
+{
+    cudaDeviceProp properties;
+    cudaGetDeviceProperties( &properties, deviceNum );
+    return properties.major;
+}
+      
+int
+tnlCudaDeviceInfo::
+getArchitectureMinor( int deviceNum )
+{
+    cudaDeviceProp properties;
+    cudaGetDeviceProperties( &properties, deviceNum );
+    return properties.minor;
+}
+      
+int
+tnlCudaDeviceInfo::
+getClockRate( int deviceNum )
+{
+    cudaDeviceProp properties;
+    cudaGetDeviceProperties( &properties, deviceNum );
+    return properties.clockRate;
+}
+      
+size_t
+tnlCudaDeviceInfo::
+getGlobalMemory( int deviceNum )
+{
+    cudaDeviceProp properties;
+    cudaGetDeviceProperties( &properties, deviceNum );
+    return properties.totalGlobalMem;
+}
+
+int
+tnlCudaDeviceInfo::
+getMemoryClockRate( int deviceNum )
+{
+    cudaDeviceProp properties;
+    cudaGetDeviceProperties( &properties, deviceNum );
+    return properties.memoryClockRate;
+}
+
+bool
+tnlCudaDeviceInfo::
+getECCEnabled( int deviceNum )
+{
+    cudaDeviceProp properties;
+    cudaGetDeviceProperties( &properties, deviceNum );
+    return properties.ECCEnabled;
+}
+
+int
+tnlCudaDeviceInfo::
+getCudaMultiprocessors( int deviceNum )
+{
+    cudaDeviceProp properties;
+    cudaGetDeviceProperties( &properties, deviceNum );
+    return properties.multiProcessorCount;
+}
+
+int
+tnlCudaDeviceInfo::
+getCudaCoresPerMultiprocessors( int deviceNum )
+{
+    int major = tnlCudaDeviceInfo::getArchitectureMajor( deviceNum );
+    int minor = tnlCudaDeviceInfo::getArchitectureMinor( deviceNum );
+    switch( major )
+    {
+        case 1:   // Tesla generation, G80, G8x, G9x classes
+            return 8;
+        case 2:   // Fermi generation
+        switch( minor )
+        {
+            case 0:  // GF100 class
+                return 32; 
+            case 1:  // GF10x class
+                return 48;
+        }
+        case 3: // Kepler generation -- GK10x, GK11x classes
+            return 192;
+        case 5: // Maxwell generation -- GM10x, GM20x classes
+            return 128;
+        default:
+            return -1;
+    }
+}
+
+int
+tnlCudaDeviceInfo::
+getCudaCores( int deviceNum )
+{
+    return tnlCudaDeviceInfo::getCudaMultiprocessors( deviceNum ) *
+           tnlCudaDeviceInfo::getCudaCoresPerMultiprocessors( deviceNum );
+}
+
+
+#endif
\ No newline at end of file
diff --git a/src/core/tnlCudaDeviceInfo.h b/src/core/tnlCudaDeviceInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..34a560e3303ee17d3077b2613ecf968aec9a9d94
--- /dev/null
+++ b/src/core/tnlCudaDeviceInfo.h
@@ -0,0 +1,56 @@
+/***************************************************************************
+                          tnlCudaDeviceInfo.h  -  description
+                             -------------------
+    begin                : Jun 21, 2015
+    copyright            : (C) 2007 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef TNLCUDADEVICEINFO_H
+#define	TNLCUDADEVICEINFO_H
+
+#include <stdlib.h>
+#include <core/tnlCuda.h>
+
+class tnlCudaDeviceInfo
+{
+   public:
+      
+      static int getNumberOfDevices();
+      
+      static tnlString getDeviceName( int deviceNum );
+      
+      static int getArchitectureMajor( int deviceNum );
+      
+      static int getArchitectureMinor( int deviceNum );
+      
+      static int getClockRate( int deviceNum );
+      
+      static size_t getGlobalMemory( int deviceNum );
+
+      static int getMemoryClockRate( int deviceNum );
+
+      static bool getECCEnabled( int deviceNum );
+
+      static int getCudaMultiprocessors( int deviceNum );      
+      
+      static int getCudaCoresPerMultiprocessors( int deviceNum );      
+      
+      static int getCudaCores( int deviceNum );      
+      
+};
+
+
+
+
+#endif	/* TNLCUDADEVICEINFO_H */
+
diff --git a/src/core/tnlIndexedSet.h b/src/core/tnlIndexedSet.h
index 05e263749f83065a2dec9a2b2a8aa45add91d098..1762afdd836b993b16bd7411cd48537a33b2c734 100644
--- a/src/core/tnlIndexedSet.h
+++ b/src/core/tnlIndexedSet.h
@@ -54,6 +54,12 @@ class tnlIndexedSet
 
    struct DataWithIndex
    {
+      // This constructor is here only because of bug in g++, we might fix it later.
+      // http://stackoverflow.com/questions/22357887/comparing-two-mapiterators-why-does-it-need-the-copy-constructor-of-stdpair
+      DataWithIndex(){};
+      
+      DataWithIndex( const DataWithIndex& d ) : data( d.data ), index( d.index) {}
+      
       explicit DataWithIndex( const Element data) : data( data ) {}
 
       DataWithIndex( const Element data,
diff --git a/src/core/tnlLogger.cpp b/src/core/tnlLogger.cpp
index 3aca3e1de7773a9fe9a728a30b3316f55a293eb1..492af9eb93ebb6a497e39422dc065e12583d7fd3 100644
--- a/src/core/tnlLogger.cpp
+++ b/src/core/tnlLogger.cpp
@@ -22,6 +22,7 @@
 #include <iomanip>
 #include <core/tnlLogger.h>
 #include <tnlConfig.h>
+#include <core/tnlCudaDeviceInfo.h>
 
 tnlLogger :: tnlLogger( int _width,
                         ostream& _stream )
@@ -50,7 +51,7 @@ void tnlLogger :: writeSeparator()
    stream. fill( fill );
 }
 
-bool tnlLogger :: writeSystemInformation()
+bool tnlLogger :: writeSystemInformation( const tnlParameterContainer& parameters )
 {
    char host_name[ 256 ];
    struct utsname uts;
@@ -59,6 +60,17 @@ bool tnlLogger :: writeSystemInformation()
    writeParameter< char* >( "Host name:", host_name );
    writeParameter< char* >( "Architecture:", uts. machine );
    fstream file;
+   file.open( "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", ios::in );
+   int maxCpuFreq( 0 );
+   if( file )
+   {
+      char line[ 1024 ];
+      file.getline( line, 1024 );
+      maxCpuFreq = atoi( line );
+   }
+   else
+       cerr << "Unable to read information from /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq." << endl;
+   file.close();
    file. open( "/proc/cpuinfo", ios :: in );
    if( file )
    {
@@ -67,50 +79,94 @@ bool tnlLogger :: writeSystemInformation()
       char* cpu_model_name;
       char* cpu_mhz;
       char* cpu_cache;
+      tnlString modelName, Mhz, cache;
+      int cores( 0 ), siblings( 0 );
       while( ! file. eof() )
       {
          int i;
          file. getline( line, 1024 );
-         if( strncmp( line, "processor", strlen( "processor" ) ) == 0 )
+         /*if( strncmp( line, "processor", strlen( "processor" ) ) == 0 )
          {
             i = strlen( "processor" );
             while( line[ i ] != ':' && line[ i ] ) i ++;
             cpu_id = &line[ i + 1 ];
             writeParameter< char * >( "CPU Id.:", cpu_id );
             continue;
-         }
+         }*/
          if( strncmp( line, "model name", strlen( "model name" ) ) == 0 )
          {
             i = strlen( "model name" );
             while( line[ i ] != ':' && line[ i ] ) i ++;
-            cpu_model_name = &line[ i + 1 ];
-            writeParameter< char * >( "Model name:", cpu_model_name );
+            //cpu_model_name = &line[ i + 1 ];
+            modelName.setString( &line[ i + 1 ] );
+            //writeParameter< char * >( "Model name:", cpu_model_name );
             continue;
          }
-         if( strncmp( line, "cpu MHz", strlen( "cpu MHz" ) ) == 0 )
+         if( strncmp( line, "cpu cores", strlen( "cpu cores" ) ) == 0 )
          {
             i = strlen( "cpu MHz" );
             while( line[ i ] != ':' && line[ i ] ) i ++;
-            cpu_mhz = &line[ i + 1 ];
-            writeParameter< char * >( "CPU MHz:", cpu_mhz );
+            cores = atoi( &line[ i + 1 ] );
             continue;
          }
+         if( strncmp( line, "siblings", strlen( "siblings" ) ) == 0 )
+         {
+            i = strlen( "siblings" );
+            while( line[ i ] != ':' && line[ i ] ) i ++;
+            siblings = atoi( &line[ i + 1 ] );
+         }
+         /*if( strncmp( line, "cpu MHz", strlen( "cpu MHz" ) ) == 0 )
+         {
+            i = strlen( "cpu MHz" );
+            while( line[ i ] != ':' && line[ i ] ) i ++;
+            //cpu_mhz = &line[ i + 1 ];
+            Mhz.setString( &line[ i + 1 ] );
+            //writeParameter< char * >( "CPU MHz:", cpu_mhz );
+            continue;
+         }*/
          if( strncmp( line, "cache size", strlen( "cache size" ) ) == 0 )
          {
             i = strlen( "cache size" );
             while( line[ i ] != ':' && line[ i ] ) i ++;
-            cpu_cache = &line[ i + 1 ];
-            writeParameter< char * >( "CPU cache:", cpu_cache );
+            //cpu_cache = &line[ i + 1 ];
+            cache.setString( &line[ i + 1 ] );
+            //writeParameter< char * >( "CPU cache:", cpu_cache );
             continue;
          }
       }
-   }
+      int threadsPerCore = siblings / cores;
+      writeParameter< tnlString >( "CPU info", tnlString("") );
+      writeParameter< tnlString >( "Model name:", modelName, 1 );
+      writeParameter< int >( "Cores:", cores, 1 );
+      writeParameter< int >( "Threads per core:", threadsPerCore, 1 );
+      writeParameter< tnlString >( "Max clock rate (in MHz):", maxCpuFreq / 1000, 1 );
+      writeParameter< tnlString >( "Cache:", cache, 1 );
+    }
    else
-   {
       cerr << "Unable to read information from /proc/cpuinfo." << endl;
-      return false;
+   file.close();
+   if( parameters.getParameter< tnlString >( "device" ) == "cuda" )
+   {      
+      int devices = tnlCudaDeviceInfo::getNumberOfDevices();
+      writeParameter< tnlString >( "CUDA GPU info", tnlString("") );   
+      writeParameter< int >( "Number of devices", devices,1 );
+      for( int i = 0; i < devices; i++ )
+      {
+        writeParameter< int >( "Device no.", i, 1 );       
+        writeParameter< tnlString >( "Name", tnlCudaDeviceInfo::getDeviceName( i ), 2 );
+        tnlString deviceArch = tnlString( tnlCudaDeviceInfo::getArchitectureMajor( i ) ) + "." +
+                                tnlString( tnlCudaDeviceInfo::getArchitectureMinor( i ) );
+        writeParameter< tnlString >( "Architecture", deviceArch, 2 );
+        writeParameter< int >( "CUDA cores", tnlCudaDeviceInfo::getCudaCores( i ), 2 );         
+        double clockRate = ( double ) tnlCudaDeviceInfo::getClockRate( i ) / 1.0e3;
+        writeParameter< double >( "Clock rate (in MHz)", clockRate, 2 );
+        double globalMemory = ( double ) tnlCudaDeviceInfo::getGlobalMemory( i ) / 1.0e9;
+        writeParameter< double >( "Global memory (in GB)", globalMemory, 2 );         
+        double memoryClockRate = ( double ) tnlCudaDeviceInfo::getMemoryClockRate( i ) / 1.0e3;
+        writeParameter< double >( "Memory clock rate (in Mhz)", memoryClockRate, 2 );
+        writeParameter< bool >( "ECC enabled", tnlCudaDeviceInfo::getECCEnabled( i ), 2 );         
+      }
    }
-   file. close();
    writeParameter< char* >( "System:", uts. sysname );
    writeParameter< char* >( "Release:", uts. release );
    writeParameter< char* >( "TNL Compiler:", ( char* ) TNL_CPP_COMPILER_NAME );
diff --git a/src/core/tnlLogger.h b/src/core/tnlLogger.h
index eb5a19a15e3ffe7200019e35889851f199a6298d..c657ee4fe3e549bcfb6762533e8df83718110311 100644
--- a/src/core/tnlLogger.h
+++ b/src/core/tnlLogger.h
@@ -34,7 +34,8 @@ class tnlLogger
 
    void writeSeparator();
 
-   bool writeSystemInformation();
+   bool writeSystemInformation( const tnlParameterContainer& parameters );
+   
 
    void writeCurrentTime( const char* label );
 
diff --git a/src/core/tnlTimerCPU.cpp b/src/core/tnlTimerCPU.cpp
index e89abc687896a685c874f4f056b2d999635dd719..d618b455e372c9daa16bfdfaa129019adbc395be 100644
--- a/src/core/tnlTimerCPU.cpp
+++ b/src/core/tnlTimerCPU.cpp
@@ -21,10 +21,10 @@ tnlTimerCPU defaultCPUTimer;
 
 tnlTimerCPU :: tnlTimerCPU()
 {
-   Reset();
+   reset();
 }
-//--------------------------------------------------------------------------
-void tnlTimerCPU :: Reset()
+
+void tnlTimerCPU::reset()
 {
 #ifdef HAVE_SYS_RESOURCE_H
    rusage init_usage;
@@ -36,8 +36,8 @@ void tnlTimerCPU :: Reset()
    total_time = 0.0;
    stop_state = false;
 }
-//--------------------------------------------------------------------------
-void tnlTimerCPU :: Stop()
+
+void tnlTimerCPU::stop()
 {
 #ifdef HAVE_SYS_RESOURCE_H
    if( ! stop_state )
@@ -49,8 +49,8 @@ void tnlTimerCPU :: Stop()
    }
 #endif
 }
-//--------------------------------------------------------------------------
-void tnlTimerCPU :: Continue()
+
+void tnlTimerCPU::start()
 {
 #ifdef HAVE_SYS_RESOURCE_H
    rusage init_usage;
@@ -59,12 +59,12 @@ void tnlTimerCPU :: Continue()
 #endif
   stop_state = false;
 }
-//--------------------------------------------------------------------------
-double tnlTimerCPU :: GetTime( int root, MPI_Comm comm )
+
+double tnlTimerCPU::getTime( int root, MPI_Comm comm )
 {
 #ifdef HAVE_SYS_RESOURCE_H
-   Stop();
-   Continue();
+   stop();
+   start();
    double mpi_total_time;
    MPIReduce( total_time, mpi_total_time, 1, MPI_SUM, root, comm );
    return mpi_total_time;
diff --git a/src/core/tnlTimerCPU.h b/src/core/tnlTimerCPU.h
index 098d90ab1822a0254d488e4d2c0f0cdb2e8797f3..c982018b5b1bb0ee6e02085569d7c00c9bff732f 100644
--- a/src/core/tnlTimerCPU.h
+++ b/src/core/tnlTimerCPU.h
@@ -32,13 +32,13 @@ class tnlTimerCPU
 
    tnlTimerCPU();
 
-   void Reset();
+   void reset();
    
-   void Stop();
+   void stop();
 
-   void Continue();
+   void start();
 
-   double GetTime( int root = 0, MPI_Comm = MPI_COMM_WORLD );
+   double getTime( int root = 0, MPI_Comm = MPI_COMM_WORLD );
       
    protected:
 
diff --git a/src/core/tnlTimerRT.cpp b/src/core/tnlTimerRT.cpp
index 8d2fa297e056a4f216ad0b344e2e8a77ce6e4360..7351972dc5cb83697d7d5884170ab74fcd9f392a 100644
--- a/src/core/tnlTimerRT.cpp
+++ b/src/core/tnlTimerRT.cpp
@@ -27,12 +27,12 @@
 
 tnlTimerRT defaultRTTimer;
 
-tnlTimerRT :: tnlTimerRT()
+tnlTimerRT::tnlTimerRT()
 {
-   Reset();
+   reset();
 }
 
-void tnlTimerRT :: Reset()
+void tnlTimerRT::reset()
 {
 #ifdef HAVE_TIME
    struct timeval tp;
@@ -46,7 +46,7 @@ void tnlTimerRT :: Reset()
 
 }
 
-void tnlTimerRT :: Stop()
+void tnlTimerRT::stop()
 {
 #ifdef HAVE_TIME
    if( ! stop_state )
@@ -59,7 +59,7 @@ void tnlTimerRT :: Stop()
 #endif
 }
 
-void tnlTimerRT :: Continue()
+void tnlTimerRT::start()
 {
 #ifdef HAVE_TIME
    struct timeval tp;
@@ -69,11 +69,11 @@ void tnlTimerRT :: Continue()
 #endif
 }
 
-double tnlTimerRT :: GetTime()
+double tnlTimerRT::getTime()
 {
 #ifdef HAVE_TIME
-	Stop();
-	Continue();
+	stop();
+	start();
 	return total_time;
 #endif
  return -1;
diff --git a/src/core/tnlTimerRT.h b/src/core/tnlTimerRT.h
index 31ab6a4f8003c1dcb7e2d4bb26a0495fc591e8d8..7aa6305c6f8f48e8cdf6f25616b3438f0efb09f3 100644
--- a/src/core/tnlTimerRT.h
+++ b/src/core/tnlTimerRT.h
@@ -26,13 +26,13 @@ class tnlTimerRT
 
    tnlTimerRT();
 
-   void Reset();
+   void reset();
 
-   void Stop();
+   void stop();
 
-   void Continue();
+   void start();
 
-   double GetTime();
+   double getTime();
 
    protected:
 
diff --git a/src/core/vectors/tnlMultiVector1D_impl.h b/src/core/vectors/tnlMultiVector1D_impl.h
index 4bb82a17674f8144682eeb46b626d4e3ab39432d..6e1b6a6695f41c38454f24a8493d9ae68b4bbe90 100644
--- a/src/core/vectors/tnlMultiVector1D_impl.h
+++ b/src/core/vectors/tnlMultiVector1D_impl.h
@@ -250,4 +250,32 @@ bool tnlMultiVector< 1, Real, Device, Index > :: load( const tnlString& fileName
    return tnlObject :: load( fileName );
 }
 
+#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
+
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 1, float,  tnlHost, int >;
+#endif
+extern template class tnlMultiVector< 1, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 1, float,  tnlHost, long int >;
+#endif
+extern template class tnlMultiVector< 1, double, tnlHost, long int >;
+#endif
+
+#ifdef HAVE_CUDA
+/*#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 1, float,  tnlCuda, int >;
+#endif
+extern template class tnlMultiVector< 1, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 1, float,  tnlCuda, long int >;
+#endif
+extern template class tnlMultiVector< 1, double, tnlCuda, long int >;
+#endif*/
+#endif
+
+#endif
+
 #endif /* TNLMULTIVECTOR1D_IMPL_H_ */
diff --git a/src/core/vectors/tnlMultiVector2D_impl.h b/src/core/vectors/tnlMultiVector2D_impl.h
index 479d64a31c3216c4f19baf5c48f18ec805dd2306..a964b062e641695bfcc29f30861c4c43b50f35f4 100644
--- a/src/core/vectors/tnlMultiVector2D_impl.h
+++ b/src/core/vectors/tnlMultiVector2D_impl.h
@@ -262,4 +262,33 @@ ostream& operator << ( ostream& str, const tnlMultiVector< 2, Real, Device, Inde
    return str;
 }
 
+#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
+
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 2, float,  tnlHost, int >;
+#endif
+extern template class tnlMultiVector< 2, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 2, float,  tnlHost, long int >;
+#endif
+extern template class tnlMultiVector< 2, double, tnlHost, long int >;
+#endif
+
+#ifdef HAVE_CUDA
+/*#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 2, float,  tnlCuda, int >;
+#endif
+extern template class tnlMultiVector< 2, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 2, float,  tnlCuda, long int >;
+#endif
+extern template class tnlMultiVector< 2, double, tnlCuda, long int >;
+#endif*/
+#endif
+
+#endif
+
+
 #endif /* TNLMULTIVECTOR2D_IMPL_H_ */
diff --git a/src/core/vectors/tnlMultiVector3D_impl.h b/src/core/vectors/tnlMultiVector3D_impl.h
index 50082d3f9e6d4f758947deb78ca09560240f3a38..3a19302c373be1f7b56db3fee36b688aa6aa6a04 100644
--- a/src/core/vectors/tnlMultiVector3D_impl.h
+++ b/src/core/vectors/tnlMultiVector3D_impl.h
@@ -285,4 +285,33 @@ bool tnlMultiVector< 3, Real, Device, Index > :: load( const tnlString& fileName
    return tnlObject :: load( fileName );
 }
 
+#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
+
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 3, float,  tnlHost, int >;
+#endif
+extern template class tnlMultiVector< 3, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 3, float,  tnlHost, long int >;
+#endif
+extern template class tnlMultiVector< 3, double, tnlHost, long int >;
+#endif
+
+#ifdef HAVE_CUDA
+/*#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 3, float,  tnlCuda, int >;
+#endif
+extern template class tnlMultiVector< 3, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 3, float,  tnlCuda, long int >;
+#endif
+extern template class tnlMultiVector< 3, double, tnlCuda, long int >;
+#endif*/
+#endif
+
+#endif
+
+
 #endif /* TNLMULTIVECTOR3D_IMPL_H_ */
diff --git a/src/core/vectors/tnlMultiVector4D_impl.h b/src/core/vectors/tnlMultiVector4D_impl.h
index 416acb45eaf492ee6b22bd267fc893c164ea6073..b843e4838df4691c6f101bdc793e3dad71c7aebe 100644
--- a/src/core/vectors/tnlMultiVector4D_impl.h
+++ b/src/core/vectors/tnlMultiVector4D_impl.h
@@ -306,4 +306,33 @@ bool tnlMultiVector< 4, Real, Device, Index > :: load( const tnlString& fileName
    return tnlObject :: load( fileName );
 }
 
+#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
+
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 4, float,  tnlHost, int >;
+#endif
+extern template class tnlMultiVector< 4, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 4, float,  tnlHost, long int >;
+#endif
+extern template class tnlMultiVector< 4, double, tnlHost, long int >;
+#endif
+
+#ifdef HAVE_CUDA
+/*#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 4, float,  tnlCuda, int >;
+#endif
+extern template class tnlMultiVector< 4, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlMultiVector< 4, float,  tnlCuda, long int >;
+#endif
+extern template class tnlMultiVector< 4, double, tnlCuda, long int >;
+#endif*/
+#endif
+
+#endif
+
+
 #endif /* TNLMULTIVECTOR4D_IMPL_H_ */
diff --git a/src/core/vectors/tnlMultiVector_impl.cpp b/src/core/vectors/tnlMultiVector_impl.cpp
index 5dff1b99997c483d257bbced57a5a39cfe9f8903..5920c03ff640ad3500e4e88877a62344c24cb7aa 100644
--- a/src/core/vectors/tnlMultiVector_impl.cpp
+++ b/src/core/vectors/tnlMultiVector_impl.cpp
@@ -19,41 +19,92 @@
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 1, float,  tnlHost, int >;
+#endif
 template class tnlMultiVector< 1, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 1, float,  tnlHost, long int >;
+#endif
 template class tnlMultiVector< 1, double, tnlHost, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 2, float,  tnlHost, int >;
+#endif
 template class tnlMultiVector< 2, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 2, float,  tnlHost, long int >;
+#endif
 template class tnlMultiVector< 2, double, tnlHost, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 3, float,  tnlHost, int >;
+#endif
 template class tnlMultiVector< 3, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 3, float,  tnlHost, long int >;
+#endif
 template class tnlMultiVector< 3, double, tnlHost, long int >;
+#endif
+
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 4, float,  tnlHost, int >;
+#endif
 template class tnlMultiVector< 4, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 4, float,  tnlHost, long int >;
+#endif
 template class tnlMultiVector< 4, double, tnlHost, long int >;
+#endif
 
 #ifdef HAVE_CUDA
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 1, float,  tnlCuda, int >;
+#endif
 template class tnlMultiVector< 1, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 1, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiVector< 1, double, tnlCuda, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 2, float,  tnlCuda, int >;
+#endif
 template class tnlMultiVector< 2, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 2, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiVector< 2, double, tnlCuda, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 3, float,  tnlCuda, int >;
+#endif
 template class tnlMultiVector< 3, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 3, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiVector< 3, double, tnlCuda, long int >;
+#endif
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 4, float,  tnlCuda, int >;
+#endif
 template class tnlMultiVector< 4, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlMultiVector< 4, float,  tnlCuda, long int >;
+#endif
 template class tnlMultiVector< 4, double, tnlCuda, long int >;
+#endif
 
 #endif
 
diff --git a/src/core/vectors/tnlSharedVector.h b/src/core/vectors/tnlSharedVector.h
index a3d6621a85da8b17fadab11d2065fbfa57723db3..d3e19391f1008a8219572c1c521e91e543a85bf3 100644
--- a/src/core/vectors/tnlSharedVector.h
+++ b/src/core/vectors/tnlSharedVector.h
@@ -20,7 +20,7 @@
 
 #include <core/arrays/tnlSharedArray.h>
 #include <core/vectors/tnlVector.h>
-#include <functions/tnlFunctionType.h>
+#include <functors/tnlFunctionType.h>
 
 class tnlHost;
 
@@ -78,6 +78,10 @@ class tnlSharedVector : public tnlSharedArray< Real, Device, Index >
 
    template< typename Vector >
    tnlSharedVector< Real, Device, Index >& operator += ( const Vector& vector );
+   
+   tnlSharedVector< Real, Device, Index >& operator *= ( const RealType& c );
+   
+   tnlSharedVector< Real, Device, Index >& operator /= ( const RealType& c );
 
    //bool save( tnlFile& file ) const;
 
@@ -125,25 +129,13 @@ class tnlSharedVector : public tnlSharedArray< Real, Device, Index >
                    const Real& alpha = 1.0,
                    const Real& thisMultiplicator = 1.0 );
 
-   //! Computes Y = alpha * X + beta * Y.
+   //! Computes this = thisMultiplicator * this + multiplicator1 * v1 + multiplicator2 * v2.
    template< typename Vector >
-   void alphaXPlusBetaY( const Real& alpha,
-                         const Vector& x,
-                         const Real& beta );
-
-   //! Computes Y = alpha * X + beta * Z
-   template< typename Vector >
-   void alphaXPlusBetaZ( const Real& alpha,
-                         const Vector& x,
-                         const Real& beta,
-                         const Vector& z );
-
-   //! Computes Y = Scalar Alpha X Plus Scalar Beta Z Plus Y
-   template< typename Vector >
-   void alphaXPlusBetaZPlusY( const Real& alpha,
-                              const Vector& x,
-                              const Real& beta,
-                              const Vector& z );
+   void addVectors( const Vector& v1,
+                    const Real& multiplicator1,
+                    const Vector& v2,
+                    const Real& multiplicator2,
+                    const Real& thisMultiplicator = 1.0 );
 
    void computePrefixSum();
 
diff --git a/src/core/vectors/tnlSharedVector_impl.cpp b/src/core/vectors/tnlSharedVector_impl.cpp
index 77a908e3a9939aa7ff9eaee38fa7ebca92fac84a..1993b3d301a7c8934514611be5ec032a7d35d1db 100644
--- a/src/core/vectors/tnlSharedVector_impl.cpp
+++ b/src/core/vectors/tnlSharedVector_impl.cpp
@@ -19,16 +19,42 @@
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedVector< float, tnlHost, int >;
+#endif
 template class tnlSharedVector< double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlSharedVector< long double, tnlHost, int >;
+#endif
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedVector< float, tnlHost, long int >;
+#endif
 template class tnlSharedVector< double, tnlHost, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlSharedVector< long double, tnlHost, long int >;
+#endif
+#endif
 
 #ifdef HAVE_CUDA
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedVector< float, tnlCuda, int >;
+#endif
 template class tnlSharedVector< double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlSharedVector< long double, tnlCuda, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlSharedVector< float, tnlCuda, long int >;
+#endif
 template class tnlSharedVector< double, tnlCuda, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlSharedVector< long double, tnlCuda, long int >;
+#endif
+#endif
+
 #endif
 
 #endif
diff --git a/src/core/vectors/tnlSharedVector_impl.h b/src/core/vectors/tnlSharedVector_impl.h
index 3e98e54783639601b37c388ed10ee352196d25e3..83a5c92e396a3a27de2075ae1e96db929f3d42de 100644
--- a/src/core/vectors/tnlSharedVector_impl.h
+++ b/src/core/vectors/tnlSharedVector_impl.h
@@ -151,7 +151,7 @@ template< typename Real,
    template< typename Vector >
 tnlSharedVector< Real, Device, Index >& tnlSharedVector< Real, Device, Index > :: operator -= ( const Vector& vector )
 {
-   alphaXPlusBetaY( -1.0, vector, 1.0 );
+   this->addVector( vector, -1.0 );
    return ( *this );
 }
 
@@ -161,10 +161,28 @@ template< typename Real,
    template< typename Vector >
 tnlSharedVector< Real, Device, Index >& tnlSharedVector< Real, Device, Index > :: operator += ( const Vector& vector )
 {
-   alphaXPlusBetaY( 1.0, vector, 1.0 );
+   this->addVector( vector );
    return ( *this );
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+tnlSharedVector< Real, Device, Index >& tnlSharedVector< Real, Device, Index > :: operator *= ( const RealType& c )
+{
+   tnlVectorOperations< Device >::vectorScalarMultiplication( *this, c );
+   return *this;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+tnlSharedVector< Real, Device, Index >& tnlSharedVector< Real, Device, Index > :: operator /= ( const RealType& c )
+{
+   tnlVectorOperations< Device >::vectorScalarMultiplication( *this, 1.0/ c );
+   return *this;
+}
+
 template< typename Real,
           typename Device,
           typename Index >
@@ -307,36 +325,17 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vector >
-void tnlSharedVector< Real, Device, Index > :: alphaXPlusBetaY( const Real& alpha,
-                                                                const Vector& x,
-                                                                const Real& beta )
-{
-   tnlVectorOperations< Device > :: alphaXPlusBetaY( *this, x, alpha, beta );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Vector >
-void tnlSharedVector< Real, Device, Index > :: alphaXPlusBetaZ( const Real& alpha,
-                                                                const Vector& x,
-                                                                const Real& beta,
-                                                                const Vector& z )
+void
+tnlSharedVector< Real, Device, Index >::
+addVectors( const Vector& v1,
+            const Real& multiplicator1,
+            const Vector& v2,
+            const Real& multiplicator2,
+            const Real& thisMultiplicator )
 {
-   tnlVectorOperations< Device > :: alphaXPlusBetaZ( *this, x, alpha, z, beta );
+   tnlVectorOperations< Device >::addVectors( *this, v1, multiplicator1, v2, multiplicator2, thisMultiplicator );
 }
 
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Vector >
-void tnlSharedVector< Real, Device, Index > :: alphaXPlusBetaZPlusY( const Real& alpha,
-                                                                     const Vector& x,
-                                                                     const Real& beta,
-                                                                     const Vector& z )
-{
-   tnlVectorOperations< Device > :: alphaXPlusBetaZPlusY( *this, x, alpha, z, beta );
-}
 template< typename Real,
           typename Device,
           typename Index >
@@ -374,17 +373,43 @@ void tnlSharedVector< Real, Device, Index > :: computeExclusivePrefixSum( const
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlSharedVector< float, tnlHost, int >;
+#endif
 extern template class tnlSharedVector< double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlSharedVector< long double, tnlHost, int >;
+#endif
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlSharedVector< float, tnlHost, long int >;
+#endif
 extern template class tnlSharedVector< double, tnlHost, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlSharedVector< long double, tnlHost, long int >;
+#endif
+#endif
 
 #ifdef HAVE_CUDA
 // TODO: fix this - it does not work with CUDA 5.5
-/*extern template class tnlSharedVector< float, tnlCuda, int >;
+/*
+#ifdef INSTANTIATE_FLOAT
+extern template class tnlSharedVector< float, tnlCuda, int >;
+#endif
 extern template class tnlSharedVector< double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlSharedVector< long double, tnlCuda, int >;
+#endif
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlSharedVector< float, tnlCuda, long int >;
-extern template class tnlSharedVector< double, tnlCuda, long int >;*/
+#endif
+extern template class tnlSharedVector< double, tnlCuda, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlSharedVector< long double, tnlCuda, long int >;
+#endif
+ #endif 
+ */
 #endif
 
 #endif
diff --git a/src/core/vectors/tnlStaticVector1D_impl.h b/src/core/vectors/tnlStaticVector1D_impl.h
index 5fd6030414486a4d3a16de8f2cff013fbf4d91df..048c9b9619f8c2f5f17aadf3891f2a1915b8a87a 100644
--- a/src/core/vectors/tnlStaticVector1D_impl.h
+++ b/src/core/vectors/tnlStaticVector1D_impl.h
@@ -166,9 +166,13 @@ bool tnlStaticVector< 1, Real >::operator >= ( const tnlStaticVector& v ) const
 
 #ifndef HAVE_CUDA
 // TODO: does not work with CUDA
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlStaticVector< 1, float >;
+#endif
 extern template class tnlStaticVector< 1, double >;
-//extern template class tnlStaticVector< 1, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlStaticVector< 1, long double >;
+#endif
 #endif
 
 #endif
diff --git a/src/core/vectors/tnlStaticVector2D_impl.h b/src/core/vectors/tnlStaticVector2D_impl.h
index 81c152dce7423aa1a6421dabf74b12e36b79a5d3..5a780d57fac158ab2c6f60e7c3f28b88c9aa0ff7 100644
--- a/src/core/vectors/tnlStaticVector2D_impl.h
+++ b/src/core/vectors/tnlStaticVector2D_impl.h
@@ -195,9 +195,13 @@ bool tnlStaticVector< 2, Real >::operator >= ( const tnlStaticVector& v ) const
 
 #ifndef HAVE_CUDA
 // TODO: does not work with CUDA
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlStaticVector< 2, float >;
+#endif
 extern template class tnlStaticVector< 2, double >;
-//extern template class tnlStaticVector< 2, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlStaticVector< 2, long double >;
+#endif
 #endif
 
 #endif
diff --git a/src/core/vectors/tnlStaticVector3D_impl.h b/src/core/vectors/tnlStaticVector3D_impl.h
index 50f4078e63aaa5c3b13166b89ea24bfc62d81b27..8772a796f8ee52f34e03fe6dc7b4c59ab77ceac8 100644
--- a/src/core/vectors/tnlStaticVector3D_impl.h
+++ b/src/core/vectors/tnlStaticVector3D_impl.h
@@ -206,9 +206,13 @@ bool tnlStaticVector< 3, Real >::operator >= ( const tnlStaticVector& v ) const
 
 #ifndef HAVE_CUDA
 // TODO: does not work with CUDA
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlStaticVector< 3, float >;
+#endif
 extern template class tnlStaticVector< 3, double >;
-//extern template class tnlStaticVector< 3, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlStaticVector< 3, long double >;
+#endif
 #endif
 
 #endif
diff --git a/src/core/vectors/tnlStaticVector_impl.cpp b/src/core/vectors/tnlStaticVector_impl.cpp
index ab9fe2be93e7b3c7c311659cd965e054e6acc0f4..1b20e1a53aaf6fedf9b0de09d8d1d6691506741d 100644
--- a/src/core/vectors/tnlStaticVector_impl.cpp
+++ b/src/core/vectors/tnlStaticVector_impl.cpp
@@ -20,21 +20,37 @@
 #ifndef HAVE_CUDA
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticVector< 1, float >;
+#endif
 template class tnlStaticVector< 1, double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
 template class tnlStaticVector< 1, long double >;
+#endif
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticVector< 2, float >;
+#endif
 template class tnlStaticVector< 2, double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
 template class tnlStaticVector< 2, long double >;
+#endif
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticVector< 3, float >;
+#endif
 template class tnlStaticVector< 3, double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
 template class tnlStaticVector< 3, long double >;
+#endif
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticVector< 4, float >;
+#endif
 template class tnlStaticVector< 4, double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
 template class tnlStaticVector< 4, long double >;
+#endif
 
 #endif
 #endif
diff --git a/src/core/vectors/tnlStaticVector_impl.cu b/src/core/vectors/tnlStaticVector_impl.cu
index 938d0c47337fb389add036ab68dc7444474e4b16..c8ecd5fe583f9336a6e1e2283d2c4f187e8076fc 100644
--- a/src/core/vectors/tnlStaticVector_impl.cu
+++ b/src/core/vectors/tnlStaticVector_impl.cu
@@ -20,21 +20,37 @@
 #ifdef HAVE_CUDA
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticVector< 1, float >;
+#endif
 template class tnlStaticVector< 1, double >;
-//template class tnlStaticVector< 1, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlStaticVector< 1, long double >;
+#endif
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticVector< 2, float >;
+#endif
 template class tnlStaticVector< 2, double >;
-//template class tnlStaticVector< 2, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlStaticVector< 2, long double >;
+#endif
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticVector< 3, float >;
+#endif
 template class tnlStaticVector< 3, double >;
-//template class tnlStaticVector< 3, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlStaticVector< 3, long double >;
+#endif
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlStaticVector< 4, float >;
+#endif
 template class tnlStaticVector< 4, double >;
-//template class tnlStaticVector< 4, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlStaticVector< 4, long double >;
+#endif
 
 #endif
 #endif
diff --git a/src/core/vectors/tnlStaticVector_impl.h b/src/core/vectors/tnlStaticVector_impl.h
index 208aca2bf923b66688382c502e08d17f759822c2..7adfd45fc5279341605dc6ccb53239b7be97405d 100644
--- a/src/core/vectors/tnlStaticVector_impl.h
+++ b/src/core/vectors/tnlStaticVector_impl.h
@@ -202,9 +202,13 @@ tnlStaticVector< Size, Real > operator * ( const Real& c, const tnlStaticVector<
 
 #ifndef HAVE_CUDA
 // TODO: does not work with CUDA
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlStaticVector< 4, float >;
+#endif
 extern template class tnlStaticVector< 4, double >;
-//extern template class tnlStaticVector< 4, long double >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlStaticVector< 4, long double >;
+#endif
 #endif
 
 #endif
diff --git a/src/core/vectors/tnlVector.h b/src/core/vectors/tnlVector.h
index 3578812a583db6a9bde34c94b9576ca128fce638..be32ab03b7eaab1b199cc55d71ec1e9f15a9c51c 100644
--- a/src/core/vectors/tnlVector.h
+++ b/src/core/vectors/tnlVector.h
@@ -19,7 +19,7 @@
 #define TNLVECTOR_H_
 
 #include <core/arrays/tnlArray.h>
-#include <functions/tnlFunctionType.h>
+#include <functors/tnlFunctionType.h>
 
 class tnlHost;
 
@@ -75,8 +75,9 @@ class tnlVector : public tnlArray< Real, Device, Index >
    template< typename Vector >
    tnlVector< Real, Device, Index >& operator += ( const Vector& vector );
 
-   // TODO: implement
-   //tnlVector< Real, Device, Index >& operator *= ( const RealType& c );
+   tnlVector< Real, Device, Index >& operator *= ( const RealType& c );
+   
+   tnlVector< Real, Device, Index >& operator /= ( const RealType& c );
 
    Real max() const;
 
@@ -114,31 +115,20 @@ class tnlVector : public tnlArray< Real, Device, Index >
    template< typename Vector >
    Real scalarProduct( const Vector& v );
 
-   //! Computes Y = alpha * X + Y.
+   //! Computes this = thisMultiplicator * this + multiplicator * v.
    template< typename Vector >
    void addVector( const Vector& v,
                    const Real& multiplicator = 1.0,
                    const Real& thisMultiplicator = 1.0 );
 
-   //! Computes Y = alpha * X + beta * Y.
-   template< typename Vector >
-   void alphaXPlusBetaY( const Real& alpha,
-                         const Vector& x,
-                         const Real& beta );
-
-   //! Computes Y = alpha * X + beta * Z
-   template< typename Vector >
-   void alphaXPlusBetaZ( const Real& alpha,
-                         const Vector& x,
-                         const Real& beta,
-                         const Vector& z );
 
-   //! Computes Y = Scalar Alpha X Plus Scalar Beta Z Plus Y
+   //! Computes this = thisMultiplicator * this + multiplicator1 * v1 + multiplicator2 * v2.
    template< typename Vector >
-   void alphaXPlusBetaZPlusY( const Real& alpha,
-                              const Vector& x,
-                              const Real& beta,
-                              const Vector& z );
+   void addVectors( const Vector& v1,
+                    const Real& multiplicator1,
+                    const Vector& v2,
+                    const Real& multiplicator2,
+                    const Real& thisMultiplicator = 1.0 );
 
    void computePrefixSum();
 
diff --git a/src/core/vectors/tnlVectorOperations.h b/src/core/vectors/tnlVectorOperations.h
index b065a2bf97cb3a690433011dcc73cbbf5dc6d5b4..3438b5250ff5a39dfc3529c5cf452938ff59cdbc 100644
--- a/src/core/vectors/tnlVectorOperations.h
+++ b/src/core/vectors/tnlVectorOperations.h
@@ -18,7 +18,6 @@
 #ifndef TNLVECTOROPERATIONS_H_
 #define TNLVECTOROPERATIONS_H_
 
-#include <core/tnlCuda.h>
 #include <core/cuda/cuda-reduction.h>
 #include <core/cuda/reduction-operations.h>
 #include <core/tnlHost.h>
@@ -99,26 +98,14 @@ class tnlVectorOperations< tnlHost >
                           const Vector2& v,
                           const typename Vector2::RealType& multiplicator,
                           const typename Vector1::RealType& thisMultiplicator = 1.0 );
-
-   template< typename Vector1, typename Vector2 >
-   static void alphaXPlusBetaY( Vector1& y,
-                                const Vector2& x,
-                                const typename Vector1::RealType& alpha,
-                                const typename Vector1::RealType& beta );
-
-   template< typename Vector1, typename Vector2 >
-   static void alphaXPlusBetaZ( Vector1& y,
-                                const Vector2& x,
-                                const typename Vector1::RealType& alpha,
-                                const Vector2& z,
-                                const typename Vector1::RealType& beta );
-
-   template< typename Vector1, typename Vector2 >
-   static void alphaXPlusBetaZPlusY( Vector1& y,
-                                     const Vector2& x,
-                                     const typename Vector1::RealType& alpha,
-                                     const Vector2& z,
-                                     const typename Vector1::RealType& beta );
+   
+   template< typename Vector1, typename Vector2, typename Vector3 >
+   static void addVectors( Vector1& v,
+                           const Vector2& v1,
+                           const typename Vector2::RealType& multiplicator1,
+                           const Vector3& v2,
+                           const typename Vector3::RealType& multiplicator2,
+                           const typename Vector1::RealType& thisMultiplicator = 1.0 );
 
    template< typename Vector >
    static void computePrefixSum( Vector& v,
@@ -204,26 +191,15 @@ class tnlVectorOperations< tnlCuda >
                           const Vector2& x,
                           const typename Vector2::RealType& alpha,
                           const typename Vector1::RealType& thisMultiplicator = 1.0 );
-
-   template< typename Vector1, typename Vector2 >
-   static void alphaXPlusBetaY( Vector1& y,
-                                const Vector2& x,
-                                const typename Vector1::RealType& alpha,
-                                const typename Vector1::RealType& beta );
-
-   template< typename Vector1, typename Vector2 >
-   static void alphaXPlusBetaZ( Vector1& y,
-                                const Vector2& x,
-                                const typename Vector1::RealType& alpha,
-                                const Vector2& z,
-                                const typename Vector1::RealType& beta );
-
-   template< typename Vector1, typename Vector2 >
-   static void alphaXPlusBetaZPlusY( Vector1& y,
-                                     const Vector2& x,
-                                     const typename Vector1::RealType& alpha,
-                                     const Vector2& z,
-                                     const typename Vector1::RealType& beta );
+   
+   template< typename Vector1, typename Vector2, typename Vector3 >
+   static void addVectors( Vector1& v,
+                           const Vector2& v1,
+                           const typename Vector2::RealType& multiplicator1,
+                           const Vector3& v2,
+                           const typename Vector3::RealType& multiplicator2,
+                           const typename Vector1::RealType& thisMultiplicator = 1.0 );
+   
 
    template< typename Vector >
    static void computePrefixSum( Vector& v,
diff --git a/src/core/vectors/tnlVectorOperationsCuda_impl.h b/src/core/vectors/tnlVectorOperationsCuda_impl.h
index 0f5eb8371407acd4c8b5bac8494364085acc3d69..41d537fb5f9c4cb85bd7e4bf503048765b43198b 100644
--- a/src/core/vectors/tnlVectorOperationsCuda_impl.h
+++ b/src/core/vectors/tnlVectorOperationsCuda_impl.h
@@ -18,7 +18,9 @@
 #ifndef TNLVECTOROPERATIONSCUDA_IMPL_H_
 #define TNLVECTOROPERATIONSCUDA_IMPL_H_
 
+#include <tnlConfig.h>
 #include <core/cuda/cuda-prefix-sum.h>
+#include <core/cuda/tnlCublasWrapper.h>
 
 template< typename Vector >
 void tnlVectorOperations< tnlCuda >::addElement( Vector& v,
@@ -52,7 +54,7 @@ typename Vector :: RealType tnlVectorOperations< tnlCuda > :: getVectorMax( cons
                           v. getSize(),
                           v. getData(),
                           ( Real* ) 0,
-                          result );
+                          result );   
    return result;
 }
 
@@ -245,9 +247,11 @@ typename Vector1 :: RealType tnlVectorOperations< tnlCuda > :: getVectorDifferen
 }
 
 template< typename Vector1, typename Vector2 >
-typename Vector1 :: RealType tnlVectorOperations< tnlCuda > :: getVectorDifferenceLpNorm( const Vector1& v1,
-                                                               const Vector2& v2,
-                                                               const typename Vector1 :: RealType& p )
+typename Vector1::RealType
+tnlVectorOperations< tnlCuda >::
+getVectorDifferenceLpNorm( const Vector1& v1,
+                           const Vector2& v2,
+                           const typename Vector1 :: RealType& p )
 {
    typedef typename Vector1 :: RealType Real;
    typedef typename Vector1 :: IndexType Index;
@@ -347,6 +351,12 @@ typename Vector1 :: RealType tnlVectorOperations< tnlCuda > :: getScalarProduct(
               cerr << "Vector names are " << v1. getName() << " and " << v2. getName() );
 
    Real result( 0 );
+/*#if defined HAVE_CUBLAS && defined HAVE_CUDA
+   if( tnlCublasWrapper< typename Vector1::RealType,
+                         typename Vector2::RealType,
+                         typename Vector1::IndexType >::dot( v1.getData(), v1.getData(), v1.getSize(), result ) )
+       return result;
+#endif*/
    tnlParallelReductionScalarProduct< Real, Index > operation;
    reductionOnCudaDevice( operation,
                           v1. getSize(),
@@ -422,158 +432,81 @@ void tnlVectorOperations< tnlCuda > :: addVector( Vector1& y,
 #ifdef HAVE_CUDA
 template< typename Real,
           typename Index >
-__global__ void vectorAlphaXPlusBetaYCudaKernel( Real* y,
-                                                 const Real* x,
-                                                 const Index size,
-                                                 const Real alpha,
-                                                 const Real beta )
+__global__ void vectorAddVectorsCudaKernel( Real* v,
+                                            const Real* v1,
+                                            const Real* v2,
+                                            const Index size,
+                                            const Real multiplicator1,
+                                            const Real multiplicator2,
+                                            const Real thisMultiplicator )
 {
    Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
    const Index maxGridSize = blockDim. x * gridDim. x;
-   while( elementIdx < size )
-   {
-      y[ elementIdx ] = alpha * x[ elementIdx ] + beta * y[ elementIdx ];
-      elementIdx += maxGridSize;
-   }
+   if( thisMultiplicator == 1.0 )
+      while( elementIdx < size )
+      {
+         v[ elementIdx ] += multiplicator1 * v1[ elementIdx ] +
+                            multiplicator2 * v2[ elementIdx ];
+         elementIdx += maxGridSize;
+      }
+   else
+      while( elementIdx < size )
+      {
+         v[ elementIdx ] = thisMultiplicator * v[ elementIdx ] +
+                           multiplicator1 * v1[ elementIdx ] +
+                           multiplicator2 * v2[ elementIdx ];
+         elementIdx += maxGridSize;
+      }
 }
 #endif
 
-template< typename Vector1, typename Vector2 >
-void tnlVectorOperations< tnlCuda > :: alphaXPlusBetaY( Vector1& y,
-                                                        const Vector2& x,
-                                                        const typename Vector1::RealType& alpha,
-                                                        const typename Vector1::RealType& beta )
+
+template< typename Vector1,
+          typename Vector2,
+          typename Vector3 >
+void
+tnlVectorOperations< tnlCuda >::
+addVectors( Vector1& v,
+            const Vector2& v1,
+            const typename Vector2::RealType& multiplicator1,
+            const Vector3& v2,
+            const typename Vector3::RealType& multiplicator2,
+            const typename Vector1::RealType& thisMultiplicator )
 {
    typedef typename Vector1 :: RealType Real;
    typedef typename Vector1 :: IndexType Index;
 
-   tnlAssert( y. getSize() > 0,
-              cerr << "Vector name is " << y. getName() );
-   tnlAssert( y. getSize() == x. getSize(),
-              cerr << "Vector names are " << x. getName() << " and " << y. getName() );
+   tnlAssert( v.getSize() > 0,
+              cerr << "Vector name is " << v.getName() );
+   tnlAssert( v.getSize() == v1.getSize(),
+              cerr << "Vector names are " << v.getName() << " and " << v1.getName() );
+   tnlAssert( v.getSize() == v2.getSize(),
+              cerr << "Vector names are " << v.getName() << " and " << v2.getName() );
+   tnlAssert( v.getData() != 0, );
+   tnlAssert( v1.getData() != 0, );
+   tnlAssert( v2.getData() != 0, );
 
    #ifdef HAVE_CUDA
       dim3 blockSize( 0 ), gridSize( 0 );
-      const Index& size = x.getSize();
-      blockSize. x = 256;
-      Index blocksNumber = ceil( ( double ) size / ( double ) blockSize. x );
-      gridSize. x = Min( blocksNumber, tnlCuda::getMaxGridSize() );
-      vectorAlphaXPlusBetaYCudaKernel<<< gridSize, blockSize >>>( y.getData(),
-                                                                  x.getData(),
-                                                                  size,
-                                                                  alpha,
-                                                                  beta );
-      checkCudaDevice;
-   #else
-      tnlCudaSupportMissingMessage;;
-   #endif
-}
-
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index >
-__global__ void vectorAlphaXPlusBetaZCudaKernel( Real* y,
-                                                 const Real* x,
-                                                 const Real* z,
-                                                 const Index size,
-                                                 const Real alpha,
-                                                 const Real beta )
-{
-   Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
-   const Index maxGridSize = blockDim. x * gridDim. x;
-   while( elementIdx < size )
-   {
-      y[ elementIdx ] = alpha * x[ elementIdx ] + beta * z[ elementIdx ];
-      elementIdx += maxGridSize;
-   }
-}
-#endif
-
-
-template< typename Vector1, typename Vector2 >
-void tnlVectorOperations< tnlCuda > :: alphaXPlusBetaZ( Vector1& y,
-                                                        const Vector2& x,
-                                                        const typename Vector1 :: RealType& alpha,
-                                                        const Vector2& z,
-                                                        const typename Vector1 :: RealType& beta )
-{
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
 
-   tnlAssert( y.getSize() > 0,
-              cerr << "Vector name is " << y.getName() );
-   tnlAssert( y.getSize() == x.getSize() && x.getSize() == z.getSize(),
-              cerr << "Vector names are " << x.getName() << ", " << y.getName() << " and " << z.getName() );
+      const Index& size = v.getSize();
+      dim3 cudaBlockSize( 256 );
+      dim3 cudaBlocks;
+      cudaBlocks.x = Min( tnlCuda::getMaxGridSize(), tnlCuda::getNumberOfBlocks( size, cudaBlockSize.x ) );      
 
-   #ifdef HAVE_CUDA
-      dim3 blockSize( 0 ), gridSize( 0 );
-      const Index& size = x.getSize();
-      blockSize. x = 256;
-      Index blocksNumber = ceil( ( double ) size / ( double ) blockSize. x );
-      gridSize. x = Min( blocksNumber, tnlCuda::getMaxGridSize() );
-      vectorAlphaXPlusBetaZCudaKernel<<< gridSize, blockSize >>>( y.getData(),
-                                                                  x.getData(),
-                                                                  z.getData(),
-                                                                  size,
-                                                                  alpha,
-                                                                  beta );
+      vectorAddVectorsCudaKernel<<< cudaBlocks, cudaBlockSize >>>( v.getData(),
+                                                                   v1.getData(),
+                                                                   v2.getData(),
+                                                                   size,
+                                                                   multiplicator1,
+                                                                   multiplicator2,
+                                                                   thisMultiplicator);
       checkCudaDevice;
    #else
       tnlCudaSupportMissingMessage;;
    #endif
-}
 
-#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index >
-__global__ void vectorAlphaXPlusBetaZPlusYCudaKernel( Real* y,
-                                                     const Real* x,
-                                                     const Real* z,
-                                                     const Index size,
-                                                     const Real alpha,
-                                                     const Real beta )
-{
-   Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
-   const Index maxGridSize = blockDim. x * gridDim. x;
-   while( elementIdx < size )
-   {
-      y[ elementIdx ] += alpha * x[ elementIdx ] + beta * z[ elementIdx ];
-      elementIdx += maxGridSize;
-   }
-}
-#endif
 
-template< typename Vector1, typename Vector2 >
-void tnlVectorOperations< tnlCuda > :: alphaXPlusBetaZPlusY( Vector1& y,
-                                                             const Vector2& x,
-                                                             const typename Vector1 :: RealType& alpha,
-                                                             const Vector2& z,
-                                                             const typename Vector1 :: RealType& beta )
-{
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
-
-   tnlAssert( y.getSize() > 0,
-              cerr << "Vector name is " << y.getName() );
-   tnlAssert( y.getSize() == x.getSize() && x.getSize() == z.getSize(),
-              cerr << "Vector names are " << x.getName() << ", " << y.getName() << " and " << z.getName() );
-
-   #ifdef HAVE_CUDA
-      dim3 blockSize( 0 ), gridSize( 0 );
-      const Index& size = x.getSize();
-      blockSize. x = 256;
-      Index blocksNumber = ceil( ( double ) size / ( double ) blockSize. x );
-      gridSize. x = Min( blocksNumber, tnlCuda::getMaxGridSize() );
-      vectorAlphaXPlusBetaZPlusYCudaKernel<<< gridSize, blockSize >>>( y.getData(),
-                                                                       x.getData(),
-                                                                       z.getData(),
-                                                                       size,
-                                                                       alpha,
-                                                                       beta );
-      checkCudaDevice;
-   #else
-      tnlCudaSupportMissingMessage;;
-   #endif
 }
 
 template< typename Vector >
@@ -626,140 +559,250 @@ void tnlVectorOperations< tnlCuda >::computeExclusivePrefixSum( Vector& v,
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< int, tnlCuda, int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< float, tnlCuda, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< double, tnlCuda, int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< int, tnlCuda, long int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< double, tnlCuda, long int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Min
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< int, tnlCuda, int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< float, tnlCuda, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< double, tnlCuda, int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< int, tnlCuda, long int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< double, tnlCuda, long int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Abs max
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< int, tnlCuda, int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< float, tnlCuda, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< double, tnlCuda, int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< int, tnlCuda, long int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< double, tnlCuda, long int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Abs min
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< int, tnlCuda, int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< float, tnlCuda, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< double, tnlCuda, int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< int, tnlCuda, long int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< double, tnlCuda, long int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Lp norm
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< int, tnlCuda, int >& v, const int& p );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long int, tnlCuda, int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< float, tnlCuda, int >& v, const float& p );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< double, tnlCuda, int >& v, const double& p );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, int >& v, const long double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, int >& v, const long double& p );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< int, tnlCuda, long int >& v, const int& p );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long int, tnlCuda, long int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< float, tnlCuda, long int >& v, const float& p );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< double, tnlCuda, long int >& v, const double& p );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, long int >& v, const long double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, long int >& v, const long double& p );
+#endif
+#endif
 
 /****
  * Sum
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< int, tnlCuda, int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< float, tnlCuda, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< double, tnlCuda, int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< int, tnlCuda, long int >& v );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< double, tnlCuda, long int >& v );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Difference max
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference min
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference abs max
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference abs min
  */
 extern template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 extern template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 extern template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-//extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
 
 #endif
 
diff --git a/src/core/vectors/tnlVectorOperationsHost_impl.h b/src/core/vectors/tnlVectorOperationsHost_impl.h
index cd3c2a17a17c1bf933d4bad46e2daa69404964f8..29d8485eb08c20501b6cefe1c49bb3e055fb091a 100644
--- a/src/core/vectors/tnlVectorOperationsHost_impl.h
+++ b/src/core/vectors/tnlVectorOperationsHost_impl.h
@@ -18,6 +18,7 @@
 #ifndef TNLVECTOROPERATIONSHOST_IMPL_H_
 #define TNLVECTOROPERATIONSHOST_IMPL_H_
 
+static const int OpenMPVectorOperationsThreshold = 65536; // TODO: check this threshold
 
 template< typename Vector >
 void tnlVectorOperations< tnlHost >::addElement( Vector& v,
@@ -313,8 +314,11 @@ typename Vector1 :: RealType tnlVectorOperations< tnlHost > :: getScalarProduct(
 
    Real result = 0;
    const Index n = v1. getSize();
+#ifdef HAVE_OPENMP
+#pragma omp parallel for reduction(+:result) if( n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
+#endif     
    for( Index i = 0; i < n; i ++ )
-      result += v1. getElement( i ) * v2. getElement( i );
+      result += v1[ i ] * v2[ i ];
    return result;
 }
 
@@ -339,72 +343,38 @@ void tnlVectorOperations< tnlHost > :: addVector( Vector1& y,
    else
       for( Index i = 0; i < n; i ++ )
          y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ];
-
-}
-
-template< typename Vector1, typename Vector2 >
-void tnlVectorOperations< tnlHost > :: alphaXPlusBetaY( Vector1& y,
-                                                        const Vector2& x,
-                                                        const typename Vector1::RealType& alpha,
-                                                        const typename Vector1::RealType& beta )
-{
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
-
-   tnlAssert( x. getSize() > 0,
-              cerr << "Vector name is " << x. getName() );
-   tnlAssert( x. getSize() == y. getSize(),
-              cerr << "Vector names are " << x. getName() << " and " << y. getName() );
-
-   const Index n = y. getSize();
-   for( Index i = 0; i < n; i ++ )
-      y[ i ] = alpha * x[ i ] + beta * y[ i ];
-}
-
-
-template< typename Vector1, typename Vector2 >
-void tnlVectorOperations< tnlHost > :: alphaXPlusBetaZ( Vector1& y,
-                                                        const Vector2& x,
-                                                        const typename Vector1 :: RealType& alpha,
-                                                        const Vector2& z,
-                                                        const typename Vector1 :: RealType& beta )
-{
-   typedef typename Vector1 :: RealType Real;
-   typedef typename Vector1 :: IndexType Index;
-
-   tnlAssert( x. getSize() > 0,
-              cerr << "Vector name is " << x. getName() );
-   tnlAssert( x. getSize() == y. getSize(),
-              cerr << "Vector names are " << x. getName() << " and " << y. getName() );
-   tnlAssert( x. getSize() == z. getSize(),
-              cerr << "Vector names are " << x. getName() << " and " << z. getName() );
-
-
-   const Index n = y. getSize();
-   for( Index i = 0; i < n; i ++ )
-      y[ i ] = alpha * x[ i ] + beta *  z[ i ];
 }
 
-template< typename Vector1, typename Vector2 >
-void tnlVectorOperations< tnlHost > :: alphaXPlusBetaZPlusY( Vector1& y,
-                                                             const Vector2& x,
-                                                             const typename Vector1 :: RealType& alpha,
-                                                             const Vector2& z,
-                                                             const typename Vector1 :: RealType& beta )
+template< typename Vector1,
+          typename Vector2,
+          typename Vector3 >
+void
+tnlVectorOperations< tnlHost >::
+addVectors( Vector1& v,
+            const Vector2& v1,
+            const typename Vector2::RealType& multiplicator1,
+            const Vector3& v2,
+            const typename Vector3::RealType& multiplicator2,
+            const typename Vector1::RealType& thisMultiplicator )
 {
    typedef typename Vector1 :: RealType Real;
    typedef typename Vector1 :: IndexType Index;
 
-   tnlAssert( x. getSize() > 0,
-              cerr << "Vector name is " << x. getName() );
-   tnlAssert( x. getSize() == y. getSize(),
-              cerr << "Vector names are " << x. getName() << " and " << y. getName() );
-   tnlAssert( x. getSize() == z. getSize(),
-              cerr << "Vector names are " << x. getName() << " and " << z. getName() );
+   tnlAssert( v.getSize() > 0,
+              cerr << "Vector name is " << v.getName() );
+   tnlAssert( v.getSize() == v1.getSize(),
+              cerr << "Vector names are " << v.getName() << " and " << v1.getName() );
+   tnlAssert( v.getSize() == v2.getSize(),
+              cerr << "Vector names are " << v.getName() << " and " << v2.getName() );
 
-   const Index n = y. getSize();
-   for( Index i = 0; i < n; i ++ )
-      y[ i ] += alpha * x[ i ] + beta *  z[ i ];
+   
+   const Index n = v.getSize();
+   if( thisMultiplicator == 1.0 )
+      for( Index i = 0; i < n; i ++ )
+         v[ i ] += multiplicator1 * v1[ i ] + multiplicator2 * v2[ i ];
+   else
+      for( Index i = 0; i < n; i ++ )
+         v[ i ] = thisMultiplicator * v[ i ] * multiplicator1 * v1[ i ] + multiplicator2 * v2[ i ];
 }
 
 template< typename Vector >
@@ -443,141 +413,251 @@ void tnlVectorOperations< tnlHost >::computeExclusivePrefixSum( Vector& v,
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< int, tnlHost, int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< float, tnlHost, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< int, tnlHost, long int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< float, tnlHost, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Min
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< int, tnlHost, int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< float, tnlHost, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< int, tnlHost, long int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< float, tnlHost, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Abs max
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< int, tnlHost, int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< float, tnlHost, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< int, tnlHost, long int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< float, tnlHost, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Abs min
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< int, tnlHost, int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< float, tnlHost, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< int, tnlHost, long int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< float, tnlHost, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Lp norm
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< int, tnlHost, int >& v, const int& p );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< long int, tnlHost, int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< float, tnlHost, int >& v, const float& p );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< double, tnlHost, int >& v, const double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< long double, tnlHost, int >& v, const long double& p );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< int, tnlHost, long int >& v, const int& p );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< long int, tnlHost, long int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< float, tnlHost, long int >& v, const float& p );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< double, tnlHost, long int >& v, const double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< long double, tnlHost, long int >& v, const long double& p );
+#endif
+#endif
 
 /****
  * Sum
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< int, tnlHost, int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< float, tnlHost, int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< int, tnlHost, long int >& v );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< float, tnlHost, long int >& v );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Difference max
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< int, tnlHost, int >& v1, const tnlVector< int, tnlHost, int >& v2 );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< long int, tnlHost, int >& v1, const tnlVector< long int, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< float, tnlHost, int >& v1,  const tnlVector< float, tnlHost, int >& v2);
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< double, tnlHost, int >& v1, const tnlVector< double, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< long double, tnlHost, int >& v1, const tnlVector< long double, tnlHost, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< int, tnlHost, long int >& v1, const tnlVector< int, tnlHost, long int >& v2 );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< long int, tnlHost, long int >& v1, const tnlVector< long int, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< float, tnlHost, long int >& v1, const tnlVector< float, tnlHost, long int >& v2 );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< double, tnlHost, long int >& v1, const tnlVector< double, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< long double, tnlHost, long int >& v1, const tnlVector< long double, tnlHost, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference min
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< int, tnlHost, int >& v1, const tnlVector< int, tnlHost, int >& v2 );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< long int, tnlHost, int >& v1, const tnlVector< long int, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< float, tnlHost, int >& v1,  const tnlVector< float, tnlHost, int >& v2);
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< double, tnlHost, int >& v1, const tnlVector< double, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< long double, tnlHost, int >& v1, const tnlVector< long double, tnlHost, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< int, tnlHost, long int >& v1, const tnlVector< int, tnlHost, long int >& v2 );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< long int, tnlHost, long int >& v1, const tnlVector< long int, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< float, tnlHost, long int >& v1, const tnlVector< float, tnlHost, long int >& v2 );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< double, tnlHost, long int >& v1, const tnlVector< double, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< long double, tnlHost, long int >& v1, const tnlVector< long double, tnlHost, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference abs max
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< int, tnlHost, int >& v1, const tnlVector< int, tnlHost, int >& v2 );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlHost, int >& v1, const tnlVector< long int, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< float, tnlHost, int >& v1,  const tnlVector< float, tnlHost, int >& v2);
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< double, tnlHost, int >& v1, const tnlVector< double, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlHost, int >& v1, const tnlVector< long double, tnlHost, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< int, tnlHost, long int >& v1, const tnlVector< int, tnlHost, long int >& v2 );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlHost, long int >& v1, const tnlVector< long int, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< float, tnlHost, long int >& v1, const tnlVector< float, tnlHost, long int >& v2 );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< double, tnlHost, long int >& v1, const tnlVector< double, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlHost, long int >& v1, const tnlVector< long double, tnlHost, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference abs min
  */
 extern template int         tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< int, tnlHost, int >& v1, const tnlVector< int, tnlHost, int >& v2 );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlHost, int >& v1, const tnlVector< long int, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< float, tnlHost, int >& v1,  const tnlVector< float, tnlHost, int >& v2);
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< double, tnlHost, int >& v1, const tnlVector< double, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlHost, int >& v1, const tnlVector< long double, tnlHost, int >& v2 );
+#endif
+
+
+#ifdef INSTANTIATE_LONG_INT
 extern template int         tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< int, tnlHost, long int >& v1, const tnlVector< int, tnlHost, long int >& v2 );
 extern template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlHost, long int >& v1, const tnlVector< long int, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 extern template float       tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< float, tnlHost, long int >& v1, const tnlVector< float, tnlHost, long int >& v2 );
+#endif
 extern template double      tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< double, tnlHost, long int >& v1, const tnlVector< double, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 extern template long double tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlHost, long int >& v1, const tnlVector< long double, tnlHost, long int >& v2 );
-
+#endif
+#endif
 
 #endif
 
diff --git a/src/core/vectors/tnlVectorOperations_impl.cpp b/src/core/vectors/tnlVectorOperations_impl.cpp
index 23ee7d01ebd17cbb964fa32d7ec94c2207dfe0cf..dbd2275c17f405ec2733f8fb19018dafd2ced47f 100644
--- a/src/core/vectors/tnlVectorOperations_impl.cpp
+++ b/src/core/vectors/tnlVectorOperations_impl.cpp
@@ -24,282 +24,503 @@
  */
 template int         tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< int, tnlHost, int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< float, tnlHost, int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< int, tnlHost, long int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< float, tnlHost, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorMax( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Min
  */
 template int         tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< int, tnlHost, int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< float, tnlHost, int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< int, tnlHost, long int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< float, tnlHost, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorMin( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Abs max
  */
 template int         tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< int, tnlHost, int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< float, tnlHost, int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< int, tnlHost, long int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< float, tnlHost, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorAbsMax( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Abs min
  */
 template int         tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< int, tnlHost, int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< float, tnlHost, int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< int, tnlHost, long int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< float, tnlHost, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Lp norm
  */
 template int         tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< int, tnlHost, int >& v, const int& p );
 template long int    tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< long int, tnlHost, int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< float, tnlHost, int >& v, const float& p );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< double, tnlHost, int >& v, const double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< long double, tnlHost, int >& v, const long double& p );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< int, tnlHost, long int >& v, const int& p );
 template long int    tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< long int, tnlHost, long int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< float, tnlHost, long int >& v, const float& p );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< double, tnlHost, long int >& v, const double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlVector< long double, tnlHost, long int >& v, const long double& p );
+#endif
+#endif
 
 /****
  * Sum
  */
 template int         tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< int, tnlHost, int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< float, tnlHost, int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< int, tnlHost, long int >& v );
 template long int    tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< float, tnlHost, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorSum( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
 
 /****
  * Difference max
  */
 template int         tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< int, tnlHost, int >& v1, const tnlVector< int, tnlHost, int >& v2 );
 template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< long int, tnlHost, int >& v1, const tnlVector< long int, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< float, tnlHost, int >& v1,  const tnlVector< float, tnlHost, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< double, tnlHost, int >& v1, const tnlVector< double, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< long double, tnlHost, int >& v1, const tnlVector< long double, tnlHost, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< int, tnlHost, long int >& v1, const tnlVector< int, tnlHost, long int >& v2 );
 template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< long int, tnlHost, long int >& v1, const tnlVector< long int, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< float, tnlHost, long int >& v1, const tnlVector< float, tnlHost, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< double, tnlHost, long int >& v1, const tnlVector< double, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorDifferenceMax( const tnlVector< long double, tnlHost, long int >& v1, const tnlVector< long double, tnlHost, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference min
  */
 template int         tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< int, tnlHost, int >& v1, const tnlVector< int, tnlHost, int >& v2 );
 template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< long int, tnlHost, int >& v1, const tnlVector< long int, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< float, tnlHost, int >& v1,  const tnlVector< float, tnlHost, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< double, tnlHost, int >& v1, const tnlVector< double, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< long double, tnlHost, int >& v1, const tnlVector< long double, tnlHost, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< int, tnlHost, long int >& v1, const tnlVector< int, tnlHost, long int >& v2 );
 template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< long int, tnlHost, long int >& v1, const tnlVector< long int, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< float, tnlHost, long int >& v1, const tnlVector< float, tnlHost, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< double, tnlHost, long int >& v1, const tnlVector< double, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorDifferenceMin( const tnlVector< long double, tnlHost, long int >& v1, const tnlVector< long double, tnlHost, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference abs max
  */
 template int         tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< int, tnlHost, int >& v1, const tnlVector< int, tnlHost, int >& v2 );
 template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlHost, int >& v1, const tnlVector< long int, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< float, tnlHost, int >& v1,  const tnlVector< float, tnlHost, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< double, tnlHost, int >& v1, const tnlVector< double, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlHost, int >& v1, const tnlVector< long double, tnlHost, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< int, tnlHost, long int >& v1, const tnlVector< int, tnlHost, long int >& v2 );
 template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlHost, long int >& v1, const tnlVector< long int, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< float, tnlHost, long int >& v1, const tnlVector< float, tnlHost, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< double, tnlHost, long int >& v1, const tnlVector< double, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlHost, long int >& v1, const tnlVector< long double, tnlHost, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference abs min
  */
 template int         tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< int, tnlHost, int >& v1, const tnlVector< int, tnlHost, int >& v2 );
 template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlHost, int >& v1, const tnlVector< long int, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< float, tnlHost, int >& v1,  const tnlVector< float, tnlHost, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< double, tnlHost, int >& v1, const tnlVector< double, tnlHost, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlHost, int >& v1, const tnlVector< long double, tnlHost, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< int, tnlHost, long int >& v1, const tnlVector< int, tnlHost, long int >& v2 );
 template long int    tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlHost, long int >& v1, const tnlVector< long int, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< float, tnlHost, long int >& v1, const tnlVector< float, tnlHost, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< double, tnlHost, long int >& v1, const tnlVector< double, tnlHost, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlHost, long int >& v1, const tnlVector< long double, tnlHost, long int >& v2 );
-
+#endif
+#endif
 
 /****
  * Max
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
+
 
 /****
  * Min
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Abs max
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Abs min
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Lp norm
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< int, tnlCuda, int >& v, const int& p );
 template long int    tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long int, tnlCuda, int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< float, tnlCuda, int >& v, const float& p );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< double, tnlCuda, int >& v, const double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, int >& v, const long double& p );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< int, tnlCuda, long int >& v, const int& p );
 template long int    tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long int, tnlCuda, long int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< float, tnlCuda, long int >& v, const float& p );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< double, tnlCuda, long int >& v, const double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, long int >& v, const long double& p );
+#endif
+#endif
 
 /****
  * Sum
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
+
 
 /****
  * Difference max
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
+
 
 /****
  * Difference min
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference abs max
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference abs min
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
 template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
-
+#endif
+#endif
 
 #endif
 
diff --git a/src/core/vectors/tnlVectorOperations_impl.cu b/src/core/vectors/tnlVectorOperations_impl.cu
index 6ad481eef135afc207929c3cf70f22f7ee99b460..b8af43c24dba48c7b2775a4c27eba77a1a77b6fa 100644
--- a/src/core/vectors/tnlVectorOperations_impl.cu
+++ b/src/core/vectors/tnlVectorOperations_impl.cu
@@ -24,140 +24,253 @@
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< double, tnlCuda, int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< double, tnlCuda, long int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Min
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< double, tnlCuda, int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< double, tnlCuda, long int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Abs max
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< double, tnlCuda, int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< double, tnlCuda, long int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
+
 
 /****
  * Abs min
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< double, tnlCuda, int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< double, tnlCuda, long int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
+
 
 /****
  * Lp norm
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< int, tnlCuda, int >& v, const int& p );
 template long int    tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long int, tnlCuda, int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< float, tnlCuda, int >& v, const float& p );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< double, tnlCuda, int >& v, const double& p );
-//template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, int >& v, const long double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, int >& v, const long double& p );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< int, tnlCuda, long int >& v, const int& p );
 template long int    tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long int, tnlCuda, long int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< float, tnlCuda, long int >& v, const float& p );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< double, tnlCuda, long int >& v, const double& p );
-//template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, long int >& v, const long double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, long int >& v, const long double& p );
+#endif
+#endif
 
 /****
  * Sum
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< int, tnlCuda, int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< float, tnlCuda, int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< double, tnlCuda, int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< int, tnlCuda, long int >& v );
 template long int    tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< float, tnlCuda, long int >& v );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< double, tnlCuda, long int >& v );
-//template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Difference max
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-//template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-//template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference min
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-//template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-//template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
 
 /****
  * Difference abs max
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-//template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-//template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
+
 
 /****
  * Difference abs min
  */
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-//template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
 template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
 template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
 template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
 template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-//template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
         
 #endif
- 
\ No newline at end of file
+ 
diff --git a/src/core/vectors/tnlVector_impl.cpp b/src/core/vectors/tnlVector_impl.cpp
index e4515ae77448d5f5ca3fe858ef048208b8a0ae54..8d8df49b4937ae9da2c12a8b97b4b568e616124b 100644
--- a/src/core/vectors/tnlVector_impl.cpp
+++ b/src/core/vectors/tnlVector_impl.cpp
@@ -19,12 +19,25 @@
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 template class tnlVector< float, tnlHost, int >;
 template tnlVector< float, tnlHost, int >& tnlVector< float, tnlHost, int >:: operator = ( const tnlVector< double, tnlHost, int >& vector );
+#endif
+
 
 template class tnlVector< double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlVector< long double, tnlHost, int >;
+#endif
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlVector< float, tnlHost, long int >;
+#endif
 template class tnlVector< double, tnlHost, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlVector< long double, tnlHost, long int >;
+#endif
+#endif
 
 #endif
 
diff --git a/src/core/vectors/tnlVector_impl.cu b/src/core/vectors/tnlVector_impl.cu
index 9daa8dcc9ce9b0b5b5b1a766209e09a520449d65..f5406187ec7ae03f628f98782131a6fe9eef02ca 100644
--- a/src/core/vectors/tnlVector_impl.cu
+++ b/src/core/vectors/tnlVector_impl.cu
@@ -20,10 +20,23 @@
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
 #ifdef HAVE_CUDA
+#ifdef INSTANTIATE_FLOAT
 template class tnlVector< float, tnlCuda, int >;
+#endif
 template class tnlVector< double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlVector< long double, tnlCuda, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 template class tnlVector< float, tnlCuda, long int >;
+#endif
 template class tnlVector< double, tnlCuda, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+template class tnlVector< long double, tnlCuda, long int >;
+#endif
+#endif
 #endif
 
 #endif
diff --git a/src/core/vectors/tnlVector_impl.h b/src/core/vectors/tnlVector_impl.h
index 636a80320344fdde2dc3e0d3beb4af3d00147a80..c0d08a6ae2baea5e04aabfb27479839707287439 100644
--- a/src/core/vectors/tnlVector_impl.h
+++ b/src/core/vectors/tnlVector_impl.h
@@ -145,7 +145,7 @@ template< typename Real,
    template< typename Vector >
 tnlVector< Real, Device, Index >& tnlVector< Real, Device, Index > :: operator -= ( const Vector& vector )
 {
-   alphaXPlusBetaY( -1.0, vector, 1.0 );
+   this->addVector( vector, -1.0 );
    return *this;
 }
 
@@ -155,10 +155,29 @@ template< typename Real,
    template< typename Vector >
 tnlVector< Real, Device, Index >& tnlVector< Real, Device, Index > :: operator += ( const Vector& vector )
 {
-   alphaXPlusBetaY( 1.0, vector, 1.0 );
+   this->addVector( vector );
    return *this;
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+tnlVector< Real, Device, Index >& tnlVector< Real, Device, Index > :: operator *= ( const RealType& c )
+{
+   tnlVectorOperations< Device >::vectorScalarMultiplication( *this, c );
+   return *this;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+tnlVector< Real, Device, Index >& tnlVector< Real, Device, Index > :: operator /= ( const RealType& c )
+{
+   tnlVectorOperations< Device >::vectorScalarMultiplication( *this, 1.0 / c );
+   return *this;
+}
+
+
 template< typename Real,
           typename Device,
           typename Index >
@@ -301,35 +320,15 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vector >
-void tnlVector< Real, Device, Index > :: alphaXPlusBetaY( const Real& alpha,
-                                                          const Vector& x,
-                                                          const Real& beta )
-{
-   tnlVectorOperations< Device > :: alphaXPlusBetaY( *this, x, alpha, beta );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Vector >
-void tnlVector< Real, Device, Index > :: alphaXPlusBetaZ( const Real& alpha,
-                                                          const Vector& x,
-                                                          const Real& beta,
-                                                          const Vector& z )
-{
-   tnlVectorOperations< Device > :: alphaXPlusBetaZ( *this, x, alpha, z, beta );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< typename Vector >
-void tnlVector< Real, Device, Index > :: alphaXPlusBetaZPlusY( const Real& alpha,
-                                                    const Vector& x,
-                                                    const Real& beta,
-                                                    const Vector& z )
-{
-   tnlVectorOperations< Device > :: alphaXPlusBetaZPlusY( *this, x, alpha, z, beta );
+void
+tnlVector< Real, Device, Index >::
+addVectors( const Vector& v1,
+            const Real& multiplicator1,
+            const Vector& v2,
+            const Real& multiplicator2,
+            const Real& thisMultiplicator )
+{
+   tnlVectorOperations< Device >::addVectors( *this, v1, multiplicator1, v2, multiplicator2, thisMultiplicator );
 }
 
 template< typename Real,
@@ -369,19 +368,44 @@ void tnlVector< Real, Device, Index > :: computeExclusivePrefixSum( const IndexT
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlVector< float, tnlHost, int >;
 extern template tnlVector< float, tnlHost, int >& tnlVector< float, tnlHost, int >:: operator = ( const tnlVector< double, tnlHost, int >& vector );
+#endif
 
 extern template class tnlVector< double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlVector< long double, tnlHost, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlVector< float, tnlHost, long int >;
+#endif
 extern template class tnlVector< double, tnlHost, long int >;
-
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlVector< long double, tnlHost, long int >;
+#endif
+#endif
 
 #ifdef HAVE_CUDA
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlVector< float, tnlCuda, int >;
+#endif
 extern template class tnlVector< double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlVector< long double, tnlCuda, int >;
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+#ifdef INSTANTIATE_FLOAT
 extern template class tnlVector< float, tnlCuda, long int >;
+#endif
 extern template class tnlVector< double, tnlCuda, long int >;
+#ifdef INSTANTIATE_LONG_DOUBLE
+extern template class tnlVector< long double, tnlCuda, long int >;
+#endif
+#endif
 #endif
 
 #endif
diff --git a/src/functions/CMakeLists.txt b/src/functors/CMakeLists.txt
similarity index 76%
rename from src/functions/CMakeLists.txt
rename to src/functors/CMakeLists.txt
index 920be4b577584b297cc61c4a0979e317e0e72dea..8b8dae1c5b86e9a3753a7e0003d1c657a1f3ce30 100755
--- a/src/functions/CMakeLists.txt
+++ b/src/functors/CMakeLists.txt
@@ -1,6 +1,8 @@
 SET( headers tnlFunctionDiscretizer.h
              tnlFunctionDiscretizer_impl.h
-             tnlFunctionAdapter.h
+             tnlFunctionEnumerator.h
+             tnlFunctionEnumerator_impl.h
+             tnlFunctorAdapter.h
              tnlConstantFunction.h
              tnlConstantFunction_impl.h
              tnlExpBumpFunction.h
@@ -13,19 +15,19 @@ SET( headers tnlFunctionDiscretizer.h
              tnlFunctionType.h
              tnlTestFunction_impl.h )
 
-SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/functions )
+SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/functors )
 set( common_SOURCES
      ${CURRENT_DIR}/tnlTestFunction_impl.cpp )       
 
 IF( BUILD_CUDA )
-   set( tnl_functions_CUDA__SOURCES
+   set( tnl_functors_CUDA__SOURCES
         ${common_SOURCES} 
         ${CURRENT_DIR}/tnlTestFunction_impl.cu
         PARENT_SCOPE )
 ENDIF()    
 
-set( tnl_functions_SOURCES     
+set( tnl_functors_SOURCES     
      ${common_SOURCES}
      PARENT_SCOPE )
         
-INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/functions )
\ No newline at end of file
+INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/functors )
\ No newline at end of file
diff --git a/src/functions/tnlConstantFunction.h b/src/functors/tnlConstantFunction.h
similarity index 94%
rename from src/functions/tnlConstantFunction.h
rename to src/functors/tnlConstantFunction.h
index bd64f268a4292edd54578ab45c8425cf6ec7365d..df85597ef742b7d5eb9cc9877b99ecb3d6429ba0 100644
--- a/src/functions/tnlConstantFunction.h
+++ b/src/functors/tnlConstantFunction.h
@@ -20,7 +20,7 @@
 
 #include <iostream>
 #include <core/vectors/tnlStaticVector.h>
-#include <functions/tnlFunctionType.h>
+#include <functors/tnlFunctionType.h>
 
 template< int FunctionDimensions,
           typename Real = double >
@@ -55,16 +55,12 @@ class tnlConstantFunction
              int ZDiffOrder = 0,
              typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
    RealType getValue( const Vertex& v,
                       const Real& time = 0.0 ) const;
 
    template< typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
    RealType getValue( const Vertex& v,
                       const Real& time = 0.0 ) const
    {
@@ -93,6 +89,6 @@ class tnlFunctionType< tnlConstantFunction< FunctionDimensions, Real > >
       enum { Type = tnlAnalyticFunction };
 };
 
-#include <functions/tnlConstantFunction_impl.h>
+#include <functors/tnlConstantFunction_impl.h>
 
 #endif /* TNLCONSTANTFUNCTION_H_ */
diff --git a/src/functions/tnlConstantFunction_impl.h b/src/functors/tnlConstantFunction_impl.h
similarity index 100%
rename from src/functions/tnlConstantFunction_impl.h
rename to src/functors/tnlConstantFunction_impl.h
diff --git a/src/functions/tnlExpBumpFunction.h b/src/functors/tnlExpBumpFunction.h
similarity index 88%
rename from src/functions/tnlExpBumpFunction.h
rename to src/functors/tnlExpBumpFunction.h
index 14ed47cedce92cef5a5b85e19795af4517ee4526..1159f1849f3bd4bb911a321f8843c70312fe3efa 100644
--- a/src/functions/tnlExpBumpFunction.h
+++ b/src/functors/tnlExpBumpFunction.h
@@ -20,7 +20,7 @@
 
 #include <config/tnlParameterContainer.h>
 #include <core/vectors/tnlStaticVector.h>
-#include <functions/tnlFunctionType.h>
+#include <functors/tnlFunctionType.h>
 
 template< typename Real >
 class tnlExpBumpFunctionBase
@@ -75,11 +75,8 @@ class tnlExpBumpFunction< 1, Real > : public tnlExpBumpFunctionBase< Real >
                 int ZDiffOrder = 0,
                 typename Vertex = VertexType >
 #endif   
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
-      RealType getValue( const Vertex& v,
-                         const Real& time = 0.0 ) const;
+   __cuda_callable__ RealType getValue( const Vertex& v,
+                                        const Real& time = 0.0 ) const;
 };
 
 template< typename Real >
@@ -106,11 +103,8 @@ class tnlExpBumpFunction< 2, Real > : public tnlExpBumpFunctionBase< Real >
                 int ZDiffOrder = 0,
                 typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
-      RealType getValue( const Vertex& v,
-                         const Real& time = 0.0 ) const;
+   __cuda_callable__ RealType getValue( const Vertex& v,
+                                        const Real& time = 0.0 ) const;
 };
 
 template< typename Real >
@@ -137,11 +131,8 @@ class tnlExpBumpFunction< 3, Real > : public tnlExpBumpFunctionBase< Real >
                 int ZDiffOrder = 0,
                 typename Vertex = VertexType >
 #endif   
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
-      RealType getValue( const Vertex& v,
-                         const Real& time = 0.0 ) const;
+   __cuda_callable__    RealType getValue( const Vertex& v,
+                                           const Real& time = 0.0 ) const;
 };
 
 template< int Dimensions,
@@ -162,7 +153,7 @@ class tnlFunctionType< tnlExpBumpFunction< FunctionDimensions, Real > >
 };
 
 
-#include <functions/tnlExpBumpFunction_impl.h>
+#include <functors/tnlExpBumpFunction_impl.h>
 
 
 #endif /* TNLEXPBUMPFUNCTION_H_ */
diff --git a/src/functions/tnlExpBumpFunction_impl.h b/src/functors/tnlExpBumpFunction_impl.h
similarity index 97%
rename from src/functions/tnlExpBumpFunction_impl.h
rename to src/functors/tnlExpBumpFunction_impl.h
index e701d9ee068c76f6c39eb4506129709d2f2ee3f6..c468873a23c73d5744556d0e019bb6a730b943a5 100644
--- a/src/functions/tnlExpBumpFunction_impl.h
+++ b/src/functors/tnlExpBumpFunction_impl.h
@@ -18,7 +18,7 @@
 #ifndef TNLEXPBUMPFUNCTION_IMPL_H_
 #define TNLEXPBUMPFUNCTION_IMPL_H_
 
-#include <functions/tnlExpBumpFunction.h>
+#include <functors/tnlExpBumpFunction.h>
 
 template< typename Real >
 bool
@@ -76,9 +76,7 @@ template< typename Real >
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlExpBumpFunction< 1, Real >::getValue( const Vertex& v,
                                          const Real& time ) const
@@ -116,9 +114,7 @@ template< typename Real >
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlExpBumpFunction< 2, Real >::
 getValue( const Vertex& v,
@@ -162,9 +158,7 @@ template< typename Real >
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlExpBumpFunction< 3, Real >::
 getValue( const Vertex& v,
diff --git a/src/functions/tnlFunctionDiscretizer.h b/src/functors/tnlFunctionDiscretizer.h
similarity index 97%
rename from src/functions/tnlFunctionDiscretizer.h
rename to src/functors/tnlFunctionDiscretizer.h
index dfdef2112fcbcb7ccd91f0dfdf3f299a5e1a5ded..a060328b93c742e695e50a0558b1e31379be156d 100644
--- a/src/functions/tnlFunctionDiscretizer.h
+++ b/src/functors/tnlFunctionDiscretizer.h
@@ -48,6 +48,6 @@ class tnlFunctionDiscretizer
    
 };
 
-#include <functions/tnlFunctionDiscretizer_impl.h>
+#include <functors/tnlFunctionDiscretizer_impl.h>
 
 #endif /* TNLFUNCTIONDISCRETIZER_H_ */
diff --git a/src/functions/tnlFunctionDiscretizer_impl.h b/src/functors/tnlFunctionDiscretizer_impl.h
similarity index 100%
rename from src/functions/tnlFunctionDiscretizer_impl.h
rename to src/functors/tnlFunctionDiscretizer_impl.h
diff --git a/src/functors/tnlFunctionEnumerator.h b/src/functors/tnlFunctionEnumerator.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e25baf478eee26469c56ad03e9e68f6883e6518
--- /dev/null
+++ b/src/functors/tnlFunctionEnumerator.h
@@ -0,0 +1,203 @@
+/***************************************************************************
+                          tnlFunctionEnumerator.h  -  description
+                             -------------------
+    begin                : Mar 5, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+#ifndef SRC_FUNCTIONS_TNLFUNCTIONENUMERATOR_H_
+#define SRC_FUNCTIONS_TNLFUNCTIONENUMERATOR_H_
+
+#include <functors/tnlFunctorAdapter.h>
+
+template< typename Function,
+          typename DofVector >
+class tnlFunctionEnumeratorTraverserUserData
+{
+   public:
+
+      typedef typename DofVector::RealType RealType;
+
+      const RealType *time;
+
+      const Function* function;
+
+      DofVector *u;
+
+      const RealType* functionCoefficient;
+
+      const RealType* dofVectorCoefficient;
+
+      tnlFunctionEnumeratorTraverserUserData( const RealType& time,
+                                              const Function& function,
+                                              DofVector& u,
+                                              const RealType& functionCoefficient,
+                                              const RealType& dofVectorCoefficient )
+      : time( &time ),
+        function( &function ),
+        u( &u ),
+        functionCoefficient( &functionCoefficient ),
+        dofVectorCoefficient( &dofVectorCoefficient )
+      {};
+};
+
+
+template< typename Mesh,
+          typename Function,
+          typename DofVector >
+class tnlFunctionEnumerator
+{
+   public:
+      typedef Mesh MeshType;
+      typedef typename DofVector::RealType RealType;
+      typedef typename DofVector::DeviceType DeviceType;
+      typedef typename DofVector::IndexType IndexType;
+      typedef tnlFunctionEnumeratorTraverserUserData< Function,
+                                                      DofVector > TraverserUserData;
+
+      template< int EntityDimensions >
+      void enumerate( const MeshType& mesh,
+                      const Function& function,
+                      DofVector& u,
+                      const RealType& functionCoefficient = 1.0,
+                      const RealType& dofVectorCoefficient = 0.0,
+                      const RealType& time = 0.0 ) const;
+
+
+      class TraverserEntitiesProcessor
+      {
+         public:
+
+            template< int EntityDimensions >
+#ifdef HAVE_CUDA
+            __host__ __device__
+#endif
+            static void processEntity( const MeshType& mesh,
+                                       TraverserUserData& userData,
+                                       const IndexType index )
+            {
+               typedef tnlFunctorAdapter< MeshType, Function > FunctionAdapter;
+               if( ! *userData.dofVectorCoefficient  )
+                  ( *userData.u )[ index ] =
+                     ( *userData.functionCoefficient ) * FunctionAdapter::getValue( mesh,
+                                                                                    *userData.function,
+                                                                                    index,
+                                                                                    *userData.time );
+               else                                                                                            
+                 ( *userData.u )[ index ] =
+                             ( *userData.dofVectorCoefficient ) * ( *userData.u )[ index ] +
+                             ( *userData.functionCoefficient ) * FunctionAdapter::getValue( mesh,
+                                                                                            *userData.function,
+                                                                                            index,
+                                                                                            *userData.time );
+            }
+
+      };
+
+};
+
+template< int Dimensions,
+          typename Real,
+          typename Device,
+          typename Index,
+          typename Function,
+          typename DofVector >
+class tnlFunctionEnumerator< tnlGrid< Dimensions, Real, Device, Index >,
+                             Function,
+                             DofVector >
+{
+   public:
+
+      typedef tnlGrid< Dimensions, Real, Device, Index > MeshType;
+      typedef typename MeshType::RealType RealType;
+      typedef typename MeshType::DeviceType DeviceType;
+      typedef typename MeshType::IndexType IndexType;
+      typedef typename MeshType::CoordinatesType CoordinatesType;
+      typedef tnlFunctionEnumeratorTraverserUserData< Function,
+                                                      DofVector > TraverserUserData;
+
+      template< int EntityDimensions >
+      void enumerate( const MeshType& mesh,
+                      const Function& function,
+                      DofVector& u,
+                      const RealType& functionCoefficient = 1.0,
+                      const RealType& dofVectorCoefficient = 0.0,
+                      const RealType& time = 0.0 ) const;
+
+      class TraverserEntitiesProcessor
+      {
+         public:
+
+         typedef typename MeshType::VertexType VertexType;
+
+#ifdef HAVE_CUDA
+            __host__ __device__
+#endif
+            static void processCell( const MeshType& mesh,
+                                     TraverserUserData& userData,
+                                     const IndexType index,
+                                     const CoordinatesType& coordinates )
+            {
+               //printf( "Enumerator::processCell mesh =%p \n", &mesh );
+               typedef tnlFunctorAdapter< MeshType, Function > FunctionAdapter;
+               if( ! ( *userData.dofVectorCoefficient ) )
+                  ( *userData.u )[ index ] =
+                     ( *userData.functionCoefficient ) * FunctionAdapter::getValue( mesh,
+                                                                                    *userData.function,
+                                                                                    index,
+                                                                                    coordinates,
+                                                                                    *userData.time );
+               else
+                  ( *userData.u )[ index ] =
+                           ( *userData.dofVectorCoefficient ) * ( *userData.u )[ index ] +
+                           ( *userData.functionCoefficient ) * FunctionAdapter::getValue( mesh,
+                                                                                          *userData.function,
+                                                                                          index,
+                                                                                          coordinates,
+                                                                                          *userData.time );
+
+            }
+
+#ifdef HAVE_CUDA
+            __host__ __device__
+#endif
+            static void processFace( const MeshType& mesh,
+                                     TraverserUserData& userData,
+                                     const IndexType index,
+                                     const CoordinatesType& coordinates )
+            {
+               typedef tnlFunctorAdapter< MeshType, Function > FunctionAdapter;
+               if( ! ( *userData.dofVectorCoefficient ) )
+                  ( *userData.u )[ index ] =
+                     ( *userData.functionCoefficient ) * FunctionAdapter::getValue( mesh,
+                                                                                    *userData.function,
+                                                                                    index,
+                                                                                    coordinates,
+                                                                                    *userData.time );
+               else
+                  ( *userData.u )[ index ] =
+                           ( *userData.dofVectorCoefficient ) * ( *userData.u )[ index ] +
+                           ( *userData.functionCoefficient ) * FunctionAdapter::getValue( mesh,
+                                                                                          *userData.function,
+                                                                                          index,
+                                                                                          coordinates,
+                                                                                          *userData.time );
+            }
+      };
+
+};
+
+#include <functors/tnlFunctionEnumerator_impl.h>
+
+
+
+#endif /* SRC_FUNCTIONS_TNLFUNCTIONENUMERATOR_H_ */
diff --git a/src/functors/tnlFunctionEnumerator_impl.h b/src/functors/tnlFunctionEnumerator_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e3d824b9d8aa5c50e2377a3702d9d4081f460b0
--- /dev/null
+++ b/src/functors/tnlFunctionEnumerator_impl.h
@@ -0,0 +1,143 @@
+/***************************************************************************
+                          tnlFunctionEnumerator_impl.h  -  description
+                             -------------------
+    begin                : Mar 5, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+#ifndef SRC_FUNCTIONS_TNLFUNCTIONENUMERATOR_IMPL_H_
+#define SRC_FUNCTIONS_TNLFUNCTIONENUMERATOR_IMPL_H_
+
+#include <functors/tnlFunctionEnumerator.h>
+#include <mesh/tnlTraverser_Grid1D.h>
+#include <mesh/tnlTraverser_Grid2D.h>
+#include <mesh/tnlTraverser_Grid3D.h>
+
+template< typename Mesh,
+          typename Function,
+          typename DofVector >
+   template< int EntityDimensions >
+void
+tnlFunctionEnumerator< Mesh, Function, DofVector >::
+enumerate( const MeshType& mesh,
+           const Function& function,
+           DofVector& u,
+           const RealType& functionCoefficient,
+           const RealType& dofVectorCoefficient,
+           const RealType& time ) const
+
+{
+   if( DeviceType::DeviceType == tnlHostDevice )
+   {
+      TraverserUserData userData( time, function, u, functionCoefficient, dofVectorCoefficient );
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+
+   }
+   if( DeviceType::DeviceType == tnlCudaDevice )
+   {
+      RealType* kernelTime = tnlCuda::passToDevice( time );
+      Function* kernelFunction = tnlCuda::passToDevice( function );
+      DofVector* kernelU = tnlCuda::passToDevice( u );
+      RealType* kernelFunctionCoefficient = tnlCuda::passToDevice( functionCoefficient );
+      RealType* kernelDofVectorCoefficient = tnlCuda::passToDevice( dofVectorCoefficient );
+      TraverserUserData userData( *kernelTime, *kernelFunction, *kernelU, *kernelFunctionCoefficient, *kernelDofVectorCoefficient );
+      checkCudaDevice;
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+
+      checkCudaDevice;
+      tnlCuda::freeFromDevice( kernelTime );
+      tnlCuda::freeFromDevice( kernelFunction );
+      tnlCuda::freeFromDevice( kernelU );
+      tnlCuda::freeFromDevice( kernelFunctionCoefficient );
+      tnlCuda::freeFromDevice( kernelDofVectorCoefficient );
+      checkCudaDevice;
+   }
+}
+
+template< int Dimensions,
+          typename Real,
+          typename Device,
+          typename Index,
+          typename Function,
+          typename DofVector >
+   template< int EntityDimensions >
+void
+tnlFunctionEnumerator< tnlGrid< Dimensions, Real, Device, Index >, Function, DofVector  >::
+enumerate( const tnlGrid< Dimensions, Real, Device, Index >& mesh,
+           const Function& function,
+           DofVector& u,
+           const RealType& functionCoefficient,
+           const RealType& dofVectorCoefficient,
+           const RealType& time ) const
+{
+   if( ( tnlDeviceEnum ) DeviceType::DeviceType == tnlHostDevice )
+   {
+      TraverserUserData userData( time, function, u, functionCoefficient, dofVectorCoefficient );
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+
+   }
+   if( ( tnlDeviceEnum ) DeviceType::DeviceType == tnlCudaDevice )
+   {
+      RealType* kernelTime = tnlCuda::passToDevice( time );
+      Function* kernelFunction = tnlCuda::passToDevice( function );
+      DofVector* kernelU = tnlCuda::passToDevice( u );
+      RealType* kernelFunctionCoefficient = tnlCuda::passToDevice( functionCoefficient );
+      RealType* kernelDofVectorCoefficient = tnlCuda::passToDevice( dofVectorCoefficient );
+      TraverserUserData userData( *kernelTime, *kernelFunction, *kernelU, *kernelFunctionCoefficient, *kernelDofVectorCoefficient );
+      checkCudaDevice;
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+
+      checkCudaDevice;
+      tnlCuda::freeFromDevice( kernelTime );
+      tnlCuda::freeFromDevice( kernelFunction );
+      tnlCuda::freeFromDevice( kernelU );
+      tnlCuda::freeFromDevice( kernelFunctionCoefficient );
+      tnlCuda::freeFromDevice( kernelDofVectorCoefficient );
+      checkCudaDevice;
+   }
+}
+
+
+
+#endif /* SRC_FUNCTIONS_TNLFUNCTIONENUMERATOR_IMPL_H_ */
diff --git a/src/functions/tnlFunctionType.h b/src/functors/tnlFunctionType.h
similarity index 100%
rename from src/functions/tnlFunctionType.h
rename to src/functors/tnlFunctionType.h
diff --git a/src/functions/tnlFunctionAdapter.h b/src/functors/tnlFunctorAdapter.h
similarity index 74%
rename from src/functions/tnlFunctionAdapter.h
rename to src/functors/tnlFunctorAdapter.h
index 6bd94d72cc9bb009d1fe1c7b827f8c821bfeef4c..3d6a96df58f61a84357cb6927487881816b5ef3f 100644
--- a/src/functions/tnlFunctionAdapter.h
+++ b/src/functors/tnlFunctorAdapter.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          tnlFunctionAdapter.h  -  description
+                          tnlFunctorAdapter.h  -  description
                              -------------------
     begin                : Nov 28, 2014
     copyright            : (C) 2014 by Tomas Oberhuber
@@ -15,16 +15,16 @@
  *                                                                         *
  ***************************************************************************/
 
-#ifndef TNLFUNCTIONADAPTER_H_
-#define TNLFUNCTIONADAPTER_H_
+#ifndef tnlFunctorAdapter_H_
+#define tnlFunctorAdapter_H_
 
-#include <functions/tnlConstantFunction.h>
-#include <functions/tnlFunctionType.h>
+#include <functors/tnlConstantFunction.h>
+#include <functors/tnlFunctionType.h>
 
 template< typename Mesh,
           typename Function,
           int FunctionType = tnlFunctionType< Function >::Type >
-class tnlFunctionAdapter
+class tnlFunctorAdapter
 {
 };
 
@@ -34,7 +34,7 @@ class tnlFunctionAdapter
  */
 template< typename Mesh,
           typename Function >
-class tnlFunctionAdapter< Mesh, Function, tnlGeneralFunction >
+class tnlFunctorAdapter< Mesh, Function, tnlGeneralFunction >
 {
    public:
 
@@ -44,15 +44,13 @@ class tnlFunctionAdapter< Mesh, Function, tnlGeneralFunction >
       typedef typename MeshType::IndexType IndexType;
 
       template< int MeshEntityDimension >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      __cuda_callable__ 
       static RealType getValue( const MeshType& mesh,
                                 const FunctionType& function,
                                 const IndexType index,
                                 const RealType& time = 0.0 )
       {
-         return function.getValue( mesh.template getEntityCenter< MeshEntityDimension >,
+         return function.getValue( mesh, //.template getEntityCenter< MeshEntityDimension >,
                                    index,
                                    time );
       }
@@ -67,7 +65,7 @@ template< int Dimensions,
           typename Device,
           typename Index,
           typename Function >
-class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function, tnlGeneralFunction >
+class tnlFunctorAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function, tnlGeneralFunction >
 {
          public:
 
@@ -78,16 +76,14 @@ class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function,
       typedef typename MeshType::VertexType VertexType;
       typedef typename MeshType::CoordinatesType CoordinatesType;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      __cuda_callable__
       static RealType getValue( const MeshType& mesh,
                                 const FunctionType& function,
                                 const IndexType index,
                                 const CoordinatesType& coordinates,
                                 const RealType& time = 0.0 )
       {
-         return function.getValue( mesh.template getCellCenter< VertexType >( coordinates ),
+         return function.getValue( mesh, //.template getCellCenter< VertexType >( coordinates ),
                                    index,
                                    time );
       }
@@ -99,7 +95,7 @@ class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function,
  */
 template< typename Mesh,
           typename Function >
-class tnlFunctionAdapter< Mesh, Function, tnlDiscreteFunction >
+class tnlFunctorAdapter< Mesh, Function, tnlDiscreteFunction >
 {
    public:
 
@@ -109,9 +105,7 @@ class tnlFunctionAdapter< Mesh, Function, tnlDiscreteFunction >
       typedef typename MeshType::IndexType IndexType;
 
       template< int MeshEntityDimension >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      __cuda_callable__
       static RealType getValue( const MeshType& mesh,
                                 const FunctionType& function,
                                 const IndexType index,
@@ -122,13 +116,47 @@ class tnlFunctionAdapter< Mesh, Function, tnlDiscreteFunction >
       }
 };
 
+/****
+ * Specialization for discrete functions:
+ * - it passes only the mesh entity index
+ */
+template< int Dimensions,
+          typename Real,
+          typename Device,
+          typename Index,
+          typename Function >
+class tnlFunctorAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function, tnlDiscreteFunction >
+{
+   public:
+
+      typedef tnlGrid< Dimensions, Real, Device, Index > MeshType;
+      typedef Function FunctionType;
+      typedef typename FunctionType::RealType RealType;
+      typedef typename MeshType::IndexType IndexType;
+      typedef typename MeshType::VertexType VertexType;
+      typedef typename MeshType::CoordinatesType CoordinatesType;
+
+      //template< int MeshEntityDimension >
+      __cuda_callable__
+      static RealType getValue( const MeshType& mesh,
+                                const FunctionType& function,
+                                const IndexType index,
+                                const CoordinatesType& coordinates,
+                                const RealType& time = 0.0 )
+      {
+         return function.getValue( index,
+                                   time );
+      }
+};
+
+
 /****
  * Specialization for analytic functions:
  * - it does not pass the mesh entity index
  */
 template< typename Mesh,
           typename Function >
-class tnlFunctionAdapter< Mesh, Function, tnlAnalyticFunction >
+class tnlFunctorAdapter< Mesh, Function, tnlAnalyticFunction >
 {
    public:
 
@@ -138,9 +166,7 @@ class tnlFunctionAdapter< Mesh, Function, tnlAnalyticFunction >
       typedef typename MeshType::IndexType IndexType;
 
       template< int MeshEntityDimension >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      __cuda_callable__
       static RealType getValue( const MeshType& mesh,
                                 const FunctionType& function,
                                 const IndexType index,
@@ -160,7 +186,7 @@ template< int Dimensions,
           typename Device,
           typename Index,
           typename Function >
-class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function, tnlAnalyticFunction >
+class tnlFunctorAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function, tnlAnalyticFunction >
 {
          public:
 
@@ -171,9 +197,7 @@ class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function,
       typedef typename MeshType::VertexType VertexType;
       typedef typename MeshType::CoordinatesType CoordinatesType;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      __cuda_callable__
       static RealType getValue( const MeshType& mesh,
                                 const FunctionType& function,
                                 const IndexType index,
@@ -185,15 +209,16 @@ class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >, Function,
       }
 };
 
+// TODO: Fix the specializations for the constant function.
+#ifdef UNDEF
 /****
  * Specialization for constant function
  *  - it does not ask the mesh for the mesh entity center
  */
-
 template< typename Mesh,
           int FunctionDimensions,
           typename Real >
-class tnlFunctionAdapter< Mesh, tnlConstantFunction< FunctionDimensions, Real >, tnlAnalyticFunction >
+class tnlFunctorAdapter< Mesh, tnlConstantFunction< FunctionDimensions, Real >, tnlAnalyticFunction >
 {
    public:
 
@@ -204,9 +229,7 @@ class tnlFunctionAdapter< Mesh, tnlConstantFunction< FunctionDimensions, Real >,
       typedef typename MeshType::VertexType VertexType;
 
       template< int MeshEntityDimension >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      __cuda_callable__
       static RealType getValue( const MeshType& mesh,
                                 const FunctionType& function,
                                 const IndexType index,
@@ -225,7 +248,7 @@ template< int Dimensions,
           typename Real,
           typename Device,
           typename Index >
-class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >,
+class tnlFunctorAdapter< tnlGrid< Dimensions, Real, Device, Index >,
                           tnlConstantFunction< Dimensions, Real >,
                           tnlAnalyticFunction >
 {
@@ -238,9 +261,7 @@ class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >,
       typedef typename MeshType::VertexType VertexType;
       typedef typename MeshType::CoordinatesType CoordinatesType;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      __cuda_callable__
       static RealType getValue( const MeshType& mesh,
                                 const FunctionType& function,
                                 const IndexType index,
@@ -252,4 +273,6 @@ class tnlFunctionAdapter< tnlGrid< Dimensions, Real, Device, Index >,
       }
 };
 
-#endif /* TNLFUNCTIONADAPTER_H_ */
+#endif /* UNDEF */
+
+#endif /* tnlFunctorAdapter_H_ */
diff --git a/src/functions/tnlSinBumpsFunction.h b/src/functors/tnlSinBumpsFunction.h
similarity index 95%
rename from src/functions/tnlSinBumpsFunction.h
rename to src/functors/tnlSinBumpsFunction.h
index d3ae9dcea77c452903806e55554426b99d045248..2b3183f8c252605c7501c9e88312d6a207a860d5 100644
--- a/src/functions/tnlSinBumpsFunction.h
+++ b/src/functors/tnlSinBumpsFunction.h
@@ -20,7 +20,7 @@
 
 #include <config/tnlParameterContainer.h>
 #include <core/vectors/tnlStaticVector.h>
-#include <functions/tnlFunctionType.h>
+#include <functors/tnlFunctionType.h>
 
 template< typename Vertex >
 class tnlSinBumpsFunctionBase
@@ -81,9 +81,7 @@ class tnlSinBumpsFunction< 1, Real  > : public tnlSinBumpsFunctionBase< tnlStati
                 int ZDiffOrder = 0,
                 typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       RealType getValue( const Vertex& v,
                          const Real& time = 0.0 ) const;
 };
@@ -114,9 +112,7 @@ class tnlSinBumpsFunction< 2, Real > : public tnlSinBumpsFunctionBase< tnlStatic
                 int ZDiffOrder = 0,
                 typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       RealType getValue( const Vertex& v,
                          const Real& time = 0.0 ) const;
 };
@@ -147,9 +143,7 @@ class tnlSinBumpsFunction< 3, Real > : public tnlSinBumpsFunctionBase< tnlStatic
                 int ZDiffOrder = 0,
                 typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       RealType getValue( const Vertex& v,
                          const Real& time = 0.0 ) const;
 };
@@ -173,7 +167,7 @@ class tnlFunctionType< tnlSinBumpsFunction< FunctionDimensions, Real > >
       enum { Type = tnlAnalyticFunction };
 };
 
-#include <functions/tnlSinBumpsFunction_impl.h>
+#include <functors/tnlSinBumpsFunction_impl.h>
 
 
 #endif /* TNLSINBUMPSFUNCTION_H_ */
diff --git a/src/functions/tnlSinBumpsFunction_impl.h b/src/functors/tnlSinBumpsFunction_impl.h
similarity index 97%
rename from src/functions/tnlSinBumpsFunction_impl.h
rename to src/functors/tnlSinBumpsFunction_impl.h
index 20a58d22934dd3573f5d9481b8dcb23b1e06841c..09109aeb10eb5d1660bdaf8f54e76a62fce0a1a4 100644
--- a/src/functions/tnlSinBumpsFunction_impl.h
+++ b/src/functors/tnlSinBumpsFunction_impl.h
@@ -18,7 +18,7 @@
 #ifndef TNLSINBUMPSFUNCTION_IMPL_H_
 #define TNLSINBUMPSFUNCTION_IMPL_H_
 
-#include <functions/tnlSinBumpsFunction.h>
+#include <functors/tnlSinBumpsFunction.h>
 
 template< typename Vertex >
 void tnlSinBumpsFunctionBase< Vertex >::setWaveLength( const Vertex& waveLength )
@@ -81,9 +81,7 @@ template< typename Real >
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlSinBumpsFunction< 1, Real >::
 getValue( const Vertex& v,
@@ -128,9 +126,7 @@ template< typename Real >
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlSinBumpsFunction< 2, Real>::
 getValue( const Vertex& v,
@@ -184,9 +180,7 @@ template< typename Real >
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlSinBumpsFunction< 3, Real >::
 getValue( const Vertex& v,
diff --git a/src/functions/tnlSinWaveFunction.h b/src/functors/tnlSinWaveFunction.h
similarity index 94%
rename from src/functions/tnlSinWaveFunction.h
rename to src/functors/tnlSinWaveFunction.h
index 114ab844faa8d33549a83b309dbf06e470164660..e6d066d80d73a3a7f67fc4495b5f01fe545e6ff6 100644
--- a/src/functions/tnlSinWaveFunction.h
+++ b/src/functors/tnlSinWaveFunction.h
@@ -20,7 +20,7 @@
 
 #include <config/tnlParameterContainer.h>
 #include <core/vectors/tnlStaticVector.h>
-#include <functions/tnlFunctionType.h>
+#include <functors/tnlFunctionType.h>
 
 template< typename Real = double >
 class tnlSinWaveFunctionBase
@@ -74,9 +74,7 @@ class tnlSinWaveFunction< 1, Real > : public tnlSinWaveFunctionBase< Real >
                 int ZDiffOrder = 0,
                 typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       RealType getValue( const Vertex& v,
                          const Real& time = 0.0 ) const;
 
@@ -102,9 +100,7 @@ class tnlSinWaveFunction< 2, Real > : public tnlSinWaveFunctionBase< Real >
                 int ZDiffOrder = 0,
                 typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       RealType getValue( const Vertex& v,
                          const Real& time = 0.0 ) const;
 };
@@ -130,9 +126,7 @@ class tnlSinWaveFunction< 3, Real > : public tnlSinWaveFunctionBase< Real >
                 int ZDiffOrder = 0,
                 typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       RealType getValue( const Vertex& v,
                          const Real& time = 0.0 ) const;
 };
@@ -156,6 +150,6 @@ class tnlFunctionType< tnlSinWaveFunction< FunctionDimensions, Real > >
       enum { Type = tnlAnalyticFunction };
 };
 
-#include <functions/tnlSinWaveFunction_impl.h>
+#include <functors/tnlSinWaveFunction_impl.h>
 
 #endif /* TNLSINWAVEFUNCTION_H_ */
diff --git a/src/functions/tnlSinWaveFunction_impl.h b/src/functors/tnlSinWaveFunction_impl.h
similarity index 97%
rename from src/functions/tnlSinWaveFunction_impl.h
rename to src/functors/tnlSinWaveFunction_impl.h
index 49c671af5d702b9b2dcee8d6e9eca83d02c14c0d..fe8ffbfac0ad619036837cf5ff3f2b17f761c01e 100644
--- a/src/functions/tnlSinWaveFunction_impl.h
+++ b/src/functors/tnlSinWaveFunction_impl.h
@@ -18,7 +18,7 @@
 #ifndef TNLSINWAVEFUNCTION_IMPL_H_
 #define TNLSINWAVEFUNCTION_IMPL_H_
 
-#include <functions/tnlSinWaveFunction.h>
+#include <functors/tnlSinWaveFunction.h>
 
 template< typename Real >
 tnlSinWaveFunctionBase< Real >::tnlSinWaveFunctionBase()
@@ -81,9 +81,7 @@ template< typename Real >
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlSinWaveFunction< 1, Real >::
 getValue( const Vertex& v,
@@ -116,9 +114,7 @@ template< typename Real >
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlSinWaveFunction< 2, Real >::
 getValue( const Vertex& v,
@@ -148,9 +144,7 @@ template< typename Real >
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlSinWaveFunction< 3, Real >::
 getValue( const Vertex& v,
diff --git a/src/functions/tnlTestFunction.h b/src/functors/tnlTestFunction.h
similarity index 96%
rename from src/functions/tnlTestFunction.h
rename to src/functors/tnlTestFunction.h
index 285009b40e952b2464ab3d3a9a734db99ebf9062..8d09994f141c12ffbbf1efcfe95e08a4641c5f73 100644
--- a/src/functions/tnlTestFunction.h
+++ b/src/functors/tnlTestFunction.h
@@ -67,9 +67,7 @@ class tnlTestFunction
              int ZDiffOrder = 0,
              typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Real getValue( const Vertex& vertex,
                   const Real& time = 0 ) const;
 
@@ -93,9 +91,7 @@ class tnlTestFunction
              int ZDiffOrder = 0,
              typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Real getTimeDerivative( const Vertex& vertex,
                            const Real& time = 0 ) const;
 
@@ -148,6 +144,6 @@ ostream& operator << ( ostream& str, const tnlTestFunction< FunctionDimensions,
    return f.print( str );
 }
 
-#include <functions/tnlTestFunction_impl.h>
+#include <functors/tnlTestFunction_impl.h>
 
 #endif /* TNLTESTFUNCTION_H_ */
diff --git a/src/functions/tnlTestFunction_impl.cpp b/src/functors/tnlTestFunction_impl.cpp
similarity index 97%
rename from src/functions/tnlTestFunction_impl.cpp
rename to src/functors/tnlTestFunction_impl.cpp
index c8baf09f22da5f5871f785e763fdf4c254110c62..948a11afa1b1593ffa7a8462b489096b82406e21 100644
--- a/src/functions/tnlTestFunction_impl.cpp
+++ b/src/functors/tnlTestFunction_impl.cpp
@@ -18,7 +18,7 @@
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
-#include <functions/tnlTestFunction.h>
+#include <functors/tnlTestFunction.h>
 
 template class tnlTestFunction< 1, float, tnlHost >;
 template class tnlTestFunction< 2, float, tnlHost >;
diff --git a/src/functions/tnlTestFunction_impl.cu b/src/functors/tnlTestFunction_impl.cu
similarity index 97%
rename from src/functions/tnlTestFunction_impl.cu
rename to src/functors/tnlTestFunction_impl.cu
index 69354815a11103d8134d35d27e97e323bc3f314b..b6a154832e2cdc277e06361eb5b3dab1431f4303 100644
--- a/src/functions/tnlTestFunction_impl.cu
+++ b/src/functors/tnlTestFunction_impl.cu
@@ -18,7 +18,7 @@
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 #ifdef HAVE_CUDA
 
-#include <functions/tnlTestFunction.h>
+#include <functors/tnlTestFunction.h>
 
 template class tnlTestFunction< 1, float, tnlCuda >;
 template class tnlTestFunction< 2, float, tnlCuda >;
diff --git a/src/functions/tnlTestFunction_impl.h b/src/functors/tnlTestFunction_impl.h
similarity index 95%
rename from src/functions/tnlTestFunction_impl.h
rename to src/functors/tnlTestFunction_impl.h
index 6cd5b627a26bc9f08465f2bd4777ed575debba9c..ec59c13f2c2baecb32bd0c22368dad0cb4a485f8 100644
--- a/src/functions/tnlTestFunction_impl.h
+++ b/src/functors/tnlTestFunction_impl.h
@@ -19,10 +19,10 @@
 #define TNLTESTFUNCTION_IMPL_H_
 
 #include <core/tnlCuda.h>
-#include <functions/tnlConstantFunction.h>
-#include <functions/tnlExpBumpFunction.h>
-#include <functions/tnlSinBumpsFunction.h>
-#include <functions/tnlSinWaveFunction.h>
+#include <functors/tnlConstantFunction.h>
+#include <functors/tnlExpBumpFunction.h>
+#include <functors/tnlSinBumpsFunction.h>
+#include <functors/tnlSinWaveFunction.h>
 
 template< int FunctionDimensions,
           typename Real,
@@ -202,9 +202,7 @@ template< int FunctionDimensions,
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlTestFunction< FunctionDimensions, Real, Device >::
 getValue( const Vertex& vertex,
@@ -253,9 +251,7 @@ template< int FunctionDimensions,
              int YDiffOrder,
              int ZDiffOrder,
              typename Vertex >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlTestFunction< FunctionDimensions, Real, Device >::
 getTimeDerivative( const Vertex& vertex,
@@ -281,22 +277,17 @@ getTimeDerivative( const Vertex& vertex,
       case constant:
          return scale * ( ( tnlConstantFunction< Dimensions, Real >* ) function )->
                   getValue< XDiffOrder, YDiffOrder, ZDiffOrder, Vertex >( vertex, time );
-         break;
       case expBump:
          return scale * ( ( tnlExpBumpFunction< Dimensions, Real >* ) function )->
                   getValue< XDiffOrder, YDiffOrder, ZDiffOrder, Vertex >( vertex, time );
-         break;
       case sinBumps:
          return scale * ( ( tnlSinBumpsFunction< Dimensions, Real >* ) function )->
                   getValue< XDiffOrder, YDiffOrder, ZDiffOrder, Vertex >( vertex, time );
-         break;
       case sinWave:
          return scale * ( ( tnlSinWaveFunction< Dimensions, Real >* ) function )->
                   getValue< XDiffOrder, YDiffOrder, ZDiffOrder, Vertex >( vertex, time );
-         break;
       default:
          return 0.0;
-         break;
    }
 }
 
@@ -352,7 +343,6 @@ void
 tnlTestFunction< FunctionDimensions, Real, Device >::
 copyFunction( const void* function )
 {
-   cout << "Copy function ********************************* " << endl;
    if( Device::DeviceType == ( int ) tnlHostDevice ) 
    {
       FunctionType* f = new FunctionType;
@@ -374,17 +364,17 @@ tnlTestFunction< FunctionDimensions, Real, Device >::
 printFunction( ostream& str ) const
 {
    FunctionType* f = ( FunctionType* ) this->function;
-   if( Device::DeviceType == ( int ) tnlHostDevice )
+   switch( Device::DeviceType )
    {
-      str << *f;
-      return str;
-   }
-   if( Device::DeviceType == ( int ) tnlCudaDevice )
-   {
-      tnlCuda::print( f, str );
-      return str;
+      case tnlHostDevice:
+         str << *f;
+         return str;
+      case tnlCudaDevice:
+         tnlCuda::print( f, str );
+         return str;
+      default:
+         return str;
    }
-   return str;
 }
 
 template< int FunctionDimensions,
diff --git a/src/legacy/benchmarks/tnlSpmvBenchmarkBase_impl.h b/src/legacy/benchmarks/tnlSpmvBenchmarkBase_impl.h
index 2b1216bb1b82569fd34d7aa255d75026eeab2bab..9066c57d7e5b8b76806346e5651a5232c384210f 100644
--- a/src/legacy/benchmarks/tnlSpmvBenchmarkBase_impl.h
+++ b/src/legacy/benchmarks/tnlSpmvBenchmarkBase_impl.h
@@ -115,7 +115,7 @@ void tnlSpmvBenchmarkBase< Matrix >::runBenchmark( const tnlVector< RealType, De
       iterations ++;
    }
 
-   this -> time = rt_timer. GetTime();
+   this -> time = rt_timer. getTime();
 
    firstErrorOccurence = 0;
    tnlVector< RealType, tnlHost, IndexType > resB( "tnlSpmvBenchmark< Real, Device, Index, Matrix > :: runBenchmark : b" );
diff --git a/src/legacy/benchmarks/tnlSpmvBenchmarkHybridMatrix.h b/src/legacy/benchmarks/tnlSpmvBenchmarkHybridMatrix.h
index ca4b3b5f13f47f8089c813bc015a5228c3692147..8dffd254eedc72d39c48991e462824127a670047 100644
--- a/src/legacy/benchmarks/tnlSpmvBenchmarkHybridMatrix.h
+++ b/src/legacy/benchmarks/tnlSpmvBenchmarkHybridMatrix.h
@@ -109,7 +109,7 @@ void tnlSpmvBenchmarkHybridMatrix< Real, Index > :: runBenchmark( const tnlVecto
       rt_timer. Reset();
 
       this -> iterations = 0;
-      //while( rt_timer. GetTime() < time )
+      //while( rt_timer. getTime() < time )
       {
          for( int i = 0; i < this -> maxIterations; i ++ )
          {
@@ -118,7 +118,7 @@ void tnlSpmvBenchmarkHybridMatrix< Real, Index > :: runBenchmark( const tnlVecto
             this -> iterations ++;
          }
       }
-      this -> time = rt_timer. GetTime();
+      this -> time = rt_timer. getTime();
 
       cusp::array1d< Real, cusp::host_memory > host_b( b );
       host_b = b;
diff --git a/src/legacy/solvers/tnlMatrixSolver.h b/src/legacy/solvers/tnlMatrixSolver.h
index bfdab721b6873af83090482698b4285d4b9aa931..f5351963d3cfdec6be6ef77aa18fe467f324b178 100644
--- a/src/legacy/solvers/tnlMatrixSolver.h
+++ b/src/legacy/solvers/tnlMatrixSolver.h
@@ -122,7 +122,7 @@ void tnlMatrixSolver< Real, Device, Index > :: printOut()
    if( this -> verbosity > 0 )
    {
       int cpu_time = 0;
-      if( this -> cpu_timer ) cpu_time = this -> cpu_timer -> GetTime( 0, this -> solver_comm );
+      if( this -> cpu_timer ) cpu_time = this -> cpu_timer -> getTime( 0, this -> solver_comm );
       if( MPIGetRank() != 0 ) return;
       // TODO: add EST
       //cout << " EST: " << estimated;
@@ -131,7 +131,7 @@ void tnlMatrixSolver< Real, Device, Index > :: printOut()
       if( this -> cpu_timer )
          cout << " CPU: " << setw( 8 ) << cpu_time;
       if( this -> rt_timer )
-         cout << " ELA: " << setw( 8 ) << this -> rt_timer -> GetTime();
+         cout << " ELA: " << setw( 8 ) << this -> rt_timer -> getTime();
       cout << "   \r" << flush;
    }
 };
diff --git a/src/matrices/CMakeLists.txt b/src/matrices/CMakeLists.txt
index 17422420f12d77164c8fb07045c9a15b58dc2d80..58591b19776e1e61fac9171713a1fe8c2fcecec9 100755
--- a/src/matrices/CMakeLists.txt
+++ b/src/matrices/CMakeLists.txt
@@ -30,7 +30,8 @@ SET( headers tnlMatrix.h
              tnlTridiagonalMatrixRow_impl.h
              tnlMultidiagonalMatrixSetter.h
              tnlMultidiagonalMatrixSetter_impl.h
-             tnlMultidiagonalMatrixRow.h )
+             tnlMultidiagonalMatrixRow.h
+             tnlMultidiagonalMatrixRow_impl.h  )
 
 SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/matrices )
 set( common_SOURCES  )
diff --git a/src/matrices/tnlCSRMatrix.h b/src/matrices/tnlCSRMatrix.h
index 2216ce6b62c434802f37b23b0c9f127d08224865..ac272baf2247aaa93622eb8e4ed8888f23e13ec8 100644
--- a/src/matrices/tnlCSRMatrix.h
+++ b/src/matrices/tnlCSRMatrix.h
@@ -35,7 +35,7 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
    typedef Real RealType;
    typedef Device DeviceType;
    typedef Index IndexType;
-   typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >:: RowLengthsVector RowLengthsVector;
+   typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >:: CompressedRowsLengthsVector CompressedRowsLengthsVector;
    typedef tnlCSRMatrix< Real, Device, Index > ThisType;
    typedef tnlCSRMatrix< Real, tnlHost, Index > HostType;
    typedef tnlCSRMatrix< Real, tnlCuda, Index > CudaType;
@@ -54,7 +54,7 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
    bool setDimensions( const IndexType rows,
                        const IndexType columns );
 
-   bool setRowLengths( const RowLengthsVector& rowLengths );
+   bool setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths );
 
    IndexType getRowLength( const IndexType row ) const;
 
@@ -63,9 +63,7 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
 
    void reset();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__ 
    bool setElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value );
@@ -73,9 +71,7 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
    bool setElement( const IndexType row,
                     const IndexType column,
                     const RealType& value );
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value,
@@ -87,9 +83,7 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
                     const RealType& thisElementMultiplicator = 1.0 );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setRowFast( const IndexType row,
                     const IndexType* columnIndexes,
                     const RealType* values,
@@ -101,9 +95,7 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
                 const IndexType elements );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -117,36 +109,26 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
                 const RealType& thisElementMultiplicator = 1.0 );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getElementFast( const IndexType row,
                             const IndexType column ) const;
 
    RealType getElement( const IndexType row,
                         const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getRowFast( const IndexType row,
                     IndexType* columns,
                     RealType* values ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    MatrixRow getRow( const IndexType rowIndex );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const MatrixRow getRow( const IndexType rowIndex ) const;
 
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
@@ -183,9 +165,7 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
 
    void setCudaKernelType( const SPMVCudaKernel kernel );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    SPMVCudaKernel getCudaKernelType() const;
 
    void setCudaWarpSize( const int warpSize );
@@ -194,9 +174,7 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
 
    void setHybridModeSplit( const IndexType hybridModeSplit );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getHybridModeSplit() const;
 
 #ifdef HAVE_CUDA
diff --git a/src/matrices/tnlCSRMatrix_impl.h b/src/matrices/tnlCSRMatrix_impl.h
index 9002a256735e78450bf9ebbd91031cc5be3bb5ea..d99ee5732fb53c4111bfe60de1914fd4f484946d 100644
--- a/src/matrices/tnlCSRMatrix_impl.h
+++ b/src/matrices/tnlCSRMatrix_impl.h
@@ -23,6 +23,14 @@
 #include <core/vectors/tnlSharedVector.h>
 #include <core/mfuncs.h>
 
+#ifdef HAVE_CUSPARSE
+#include <cusparse.h>
+
+template< typename Real, typename Index >
+class tnlCusparseCSRWrapper {};
+#endif
+
+
 template< typename Real,
           typename Device,
           typename Index >
@@ -69,7 +77,7 @@ bool tnlCSRMatrix< Real, Device, Index >::setDimensions( const IndexType rows,
 template< typename Real,
           typename Device,
           typename Index >
-bool tnlCSRMatrix< Real, Device, Index >::setRowLengths( const RowLengthsVector& rowLengths )
+bool tnlCSRMatrix< Real, Device, Index >::setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths )
 {
    /****
     * Compute the rows pointers. The last one is
@@ -130,9 +138,7 @@ void tnlCSRMatrix< Real, Device, Index >::reset()
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlCSRMatrix< Real, Device, Index >::setElementFast( const IndexType row,
                                                           const IndexType column,
                                                           const Real& value )
@@ -154,9 +160,7 @@ bool tnlCSRMatrix< Real, Device, Index >::setElement( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlCSRMatrix< Real, Device, Index >::addElementFast( const IndexType row,
                                                           const IndexType column,
                                                           const RealType& value,
@@ -258,9 +262,7 @@ bool tnlCSRMatrix< Real, Device, Index >::addElement( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlCSRMatrix< Real, Device, Index > :: setRowFast( const IndexType row,
                                                         const IndexType* columnIndexes,
                                                         const RealType* values,
@@ -310,9 +312,7 @@ bool tnlCSRMatrix< Real, Device, Index > :: setRow( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlCSRMatrix< Real, Device, Index > :: addRowFast( const IndexType row,
                                                         const IndexType* columns,
                                                         const RealType* values,
@@ -338,9 +338,7 @@ bool tnlCSRMatrix< Real, Device, Index > :: addRow( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlCSRMatrix< Real, Device, Index >::getElementFast( const IndexType row,
                                                           const IndexType column ) const
 {
@@ -377,9 +375,7 @@ Real tnlCSRMatrix< Real, Device, Index >::getElement( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlCSRMatrix< Real, Device, Index >::getRowFast( const IndexType row,
                                                       IndexType* columns,
                                                       RealType* values ) const
@@ -397,9 +393,7 @@ void tnlCSRMatrix< Real, Device, Index >::getRowFast( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlCSRMatrix< Real, Device, Index >::MatrixRow
 tnlCSRMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex )
@@ -415,9 +409,7 @@ getRow( const IndexType rowIndex )
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlCSRMatrix< Real, Device, Index >::MatrixRow
 tnlCSRMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex ) const
@@ -434,9 +426,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename Vector::RealType tnlCSRMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
                                                                                  const Vector& vector ) const
 {
@@ -591,9 +581,7 @@ void tnlCSRMatrix< Real, Device, Index >::setCudaKernelType( const SPMVCudaKerne
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlCSRMatrix< Real, Device, Index >::SPMVCudaKernel tnlCSRMatrix< Real, Device, Index >::getCudaKernelType() const
 {
    return this->spmvCudaKernel;
@@ -626,9 +614,7 @@ void tnlCSRMatrix< Real, Device, Index >::setHybridModeSplit( const IndexType hy
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlCSRMatrix< Real, Device, Index >::getHybridModeSplit() const
 {
    return this->hybridModeSplit;
@@ -648,7 +634,7 @@ void tnlCSRMatrix< Real, Device, Index >::spmvCudaVectorized( const InVector& in
                                                               const IndexType warpEnd,
                                                               const IndexType inWarpIdx ) const
 {
-   Real* aux = getSharedMemory< Real >();
+   volatile Real* aux = getSharedMemory< Real >();
    for( IndexType row = warpStart; row < warpEnd; row++ )
    {
       aux[ threadIdx.x ] = 0.0;
@@ -672,8 +658,6 @@ void tnlCSRMatrix< Real, Device, Index >::spmvCudaVectorized( const InVector& in
          if( inWarpIdx < 2 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 2 ];
       if( warpSize >= 2 )
          if( inWarpIdx < 1 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 1 ];
-      __syncthreads(); // TODO: I am not sure why
-
       if( inWarpIdx == 0 )
          outVector[ row ] = aux[ threadIdx.x ];
    }
@@ -739,7 +723,14 @@ class tnlCSRMatrixDeviceDependentCode< tnlHost >
                                  const InVector& inVector,
                                  OutVector& outVector )
       {
-         for( Index row = 0; row < matrix.getRows(); row ++ )
+         const Index rows = matrix.getRows();
+         const tnlCSRMatrix< Real, Device, Index >* matrixPtr = &matrix;
+         const InVector* inVectorPtr = &inVector;
+         OutVector* outVectorPtr = &outVector;
+#ifdef HAVE_OPENMP
+#pragma omp parallel for private( matrixPtr, inVectorPtr, outVectorPtr ), schedule(static )
+#endif         
+         for( Index row = 0; row < rows; row ++ )
             outVector[ row ] = matrix.rowVectorProduct( row, inVector );
       }
 
@@ -849,6 +840,89 @@ void tnlCSRMatrixVectorProductCuda( const tnlCSRMatrix< Real, tnlCuda, Index >&
 }
 
 
+#ifdef HAVE_CUSPARSE
+template<>
+class tnlCusparseCSRWrapper< float, int >
+{
+   public:
+      
+      typedef float Real;
+      typedef int Index;
+      
+      static void vectorProduct( const Index rows,
+                                 const Index columns,
+                                 const Index nnz,
+                                 const Real* values,
+                                 const Index* columnIndexes,
+                                 const Index* rowPointers,
+                                 const Real* x,
+                                 Real* y )
+      {
+         cusparseHandle_t   cusparseHandle;
+         cusparseMatDescr_t cusparseMatDescr;         
+         cusparseCreate( &cusparseHandle );
+         cusparseCreateMatDescr( &cusparseMatDescr );
+         cusparseSetMatType( cusparseMatDescr, CUSPARSE_MATRIX_TYPE_GENERAL );
+         cusparseSetMatIndexBase( cusparseMatDescr, CUSPARSE_INDEX_BASE_ZERO );
+         Real alpha( 1.0 ), beta( 0.0 );
+         cusparseScsrmv( cusparseHandle,
+                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                         rows,
+                         columns,
+                         nnz,
+                         &alpha,
+                         cusparseMatDescr,
+                         values,
+                         rowPointers,
+                         columnIndexes,
+                         x,
+                         &beta,
+                         y );
+      };
+};
+
+template<>
+class tnlCusparseCSRWrapper< double, int >
+{
+   public:
+      
+      typedef double Real;
+      typedef int Index;
+      
+      static void vectorProduct( const Index rows,
+                                 const Index columns,
+                                 const Index nnz,
+                                 const Real* values,
+                                 const Index* columnIndexes,
+                                 const Index* rowPointers,
+                                 const Real* x,
+                                 Real* y )
+      {
+         cusparseHandle_t   cusparseHandle;
+         cusparseMatDescr_t cusparseMatDescr;         
+         cusparseCreate( &cusparseHandle );
+         cusparseCreateMatDescr( &cusparseMatDescr );
+         cusparseSetMatType( cusparseMatDescr, CUSPARSE_MATRIX_TYPE_GENERAL );
+         cusparseSetMatIndexBase( cusparseMatDescr, CUSPARSE_INDEX_BASE_ZERO );
+         Real alpha( 1.0 ), beta( 0.0 );
+         cusparseDcsrmv( cusparseHandle,
+                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                         rows,
+                         columns,
+                         nnz,
+                         &alpha,
+                         cusparseMatDescr,
+                         values,
+                         rowPointers,
+                         columnIndexes,
+                         x,
+                         &beta,
+                         y );
+      };
+};
+
+#endif
+
 template<>
 class tnlCSRMatrixDeviceDependentCode< tnlCuda >
 {
@@ -864,10 +938,22 @@ class tnlCSRMatrixDeviceDependentCode< tnlCuda >
                                  const InVector& inVector,
                                  OutVector& outVector )
       {
+#ifdef HAVE_CUSPARSE         
+         tnlCusparseCSRWrapper< Real, Index >::vectorProduct( matrix.getRows(),
+                                                              matrix.getColumns(),
+                                                              matrix.values.getSize(),
+                                                              matrix.values.getData(),
+                                                              matrix.columnIndexes.getData(),
+                                                              matrix.rowPointers.getData(),
+                                                              inVector.getData(),
+                                                              outVector.getData() );
+#else
          tnlCSRMatrixVectorProductCuda( matrix, inVector, outVector );
+#endif         
       }
 
 };
 
 
+
 #endif /* TNLCSRMATRIX_IMPL_H_ */
diff --git a/src/matrices/tnlChunkedEllpackMatrix.h b/src/matrices/tnlChunkedEllpackMatrix.h
index 3c4a4b7f649ea662d2afca0d0a6d8dad45e57d91..dc89069ced382f9c28bde37a5a76c290282e7f92 100644
--- a/src/matrices/tnlChunkedEllpackMatrix.h
+++ b/src/matrices/tnlChunkedEllpackMatrix.h
@@ -2,7 +2,7 @@
                           tnlChunkedEllpackMatrix.h  -  description
                              -------------------
     begin                : Dec 12, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
+    copyright            : (C) 2013 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -15,6 +15,18 @@
  *                                                                         *
  ***************************************************************************/
 
+/***
+ * Authors:
+ * Oberhuber Tomas, tomas.oberhuber@fjfi.cvut.cz
+ * Heller Martin
+ * 
+ * The algorithm/method was published in:
+ * Heller M., Oberhuber T., Improved Row-grouped CSR Format for Storing of
+ * Sparse Matrices on GPU, Proceedings of Algoritmy 2012, 2012, Handlovičová A.,
+ * Minarechová Z. and Ševčovič D. (ed.), pages 282-290.
+ */
+
+
 #ifndef TNLCHUNKEDELLPACKMATRIX_H_
 #define TNLCHUNKEDELLPACKMATRIX_H_
 
@@ -61,7 +73,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
    typedef Device DeviceType;
    typedef Index IndexType;
    typedef tnlChunkedEllpackSliceInfo< IndexType > ChunkedEllpackSliceInfo;
-   typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >:: RowLengthsVector RowLengthsVector;
+   typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >:: CompressedRowsLengthsVector CompressedRowsLengthsVector;
    typedef tnlChunkedEllpackMatrix< Real, Device, Index > ThisType;
    typedef tnlChunkedEllpackMatrix< Real, tnlHost, Index > HostType;
    typedef tnlChunkedEllpackMatrix< Real, tnlCuda, Index > CudaType;
@@ -77,7 +89,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
    bool setDimensions( const IndexType rows,
                        const IndexType columns );
 
-   bool setRowLengths( const RowLengthsVector& rowLengths );
+   bool setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths );
 
    IndexType getRowLength( const IndexType row ) const;
 
@@ -94,23 +106,17 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
 
    void setNumberOfChunksInSlice( const IndexType chunksInSlice );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getNumberOfChunksInSlice() const;
 
    void setDesiredChunkSize( const IndexType desiredChunkSize );
 
    IndexType getDesiredChunkSize() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getNumberOfSlices() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value );
@@ -119,9 +125,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                     const IndexType column,
                     const RealType& value );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value,
@@ -133,9 +137,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                     const RealType& thisElementMultiplicator = 1.0 );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setRowFast( const IndexType row,
                     const IndexType* columnIndexes,
                     const RealType* values,
@@ -147,9 +149,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                 const IndexType elements );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -163,18 +163,14 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                 const RealType& thisElementMultiplicator = 1.0 );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getElementFast( const IndexType row,
                             const IndexType column ) const;
 
    RealType getElement( const IndexType row,
                         const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getRowFast( const IndexType row,
                     IndexType* columns,
                     RealType* values ) const;
@@ -183,20 +179,14 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                 IndexType* columns,
                 RealType* values ) const;*/
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    MatrixRow getRow( const IndexType rowIndex );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const MatrixRow getRow( const IndexType rowIndex ) const;
 
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
@@ -246,7 +236,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
 
    void resolveSliceSizes( const tnlVector< Index, tnlHost, Index >& rowLengths );
 
-   bool setSlice( const RowLengthsVector& rowLengths,
+   bool setSlice( const CompressedRowsLengthsVector& rowLengths,
                   const IndexType sliceIdx,
                   IndexType& elementsToAllocation );
 
@@ -257,9 +247,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                            RealType& value,
                            RealType& thisElementMultiplicator );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementToChunkFast( const IndexType sliceOffset,
                                const IndexType chunkIndex,
                                const IndexType chunkSize,
@@ -267,9 +255,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                                RealType& value,
                                RealType& thisElementMultiplicator );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void setChunkFast( const IndexType sliceOffset,
                       const IndexType chunkIndex,
                       const IndexType chunkSize,
@@ -290,9 +276,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                            const IndexType column,
                            RealType& value ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool getElementInChunkFast( const IndexType sliceOffset,
                                const IndexType chunkIndex,
                                const IndexType chunkSize,
@@ -305,9 +289,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                   IndexType* columns,
                   RealType* values ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getChunkFast( const IndexType sliceOffset,
                       const IndexType chunkIndex,
                       const IndexType chunkSize,
@@ -315,9 +297,7 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                       RealType* values ) const;
 
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    typename Vector::RealType chunkVectorProduct( const IndexType sliceOffset,
                                                  const IndexType chunkIndex,
                                                  const IndexType chunkSize,
diff --git a/src/matrices/tnlChunkedEllpackMatrix_impl.h b/src/matrices/tnlChunkedEllpackMatrix_impl.h
index 12a7e6866c56b03e7446d5966fee369ce8e40794..49ea5e9ab286a91bc44cfb8b2fb7bd187b152f80 100644
--- a/src/matrices/tnlChunkedEllpackMatrix_impl.h
+++ b/src/matrices/tnlChunkedEllpackMatrix_impl.h
@@ -134,7 +134,7 @@ void tnlChunkedEllpackMatrix< Real, Device, Index >::resolveSliceSizes( const tn
 template< typename Real,
           typename Device,
           typename Index >
-bool tnlChunkedEllpackMatrix< Real, Device, Index >::setSlice( const RowLengthsVector& rowLengths,
+bool tnlChunkedEllpackMatrix< Real, Device, Index >::setSlice( const CompressedRowsLengthsVector& rowLengths,
                                                                const IndexType sliceIndex,
                                                                IndexType& elementsToAllocation )
 {
@@ -217,7 +217,7 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setSlice( const RowLengthsV
 template< typename Real,
           typename Device,
           typename Index >
-bool tnlChunkedEllpackMatrix< Real, Device, Index >::setRowLengths( const RowLengthsVector& rowLengths )
+bool tnlChunkedEllpackMatrix< Real, Device, Index >::setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths )
 {
    tnlAssert( this->getRows() > 0, );
    tnlAssert( this->getColumns() > 0, );
@@ -236,12 +236,12 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setRowLengths( const RowLen
    {
       tnlChunkedEllpackMatrix< RealType, tnlHost, IndexType > hostMatrix;
       hostMatrix.setDimensions( this->getRows(), this->getColumns() );
-      tnlVector< IndexType, tnlHost, IndexType > hostRowLengths;
-      hostRowLengths.setLike( rowLengths);
-      hostRowLengths = rowLengths;
+      tnlVector< IndexType, tnlHost, IndexType > hostCompressedRowsLengths;
+      hostCompressedRowsLengths.setLike( rowLengths);
+      hostCompressedRowsLengths = rowLengths;
       hostMatrix.setNumberOfChunksInSlice( this->chunksInSlice );
       hostMatrix.setDesiredChunkSize( this->desiredChunkSize );
-      hostMatrix.setRowLengths( hostRowLengths );
+      hostMatrix.setCompressedRowsLengths( hostCompressedRowsLengths );
 
       this->rowToChunkMapping.setLike( hostMatrix.rowToChunkMapping );
       this->rowToChunkMapping = hostMatrix.rowToChunkMapping;
@@ -309,9 +309,7 @@ void tnlChunkedEllpackMatrix< Real, Device, Index >::setNumberOfChunksInSlice( c
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlChunkedEllpackMatrix< Real, Device, Index >::getNumberOfChunksInSlice() const
 {
    return this->chunksInSlice;
@@ -336,9 +334,7 @@ Index tnlChunkedEllpackMatrix< Real, Device, Index >::getDesiredChunkSize() cons
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlChunkedEllpackMatrix< Real, Device, Index >::getNumberOfSlices() const
 {
    return this->numberOfSlices;
@@ -378,9 +374,7 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::operator != ( const tnlChun
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlChunkedEllpackMatrix< Real, Device, Index >::setElementFast( const IndexType row,
                                                                      const IndexType column,
                                                                      const Real& value )
@@ -401,9 +395,7 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setElement( const IndexType
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlChunkedEllpackMatrix< Real, Device, Index >::addElementFast( const IndexType row,
                                                                      const IndexType _column,
                                                                      const RealType& _value,
@@ -438,9 +430,7 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::addElementFast( const Index
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlChunkedEllpackMatrix< Real, Device, Index >::addElementToChunkFast( const IndexType sliceOffset,
                                                                             const IndexType chunkIndex,
                                                                             const IndexType chunkSize,
@@ -601,9 +591,7 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::addElementToChunk( const In
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlChunkedEllpackMatrix< Real, Device, Index >::setRowFast( const IndexType row,
                                                                  const IndexType* columnIndexes,
                                                                  const RealType* values,
@@ -645,9 +633,7 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setRowFast( const IndexType
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlChunkedEllpackMatrix< Real, Device, Index >::setChunkFast( const IndexType sliceOffset,
                                                                    const IndexType chunkIndex,
                                                                    const IndexType chunkSize,
@@ -757,9 +743,7 @@ void tnlChunkedEllpackMatrix< Real, Device, Index >::setChunk( const IndexType s
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlChunkedEllpackMatrix< Real, Device, Index > :: addRowFast( const IndexType row,
                                                                    const IndexType* columns,
                                                                    const RealType* values,
@@ -785,9 +769,7 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index > :: addRow( const IndexType r
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlChunkedEllpackMatrix< Real, Device, Index >::getElementFast( const IndexType row,
                                                                      const IndexType column ) const
 {
@@ -809,9 +791,7 @@ Real tnlChunkedEllpackMatrix< Real, Device, Index >::getElementFast( const Index
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlChunkedEllpackMatrix< Real, Device, Index >::getElementInChunkFast( const IndexType sliceOffset,
                                                                             const IndexType chunkIndex,
                                                                             const IndexType chunkSize,
@@ -889,9 +869,7 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::getElementInChunk( const In
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlChunkedEllpackMatrix< Real, Device, Index >::getRowFast( const IndexType row,
                                                                  IndexType* columns,
                                                                  RealType* values ) const
@@ -921,9 +899,7 @@ void tnlChunkedEllpackMatrix< Real, Device, Index >::getRowFast( const IndexType
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlChunkedEllpackMatrix< Real, Device, Index >::getChunkFast( const IndexType sliceOffset,
                                                                    const IndexType chunkIndex,
                                                                    const IndexType chunkSize,
@@ -945,9 +921,7 @@ void tnlChunkedEllpackMatrix< Real, Device, Index >::getChunkFast( const IndexTy
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlChunkedEllpackMatrix< Real, Device, Index >::MatrixRow
 tnlChunkedEllpackMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex )
@@ -963,9 +937,7 @@ getRow( const IndexType rowIndex )
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlChunkedEllpackMatrix< Real, Device, Index >::MatrixRow
 tnlChunkedEllpackMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex ) const
@@ -1039,9 +1011,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename Vector::RealType tnlChunkedEllpackMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
                                                                                             const Vector& vector ) const
 {
@@ -1072,9 +1042,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename Vector::RealType tnlChunkedEllpackMatrix< Real, Device, Index >::chunkVectorProduct( const IndexType sliceOffset,
                                                                                               const IndexType chunkIndex,
                                                                                               const IndexType chunkSize,
@@ -1320,12 +1288,13 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlHost >
       template< typename Real,
                 typename Index >
       static void resolveSliceSizes( tnlChunkedEllpackMatrix< Real, Device, Index >& matrix,
-                                     const typename tnlChunkedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths )
+                                     const typename tnlChunkedEllpackMatrix< Real, Device, Index >::CompressedRowsLengthsVector& rowLengths )
       {
          matrix.resolveSliceSizes( rowLengths );
       }
 
       template< typename Index >
+      __cuda_callable__
       static void initChunkTraverse( const Index sliceOffset,
                                      const Index chunkIndex,
                                      const Index chunkSize,
@@ -1380,14 +1349,12 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlCuda >
       template< typename Real,
                 typename Index >
       static void resolveSliceSizes( tnlChunkedEllpackMatrix< Real, Device, Index >& matrix,
-                                     const typename tnlChunkedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths )
+                                     const typename tnlChunkedEllpackMatrix< Real, Device, Index >::CompressedRowsLengthsVector& rowLengths )
       {
       }
       
       template< typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      __cuda_callable__
       static void initChunkTraverse( const Index sliceOffset,
                                      const Index chunkIndex,
                                      const Index chunkSize,
diff --git a/src/matrices/tnlDenseMatrix.h b/src/matrices/tnlDenseMatrix.h
index bf86805192a074c4e8808d6cdc318da649ae63c8..7e158ff519f6b86a89552ca6d281920224c7d2bc 100644
--- a/src/matrices/tnlDenseMatrix.h
+++ b/src/matrices/tnlDenseMatrix.h
@@ -36,7 +36,7 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
    typedef Real RealType;
    typedef Device DeviceType;
    typedef Index IndexType;
-   typedef typename tnlMatrix< Real, Device, Index >::RowLengthsVector RowLengthsVector;
+   typedef typename tnlMatrix< Real, Device, Index >::CompressedRowsLengthsVector CompressedRowsLengthsVector;
    typedef tnlDenseMatrix< Real, Device, Index > ThisType;
    typedef tnlDenseMatrix< Real, tnlHost, Index > HostType;
    typedef tnlDenseMatrix< Real, tnlCuda, Index > CudaType;
@@ -59,7 +59,7 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
    /****
     * This method is only for the compatibility with the sparse matrices.
     */
-   bool setRowLengths( const RowLengthsVector& rowLengths );
+   bool setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths );
 
    /****
     * Returns maximal number of the nonzero matrix elements that can be stored
@@ -77,9 +77,7 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
 
    void setValue( const RealType& v );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value );
@@ -88,9 +86,7 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
                     const IndexType column,
                     const RealType& value );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value,
@@ -101,9 +97,7 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
                     const RealType& value,
                     const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -114,9 +108,7 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
                 const RealType* values,
                 const IndexType elements );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -129,18 +121,14 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
                 const IndexType elements,
                 const RealType& thisRowMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Real getElementFast( const IndexType row,
                         const IndexType column ) const;
 
    Real getElement( const IndexType row,
                     const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getRowFast( const IndexType row,
                     IndexType* columns,
                     RealType* values ) const;
@@ -149,20 +137,14 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
                 IndexType* columns,
                 RealType* values ) const;*/
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    MatrixRow getRow( const IndexType rowIndex );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const MatrixRow getRow( const IndexType rowIndex ) const;
 
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
@@ -217,9 +199,7 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
 
    protected:
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getElementIndex( const IndexType row,
                               const IndexType column ) const;
 
diff --git a/src/matrices/tnlDenseMatrixRow.h b/src/matrices/tnlDenseMatrixRow.h
index 29686a58e547f0eb9738e370deae29771fb9755d..0882c960b0e59e099ed0d57ed73f5ca17e871c1a 100644
--- a/src/matrices/tnlDenseMatrixRow.h
+++ b/src/matrices/tnlDenseMatrixRow.h
@@ -23,28 +23,20 @@ class tnlDenseMatrixRow
 {
    public:
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       tnlDenseMatrixRow();
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       tnlDenseMatrixRow( Real* values,
                          const Index columns,
                          const Index step );
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       void bind( Real* values,
                  const Index columns,
                  const Index step );
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       void setElement( const Index& elementIndex,
                        const Index& column,
                        const Real& value );
diff --git a/src/matrices/tnlDenseMatrixRow_impl.h b/src/matrices/tnlDenseMatrixRow_impl.h
index f752e26157fac6bdf82ba9f9f8a09df8904cd41b..3ad6af7f2b83533c02016467bbf9f5307b0cf2ca 100644
--- a/src/matrices/tnlDenseMatrixRow_impl.h
+++ b/src/matrices/tnlDenseMatrixRow_impl.h
@@ -19,9 +19,7 @@
 #define TNLDENSEMATRIXROW_IMPL_H_
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 tnlDenseMatrixRow< Real, Index >::
 tnlDenseMatrixRow()
 : values( 0 ),
@@ -31,9 +29,7 @@ tnlDenseMatrixRow()
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 tnlDenseMatrixRow< Real, Index >::
 tnlDenseMatrixRow( Real* values,
                    const Index columns,
@@ -45,9 +41,7 @@ tnlDenseMatrixRow( Real* values,
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlDenseMatrixRow< Real, Index >::
 bind( Real* values,
@@ -60,9 +54,7 @@ bind( Real* values,
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlDenseMatrixRow< Real, Index >::
 setElement( const Index& elementIndex,
diff --git a/src/matrices/tnlDenseMatrix_impl.h b/src/matrices/tnlDenseMatrix_impl.h
index fea62c6a9f3315472e06f380d1dcb19509827001..aa5f316aa027ccc916c7042ac034aaa9e657364c 100644
--- a/src/matrices/tnlDenseMatrix_impl.h
+++ b/src/matrices/tnlDenseMatrix_impl.h
@@ -78,7 +78,7 @@ bool tnlDenseMatrix< Real, Device, Index >::setLike( const tnlDenseMatrix< Real2
 template< typename Real,
           typename Device,
           typename Index >
-bool tnlDenseMatrix< Real, Device, Index >::setRowLengths( const RowLengthsVector& rowLengths )
+bool tnlDenseMatrix< Real, Device, Index >::setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths )
 {
    return true;
 }
@@ -141,9 +141,7 @@ void tnlDenseMatrix< Real, Device, Index >::setValue( const Real& value )
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlDenseMatrix< Real, Device, Index >::setElementFast( const IndexType row,
                                                             const IndexType column,
                                                             const RealType& value )
@@ -171,9 +169,7 @@ bool tnlDenseMatrix< Real, Device, Index >::setElement( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlDenseMatrix< Real, Device, Index >::addElementFast( const IndexType row,
                                                             const IndexType column,
                                                             const RealType& value,
@@ -213,9 +209,7 @@ bool tnlDenseMatrix< Real, Device, Index >::addElement( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlDenseMatrix< Real, Device, Index >::setRowFast( const IndexType row,
                                                         const IndexType* columns,
                                                         const RealType* values,
@@ -250,9 +244,7 @@ bool tnlDenseMatrix< Real, Device, Index >::setRow( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlDenseMatrix< Real, Device, Index >::addRowFast( const IndexType row,
                                                         const IndexType* columns,
                                                         const RealType* values,
@@ -292,9 +284,7 @@ bool tnlDenseMatrix< Real, Device, Index >::addRow( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlDenseMatrix< Real, Device, Index >::getElementFast( const IndexType row,
                                                             const IndexType column ) const
 {
@@ -316,9 +306,7 @@ Real tnlDenseMatrix< Real, Device, Index >::getElement( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlDenseMatrix< Real, Device, Index >::getRowFast( const IndexType row,
                                                         IndexType* columns,
                                                         RealType* values ) const
@@ -333,9 +321,7 @@ void tnlDenseMatrix< Real, Device, Index >::getRowFast( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlDenseMatrix< Real, Device, Index >::MatrixRow
 tnlDenseMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex )
@@ -353,9 +339,7 @@ getRow( const IndexType rowIndex )
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlDenseMatrix< Real, Device, Index >::MatrixRow
 tnlDenseMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex ) const
@@ -374,9 +358,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename Vector::RealType tnlDenseMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
                                                                                    const Vector& vector ) const
 {
@@ -937,9 +919,7 @@ void tnlDenseMatrix< Real, Device, Index >::print( ostream& str ) const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlDenseMatrix< Real, Device, Index >::getElementIndex( const IndexType row,
                                                               const IndexType column ) const
 {
@@ -966,6 +946,9 @@ class tnlDenseMatrixDeviceDependentCode< tnlHost >
                                  const InVector& inVector,
                                  OutVector& outVector )
       {
+#ifdef HAVE_OPENMP
+#pragma omp parallel for
+#endif           
          for( Index row = 0; row < matrix.getRows(); row ++ )
             outVector[ row ] = matrix.rowVectorProduct( row, inVector );
       }
diff --git a/src/matrices/tnlEllpackMatrix.h b/src/matrices/tnlEllpackMatrix.h
index 4667c4111d0b619509d8af54c1041cf8d524fe6e..4003ceed39fb218da89926fc800fd6ea5c7171ac 100644
--- a/src/matrices/tnlEllpackMatrix.h
+++ b/src/matrices/tnlEllpackMatrix.h
@@ -32,7 +32,7 @@ class tnlEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
    typedef Real RealType;
    typedef Device DeviceType;
    typedef Index IndexType;
-   typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::RowLengthsVector RowLengthsVector;
+   typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::CompressedRowsLengthsVector CompressedRowsLengthsVector;
    typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
    typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
    typedef tnlEllpackMatrix< Real, Device, Index > ThisType;
@@ -50,9 +50,9 @@ class tnlEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
    bool setDimensions( const IndexType rows,
                        const IndexType columns );
 
-   bool setRowLengths( const RowLengthsVector& rowLengths );
+   bool setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths );
 
-   bool setConstantRowLengths( const IndexType& rowLengths );
+   bool setConstantCompressedRowsLengths( const IndexType& rowLengths );
 
    IndexType getRowLength( const IndexType row ) const;
 
@@ -69,11 +69,9 @@ class tnlEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
 
    /*template< typename Matrix >
    bool copyFrom( const Matrix& matrix,
-                  const RowLengthsVector& rowLengths );*/
+                  const CompressedRowsLengthsVector& rowLengths );*/
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value );
@@ -82,9 +80,7 @@ class tnlEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                     const IndexType column,
                     const RealType& value );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value,
@@ -96,9 +92,7 @@ class tnlEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                     const RealType& thisElementMultiplicator = 1.0 );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setRowFast( const IndexType row,
                     const IndexType* columnIndexes,
                     const RealType* values,
@@ -110,9 +104,7 @@ class tnlEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                 const IndexType elements );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -125,36 +117,26 @@ class tnlEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                 const IndexType numberOfElements,
                 const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getElementFast( const IndexType row,
                             const IndexType column ) const;
 
    RealType getElement( const IndexType row,
                         const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getRowFast( const IndexType row,
                     IndexType* columns,
                     RealType* values ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    MatrixRow getRow( const IndexType rowIndex );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const MatrixRow getRow( const IndexType rowIndex ) const;
 
-template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   template< typename Vector >
+   __cuda_callable__
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
diff --git a/src/matrices/tnlEllpackMatrix_impl.h b/src/matrices/tnlEllpackMatrix_impl.h
index f0f3459f8f450938b239d7b74aed2d8354fd960c..088dd84e404b0663e18f0b53552f1738aa5138f4 100644
--- a/src/matrices/tnlEllpackMatrix_impl.h
+++ b/src/matrices/tnlEllpackMatrix_impl.h
@@ -72,7 +72,7 @@ bool tnlEllpackMatrix< Real, Device, Index >::setDimensions( const IndexType row
 template< typename Real,
           typename Device,
           typename Index >
-bool tnlEllpackMatrix< Real, Device, Index >::setRowLengths( const RowLengthsVector& rowLengths )
+bool tnlEllpackMatrix< Real, Device, Index >::setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths )
 {
    tnlAssert( this->getRows() > 0, );
    tnlAssert( this->getColumns() > 0, );
@@ -84,7 +84,7 @@ bool tnlEllpackMatrix< Real, Device, Index >::setRowLengths( const RowLengthsVec
 template< typename Real,
           typename Device,
           typename Index >
-bool tnlEllpackMatrix< Real, Device, Index >::setConstantRowLengths( const IndexType& rowLengths )
+bool tnlEllpackMatrix< Real, Device, Index >::setConstantCompressedRowsLengths( const IndexType& rowLengths )
 {
    tnlAssert( rowLengths > 0,
               cerr << " rowLengths = " << rowLengths );
@@ -161,7 +161,7 @@ bool tnlEllpackMatrix< Real, Device, Index >::operator != ( const tnlEllpackMatr
           typename Index >
    template< typename Matrix >
 bool tnlEllpackMatrix< Real, Device, Index >::copyFrom( const Matrix& matrix,
-                                                        const RowLengthsVector& rowLengths )
+                                                        const CompressedRowsLengthsVector& rowLengths )
 {
    return tnlMatrix< RealType, DeviceType, IndexType >::copyFrom( matrix, rowLengths );
 }*/
@@ -169,9 +169,7 @@ bool tnlEllpackMatrix< Real, Device, Index >::copyFrom( const Matrix& matrix,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlEllpackMatrix< Real, Device, Index > :: setElementFast( const IndexType row,
                                                                 const IndexType column,
                                                                 const Real& value )
@@ -193,9 +191,7 @@ bool tnlEllpackMatrix< Real, Device, Index > :: setElement( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlEllpackMatrix< Real, Device, Index > :: addElementFast( const IndexType row,
                                                                 const IndexType column,
                                                                 const RealType& value,
@@ -291,9 +287,7 @@ bool tnlEllpackMatrix< Real, Device, Index > :: addElement( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlEllpackMatrix< Real, Device, Index > :: setRowFast( const IndexType row,
                                                             const IndexType* columnIndexes,
                                                             const RealType* values,
@@ -359,9 +353,7 @@ bool tnlEllpackMatrix< Real, Device, Index > :: setRow( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlEllpackMatrix< Real, Device, Index > :: addRowFast( const IndexType row,
                                                             const IndexType* columns,
                                                             const RealType* values,
@@ -388,9 +380,7 @@ bool tnlEllpackMatrix< Real, Device, Index > :: addRow( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlEllpackMatrix< Real, Device, Index >::getElementFast( const IndexType row,
                                                               const IndexType column ) const
 {
@@ -430,9 +420,7 @@ Real tnlEllpackMatrix< Real, Device, Index >::getElement( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlEllpackMatrix< Real, Device, Index >::getRowFast( const IndexType row,
                                                           IndexType* columns,
                                                           RealType* values ) const
@@ -453,13 +441,12 @@ void tnlEllpackMatrix< Real, Device, Index >::getRowFast( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlEllpackMatrix< Real, Device, Index >::MatrixRow
 tnlEllpackMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex )
 {
+   //printf( "this->rowLengths = %d this = %p \n", this->rowLengths, this );
    IndexType rowBegin = DeviceDependentCode::getRowBegin( *this, rowIndex );
    return MatrixRow( &this->columnIndexes[ rowBegin ],
                      &this->values[ rowBegin ],
@@ -470,13 +457,12 @@ getRow( const IndexType rowIndex )
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlEllpackMatrix< Real, Device, Index >::MatrixRow
 tnlEllpackMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex ) const
 {
+   //printf( "this->rowLengths = %d this = %p \n", this->rowLengths, this );
    IndexType rowBegin = DeviceDependentCode::getRowBegin( *this, rowIndex );
    return MatrixRow( &this->columnIndexes[ rowBegin ],
                      &this->values[ rowBegin ],
@@ -488,9 +474,7 @@ template< typename Real,
           typename Device,
           typename Index >
   template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename Vector::RealType tnlEllpackMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
                                                                                      const Vector& vector ) const
 {
@@ -636,15 +620,16 @@ void tnlEllpackMatrix< Real, Device, Index >::print( ostream& str ) const
    for( IndexType row = 0; row < this->getRows(); row++ )
    {
       str <<"Row: " << row << " -> ";
-      IndexType i( row * this->rowLengths );
-      const IndexType rowEnd( i + this->rowLengths );
+      IndexType i = DeviceDependentCode::getRowBegin( *this, row );
+      const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row );
+      const IndexType step = DeviceDependentCode::getElementStep( *this );
       while( i < rowEnd &&
              this->columnIndexes.getElement( i ) < this->columns &&
              this->columnIndexes.getElement( i ) != this->getPaddingIndex() )
       {
          const Index column = this->columnIndexes.getElement( i );
          str << " Col:" << column << "->" << this->values.getElement( i ) << "\t";
-         i++;
+         i += step;
       }
       str << endl;
    }
@@ -669,6 +654,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost >
 
       template< typename Real,
                 typename Index >
+      __cuda_callable__
       static Index getRowBegin( const tnlEllpackMatrix< Real, Device, Index >& matrix,
                                 const Index row )
       {
@@ -677,6 +663,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost >
 
       template< typename Real,
                 typename Index >
+      __cuda_callable__
       static Index getRowEnd( const tnlEllpackMatrix< Real, Device, Index >& matrix,
                                 const Index row )
       {
@@ -685,6 +672,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost >
 
       template< typename Real,
                 typename Index >
+      __cuda_callable__
       static Index getElementStep( const tnlEllpackMatrix< Real, Device, Index >& matrix )
       {
          return 1;
@@ -698,6 +686,9 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost >
                                  const InVector& inVector,
                                  OutVector& outVector )
       {
+#ifdef HAVE_OPENMP
+#pragma omp parallel for
+#endif           
          for( Index row = 0; row < matrix.getRows(); row ++ )
             outVector[ row ] = matrix.rowVectorProduct( row, inVector );
       }
@@ -712,9 +703,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
 
       template< typename Real,
                 typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       static Index getRowBegin( const tnlEllpackMatrix< Real, Device, Index >& matrix,
                                 const Index row )
       {
@@ -723,9 +712,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
 
       template< typename Real,
                 typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       static Index getRowEnd( const tnlEllpackMatrix< Real, Device, Index >& matrix,
                                 const Index row )
       {
@@ -734,9 +721,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
 
       template< typename Real,
                 typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       static Index getElementStep( const tnlEllpackMatrix< Real, Device, Index >& matrix )
       {
          return matrix.alignedRows;
@@ -754,7 +739,4 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
       }
 };
 
-
-
-
 #endif /* TNLELLPACKMATRIX_IMPL_H_ */
diff --git a/src/matrices/tnlMatrix.h b/src/matrices/tnlMatrix.h
index ad3e9960da5243883653201588e43f1cda415364..7c0bfb8e452bab36171029bee6b07f0a45ffd647 100644
--- a/src/matrices/tnlMatrix.h
+++ b/src/matrices/tnlMatrix.h
@@ -32,7 +32,7 @@ class tnlMatrix : public virtual tnlObject
    typedef Real RealType;
    typedef Device DeviceType;
    typedef Index IndexType;
-   typedef tnlVector< IndexType, DeviceType, IndexType > RowLengthsVector;
+   typedef tnlVector< IndexType, DeviceType, IndexType > CompressedRowsLengthsVector;
    typedef tnlVector< RealType, DeviceType, IndexType > ValuesVector;
 
    tnlMatrix();
@@ -40,11 +40,11 @@ class tnlMatrix : public virtual tnlObject
    virtual bool setDimensions( const IndexType rows,
                                const IndexType columns );
 
-   virtual bool setRowLengths( const RowLengthsVector& rowLengths ) = 0;
+   virtual bool setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths ) = 0;
 
    virtual IndexType getRowLength( const IndexType row ) const = 0;
 
-   virtual void getRowLengths( tnlVector< IndexType, DeviceType, IndexType >& rowLengths ) const;
+   virtual void getCompressedRowsLengths( tnlVector< IndexType, DeviceType, IndexType >& rowLengths ) const;
 
    template< typename Real2, typename Device2, typename Index2 >
    bool setLike( const tnlMatrix< Real2, Device2, Index2 >& matrix );
@@ -55,14 +55,10 @@ class tnlMatrix : public virtual tnlObject
 
    void reset();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getRows() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getColumns() const;
 
    /****
@@ -104,7 +100,7 @@ class tnlMatrix : public virtual tnlObject
 
    template< typename Matrix >
    bool copyFrom( const Matrix& matrix,
-                  const RowLengthsVector& rowLengths );
+                  const CompressedRowsLengthsVector& rowLengths );
 
    virtual bool save( tnlFile& file ) const;
 
diff --git a/src/matrices/tnlMatrixReader.h b/src/matrices/tnlMatrixReader.h
index 52215c565e48011b3235f703508ba2e4023f2cdc..8675f584e6aaa2abd5b2cb88bfa42cf2d77ca1da 100644
--- a/src/matrices/tnlMatrixReader.h
+++ b/src/matrices/tnlMatrixReader.h
@@ -44,7 +44,7 @@ class tnlMatrixReader
 
    static bool readMtxFileHostMatrix( std::istream& file,
                                       Matrix& matrix,
-                                      typename Matrix::RowLengthsVector& rowLengths,
+                                      typename Matrix::CompressedRowsLengthsVector& rowLengths,
                                       bool verbose );
 
 
@@ -68,7 +68,7 @@ class tnlMatrixReader
                               bool& symmetricMatrix,
                               bool verbose );
 
-   static bool computeRowLengthsFromMtxFile( std::istream& file,
+   static bool computeCompressedRowsLengthsFromMtxFile( std::istream& file,
                                              tnlVector< int, tnlHost, int >& rowLengths,
                                              const int columns,
                                              const int rows,
diff --git a/src/matrices/tnlMatrixReader_impl.h b/src/matrices/tnlMatrixReader_impl.h
index 6603692e31132c0de0fef57805c2000f2fcfd11f..24ce946f19d7b8a6ee3e602ada0930ddcaad328e 100644
--- a/src/matrices/tnlMatrixReader_impl.h
+++ b/src/matrices/tnlMatrixReader_impl.h
@@ -52,7 +52,7 @@ bool tnlMatrixReader< Matrix >::readMtxFile( std::istream& file,
 template< typename Matrix >
 bool tnlMatrixReader< Matrix >::readMtxFileHostMatrix( std::istream& file,
                                                        Matrix& matrix,
-                                                       typename Matrix::RowLengthsVector& rowLengths,
+                                                       typename Matrix::CompressedRowsLengthsVector& rowLengths,
                                                        bool verbose )
 {
    IndexType rows, columns;
@@ -69,10 +69,10 @@ bool tnlMatrixReader< Matrix >::readMtxFileHostMatrix( std::istream& file,
       return false;
    }
 
-   if( ! computeRowLengthsFromMtxFile( file, rowLengths, columns, rows, symmetricMatrix, verbose ) )
+   if( ! computeCompressedRowsLengthsFromMtxFile( file, rowLengths, columns, rows, symmetricMatrix, verbose ) )
       return false;
 
-   if( ! matrix.setRowLengths( rowLengths ) )
+   if( ! matrix.setCompressedRowsLengths( rowLengths ) )
       return false;
 
    if( ! readMatrixElementsFromMtxFile( file, matrix, symmetricMatrix, verbose ) )
@@ -126,8 +126,8 @@ bool tnlMatrixReader< Matrix >::verifyMtxFile( std::istream& file,
    long int fileSize = file.tellg();
    if( verbose )
       cout << " Verifying the matrix elements ... " << processedElements << " / " << matrix.getNumberOfMatrixElements()
-           << " -> " << timer.GetTime()
-           << " sec. i.e. " << fileSize / ( timer.GetTime() * ( 1 << 20 ))  << "MB/s." << endl;
+           << " -> " << timer.getTime()
+           << " sec. i.e. " << fileSize / ( timer.getTime() * ( 1 << 20 ))  << "MB/s." << endl;
    return true;
 }
 
@@ -144,8 +144,6 @@ bool tnlMatrixReader< Matrix >::findLineByElement( std::istream& file,
    bool dimensionsLine( false );
    lineNumber = 0;
    tnlTimerRT timer;
-   IndexType currentRow, currentColumn;
-   RealType value;
    while( line.getLine( file ) )
    {
       lineNumber++;
@@ -260,7 +258,7 @@ bool tnlMatrixReader< Matrix >::readMtxHeader( std::istream& file,
 }
 
 template< typename Matrix >
-bool tnlMatrixReader< Matrix >::computeRowLengthsFromMtxFile( std::istream& file,
+bool tnlMatrixReader< Matrix >::computeCompressedRowsLengthsFromMtxFile( std::istream& file,
                                                               tnlVector< int, tnlHost, int >& rowLengths,
                                                               const int columns,
                                                               const int rows,
@@ -314,8 +312,8 @@ bool tnlMatrixReader< Matrix >::computeRowLengthsFromMtxFile( std::istream& file
    long int fileSize = file.tellg();
    if( verbose )
       cout << " Counting the matrix elements ... " << numberOfElements / 1000
-           << " thousands  -> " << timer.GetTime()
-           << " sec. i.e. " << fileSize / ( timer.GetTime() * ( 1 << 20 ))  << "MB/s." << endl;
+           << " thousands  -> " << timer.getTime()
+           << " sec. i.e. " << fileSize / ( timer.getTime() * ( 1 << 20 ))  << "MB/s." << endl;
    return true;
 }
 
@@ -357,8 +355,8 @@ bool tnlMatrixReader< Matrix >::readMatrixElementsFromMtxFile( std::istream& fil
    long int fileSize = file.tellg();
    if( verbose )
       cout << " Reading the matrix elements ... " << processedElements << " / " << matrix.getNumberOfMatrixElements()
-              << " -> " << timer.GetTime()
-              << " sec. i.e. " << fileSize / ( timer.GetTime() * ( 1 << 20 ))  << "MB/s." << endl;
+              << " -> " << timer.getTime()
+              << " sec. i.e. " << fileSize / ( timer.getTime() * ( 1 << 20 ))  << "MB/s." << endl;
    return true;
 }
 
@@ -391,7 +389,7 @@ class tnlMatrixReaderDeviceDependentCode< tnlHost >
                             Matrix& matrix,
                             bool verbose )
    {
-      typename Matrix::RowLengthsVector rowLengths;
+      typename Matrix::CompressedRowsLengthsVector rowLengths;
       return tnlMatrixReader< Matrix >::readMtxFileHostMatrix( file, matrix, rowLengths, verbose );
    }
 };
@@ -407,17 +405,17 @@ class tnlMatrixReaderDeviceDependentCode< tnlCuda >
                             bool verbose )
    {
       typedef typename Matrix::HostType HostMatrixType;
-      typedef typename HostMatrixType::RowLengthsVector RowLengthsVector;
+      typedef typename HostMatrixType::CompressedRowsLengthsVector CompressedRowsLengthsVector;
 
       HostMatrixType hostMatrix;
-      RowLengthsVector rowLengthsVector;
+      CompressedRowsLengthsVector rowLengthsVector;
       if( ! tnlMatrixReader< HostMatrixType >::readMtxFileHostMatrix( file, hostMatrix, rowLengthsVector, verbose ) )
          return false;
 
-      typename Matrix::RowLengthsVector cudaRowLengthsVector;
-      cudaRowLengthsVector.setLike( rowLengthsVector );
-      cudaRowLengthsVector = rowLengthsVector;
-      if( ! matrix.copyFrom( hostMatrix, cudaRowLengthsVector ) )
+      typename Matrix::CompressedRowsLengthsVector cudaCompressedRowsLengthsVector;
+      cudaCompressedRowsLengthsVector.setLike( rowLengthsVector );
+      cudaCompressedRowsLengthsVector = rowLengthsVector;
+      if( ! matrix.copyFrom( hostMatrix, cudaCompressedRowsLengthsVector ) )
          return false;
       return true;
    }
diff --git a/src/matrices/tnlMatrixSetter.h b/src/matrices/tnlMatrixSetter.h
index 98c6400f1e9a1c460511cf1cd849a87cddf5deb1..19e4b082b90e19b7111ca46a892c61ecb61c4e42 100644
--- a/src/matrices/tnlMatrixSetter.h
+++ b/src/matrices/tnlMatrixSetter.h
@@ -20,7 +20,7 @@
 
 template< typename DifferentialOperator,
           typename BoundaryConditions,
-          typename RowLengthsVector >
+          typename CompressedRowsLengthsVector >
 class tnlMatrixSetterTraversalUserData
 {
    public:
@@ -29,11 +29,11 @@ class tnlMatrixSetterTraversalUserData
 
       const BoundaryConditions* boundaryConditions;
 
-      RowLengthsVector* rowLengths;
+      CompressedRowsLengthsVector* rowLengths;
 
       tnlMatrixSetterTraversalUserData( const DifferentialOperator& differentialOperator,
                                         const BoundaryConditions& boundaryConditions,
-                                        RowLengthsVector& rowLengths )
+                                        CompressedRowsLengthsVector& rowLengths )
       : differentialOperator( &differentialOperator ),
         boundaryConditions( &boundaryConditions ),
         rowLengths( &rowLengths )
@@ -44,22 +44,22 @@ class tnlMatrixSetterTraversalUserData
 template< typename Mesh,
           typename DifferentialOperator,
           typename BoundaryConditions,
-          typename RowLengthsVector >
+          typename CompressedRowsLengthsVector >
 class tnlMatrixSetter
 {
    public:
    typedef Mesh MeshType;
    typedef typename MeshType::DeviceType DeviceType;
-   typedef typename RowLengthsVector::RealType IndexType;
+   typedef typename CompressedRowsLengthsVector::RealType IndexType;
    typedef tnlMatrixSetterTraversalUserData< DifferentialOperator,
                                              BoundaryConditions,
-                                             RowLengthsVector > TraversalUserData;
+                                             CompressedRowsLengthsVector > TraversalUserData;
 
    template< int EntityDimensions >
-   void getRowLengths( const MeshType& mesh,
+   void getCompressedRowsLengths( const MeshType& mesh,
                        DifferentialOperator& differentialOperator,
                        BoundaryConditions& boundaryConditions,
-                       RowLengthsVector& rowLengths ) const;
+                       CompressedRowsLengthsVector& rowLengths ) const;
 
 
    class TraversalBoundaryEntitiesProcessor
@@ -67,9 +67,7 @@ class tnlMatrixSetter
       public:
 
          template< int EntityDimension >
-#ifdef HAVE_CUDA
-         __device__ __host__
-#endif
+         __cuda_callable__
          static void processEntity( const MeshType& mesh,
                                     TraversalUserData& userData,
                                     const IndexType index )
@@ -85,9 +83,7 @@ class tnlMatrixSetter
       public:
 
          template< int EntityDimensions >
-#ifdef HAVE_CUDA
-         __device__ __host__
-#endif
+         __cuda_callable__
          static void processEntity( const MeshType& mesh,
                                     TraversalUserData& userData,
                                     const IndexType index )
@@ -106,34 +102,32 @@ template< int Dimensions,
           typename Index,
           typename DifferentialOperator,
           typename BoundaryConditions,
-          typename RowLengthsVector >
+          typename CompressedRowsLengthsVector >
 class tnlMatrixSetter< tnlGrid< Dimensions, Real, Device, Index >,
                        DifferentialOperator,
                        BoundaryConditions,
-                       RowLengthsVector >
+                       CompressedRowsLengthsVector >
 {
    public:
    typedef tnlGrid< Dimensions, Real, Device, Index > MeshType;
    typedef typename MeshType::DeviceType DeviceType;
-   typedef typename RowLengthsVector::RealType IndexType;
+   typedef typename CompressedRowsLengthsVector::RealType IndexType;
    typedef typename MeshType::CoordinatesType CoordinatesType;
    typedef tnlMatrixSetterTraversalUserData< DifferentialOperator,
                                              BoundaryConditions,
-                                             RowLengthsVector > TraversalUserData;
+                                             CompressedRowsLengthsVector > TraversalUserData;
 
    template< int EntityDimensions >
-   void getRowLengths( const MeshType& mesh,
+   void getCompressedRowsLengths( const MeshType& mesh,
                        const DifferentialOperator& differentialOperator,
                        const BoundaryConditions& boundaryConditions,
-                       RowLengthsVector& rowLengths ) const;
+                       CompressedRowsLengthsVector& rowLengths ) const;
 
    class TraversalBoundaryEntitiesProcessor
    {
       public:
 
-#ifdef HAVE_CUDA
-         __device__ __host__
-#endif
+         __cuda_callable__
          static void processCell( const MeshType& mesh,
                                   TraversalUserData& userData,
                                   const IndexType index,
@@ -143,15 +137,25 @@ class tnlMatrixSetter< tnlGrid< Dimensions, Real, Device, Index >,
                      userData.boundaryConditions->getLinearSystemRowLength( mesh, index, coordinates );
          }
 
+         __cuda_callable__
+         static void processFace( const MeshType& mesh,
+                                  TraversalUserData& userData,
+                                  const IndexType index,
+                                  const CoordinatesType& coordinates )
+         {
+             //printf("Matrix setter: Index = %d \n", index );
+            ( *userData.rowLengths )[ index ] =
+                     userData.boundaryConditions->getLinearSystemRowLength( mesh, index, coordinates );
+         }
+         
+
    };
 
    class TraversalInteriorEntitiesProcessor
    {
       public:
 
-#ifdef HAVE_CUDA
-         __device__ __host__
-#endif
+         __cuda_callable__
          static void processCell( const MeshType& mesh,
                                   TraversalUserData& userData,
                                   const IndexType index,
@@ -160,6 +164,18 @@ class tnlMatrixSetter< tnlGrid< Dimensions, Real, Device, Index >,
             ( *userData.rowLengths )[ index ] =
                      userData.differentialOperator->getLinearSystemRowLength( mesh, index, coordinates );
          }
+         
+         __cuda_callable__
+         static void processFace( const MeshType& mesh,
+                                  TraversalUserData& userData,
+                                  const IndexType index,
+                                  const CoordinatesType& coordinates )
+         {
+            // printf("Matrix setter: Index = %d \n", index );
+            ( *userData.rowLengths )[ index ] =
+                     userData.differentialOperator->getLinearSystemRowLength( mesh, index, coordinates );
+         }
+         
 
    };
 
diff --git a/src/matrices/tnlMatrixSetter_impl.h b/src/matrices/tnlMatrixSetter_impl.h
index 9ac7e7fcf1e88ce8384a75151921a4c7c774bf7d..2314f9e5692a785ce62985e920533198639dfcc0 100644
--- a/src/matrices/tnlMatrixSetter_impl.h
+++ b/src/matrices/tnlMatrixSetter_impl.h
@@ -23,14 +23,14 @@
 template< typename Mesh,
           typename DifferentialOperator,
           typename BoundaryConditions,
-          typename RowLengthsVector >
+          typename CompressedRowsLengthsVector >
    template< int EntityDimensions >
 void
-tnlMatrixSetter< Mesh, DifferentialOperator, BoundaryConditions, RowLengthsVector >::
-getRowLengths( const Mesh& mesh,
+tnlMatrixSetter< Mesh, DifferentialOperator, BoundaryConditions, CompressedRowsLengthsVector >::
+getCompressedRowsLengths( const Mesh& mesh,
                DifferentialOperator& differentialOperator,
                BoundaryConditions& boundaryConditions,
-               RowLengthsVector& rowLengths ) const
+               CompressedRowsLengthsVector& rowLengths ) const
 {
    if( DeviceType::DeviceType == tnlHostDevice )
    {
@@ -49,8 +49,8 @@ getRowLengths( const Mesh& mesh,
    {
       DifferentialOperator* kernelDifferentialOperator = tnlCuda::passToDevice( differentialOperator );
       BoundaryConditions* kernelBoundaryConditions = tnlCuda::passToDevice( boundaryConditions );
-      RowLengthsVector* kernelRowLengths = tnlCuda::passToDevice( rowLengths );
-      TraversalUserData userData( *kernelDifferentialOperator, *kernelBoundaryConditions, *kernelRowLengths );
+      CompressedRowsLengthsVector* kernelCompressedRowsLengths = tnlCuda::passToDevice( rowLengths );
+      TraversalUserData userData( *kernelDifferentialOperator, *kernelBoundaryConditions, *kernelCompressedRowsLengths );
       checkCudaDevice;
       tnlTraverser< MeshType, EntityDimensions > meshTraversal;
       meshTraversal.template processBoundaryEntities< TraversalUserData,
@@ -65,7 +65,7 @@ getRowLengths( const Mesh& mesh,
       checkCudaDevice;
       tnlCuda::freeFromDevice( kernelDifferentialOperator );
       tnlCuda::freeFromDevice( kernelBoundaryConditions );
-      tnlCuda::freeFromDevice( kernelRowLengths );
+      tnlCuda::freeFromDevice( kernelCompressedRowsLengths );
       checkCudaDevice;
    }
 }
@@ -76,14 +76,14 @@ template< int Dimensions,
           typename Index,
           typename DifferentialOperator,
           typename BoundaryConditions,
-          typename RowLengthsVector >
+          typename CompressedRowsLengthsVector >
    template< int EntityDimensions >
 void
-tnlMatrixSetter< tnlGrid< Dimensions, Real, Device, Index >, DifferentialOperator, BoundaryConditions, RowLengthsVector >::
-getRowLengths( const MeshType& mesh,
+tnlMatrixSetter< tnlGrid< Dimensions, Real, Device, Index >, DifferentialOperator, BoundaryConditions, CompressedRowsLengthsVector >::
+getCompressedRowsLengths( const MeshType& mesh,
                const DifferentialOperator& differentialOperator,
                const BoundaryConditions& boundaryConditions,
-               RowLengthsVector& rowLengths ) const
+               CompressedRowsLengthsVector& rowLengths ) const
 {
    if( DeviceType::DeviceType == ( int ) tnlHostDevice )
    {
@@ -102,8 +102,8 @@ getRowLengths( const MeshType& mesh,
    {
       DifferentialOperator* kernelDifferentialOperator = tnlCuda::passToDevice( differentialOperator );
       BoundaryConditions* kernelBoundaryConditions = tnlCuda::passToDevice( boundaryConditions );
-      RowLengthsVector* kernelRowLengths = tnlCuda::passToDevice( rowLengths );
-      TraversalUserData userData( *kernelDifferentialOperator, *kernelBoundaryConditions, *kernelRowLengths );
+      CompressedRowsLengthsVector* kernelCompressedRowsLengths = tnlCuda::passToDevice( rowLengths );
+      TraversalUserData userData( *kernelDifferentialOperator, *kernelBoundaryConditions, *kernelCompressedRowsLengths );
       checkCudaDevice;
       tnlTraverser< MeshType, EntityDimensions > meshTraversal;
       meshTraversal.template processBoundaryEntities< TraversalUserData,
@@ -118,7 +118,7 @@ getRowLengths( const MeshType& mesh,
       checkCudaDevice;
       tnlCuda::freeFromDevice( kernelDifferentialOperator );
       tnlCuda::freeFromDevice( kernelBoundaryConditions );
-      tnlCuda::freeFromDevice( kernelRowLengths );
+      tnlCuda::freeFromDevice( kernelCompressedRowsLengths );
       checkCudaDevice;
    }
 }
diff --git a/src/matrices/tnlMatrix_impl.h b/src/matrices/tnlMatrix_impl.h
index 72674a61c503abb5afec6a2f232fda848993a08c..5400f91eea9a2ef6a92a82adda7ef0829aebd483 100644
--- a/src/matrices/tnlMatrix_impl.h
+++ b/src/matrices/tnlMatrix_impl.h
@@ -46,7 +46,7 @@ template< typename Real,
 template< typename Real,
           typename Device,
           typename Index >
-void tnlMatrix< Real, Device, Index >::getRowLengths( tnlVector< IndexType, DeviceType, IndexType >& rowLengths ) const
+void tnlMatrix< Real, Device, Index >::getCompressedRowsLengths( tnlVector< IndexType, DeviceType, IndexType >& rowLengths ) const
 {
    rowLengths.setSize( this->getRows() );
    for( IndexType row = 0; row < this->getRows(); row++ )
@@ -67,9 +67,7 @@ bool tnlMatrix< Real, Device, Index >::setLike( const tnlMatrix< Real2, Device2,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlMatrix< Real, Device, Index >::getRows() const
 {
    return this->rows;
@@ -78,9 +76,7 @@ Index tnlMatrix< Real, Device, Index >::getRows() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlMatrix< Real, Device, Index >::getColumns() const
 {
    return this->columns;
@@ -100,13 +96,13 @@ template< typename Real,
           typename Index >
    template< typename Matrix >
 bool tnlMatrix< Real, Device, Index >::copyFrom( const Matrix& matrix,
-                                                 const RowLengthsVector& rowLengths )
+                                                 const CompressedRowsLengthsVector& rowLengths )
 {
    /*tnlStaticAssert( DeviceType::DeviceType == tnlHostDevice, );
    tnlStaticAssert( DeviceType::DeviceType == Matrix:DeviceType::DeviceType, );*/
 
    this->setLike( matrix );
-   if( ! this->setRowLengths( rowLengths ) )
+   if( ! this->setCompressedRowsLengths( rowLengths ) )
       return false;
    tnlVector< RealType, tnlHost, IndexType > values;
    tnlVector< IndexType, tnlHost, IndexType > columns;
@@ -131,8 +127,8 @@ tnlMatrix< Real, Device, Index >& tnlMatrix< Real, Device, Index >::operator = (
    this->setLike( m );
 
    tnlVector< IndexType, DeviceType, IndexType > rowLengths;
-   m.getRowLengths( rowLengths );
-   this->setRowLengths( rowLengths );
+   m.getCompressedRowsLengths( rowLengths );
+   this->setCompressedRowsLengths( rowLengths );
 
    tnlVector< RealType, DeviceType, IndexType > rowValues;
    tnlVector< IndexType, DeviceType, IndexType > rowColumns;
@@ -249,7 +245,7 @@ void tnlMatrixVectorProductCuda( const Matrix& matrix,
                                  const InVector& inVector,
                                  OutVector& outVector )
 {
-#ifdef HAVE_CUDA
+#ifdef HAVE_CUDA    
    typedef typename Matrix::IndexType IndexType;
    Matrix* kernel_this = tnlCuda::passToDevice( matrix );
    InVector* kernel_inVector = tnlCuda::passToDevice( inVector );
@@ -266,6 +262,7 @@ void tnlMatrixVectorProductCuda( const Matrix& matrix,
                                        kernel_inVector,
                                        kernel_outVector,
                                        gridIdx );
+      checkCudaDevice;
    }
    tnlCuda::freeFromDevice( kernel_this );
    tnlCuda::freeFromDevice( kernel_inVector );
diff --git a/src/matrices/tnlMultidiagonalMatrix.h b/src/matrices/tnlMultidiagonalMatrix.h
index 8c82a66d0bf2af8fd7ac8cf565298caa3384020b..b09f548aa62a12682b730d21f927368ca4c4498d 100644
--- a/src/matrices/tnlMultidiagonalMatrix.h
+++ b/src/matrices/tnlMultidiagonalMatrix.h
@@ -33,7 +33,7 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
    typedef Real RealType;
    typedef Device DeviceType;
    typedef Index IndexType;
-   typedef typename tnlMatrix< Real, Device, Index >::RowLengthsVector RowLengthsVector;
+   typedef typename tnlMatrix< Real, Device, Index >::CompressedRowsLengthsVector CompressedRowsLengthsVector;
    typedef tnlMultidiagonalMatrix< Real, Device, Index > ThisType;
    typedef tnlMultidiagonalMatrix< Real, tnlHost, Index > HostType;
    typedef tnlMultidiagonalMatrix< Real, tnlCuda, Index > CudaType;
@@ -50,7 +50,7 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
    bool setDimensions( const IndexType rows,
                        const IndexType columns );
 
-   bool setRowLengths( const RowLengthsVector& rowLengths );
+   bool setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths );
 
    IndexType getRowLength( const IndexType row ) const;
 
@@ -80,9 +80,7 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
 
    void setValue( const RealType& v );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value );
@@ -91,9 +89,7 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
                     const IndexType column,
                     const RealType& value );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value,
@@ -105,9 +101,7 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
                     const RealType& thisElementMultiplicator = 1.0 );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -119,9 +113,7 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
                 const IndexType numberOfElements );
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -134,18 +126,14 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
                 const IndexType numberOfElements,
                 const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getElementFast( const IndexType row,
                             const IndexType column ) const;
 
    RealType getElement( const IndexType row,
                         const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getRowFast( const IndexType row,
                     IndexType* columns,
                     RealType* values ) const;
@@ -154,20 +142,14 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
                 IndexType* columns,
                 RealType* values ) const;*/
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    MatrixRow getRow( const IndexType rowIndex );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const MatrixRow getRow( const IndexType rowIndex ) const;
 
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
@@ -207,9 +189,7 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
                          const IndexType column,
                          IndexType& index ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool getElementIndexFast( const IndexType row,
                              const IndexType column,
                              IndexType& index ) const;
diff --git a/src/matrices/tnlMultidiagonalMatrixRow.h b/src/matrices/tnlMultidiagonalMatrixRow.h
index cf5f92e97b59d99d10fff526974bad932d5ec0b9..0ec3b8b19ef7d5370e7dffcd807628ea2ed95501 100644
--- a/src/matrices/tnlMultidiagonalMatrixRow.h
+++ b/src/matrices/tnlMultidiagonalMatrixRow.h
@@ -23,14 +23,10 @@ class tnlMultidiagonalMatrixRow
 {
    public:
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       tnlMultidiagonalMatrixRow();
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       tnlMultidiagonalMatrixRow( Real* values,
                                  Index* diagonals,
                                  const Index maxRowLength,
@@ -38,9 +34,7 @@ class tnlMultidiagonalMatrixRow
                                  const Index columns,
                                  const Index step );
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       void bind( Real* values,
                  Index* diagonals,
                  const Index maxRowLength,
@@ -48,9 +42,7 @@ class tnlMultidiagonalMatrixRow
                  const Index columns,
                  const Index step );
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       void setElement( const Index& elementIndex,
                        const Index& column,
                        const Real& value );
diff --git a/src/matrices/tnlMultidiagonalMatrixRow_impl.h b/src/matrices/tnlMultidiagonalMatrixRow_impl.h
index c9f8135e55faf6f93cf6ded58d0ba7cf13e840aa..3c2864b4e34fa7fcbe0be40d12d618a71c573531 100644
--- a/src/matrices/tnlMultidiagonalMatrixRow_impl.h
+++ b/src/matrices/tnlMultidiagonalMatrixRow_impl.h
@@ -19,9 +19,7 @@
 #define TNLMULTIDIAGONALMATRIXROW_IMPL_H_
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 tnlMultidiagonalMatrixRow< Real, Index >::
 tnlMultidiagonalMatrixRow()
 : values( 0 ),
@@ -34,9 +32,7 @@ tnlMultidiagonalMatrixRow()
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 tnlMultidiagonalMatrixRow< Real, Index >::
 tnlMultidiagonalMatrixRow( Real* values,
                            Index* diagonals,
@@ -54,9 +50,7 @@ tnlMultidiagonalMatrixRow( Real* values,
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlMultidiagonalMatrixRow< Real, Index >::
 bind( Real* values,
@@ -75,9 +69,7 @@ bind( Real* values,
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlMultidiagonalMatrixRow< Real, Index >::
 setElement( const Index& elementIndex,
diff --git a/src/matrices/tnlMultidiagonalMatrix_impl.h b/src/matrices/tnlMultidiagonalMatrix_impl.h
index 9a5b1a135de96714a268a7d181185a771d49a768..c64660b8f3087ffae4e7876a02f7ffec62cf652d 100644
--- a/src/matrices/tnlMultidiagonalMatrix_impl.h
+++ b/src/matrices/tnlMultidiagonalMatrix_impl.h
@@ -75,7 +75,7 @@ bool tnlMultidiagonalMatrix< Real, Device, Index >::setDimensions( const IndexTy
 template< typename Real,
           typename Device,
           typename Index >
-bool tnlMultidiagonalMatrix< Real, Device, Index >::setRowLengths( const RowLengthsVector& rowLengths )
+bool tnlMultidiagonalMatrix< Real, Device, Index >::setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths )
 {
    /****
     * TODO: implement some check here similar to the one in the tridiagonal matrix
@@ -222,9 +222,7 @@ void tnlMultidiagonalMatrix< Real, Device, Index >::setValue( const RealType& v
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlMultidiagonalMatrix< Real, Device, Index > :: setElementFast( const IndexType row,
                                                                       const IndexType column,
                                                                       const Real& value )
@@ -254,9 +252,7 @@ bool tnlMultidiagonalMatrix< Real, Device, Index > :: setElement( const IndexTyp
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlMultidiagonalMatrix< Real, Device, Index > :: addElementFast( const IndexType row,
                                                                       const IndexType column,
                                                                       const RealType& value,
@@ -288,9 +284,7 @@ bool tnlMultidiagonalMatrix< Real, Device, Index > :: addElement( const IndexTyp
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlMultidiagonalMatrix< Real, Device, Index > :: setRowFast( const IndexType row,
                                                                   const IndexType* columns,
                                                                   const RealType* values,
@@ -314,9 +308,7 @@ bool tnlMultidiagonalMatrix< Real, Device, Index > :: setRow( const IndexType ro
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlMultidiagonalMatrix< Real, Device, Index > :: addRowFast( const IndexType row,
                                                                   const IndexType* columns,
                                                                   const RealType* values,
@@ -380,9 +372,7 @@ bool tnlMultidiagonalMatrix< Real, Device, Index > :: addRow( const IndexType ro
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlMultidiagonalMatrix< Real, Device, Index >::getElementFast( const IndexType row,
                                                                     const IndexType column ) const
 {
@@ -408,9 +398,7 @@ Real tnlMultidiagonalMatrix< Real, Device, Index >::getElement( const IndexType
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlMultidiagonalMatrix< Real, Device, Index >::getRowFast( const IndexType row,
                                                                 IndexType* columns,
                                                                 RealType* values ) const
@@ -431,9 +419,7 @@ void tnlMultidiagonalMatrix< Real, Device, Index >::getRowFast( const IndexType
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlMultidiagonalMatrix< Real, Device, Index >::MatrixRow
 tnlMultidiagonalMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex )
@@ -464,9 +450,7 @@ getRow( const IndexType rowIndex )
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlMultidiagonalMatrix< Real, Device, Index >::MatrixRow
 tnlMultidiagonalMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex ) const
@@ -497,9 +481,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename Vector::RealType tnlMultidiagonalMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
                                                                                            const Vector& vector ) const
 {
@@ -710,9 +692,7 @@ bool tnlMultidiagonalMatrix< Real, Device, Index >::getElementIndex( const Index
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlMultidiagonalMatrix< Real, Device, Index >::getElementIndexFast( const IndexType row,
                                                                          const IndexType column,
                                                                          Index& index ) const
@@ -748,6 +728,7 @@ class tnlMultidiagonalMatrixDeviceDependentCode< tnlHost >
       typedef tnlHost Device;
 
       template< typename Index >
+      __cuda_callable__
       static Index getElementIndex( const Index rows,
                                     const Index diagonals,
                                     const Index row,
@@ -764,6 +745,9 @@ class tnlMultidiagonalMatrixDeviceDependentCode< tnlHost >
                                  const InVector& inVector,
                                  OutVector& outVector )
       {
+#ifdef HAVE_OPENMP
+#pragma omp parallel for
+#endif           
          for( Index row = 0; row < matrix.getRows(); row ++ )
             outVector[ row ] = matrix.rowVectorProduct( row, inVector );
       }
@@ -777,9 +761,7 @@ class tnlMultidiagonalMatrixDeviceDependentCode< tnlCuda >
       typedef tnlCuda Device;
 
       template< typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       static Index getElementIndex( const Index rows,
                                     const Index diagonals,
                                     const Index row,
diff --git a/src/matrices/tnlSlicedEllpackMatrix.h b/src/matrices/tnlSlicedEllpackMatrix.h
index 6c5bc2c11fa94df593d38572ad22b4a6723d8ba7..a0b573dec7288ecf4787ad604b9083c7cbc0fd11 100644
--- a/src/matrices/tnlSlicedEllpackMatrix.h
+++ b/src/matrices/tnlSlicedEllpackMatrix.h
@@ -2,7 +2,7 @@
                           tnlSlicedEllpackMatrix.h  -  description
                              -------------------
     begin                : Dec 8, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
+    copyright            : (C) 2013 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -15,6 +15,17 @@
  *                                                                         *
  ***************************************************************************/
 
+/***
+ * Authors:
+ * Oberhuber Tomas, tomas.oberhuber@fjfi.cvut.cz
+ * Vacata Jan
+ * 
+ * The algorithm/method was published in:
+ *  Oberhuber T., Suzuki A., Vacata J., New Row-grouped CSR format for storing
+ *  the sparse matrices on GPU with implementation in CUDA, Acta Technica, 2011,
+ *  vol. 56, no. 4, pp. 447-466.
+ */
+
 #ifndef TNLSLICEDELLPACKMATRIX_H_
 #define TNLSLICEDELLPACKMATRIX_H_
 
@@ -35,7 +46,7 @@ template< typename Real,
           typename Index,
           int SliceSize >
 __global__ void tnlSlicedEllpackMatrix_computeMaximalRowLengthInSlices_CudaKernel( tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >* matrix,
-                                                                                   const typename tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >::RowLengthsVector* rowLengths,
+                                                                                   const typename tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >::CompressedRowsLengthsVector* rowLengths,
                                                                                    int gridIdx );
 #endif
 
@@ -50,7 +61,7 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
    typedef Real RealType;
    typedef Device DeviceType;
    typedef Index IndexType;
-   typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::RowLengthsVector RowLengthsVector;
+   typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::CompressedRowsLengthsVector CompressedRowsLengthsVector;
    typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
    typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
    typedef tnlSlicedEllpackMatrix< Real, Device, Index > ThisType;
@@ -69,7 +80,7 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
    bool setDimensions( const IndexType rows,
                        const IndexType columns );
 
-   bool setRowLengths( const RowLengthsVector& rowLengths );
+   bool setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths );
 
    IndexType getRowLength( const IndexType row ) const;
 
@@ -84,9 +95,7 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
    template< typename Real2, typename Device2, typename Index2 >
    bool operator != ( const tnlSlicedEllpackMatrix< Real2, Device2, Index2 >& matrix ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value );
@@ -95,9 +104,7 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                     const IndexType column,
                     const RealType& value );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value,
@@ -108,9 +115,7 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                     const RealType& value,
                     const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setRowFast( const IndexType row,
                     const IndexType* columnIndexes,
                     const RealType* values,
@@ -121,9 +126,7 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                 const RealType* values,
                 const IndexType elements );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -136,36 +139,26 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                 const IndexType numberOfElements,
                 const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getElementFast( const IndexType row,
                             const IndexType column ) const;
 
    RealType getElement( const IndexType row,
                         const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getRowFast( const IndexType row,
                     IndexType* columns,
                     RealType* values ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    MatrixRow getRow( const IndexType rowIndex );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const MatrixRow getRow( const IndexType rowIndex ) const;
 
    template< typename Vector >
-   #ifdef HAVE_CUDA
-      __device__ __host__
-   #endif
+   __cuda_callable__
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
@@ -201,19 +194,19 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
 
    protected:
 
-   tnlVector< Index, Device, Index > slicePointers, sliceRowLengths;
+   tnlVector< Index, Device, Index > slicePointers, sliceCompressedRowsLengths;
 
    typedef tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType > DeviceDependentCode;
    friend class tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType >;
 #ifdef HAVE_CUDA
    /*friend __global__ void tnlSlicedEllpackMatrix_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize >( tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >* matrix,
-                                                                                      const typename tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >::RowLengthsVector* rowLengths,
+                                                                                      const typename tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >::CompressedRowsLengthsVector* rowLengths,
                                                                                       int gridIdx );
     */
    // TODO: The friend declaration above does not work because of __global__ storage specifier. Therefore we declare the following method as public. Fix this, when possible.
 
    public:
-   __device__ void computeMaximalRowLengthInSlicesCuda( const RowLengthsVector& rowLengths,
+   __device__ void computeMaximalRowLengthInSlicesCuda( const CompressedRowsLengthsVector& rowLengths,
                                                         const IndexType sliceIdx );
 
 #endif
diff --git a/src/matrices/tnlSlicedEllpackMatrix_impl.h b/src/matrices/tnlSlicedEllpackMatrix_impl.h
index 3ac2cbedd7e7d15a35f2ddda574d1f3fb17adb5f..a5944cc1f66823ebf696e77b7b2b1d16decdd46f 100644
--- a/src/matrices/tnlSlicedEllpackMatrix_impl.h
+++ b/src/matrices/tnlSlicedEllpackMatrix_impl.h
@@ -69,12 +69,12 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::setRowLengths( const RowLengthsVector& rowLengths )
+bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths )
 {
    tnlAssert( this->getRows() > 0, );
    tnlAssert( this->getColumns() > 0, );
    const IndexType slices = roundUpDivision( this->rows, SliceSize );
-   if( ! this->sliceRowLengths.setSize( slices ) ||
+   if( ! this->sliceCompressedRowsLengths.setSize( slices ) ||
        ! this->slicePointers.setSize( slices + 1 ) )
       return false;
 
@@ -93,7 +93,7 @@ template< typename Real,
 Index tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::getRowLength( const IndexType row ) const
 {
    const IndexType slice = roundUpDivision( row, SliceSize );
-   return this->sliceRowLengths.getElement( slice );
+   return this->sliceCompressedRowsLengths.getElement( slice );
 }
 
 template< typename Real,
@@ -107,7 +107,7 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::setLike( const tn
 {
    if( !tnlSparseMatrix< Real, Device, Index >::setLike( matrix ) ||
        ! this->slicePointers.setLike( matrix.slicePointers ) ||
-       ! this->sliceRowLengths.setLike( matrix.sliceRowLengths ) )
+       ! this->sliceCompressedRowsLengths.setLike( matrix.sliceCompressedRowsLengths ) )
       return false;
    return true;
 }
@@ -120,7 +120,7 @@ void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::reset()
 {
    tnlSparseMatrix< Real, Device, Index >::reset();
    this->slicePointers.reset();
-   this->sliceRowLengths.reset();
+   this->sliceCompressedRowsLengths.reset();
 }
 
 template< typename Real,
@@ -160,9 +160,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::setElementFast( const IndexType row,
                                                                                const IndexType column,
                                                                                const Real& value )
@@ -186,9 +184,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::addElementFast( const IndexType row,
                                                                                const IndexType column,
                                                                                const RealType& value,
@@ -285,16 +281,14 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > :: setRowFast( const IndexType row,
                                                                              const IndexType* columnIndexes,
                                                                              const RealType* values,
                                                                              const IndexType elements )
 {
    const IndexType sliceIdx = row / SliceSize;
-   const IndexType rowLength = this->sliceRowLengths[ sliceIdx ];
+   const IndexType rowLength = this->sliceCompressedRowsLengths[ sliceIdx ];
    if( elements > rowLength )
       return false;
 
@@ -328,7 +322,7 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > :: setRow( const I
                                                                          const IndexType elements )
 {
    const IndexType sliceIdx = row / SliceSize;
-   const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx );
+   const IndexType rowLength = this->sliceCompressedRowsLengths.getElement( sliceIdx );
    if( elements > rowLength )
       return false;
 
@@ -356,9 +350,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > :: addRowFast( const IndexType row,
                                                                              const IndexType* columns,
                                                                              const RealType* values,
@@ -387,9 +379,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::getElementFast( const IndexType row,
                                                                                const IndexType column ) const
 {
@@ -431,9 +421,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::getRowFast( const IndexType row,
                                                                            IndexType* columns,
                                                                            RealType* values ) const
@@ -454,9 +442,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::MatrixRow
 tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::
 getRow( const IndexType rowIndex )
@@ -466,7 +452,7 @@ getRow( const IndexType rowIndex )
    const IndexType slice = roundUpDivision( rowIndex, SliceSize );
    return MatrixRow( &this->columnIndexes[ rowBegin ],
                      &this->values[ rowBegin ],
-                     this->sliceRowLengths[ slice ],
+                     this->sliceCompressedRowsLengths[ slice ],
                      step );
 }
 
@@ -474,9 +460,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::MatrixRow
 tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::
 getRow( const IndexType rowIndex ) const
@@ -486,7 +470,7 @@ getRow( const IndexType rowIndex ) const
    const IndexType slice = roundUpDivision( rowIndex, SliceSize );
    return MatrixRow( &this->columnIndexes[ rowBegin ],
                      &this->values[ rowBegin ],
-                     this->sliceRowLengths[ slice ],
+                     this->sliceCompressedRowsLengths[ slice ],
                      step );
 }
 
@@ -495,9 +479,7 @@ template< typename Real,
           typename Index,
           int SliceSize >
   template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename Vector::RealType tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::rowVectorProduct( const IndexType row,
                                                                                                       const Vector& vector ) const
 {
@@ -574,7 +556,7 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::performSORIterati
    RealType sum( 0.0 );
 
    /*const IndexType sliceIdx = row / SliceSize;
-   const IndexType rowLength = this->sliceRowLengths[ sliceIdx ];
+   const IndexType rowLength = this->sliceCompressedRowsLengths[ sliceIdx ];
    IndexType elementPtr = this->slicePointers[ sliceIdx ] +
                           rowLength * ( row - sliceIdx * SliceSize );
    const IndexType rowEnd( elementPtr + rowLength );*/
@@ -607,7 +589,7 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::save( tnlFile& fi
 {
    if( ! tnlSparseMatrix< Real, Device, Index >::save( file ) ||
        ! this->slicePointers.save( file ) ||
-       ! this->sliceRowLengths.save( file ) )
+       ! this->sliceCompressedRowsLengths.save( file ) )
       return false;
    return true;
 }
@@ -620,7 +602,7 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::load( tnlFile& fi
 {
    if( ! tnlSparseMatrix< Real, Device, Index >::load( file ) ||
        ! this->slicePointers.load( file ) ||
-       ! this->sliceRowLengths.load( file ) )
+       ! this->sliceCompressedRowsLengths.load( file ) )
       return false;
    return true;
 }
@@ -653,7 +635,7 @@ void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::print( ostream& s
    {
       str <<"Row: " << row << " -> ";
       const IndexType sliceIdx = row / SliceSize;
-      const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx );
+      const IndexType rowLength = this->sliceCompressedRowsLengths.getElement( sliceIdx );
       IndexType elementPtr = this->slicePointers.getElement( sliceIdx ) +
                              rowLength * ( row - sliceIdx * SliceSize );
       const IndexType rowEnd( elementPtr + rowLength );
@@ -674,7 +656,7 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-__device__ void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::computeMaximalRowLengthInSlicesCuda( const RowLengthsVector& rowLengths,
+__device__ void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::computeMaximalRowLengthInSlicesCuda( const CompressedRowsLengthsVector& rowLengths,
                                                                                                                const IndexType sliceIdx )
 {
    Index rowIdx = sliceIdx * SliceSize;
@@ -688,7 +670,7 @@ __device__ void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::comput
       rowIdx++;
       rowInSliceIdx++;
    }
-   this->sliceRowLengths[ sliceIdx ] = maxRowLength;
+   this->sliceCompressedRowsLengths[ sliceIdx ] = maxRowLength;
    this->slicePointers[ sliceIdx ] = maxRowLength * SliceSize;
    if( threadIdx.x == 0 )
       this->slicePointers[ this->slicePointers.getSize() - 1 ] = 0;
@@ -714,7 +696,7 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlHost >
       {
          const Index sliceIdx = row / SliceSize;
          const Index slicePointer = matrix.slicePointers.getElement( sliceIdx );
-         const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx );
+         const Index rowLength = matrix.sliceCompressedRowsLengths.getElement( sliceIdx );
 
          rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize );
          rowEnd = rowBegin + rowLength;
@@ -724,6 +706,7 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlHost >
       template< typename Real,
                 typename Index,
                 int SliceSize >
+      __cuda_callable__
       static void initRowTraverseFast( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
                                        const Index row,
                                        Index& rowBegin,
@@ -732,7 +715,7 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlHost >
       {
          const Index sliceIdx = row / SliceSize;
          const Index slicePointer = matrix.slicePointers[ sliceIdx ];
-         const Index rowLength = matrix.sliceRowLengths[ sliceIdx ];
+         const Index rowLength = matrix.sliceCompressedRowsLengths[ sliceIdx ];
 
          rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize );
          rowEnd = rowBegin + rowLength;
@@ -744,7 +727,7 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlHost >
                 typename Index,
                 int SliceSize >
       static bool computeMaximalRowLengthInSlices( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
-                                                   const typename tnlSlicedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths )
+                                                   const typename tnlSlicedEllpackMatrix< Real, Device, Index >::CompressedRowsLengthsVector& rowLengths )
       {
          Index row( 0 ), slice( 0 ), sliceRowLength( 0 );
          while( row < matrix.getRows() )
@@ -752,14 +735,14 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlHost >
             sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength );
             if( row % SliceSize == 0 )
             {
-               matrix.sliceRowLengths.setElement( slice, sliceRowLength );
+               matrix.sliceCompressedRowsLengths.setElement( slice, sliceRowLength );
                matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize );
                sliceRowLength = 0;
             }
          }
          if( row % SliceSize != 0 )
          {
-            matrix.sliceRowLengths.setElement( slice, sliceRowLength );
+            matrix.sliceCompressedRowsLengths.setElement( slice, sliceRowLength );
             matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize );
          }
          matrix.slicePointers.setElement( matrix.slicePointers.getSize() - 1, 0 );
@@ -775,6 +758,9 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlHost >
                                  const InVector& inVector,
                                  OutVector& outVector )
       {
+#ifdef HAVE_OPENMP
+#pragma omp parallel for
+#endif           
          for( Index row = 0; row < matrix.getRows(); row ++ )
             outVector[ row ] = matrix.rowVectorProduct( row, inVector );
       }
@@ -786,7 +772,7 @@ template< typename Real,
           typename Index,
           int SliceSize >
 __global__ void tnlSlicedEllpackMatrix_computeMaximalRowLengthInSlices_CudaKernel( tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >* matrix,
-                                                                                   const typename tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >::RowLengthsVector* rowLengths,
+                                                                                   const typename tnlSlicedEllpackMatrix< Real, tnlCuda, Index, SliceSize >::CompressedRowsLengthsVector* rowLengths,
                                                                                    int gridIdx )
 {
    const Index sliceIdx = gridIdx * tnlCuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
@@ -812,7 +798,7 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
       {
          const Index sliceIdx = row / SliceSize;
          const Index slicePointer = matrix.slicePointers.getElement( sliceIdx );
-         const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx );
+         const Index rowLength = matrix.sliceCompressedRowsLengths.getElement( sliceIdx );
 
          rowBegin = slicePointer + row - sliceIdx * SliceSize;
          rowEnd = rowBegin + rowLength * SliceSize;
@@ -822,9 +808,7 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
       template< typename Real,
                 typename Index,
                 int SliceSize >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       static void initRowTraverseFast( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
                                        const Index row,
                                        Index& rowBegin,
@@ -833,7 +817,7 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
       {
          const Index sliceIdx = row / SliceSize;
          const Index slicePointer = matrix.slicePointers[ sliceIdx ];
-         const Index rowLength = matrix.sliceRowLengths[ sliceIdx ];
+         const Index rowLength = matrix.sliceCompressedRowsLengths[ sliceIdx ];
 
          rowBegin = slicePointer + row - sliceIdx * SliceSize;
          rowEnd = rowBegin + rowLength * SliceSize;
@@ -845,13 +829,13 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
                 typename Index,
                 int SliceSize >
       static bool computeMaximalRowLengthInSlices( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
-                                                   const typename tnlSlicedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths )
+                                                   const typename tnlSlicedEllpackMatrix< Real, Device, Index >::CompressedRowsLengthsVector& rowLengths )
       {
 #ifdef HAVE_CUDA
          typedef tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > Matrix;
-         typedef typename Matrix::RowLengthsVector RowLengthsVector;
+         typedef typename Matrix::CompressedRowsLengthsVector CompressedRowsLengthsVector;
          Matrix* kernel_matrix = tnlCuda::passToDevice( matrix );
-         RowLengthsVector* kernel_rowLengths = tnlCuda::passToDevice( rowLengths );
+         CompressedRowsLengthsVector* kernel_rowLengths = tnlCuda::passToDevice( rowLengths );
          const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize );
          dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
          const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x );
@@ -886,6 +870,4 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
 
 };
 
-
-
 #endif /* TNLSLICEDELLPACKMATRIX_IMPL_H_ */
diff --git a/src/matrices/tnlSparseMatrix.h b/src/matrices/tnlSparseMatrix.h
index adad7932e4b4fdd11d3de2cfbdd111631be5c573..0c8f6a35a5ace48568efee9e1c92b8356a37f8b6 100644
--- a/src/matrices/tnlSparseMatrix.h
+++ b/src/matrices/tnlSparseMatrix.h
@@ -31,7 +31,7 @@ class tnlSparseMatrix : public tnlMatrix< Real, Device, Index >
    typedef Real RealType;
    typedef Device DeviceType;
    typedef Index IndexType;
-   typedef typename tnlMatrix< RealType, DeviceType, IndexType >::RowLengthsVector RowLengthsVector;
+   typedef typename tnlMatrix< RealType, DeviceType, IndexType >::CompressedRowsLengthsVector CompressedRowsLengthsVector;
    typedef typename tnlMatrix< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
    typedef tnlVector< IndexType, DeviceType, IndexType > ColumnIndexesVector;
    typedef tnlMatrix< Real, Device, Index > BaseType;
@@ -39,7 +39,7 @@ class tnlSparseMatrix : public tnlMatrix< Real, Device, Index >
 
    tnlSparseMatrix();
 
-   virtual bool setRowLengths( const RowLengthsVector& rowLengths ) = 0;
+   virtual bool setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths ) = 0;
 
    template< typename Real2, typename Device2, typename Index2 >
    bool setLike( const tnlSparseMatrix< Real2, Device2, Index2 >& matrix );
@@ -50,9 +50,7 @@ class tnlSparseMatrix : public tnlMatrix< Real, Device, Index >
 
    IndexType getMaxRowLength() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getPaddingIndex() const;
 
    void reset();
diff --git a/src/matrices/tnlSparseMatrixRow.h b/src/matrices/tnlSparseMatrixRow.h
index 74a8506333ec10164856593308e98485e1e273be..7290ffb41ee56299094a6acac25f1651339b8c3a 100644
--- a/src/matrices/tnlSparseMatrixRow.h
+++ b/src/matrices/tnlSparseMatrixRow.h
@@ -24,30 +24,22 @@ class tnlSparseMatrixRow
 {
    public:
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       tnlSparseMatrixRow();
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       tnlSparseMatrixRow( Index* columns,
                           Real* values,
                           const Index length,
                           const Index step );
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       void bind( Index* columns,
                  Real* values,
                  const Index length,
                  const Index step );
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       void setElement( const Index& elementIndex,
                        const Index& column,
                        const Real& value );
diff --git a/src/matrices/tnlSparseMatrixRow_impl.h b/src/matrices/tnlSparseMatrixRow_impl.h
index 296c910cd10989d8b1b5a3e4e5c3c5f690c2a8b7..79f171f158dd1e2362ef42ad163bf77fe5805bb4 100644
--- a/src/matrices/tnlSparseMatrixRow_impl.h
+++ b/src/matrices/tnlSparseMatrixRow_impl.h
@@ -19,9 +19,7 @@
 #define TNLSPARSEMATRIXROW_IMPL_H_
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 tnlSparseMatrixRow< Real, Index >::
 tnlSparseMatrixRow()
 : values( 0 ),
@@ -32,9 +30,7 @@ tnlSparseMatrixRow()
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 tnlSparseMatrixRow< Real, Index >::
 tnlSparseMatrixRow( Index* columns,
                     Real* values,
@@ -48,9 +44,7 @@ tnlSparseMatrixRow( Index* columns,
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlSparseMatrixRow< Real, Index >::
 bind( Index* columns,
@@ -65,9 +59,7 @@ bind( Index* columns,
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlSparseMatrixRow< Real, Index >::
 setElement( const Index& elementIndex,
@@ -77,6 +69,7 @@ setElement( const Index& elementIndex,
    tnlAssert( this->columns, );
    tnlAssert( this->values, );
    tnlAssert( this->step > 0,);
+   //printf( "elementIndex = %d length = %d \n", elementIndex, this->length );
    tnlAssert( elementIndex >= 0 && elementIndex < this->length,
               cerr << "elementIndex = " << elementIndex << " this->length = " << this->length );
 
diff --git a/src/matrices/tnlSparseMatrix_impl.h b/src/matrices/tnlSparseMatrix_impl.h
index 6ec90d0e9414dd62b33c7f3eacf28a75836bdd87..830b54252abb0f7350cf52669578b430319a404d 100644
--- a/src/matrices/tnlSparseMatrix_impl.h
+++ b/src/matrices/tnlSparseMatrix_impl.h
@@ -74,9 +74,7 @@ getMaxRowLength() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlSparseMatrix< Real, Device, Index >::getPaddingIndex() const
 {
    return this->getColumns();
diff --git a/src/matrices/tnlTridiagonalMatrix.h b/src/matrices/tnlTridiagonalMatrix.h
index 25ddda40985582b73e8e9f51558dc2cac16e0268..91e29af727ef69fa1bbbefa2e8aed10c6475a61a 100644
--- a/src/matrices/tnlTridiagonalMatrix.h
+++ b/src/matrices/tnlTridiagonalMatrix.h
@@ -35,7 +35,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
    typedef Real RealType;
    typedef Device DeviceType;
    typedef Index IndexType;
-   typedef typename tnlMatrix< Real, Device, Index >::RowLengthsVector RowLengthsVector;
+   typedef typename tnlMatrix< Real, Device, Index >::CompressedRowsLengthsVector CompressedRowsLengthsVector;
    typedef tnlTridiagonalMatrix< Real, Device, Index > ThisType;
    typedef tnlTridiagonalMatrix< Real, tnlHost, Index > HostType;
    typedef tnlTridiagonalMatrix< Real, tnlCuda, Index > CudaType;
@@ -51,7 +51,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
    bool setDimensions( const IndexType rows,
                        const IndexType columns );
 
-   bool setRowLengths( const RowLengthsVector& rowLengths );
+   bool setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths );
 
    IndexType getRowLength( const IndexType row ) const;
 
@@ -76,9 +76,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
 
    void setValue( const RealType& v );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value );
@@ -87,9 +85,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                     const IndexType column,
                     const RealType& value );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addElementFast( const IndexType row,
                         const IndexType column,
                         const RealType& value,
@@ -100,9 +96,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                     const RealType& value,
                     const RealType& thisElementMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool setRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -113,9 +107,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                 const RealType* values,
                 const IndexType elements );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool addRowFast( const IndexType row,
                     const IndexType* columns,
                     const RealType* values,
@@ -128,36 +120,26 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                 const IndexType elements,
                 const RealType& thisRowMultiplicator = 1.0 );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getElementFast( const IndexType row,
                             const IndexType column ) const;
 
    RealType getElement( const IndexType row,
                         const IndexType column ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void getRowFast( const IndexType row,
                     IndexType* columns,
                     RealType* values ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    MatrixRow getRow( const IndexType rowIndex );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const MatrixRow getRow( const IndexType rowIndex ) const;
 
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
@@ -182,9 +164,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
 #endif   
 
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void performSORIteration( const Vector& b,
                              const IndexType row,
                              Vector& x,
@@ -202,9 +182,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
 
    protected:
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getElementIndex( const IndexType row,
                               const IndexType column ) const;
 
diff --git a/src/matrices/tnlTridiagonalMatrixRow.h b/src/matrices/tnlTridiagonalMatrixRow.h
index 81bac3f54f39d963760347192f7e2029a7d77b8f..4872e67fccb81cb87c0cf668f25402ac8275d8df 100644
--- a/src/matrices/tnlTridiagonalMatrixRow.h
+++ b/src/matrices/tnlTridiagonalMatrixRow.h
@@ -23,30 +23,22 @@ class tnlTridiagonalMatrixRow
 {
    public:
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       tnlTridiagonalMatrixRow();
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       tnlTridiagonalMatrixRow( Real* values,
                                const Index row,
                                const Index columns,
                                const Index step );
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       void bind( Real* values,
                  const Index row,
                  const Index columns,
                  const Index step );
 
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       void setElement( const Index& elementIndex,
                        const Index& column,
                        const Real& value );
diff --git a/src/matrices/tnlTridiagonalMatrixRow_impl.h b/src/matrices/tnlTridiagonalMatrixRow_impl.h
index b7f364b6117c504b13761847842fe3111713b7bf..c9cd1682fe491c3a6f07e735d28f0541ea9a65f6 100644
--- a/src/matrices/tnlTridiagonalMatrixRow_impl.h
+++ b/src/matrices/tnlTridiagonalMatrixRow_impl.h
@@ -19,9 +19,7 @@
 #define TNLTRIDIAGONALMATRIXROW_IMPL_H_
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 tnlTridiagonalMatrixRow< Real, Index >::
 tnlTridiagonalMatrixRow()
 : values( 0 ),
@@ -32,9 +30,7 @@ tnlTridiagonalMatrixRow()
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 tnlTridiagonalMatrixRow< Real, Index >::
 tnlTridiagonalMatrixRow( Real* values,
                          const Index row,
@@ -48,9 +44,7 @@ tnlTridiagonalMatrixRow( Real* values,
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlTridiagonalMatrixRow< Real, Index >::
 bind( Real* values,
@@ -65,9 +59,7 @@ bind( Real* values,
 }
 
 template< typename Real, typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlTridiagonalMatrixRow< Real, Index >::
 setElement( const Index& elementIndex,
diff --git a/src/matrices/tnlTridiagonalMatrix_impl.h b/src/matrices/tnlTridiagonalMatrix_impl.h
index 2f379c920b079b998cc37b8de5c224676083adcc..56b7aee702867da7cceec2e106a11ac7819da866 100644
--- a/src/matrices/tnlTridiagonalMatrix_impl.h
+++ b/src/matrices/tnlTridiagonalMatrix_impl.h
@@ -67,7 +67,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setDimensions( const IndexType
 template< typename Real,
           typename Device,
           typename Index >
-bool tnlTridiagonalMatrix< Real, Device, Index >::setRowLengths( const RowLengthsVector& rowLengths )
+bool tnlTridiagonalMatrix< Real, Device, Index >::setCompressedRowsLengths( const CompressedRowsLengthsVector& rowLengths )
 {
    if( rowLengths[ 0 ] > 2 )
       return false;
@@ -189,9 +189,7 @@ void tnlTridiagonalMatrix< Real, Device, Index >::setValue( const RealType& v )
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlTridiagonalMatrix< Real, Device, Index >::setElementFast( const IndexType row,
                                                                   const IndexType column,
                                                                   const RealType& value )
@@ -214,9 +212,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setElement( const IndexType ro
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlTridiagonalMatrix< Real, Device, Index >::addElementFast( const IndexType row,
                                                                   const IndexType column,
                                                                   const RealType& value,
@@ -243,9 +239,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addElement( const IndexType ro
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlTridiagonalMatrix< Real, Device, Index >::setRowFast( const IndexType row,
                                                               const IndexType* columns,
                                                               const RealType* values,
@@ -276,9 +270,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setRow( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlTridiagonalMatrix< Real, Device, Index >::addRowFast( const IndexType row,
                                                               const IndexType* columns,
                                                               const RealType* values,
@@ -329,9 +321,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addRow( const IndexType row,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlTridiagonalMatrix< Real, Device, Index >::getElementFast( const IndexType row,
                                                                   const IndexType column ) const
 {
@@ -354,9 +344,7 @@ Real tnlTridiagonalMatrix< Real, Device, Index >::getElement( const IndexType ro
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlTridiagonalMatrix< Real, Device, Index >::getRowFast( const IndexType row,
                                                               IndexType* columns,
                                                               RealType* values ) const
@@ -377,9 +365,7 @@ void tnlTridiagonalMatrix< Real, Device, Index >::getRowFast( const IndexType ro
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlTridiagonalMatrix< Real, Device, Index >::MatrixRow
 tnlTridiagonalMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex )
@@ -399,9 +385,7 @@ getRow( const IndexType rowIndex )
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlTridiagonalMatrix< Real, Device, Index >::MatrixRow
 tnlTridiagonalMatrix< Real, Device, Index >::
 getRow( const IndexType rowIndex ) const
@@ -414,9 +398,7 @@ template< typename Real,
           typename Device,
           typename Index >
 template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename Vector::RealType tnlTridiagonalMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
                                                                                          const Vector& vector ) const
 {
@@ -547,9 +529,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void tnlTridiagonalMatrix< Real, Device, Index >::performSORIteration( const Vector& b,
                                                                        const IndexType row,
                                                                        Vector& x,
@@ -625,9 +605,7 @@ void tnlTridiagonalMatrix< Real, Device, Index >::print( ostream& str ) const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlTridiagonalMatrix< Real, Device, Index >::getElementIndex( const IndexType row,
                                                                     const IndexType column ) const
 {
@@ -647,6 +625,7 @@ class tnlTridiagonalMatrixDeviceDependentCode< tnlHost >
       typedef tnlHost Device;
 
       template< typename Index >
+      __cuda_callable__
       static Index getElementIndex( const Index rows,
                                     const Index row,
                                     const Index column )
@@ -657,6 +636,7 @@ class tnlTridiagonalMatrixDeviceDependentCode< tnlHost >
       template< typename Vector,
                 typename Index,
                 typename ValuesType  >
+      __cuda_callable__
       static typename Vector::RealType rowVectorProduct( const Index rows,
                                                          const ValuesType& values,
                                                          const Index row,
@@ -682,6 +662,9 @@ class tnlTridiagonalMatrixDeviceDependentCode< tnlHost >
                                  const InVector& inVector,
                                  OutVector& outVector )
       {
+#ifdef HAVE_OPENMP
+#pragma omp parallel for
+#endif           
          for( Index row = 0; row < matrix.getRows(); row ++ )
             outVector[ row ] = matrix.rowVectorProduct( row, inVector );
       }
@@ -695,9 +678,7 @@ class tnlTridiagonalMatrixDeviceDependentCode< tnlCuda >
       typedef tnlCuda Device;
 
       template< typename Index >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       static Index getElementIndex( const Index rows,
                                     const Index row,
                                     const Index column )
@@ -708,9 +689,7 @@ class tnlTridiagonalMatrixDeviceDependentCode< tnlCuda >
       template< typename Vector,
                 typename Index,
                 typename ValuesType >
-#ifdef HAVE_CUDA
-      __device__
-#endif
+      __cuda_callable__
       static typename Vector::RealType rowVectorProduct( const Index rows,
                                                          const ValuesType& values,
                                                          const Index row,
diff --git a/src/mesh/CMakeLists.txt b/src/mesh/CMakeLists.txt
index 0381314ac754930402690625546276abc5af10fe..e8ce9f1ba659cf569a2e81a280e2a18106af2c05 100755
--- a/src/mesh/CMakeLists.txt
+++ b/src/mesh/CMakeLists.txt
@@ -19,6 +19,7 @@ SET( headers tnlGrid.h
              tnlMeshReaderNetgen.h
              tnlMeshWriterNetgen.h
              tnlMeshInitializer.h
+             tnlMeshIntegrityChecker.h
              tnlMeshEntityInitializer.h
              tnlMeshSuperentityInitializerLayer.h
              tnlTraverser.h
diff --git a/src/mesh/config/tnlMeshConfigBase.h b/src/mesh/config/tnlMeshConfigBase.h
index cc96c8fbd8be790276e9b94e31bb11c637587940..0bbe28780da9633632b14f039a7699950a77e22e 100644
--- a/src/mesh/config/tnlMeshConfigBase.h
+++ b/src/mesh/config/tnlMeshConfigBase.h
@@ -24,13 +24,15 @@
  * It means that each mesh entity stores its index in its
  * mesh storage layer.
  */
-template< int WorldDimensions,
+template< typename Cell,
+          int WorldDimensions = Cell::dimensions,
           typename Real = double,
           typename GlobalIndex = int,
           typename LocalIndex = GlobalIndex,
           typename Id = void >
 struct tnlMeshConfigBase
 {
+   typedef Cell        CellTag;
    typedef Real        RealType;
    typedef GlobalIndex GlobalIndexType;
    typedef LocalIndex  LocalIndexType;
@@ -43,6 +45,7 @@ struct tnlMeshConfigBase
       return tnlString( "tnlMeshConfigBase< >");
    };
 
+   tnlStaticAssert( WorldDimensions >= Cell::dimensions, "The number of the cell dimensions cannot be larger than the world dimension." );
 };
 
 /****
diff --git a/src/mesh/layers/tnlMeshStorageLayer.h b/src/mesh/layers/tnlMeshStorageLayer.h
index 31722cd2a54fd79310af654e1fd23c518b8ec3e7..405a9799de11082f7dbdb7520d3b5489e5bd57f4 100644
--- a/src/mesh/layers/tnlMeshStorageLayer.h
+++ b/src/mesh/layers/tnlMeshStorageLayer.h
@@ -266,6 +266,13 @@ class tnlMeshStorageLayer< ConfigTag,
       this->vertices.setElement( entityIndex, entity );
    }
 
+   VertexType& getEntity( DimensionsTraits,
+                          const GlobalIndexType entityIndex )
+   {
+      return this->vertices[ entityIndex ];
+   }
+
+   
    const VertexType& getEntity( DimensionsTraits,
                                 const GlobalIndexType entityIndex ) const
    {
diff --git a/src/mesh/tnlDummyMesh.h b/src/mesh/tnlDummyMesh.h
index 51f70c0def3322257f5faf03b7110e42297dc745..9935628b02a6305b9f827363f216aee495f93b64 100644
--- a/src/mesh/tnlDummyMesh.h
+++ b/src/mesh/tnlDummyMesh.h
@@ -28,7 +28,7 @@ class tnlDummyMesh
    typedef Index IndexType;
 
 
-   const Real& getParametricStep(){};
+   const Real& getParametricStep(){ return 0.0; }
 
    template< typename GridFunction >
    typename GridFunction::RealType getDifferenceAbsMax( const GridFunction& f1,
@@ -39,23 +39,23 @@ class tnlDummyMesh
                                                         const GridFunction& f2,
                                                         const typename GridFunction::RealType& p ) const { return 0.0; }
 
-   bool save( tnlFile& file ) const{};
+   bool save( tnlFile& file ) const { return true; }
 
    //! Method for restoring the object from a file
-   bool load( tnlFile& file ){};
+   bool load( tnlFile& file ) { return true; }
 
-   bool save( const tnlString& fileName ) const{};
+   bool save( const tnlString& fileName ) const { return true; }
 
-   bool load( const tnlString& fileName ){};
+   bool load( const tnlString& fileName ) { return true; }
 
    bool writeMesh( const tnlString& fileName,
-                   const tnlString& format ) const{};
+                   const tnlString& format ) const { return true; }
 
 
    template< typename MeshFunction >
    bool write( const MeshFunction& function,
                 const tnlString& fileName,
-                const tnlString& format ) const{}
+                const tnlString& format ) const { return true; }
 };
 
 
diff --git a/src/mesh/tnlGrid1D.h b/src/mesh/tnlGrid1D.h
index 5c6ebf15478ee01c3814d121556ee0f1bc107e29..6f171e48ee05eb7954836a922f5270f4215c32a4 100644
--- a/src/mesh/tnlGrid1D.h
+++ b/src/mesh/tnlGrid1D.h
@@ -49,78 +49,50 @@ class tnlGrid< 1, Real, Device, Index > : public tnlObject
 
    void setDimensions( const CoordinatesType& dimensions );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const CoordinatesType& getDimensions() const;
 
    void setDomain( const VertexType& origin,
                    const VertexType& proportions );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const VertexType& getOrigin() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const VertexType& getProportions() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const VertexType& getCellProportions() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getCellIndex( const CoordinatesType& cellCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    CoordinatesType getCellCoordinates( const Index cellIndex ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getVertexIndex( const CoordinatesType& vertexCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    CoordinatesType getVertexCoordinates( const Index vertexCoordinates ) const;
 
    template< int dx >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getCellNextToCell( const IndexType& cellIndex ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHx() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxSquare() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxSquareInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getSmallestSpaceStep() const;
 
    /****
@@ -131,9 +103,7 @@ class tnlGrid< 1, Real, Device, Index > : public tnlObject
 #else
    template< typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getCellCenter( const CoordinatesType& cellCoordinates ) const;
 
 #ifdef HAVE_NOT_CXX11
@@ -141,9 +111,7 @@ class tnlGrid< 1, Real, Device, Index > : public tnlObject
 #else
    template< typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getCellCenter( const IndexType& cellIndex ) const;
 
 #ifdef HAVE_NOT_CXX11
@@ -151,39 +119,25 @@ class tnlGrid< 1, Real, Device, Index > : public tnlObject
 #else
    template< typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getVertex( const CoordinatesType& vertexCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfCells() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfFaces() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfVertices() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryCell( const CoordinatesType& cellCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryCell( const IndexType& cellIndex ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryVertex( const CoordinatesType& vertexCoordinates ) const;
 
    template< typename GridFunction >
diff --git a/src/mesh/tnlGrid1D_impl.h b/src/mesh/tnlGrid1D_impl.h
index f572bfd533ce17b1a5d267b2f9aeb38b9ca8d3f9..17d1db403faa3bb75ca678a5c7b63faa3cf632a6 100644
--- a/src/mesh/tnlGrid1D_impl.h
+++ b/src/mesh/tnlGrid1D_impl.h
@@ -109,9 +109,7 @@ void tnlGrid< 1, Real, Device, Index > :: setDimensions( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index  >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 1, Real, Device, Index >::CoordinatesType&
    tnlGrid< 1, Real, Device, Index > :: getDimensions() const
 {
@@ -132,9 +130,7 @@ void tnlGrid< 1, Real, Device, Index > :: setDomain( const VertexType& origin,
 template< typename Real,
           typename Device,
           typename Index  >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 1, Real, Device, Index > :: VertexType& 
   tnlGrid< 1, Real, Device, Index > :: getOrigin() const
 {
@@ -144,9 +140,7 @@ const typename tnlGrid< 1, Real, Device, Index > :: VertexType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 1, Real, Device, Index > :: VertexType& 
    tnlGrid< 1, Real, Device, Index > :: getProportions() const
 {
@@ -156,9 +150,7 @@ const typename tnlGrid< 1, Real, Device, Index > :: VertexType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 1, Real, Device, Index > :: VertexType& 
    tnlGrid< 1, Real, Device, Index > :: getCellProportions() const
 {
@@ -168,9 +160,7 @@ const typename tnlGrid< 1, Real, Device, Index > :: VertexType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 1, Real, Device, Index > :: getCellIndex( const CoordinatesType& cellCoordinates ) const
 {
    tnlAssert( cellCoordinates.x() >= 0 && cellCoordinates.x() < this->getDimensions().x(),
@@ -183,9 +173,7 @@ Index tnlGrid< 1, Real, Device, Index > :: getCellIndex( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlGrid< 1, Real, Device, Index > :: CoordinatesType
 tnlGrid< 1, Real, Device, Index > :: getCellCoordinates( const Index cellIndex ) const
 {
@@ -199,9 +187,7 @@ tnlGrid< 1, Real, Device, Index > :: getCellCoordinates( const Index cellIndex )
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 1, Real, Device, Index > :: getVertexIndex( const CoordinatesType& vertexCoordinates ) const
 {
    tnlAssert( vertexCoordinates.x() >= 0 && vertexCoordinates.x() < this->getDimensions().x() + 1,
@@ -214,9 +200,7 @@ Index tnlGrid< 1, Real, Device, Index > :: getVertexIndex( const CoordinatesType
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlGrid< 1, Real, Device, Index > :: CoordinatesType
 tnlGrid< 1, Real, Device, Index > :: getVertexCoordinates( const Index vertexIndex ) const
 {
@@ -231,9 +215,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int dx >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 1, Real, Device, Index > :: getCellNextToCell( const IndexType& cellIndex ) const
 {
    tnlAssert( cellIndex + dx >= 0 &&
@@ -248,9 +230,7 @@ Index tnlGrid< 1, Real, Device, Index > :: getCellNextToCell( const IndexType& c
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 1, Real, Device, Index > :: getHx() const
 {
    return this->hx;
@@ -259,9 +239,7 @@ const Real& tnlGrid< 1, Real, Device, Index > :: getHx() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 1, Real, Device, Index > :: getHxSquare() const
 {
    return this->hxSquare;
@@ -270,9 +248,7 @@ const Real& tnlGrid< 1, Real, Device, Index > :: getHxSquare() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 1, Real, Device, Index > :: getHxInverse() const
 {
    return this->hxInverse;
@@ -281,9 +257,7 @@ const Real& tnlGrid< 1, Real, Device, Index > :: getHxInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 1, Real, Device, Index > :: getHxSquareInverse() const
 {
    return this->hxSquareInverse;
@@ -292,9 +266,7 @@ const Real& tnlGrid< 1, Real, Device, Index > :: getHxSquareInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlGrid< 1, Real, Device, Index > :: getSmallestSpaceStep() const
 {
    return this->hx;
@@ -304,9 +276,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 1, Real, Device, Index >::getCellCenter( const CoordinatesType& cellCoordinates ) const
 {
    tnlAssert( cellCoordinates.x() >= 0 && cellCoordinates.x() < this->getDimensions().x(),
@@ -320,9 +290,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 1, Real, Device, Index >::getCellCenter( const IndexType& cellIndex ) const
 {
    tnlAssert( cellIndex >= 0 && cellIndex < this->getNumberOfCells(),
@@ -336,9 +304,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vertex >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 1, Real, Device, Index >::getVertex( const CoordinatesType& vertexCoordinates ) const
 {
    tnlAssert( vertexCoordinates.x() >= 0 && vertexCoordinates.x() < this->getDimensions().x() + 1,
@@ -351,9 +317,7 @@ Vertex tnlGrid< 1, Real, Device, Index >::getVertex( const CoordinatesType& vert
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 1, Real, Device, Index > :: getNumberOfCells() const
 {
    return this->numberOfCells;
@@ -362,9 +326,7 @@ Index tnlGrid< 1, Real, Device, Index > :: getNumberOfCells() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+___cuda_callable__
 Index tnlGrid< 1, Real, Device, Index > :: getNumberOfFaces() const
 {
    return this->numberOfVertices;
@@ -373,9 +335,7 @@ Index tnlGrid< 1, Real, Device, Index > :: getNumberOfFaces() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 1, Real, Device, Index > :: getNumberOfVertices() const
 {
    return this->numberOfVertices;
@@ -384,9 +344,7 @@ Index tnlGrid< 1, Real, Device, Index > :: getNumberOfVertices() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlGrid< 1, Real, Device, Index > :: isBoundaryCell( const CoordinatesType& cellCoordinates ) const
 {
    tnlAssert( cellCoordinates.x() >= 0 && cellCoordinates.x() < this->getDimensions().x(),
@@ -401,9 +359,7 @@ bool tnlGrid< 1, Real, Device, Index > :: isBoundaryCell( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool
 tnlGrid< 1, Real, Device, Index >::
 isBoundaryCell( const IndexType& cellIndex ) const
@@ -418,9 +374,7 @@ isBoundaryCell( const IndexType& cellIndex ) const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlGrid< 1, Real, Device, Index > :: isBoundaryVertex( const CoordinatesType& vertexCoordinates ) const
 {
    tnlAssert( vertexCoordinates.x() >= 0 && vertexCoordinates.x() < this->getDimensions().x() + 1,
diff --git a/src/mesh/tnlGrid2D.h b/src/mesh/tnlGrid2D.h
index 0d11cd0e959c3e680cdc8696f384610af7f6d258..286b7f08730884f31a5b50bb9d805a83140de80c 100644
--- a/src/mesh/tnlGrid2D.h
+++ b/src/mesh/tnlGrid2D.h
@@ -49,131 +49,83 @@ class tnlGrid< 2, Real, Device, Index > : public tnlObject
 
    void setDimensions( const CoordinatesType& dimensions );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const CoordinatesType& getDimensions() const;
 
    void setDomain( const VertexType& origin,
                    const VertexType& proportions );
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const VertexType& getOrigin() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const VertexType& getProportions() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const VertexType& getCellProportions() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getCellIndex( const CoordinatesType& cellCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    CoordinatesType getCellCoordinates( const IndexType cellIndex ) const;
 
    template< int nx, int ny >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getFaceIndex( const CoordinatesType& faceCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    CoordinatesType getFaceCoordinates( const Index faceIndex, int& nx, int& ny ) const;
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getVertexIndex( const CoordinatesType& vertexCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    CoordinatesType getVertexCoordinates( const Index vertexIndex ) const;
 
    template< int dx, int dy >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getCellNextToCell( const IndexType& cellIndex ) const;
 
    template< int nx, int ny >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getFaceNextToCell( const IndexType& cellIndex ) const;
 
    template< int nx, int ny >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getCellNextToFace( const IndexType& cellIndex ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHx() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxSquare() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxSquareInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHy() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHySquare() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHyInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHySquareInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxHy() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxHyInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getSmallestSpaceStep() const;
 
 
@@ -185,9 +137,7 @@ class tnlGrid< 2, Real, Device, Index > : public tnlObject
 #else
    template< typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getCellCenter( const CoordinatesType& cellCoordinates ) const;
 
 #ifdef HAVE_NOT_CXX11
@@ -195,9 +145,7 @@ class tnlGrid< 2, Real, Device, Index > : public tnlObject
 #else
    template< typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getCellCenter( const IndexType& cellIndex ) const;
 
 
@@ -206,9 +154,7 @@ class tnlGrid< 2, Real, Device, Index > : public tnlObject
 #else
    template< int nx, int ny, typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getFaceCenter( const CoordinatesType& faceCoordinates ) const;
 
 #ifdef HAVE_NOT_CXX11
@@ -216,14 +162,10 @@ class tnlGrid< 2, Real, Device, Index > : public tnlObject
 #else
    template< typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getVertex( const CoordinatesType& vertexCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfCells() const;
 
 #ifdef HAVE_NOT_CXX11
@@ -233,35 +175,23 @@ class tnlGrid< 2, Real, Device, Index > : public tnlObject
    template< int nx = 1,
              int ny = 1 >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfFaces() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfVertices() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryCell( const CoordinatesType& cellCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryCell( const IndexType& cellIndex ) const;
 
    template< int nx, int ny >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryFace( const CoordinatesType& faceCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryVertex( const CoordinatesType& vertexCoordinates ) const;
 
    template< typename GridFunction >
@@ -302,6 +232,7 @@ class tnlGrid< 2, Real, Device, Index > : public tnlObject
 
    protected:
 
+   __cuda_callable__
    void computeSpaceSteps();
 
    CoordinatesType dimensions;
diff --git a/src/mesh/tnlGrid2D_impl.h b/src/mesh/tnlGrid2D_impl.h
index 5ab7dff83755d30377a6f8faa5fb30b4701b1176..632fc6de2c0aef346ae38a4036a291bf466d40dc 100644
--- a/src/mesh/tnlGrid2D_impl.h
+++ b/src/mesh/tnlGrid2D_impl.h
@@ -76,6 +76,7 @@ tnlString tnlGrid< 2, Real, Device, Index > :: getSerializationTypeVirtual() con
 template< typename Real,
           typename Device,
           typename Index >
+__cuda_callable__
 void tnlGrid< 2, Real, Device, Index > :: computeSpaceSteps()
 {
    if( this->getDimensions().x() > 0 && this->getDimensions().y() > 0 )
@@ -124,9 +125,7 @@ void tnlGrid< 2, Real, Device, Index > :: setDimensions( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 2, Real, Device, Index >::CoordinatesType&
 tnlGrid< 2, Real, Device, Index > :: getDimensions() const
 {
@@ -147,9 +146,7 @@ void tnlGrid< 2, Real, Device, Index > :: setDomain( const VertexType& origin,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 2, Real, Device, Index >::VertexType&
 tnlGrid< 2, Real, Device, Index >::getOrigin() const
 {
@@ -159,9 +156,7 @@ tnlGrid< 2, Real, Device, Index >::getOrigin() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 2, Real, Device, Index > :: VertexType&
    tnlGrid< 2, Real, Device, Index > :: getProportions() const
 {
@@ -171,9 +166,7 @@ const typename tnlGrid< 2, Real, Device, Index > :: VertexType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 2, Real, Device, Index > :: VertexType&
 tnlGrid< 2, Real, Device, Index > :: getCellProportions() const
 {
@@ -183,9 +176,7 @@ tnlGrid< 2, Real, Device, Index > :: getCellProportions() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 2, Real, Device, Index > :: getCellIndex( const CoordinatesType& cellCoordinates ) const
 {
    tnlAssert( cellCoordinates.x() >= 0 && cellCoordinates.x() < this->getDimensions().x(),
@@ -203,9 +194,7 @@ Index tnlGrid< 2, Real, Device, Index > :: getCellIndex( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlGrid< 2, Real, Device, Index >::CoordinatesType
 tnlGrid< 2, Real, Device, Index >::getCellCoordinates( const Index cellIndex ) const
 {
@@ -220,9 +209,7 @@ template< typename Real,
           typename Device,
           typename Index >
 template< int nx, int ny >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 2, Real, Device, Index >::getFaceIndex( const CoordinatesType& faceCoordinates ) const
 {
    tnlStaticAssert( nx >= 0 && ny >= 0 && nx + ny == 1, "Wrong template parameters nx or ny." );
@@ -252,15 +239,13 @@ Index tnlGrid< 2, Real, Device, Index >::getFaceIndex( const CoordinatesType& fa
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 typename tnlGrid< 2, Real, Device, Index >::CoordinatesType
 tnlGrid< 2, Real, Device, Index >::getFaceCoordinates( const Index faceIndex, int& nx, int& ny ) const
 {
-   tnlAssert( faceIndex >= 0 && faceIndex < this->getNumberOfFaces(),
+   tnlAssert( faceIndex >= 0 && faceIndex < ( this->template getNumberOfFaces< 1, 1 >() ),
               cerr << " faceIndex = " << faceIndex
-                   << " this->getNumberOfFaces() = " << this->getNumberOfFaces()
+                   << " this->getNumberOfFaces() = " << ( this->template getNumberOfFaces< 1, 1 >() )
                    << " this->getName() " << this->getName(); );
    if( faceIndex < this->numberOfNxFaces )
    {
@@ -279,9 +264,7 @@ tnlGrid< 2, Real, Device, Index >::getFaceCoordinates( const Index faceIndex, in
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 2, Real, Device, Index > :: getVertexIndex( const CoordinatesType& vertexCoordinates ) const
 {
    tnlAssert( vertexCoordinates.x() >= 0 && vertexCoordinates.x() < this->getDimensions().x() + 1,
@@ -298,9 +281,7 @@ Index tnlGrid< 2, Real, Device, Index > :: getVertexIndex( const CoordinatesType
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlGrid< 2, Real, Device, Index > :: CoordinatesType
 tnlGrid< 2, Real, Device, Index > :: getVertexCoordinates( const Index vertexIndex ) const
 {
@@ -316,9 +297,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int dx, int dy >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 2, Real, Device, Index >::getCellNextToCell( const IndexType& cellIndex ) const
 {
    const IndexType result = cellIndex + dx + dy * this->getDimensions().x();
@@ -336,9 +315,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int nx, int ny >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 2, Real, Device, Index >::getFaceNextToCell( const IndexType& cellIndex ) const
 {
    tnlAssert( nx * ny == 0 && nx + ny != 0,
@@ -350,11 +327,11 @@ Index tnlGrid< 2, Real, Device, Index >::getFaceNextToCell( const IndexType& cel
    if( ny )
       result = this->numberOfNxFaces + cellIndex + ( ny + ( ny < 0 ) ) * this->getDimensions().x();
    tnlAssert( result >= 0 &&
-              result < this->getNumberOfFaces(),
+              result < ( this->template getNumberOfFaces< 1, 1 >() ),
               cerr << " cellIndex = " << cellIndex
                    << " nx = " << nx
                    << " ny = " << ny
-                   << " this->getNumberOfCells() = " << this->getNumberOfCells()
+                   << " this->getNumberOfCells() = " << ( this->getNumberOfCells() )
                    << " this->getName() " << this->getName(); );
    return result;
 }
@@ -363,9 +340,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int nx, int ny >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 2, Real, Device, Index >::getCellNextToFace( const IndexType& faceIndex ) const
 {
    tnlAssert( abs( nx ) + abs( ny ) == 1,
@@ -394,9 +369,7 @@ Index tnlGrid< 2, Real, Device, Index >::getCellNextToFace( const IndexType& fac
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHx() const
 {
    return this->hx;
@@ -405,9 +378,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHx() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHxSquare() const
 {
    return this->hxSquare;
@@ -416,9 +387,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHxSquare() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHxInverse() const
 {
    return this->hxInverse;
@@ -427,9 +396,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHxInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHxSquareInverse() const
 {
    return this->hxSquareInverse;
@@ -438,9 +405,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHxSquareInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHy() const
 {
    return this->hy;
@@ -449,9 +414,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHy() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHySquare() const
 {
    return this->hySquare;
@@ -460,9 +423,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHySquare() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHyInverse() const
 {
    return this->hyInverse;
@@ -471,9 +432,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHyInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHySquareInverse() const
 {
    return this->hySquareInverse;
@@ -482,9 +441,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHySquareInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHxHy() const
 {
    return this->hxhy;
@@ -493,9 +450,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHxHy() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 2, Real, Device, Index > :: getHxHyInverse() const
 {
    return this->hxhyInverse;
@@ -504,9 +459,7 @@ const Real& tnlGrid< 2, Real, Device, Index > :: getHxHyInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlGrid< 2, Real, Device, Index > :: getSmallestSpaceStep() const
 {
    return Min( this->hx, this->hy );
@@ -516,9 +469,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 2, Real, Device, Index > :: getCellCenter( const CoordinatesType& cellCoordinates ) const
 {
    tnlAssert( cellCoordinates.x() >= 0 && cellCoordinates.x() < this->getDimensions().x(),
@@ -538,9 +489,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 2, Real, Device, Index >::getCellCenter( const IndexType& cellIndex ) const
 {
    tnlAssert( cellIndex >= 0 && cellIndex < this->getNumberOfCells(),
@@ -554,9 +503,7 @@ template< typename Real,
           typename Device,
           typename Index >
 template< int nx, int ny, typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 2, Real, Device, Index > :: getFaceCenter( const CoordinatesType& faceCoordinates ) const
 {
    tnlStaticAssert( nx >= 0 && ny >= 0 && nx + ny == 1, "Wrong template parameters nx or ny." );
@@ -570,8 +517,8 @@ Vertex tnlGrid< 2, Real, Device, Index > :: getFaceCenter( const CoordinatesType
                  cerr << "faceCoordinates.y() = " << faceCoordinates.y()
                       << " this->getDimensions().y() = " << this->getDimensions().y()
                       << " this->getName() = " << this->getName(); );
-      return Vertex( this->origin.x() + faceCoordinates.x() * this->cellProportions().x(),
-                     this->origin.y() + ( faceCoordinates.y() + 0.5 ) * this->cellProportions().y() );
+      return Vertex( this->origin.x() + faceCoordinates.x() * this->cellProportions.x(),
+                     this->origin.y() + ( faceCoordinates.y() + 0.5 ) * this->cellProportions.y() );
    }
    if( ny )
    {
@@ -583,8 +530,8 @@ Vertex tnlGrid< 2, Real, Device, Index > :: getFaceCenter( const CoordinatesType
                  cerr << "faceCoordinates.y() = " << faceCoordinates.y()
                       << " this->getDimensions().y() + 1 = " << this->getDimensions().y() + 1
                       << " this->getName() = " << this->getName(); );
-      return Vertex( this->origin.x() + ( faceCoordinates.x() + 0.5 ) * this->cellProportions().x(),
-                     this->origin.y() + faceCoordinates.y() * this->cellProportions().y() );
+      return Vertex( this->origin.x() + ( faceCoordinates.x() + 0.5 ) * this->cellProportions.x(),
+                     this->origin.y() + faceCoordinates.y() * this->cellProportions.y() );
    }
 }
 
@@ -593,9 +540,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vertex >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 2, Real, Device, Index >::getVertex( const CoordinatesType& vertexCoordinates ) const
 {
    tnlAssert( vertexCoordinates.x() >= 0 && vertexCoordinates.x() < this->getDimensions().x() + 1,
@@ -614,9 +559,7 @@ Vertex tnlGrid< 2, Real, Device, Index >::getVertex( const CoordinatesType& vert
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 2, Real, Device, Index > :: getNumberOfCells() const
 {
    return this->numberOfCells;
@@ -627,9 +570,7 @@ template< typename Real,
           typename Index >
    template< int nx,
              int ny >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 2, Real, Device, Index > :: getNumberOfFaces() const
 {
    return nx * this->numberOfNxFaces + ny * this->numberOfNyFaces;
@@ -638,9 +579,7 @@ Index tnlGrid< 2, Real, Device, Index > :: getNumberOfFaces() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 2, Real, Device, Index > :: getNumberOfVertices() const
 {
    return this->numberOfVertices;
@@ -649,9 +588,7 @@ Index tnlGrid< 2, Real, Device, Index > :: getNumberOfVertices() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlGrid< 2, Real, Device, Index > :: isBoundaryCell( const CoordinatesType& cellCoordinates ) const
 {
    tnlAssert( cellCoordinates.x() >= 0 && cellCoordinates.x() < this->getDimensions().x(),
@@ -673,9 +610,7 @@ bool tnlGrid< 2, Real, Device, Index > :: isBoundaryCell( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool
 tnlGrid< 2, Real, Device, Index >::
 isBoundaryCell( const IndexType& cellIndex ) const
@@ -692,9 +627,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int nx, int ny >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 bool tnlGrid< 2, Real, Device, Index > :: isBoundaryFace( const CoordinatesType& faceCoordinates ) const
 {
    tnlStaticAssert( nx >= 0 && ny >= 0 && nx + ny == 1, "Wrong template parameters nx or ny." );
@@ -729,9 +662,7 @@ bool tnlGrid< 2, Real, Device, Index > :: isBoundaryFace( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlGrid< 2, Real, Device, Index > :: isBoundaryVertex( const CoordinatesType& vertexCoordinates ) const
 {
    tnlAssert( vertexCoordinates.x() >= 0 && vertexCoordinates.x() < this->getDimensions().x() + 1,
diff --git a/src/mesh/tnlGrid3D.h b/src/mesh/tnlGrid3D.h
index fb2d0ac080b16f8bfcd652a6250fc4dfd3e9d48c..9db36a288cd37b1bae31beaaa814fdbb45c78d5b 100644
--- a/src/mesh/tnlGrid3D.h
+++ b/src/mesh/tnlGrid3D.h
@@ -49,181 +49,113 @@ class tnlGrid< 3, Real, Device, Index > : public tnlObject
 
    void setDimensions( const CoordinatesType& );
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const CoordinatesType& getDimensions() const;
 
    void setDomain( const VertexType& origin,
                    const VertexType& proportions );
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const VertexType& getOrigin() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const VertexType& getProportions() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const VertexType& getCellProportions() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getCellIndex( const CoordinatesType& cellCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    CoordinatesType getCellCoordinates( const IndexType cellIndex ) const;
 
    template< int nx, int ny, int nz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getFaceIndex( const CoordinatesType& faceCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    CoordinatesType getFaceCoordinates( const Index faceIndex, int& nx, int& ny, int& nz ) const;
 
    template< int dx, int dy, int dz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getEdgeIndex( const CoordinatesType& edgeCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    CoordinatesType getEdgeCoordinates( const Index edgeIndex, int& dx, int& dy, int& dz ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getVertexIndex( const CoordinatesType& vertexCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    CoordinatesType getVertexCoordinates( const Index vertexIndex ) const;
 
    template< int dx, int dy, int dz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getCellNextToCell( const IndexType& cellIndex ) const;
 
    template< int nx, int ny, int nz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getFaceNextToCell( const IndexType& cellIndex ) const;
 
    template< int nx, int ny, int nz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    IndexType getCellNextToFace( const IndexType& cellIndex ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHx() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxSquare() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxSquareInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHy() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHySquare() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHyInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHySquareInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHz() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHzSquare() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHzInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHzSquareInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxHy() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxHz() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHyHz() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxHyInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHxHzInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    const RealType& getHyHzInverse() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    RealType getSmallestSpaceStep() const;
 
    /****
@@ -234,9 +166,7 @@ class tnlGrid< 3, Real, Device, Index > : public tnlObject
 #else
    template< typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getCellCenter( const CoordinatesType& cellCoordinates ) const;
 
 #ifdef HAVE_NOT_CXX11
@@ -244,9 +174,7 @@ class tnlGrid< 3, Real, Device, Index > : public tnlObject
 #else
    template< typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getCellCenter( const IndexType& cellIndex ) const;
 
 #ifdef HAVE_NOT_CXX11
@@ -254,9 +182,7 @@ class tnlGrid< 3, Real, Device, Index > : public tnlObject
 #else
    template< int nx, int ny, int nz, typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getFaceCenter( const CoordinatesType& faceCoordinates ) const;
 
 #ifdef HAVE_NOT_CXX11
@@ -264,9 +190,7 @@ class tnlGrid< 3, Real, Device, Index > : public tnlObject
 #else
    template< int dx, int dy, int dz, typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getEdgeCenter( const CoordinatesType& edgeCoordinates ) const;
 
 #ifdef HAVE_NOT_CXX11
@@ -274,14 +198,10 @@ class tnlGrid< 3, Real, Device, Index > : public tnlObject
 #else
    template< typename Vertex = VertexType >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Vertex getVertex( const CoordinatesType& vertexCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfCells() const;
 
 #ifdef HAVE_NOT_CXX11
@@ -293,9 +213,7 @@ class tnlGrid< 3, Real, Device, Index > : public tnlObject
              int ny = 1,
              int nz = 1 >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfFaces() const;
 
 #ifdef HAVE_NOT_CXX11
@@ -307,41 +225,27 @@ class tnlGrid< 3, Real, Device, Index > : public tnlObject
              int dy = 1,
              int dz = 1 >
 #endif
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfEdges() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getNumberOfVertices() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryCell( const CoordinatesType& cellCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryCell( const IndexType& cellIndex ) const;
 
    template< int nx, int ny, int nz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryFace( const CoordinatesType& faceCoordinates ) const;
 
    template< int dx, int dy, int dz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryEdge( const CoordinatesType& edgeCoordinates ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    bool isBoundaryVertex( const CoordinatesType& vertexCoordinates ) const;
 
    template< typename GridFunction >
diff --git a/src/mesh/tnlGrid3D_impl.h b/src/mesh/tnlGrid3D_impl.h
index 16daf5783152f2b0a6f4f8a5533cd637a2f7a9d3..147f038e595789a7d6ce4d0d4d4b74aa0fb18a4c 100644
--- a/src/mesh/tnlGrid3D_impl.h
+++ b/src/mesh/tnlGrid3D_impl.h
@@ -154,9 +154,7 @@ void tnlGrid< 3, Real, Device, Index > :: setDimensions( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 3, Real, Device, Index > :: CoordinatesType&
    tnlGrid< 3, Real, Device, Index > :: getDimensions() const
 {
@@ -177,9 +175,7 @@ void tnlGrid< 3, Real, Device, Index > :: setDomain( const VertexType& origin,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 3, Real, Device, Index >::VertexType&
 tnlGrid< 3, Real, Device, Index >::getOrigin() const
 {
@@ -189,9 +185,7 @@ tnlGrid< 3, Real, Device, Index >::getOrigin() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 3, Real, Device, Index > :: VertexType&
    tnlGrid< 3, Real, Device, Index > :: getProportions() const
 {
@@ -201,9 +195,7 @@ const typename tnlGrid< 3, Real, Device, Index > :: VertexType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const typename tnlGrid< 3, Real, Device, Index > :: VertexType&
    tnlGrid< 3, Real, Device, Index > :: getCellProportions() const
 {
@@ -251,9 +243,7 @@ template< typename Real,
           typename Device,
           typename Index >
 template< int nx, int ny, int nz >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index >::getFaceIndex( const CoordinatesType& faceCoordinates ) const
 {
    tnlStaticAssert( nx >= 0 && ny >= 0 && nz >= 0 && nx + ny + nz == 1, "Wrong template parameters nx or ny or nz." );
@@ -312,9 +302,7 @@ Index tnlGrid< 3, Real, Device, Index >::getFaceIndex( const CoordinatesType& fa
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 typename tnlGrid< 3, Real, Device, Index >::CoordinatesType
 tnlGrid< 3, Real, Device, Index >::getFaceCoordinates( const Index faceIndex, int& nx, int& ny, int& nz ) const
 {
@@ -356,9 +344,7 @@ template< typename Real,
           typename Device,
           typename Index >
 template< int dx, int dy, int dz >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index > :: getEdgeIndex( const CoordinatesType& edgeCoordinates ) const
 {
    tnlStaticAssert( dx >= 0 && dy >= 0 && dz >= 0 && dx + dy + dz == 1, "Wrong template parameters dx or dy or dz.");
@@ -417,9 +403,7 @@ Index tnlGrid< 3, Real, Device, Index > :: getEdgeIndex( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 typename tnlGrid< 3, Real, Device, Index > :: CoordinatesType
 tnlGrid< 3, Real, Device, Index > :: getEdgeCoordinates( const Index edgeIndex, int& dx, int& dy, int& dz ) const
 {
@@ -463,9 +447,7 @@ tnlGrid< 3, Real, Device, Index > :: getEdgeCoordinates( const Index edgeIndex,
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index > :: getVertexIndex( const CoordinatesType& vertexCoordinates ) const
 {
    tnlAssert( vertexCoordinates.x() >= 0 && vertexCoordinates.x() < this->getDimensions().x() + 1,
@@ -487,9 +469,7 @@ Index tnlGrid< 3, Real, Device, Index > :: getVertexIndex( const CoordinatesType
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 typename tnlGrid< 3, Real, Device, Index > :: CoordinatesType
 tnlGrid< 3, Real, Device, Index > :: getVertexCoordinates( const Index vertexIndex ) const
 {
@@ -509,9 +489,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int dx, int dy, int dz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index > :: getCellNextToCell( const IndexType& cellIndex ) const
 {
    tnlAssert( cellIndex + dx >= 0 &&
@@ -535,9 +513,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int nx, int ny, int nz >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index >::getFaceNextToCell( const IndexType& cellIndex ) const
 {
    tnlAssert( nx * ny * nz == 0 && nx + ny + nz != 0,
@@ -566,9 +542,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int nx, int ny, int nz >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index >::getCellNextToFace( const IndexType& faceIndex ) const
 {
    tnlAssert( abs( nx ) + abs( ny ) + abs( nz ) == 1,
@@ -616,9 +590,7 @@ Index tnlGrid< 3, Real, Device, Index >::getCellNextToFace( const IndexType& fac
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHx() const
 {
    return this->hx;
@@ -627,9 +599,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHx() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHxSquare() const
 {
    return this->hxSquare;
@@ -638,9 +608,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHxSquare() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHxInverse() const
 {
    return this->hxInverse;
@@ -649,9 +617,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHxInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHxSquareInverse() const
 {
    return this->hxSquareInverse;
@@ -660,9 +626,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHxSquareInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHy() const
 {
    return this->hy;
@@ -671,9 +635,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHy() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHySquare() const
 {
    return this->hySquare;
@@ -682,9 +644,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHySquare() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHyInverse() const
 {
    return this->hyInverse;
@@ -693,9 +653,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHyInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHySquareInverse() const
 {
    return this->hySquareInverse;
@@ -704,9 +662,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHySquareInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHz() const
 {
    return this->hz;
@@ -715,9 +671,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHz() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHzSquare() const
 {
    return this->hzSquare;
@@ -726,9 +680,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHzSquare() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHzInverse() const
 {
    return this->hzInverse;
@@ -737,9 +689,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHzInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHzSquareInverse() const
 {
    return this->hzSquareInverse;
@@ -748,9 +698,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHzSquareInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHxHy() const
 {
    return this->hxhy;
@@ -759,9 +707,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHxHy() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHxHz() const
 {
    return this->hxhz;
@@ -770,9 +716,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHxHz() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHyHz() const
 {
    return this->hyhz;
@@ -781,9 +725,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHyHz() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHxHyInverse() const
 {
    return this->hxhyInverse;
@@ -792,9 +734,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHxHyInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHxHzInverse() const
 {
    return this->hxhzInverse;
@@ -803,9 +743,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHxHzInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 const Real& tnlGrid< 3, Real, Device, Index > :: getHyHzInverse() const
 {
    return this->hyhzInverse;
@@ -814,9 +752,7 @@ const Real& tnlGrid< 3, Real, Device, Index > :: getHyHzInverse() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real tnlGrid< 3, Real, Device, Index > :: getSmallestSpaceStep() const
 {
    return Min( this->hx, Min( this->hy, this->hz ) );
@@ -827,9 +763,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 3, Real, Device, Index > :: getCellCenter( const CoordinatesType& cellCoordinates ) const
 {
    tnlAssert( cellCoordinates.x() >= 0 && cellCoordinates.x() < this->getDimensions().x(),
@@ -855,9 +789,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 3, Real, Device, Index >::getCellCenter( const IndexType& cellIndex ) const
 {
    tnlAssert( cellIndex >= 0 && cellIndex < this->getNumberOfCells(),
@@ -871,9 +803,7 @@ template< typename Real,
           typename Device,
           typename Index >
 template< int nx, int ny, int nz, typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 3, Real, Device, Index > :: getFaceCenter( const CoordinatesType& faceCoordinates ) const
 {
    tnlStaticAssert( nx >= 0 && ny >= 0 && nz >= 0 && nx + ny + nz == 1, "Wrong template parameters nx or ny or nz." );
@@ -938,9 +868,7 @@ template< typename Real,
           typename Device,
           typename Index >
 template< int dx, int dy, int dz, typename Vertex >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 3, Real, Device, Index > :: getEdgeCenter( const CoordinatesType& edgeCoordinates ) const
 {
    tnlStaticAssert( dx >= 0 && dy >= 0 && dz >= 0 && dx + dy + dz == 1, "Wrong template parameters nx or ny or nz." );
@@ -1004,9 +932,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< typename Vertex >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Vertex tnlGrid< 3, Real, Device, Index >::getVertex( const CoordinatesType& vertexCoordinates ) const
 {
    tnlAssert( vertexCoordinates.x() >= 0 && vertexCoordinates.x() < this->getDimensions().x() + 1,
@@ -1030,9 +956,7 @@ Vertex tnlGrid< 3, Real, Device, Index >::getVertex( const CoordinatesType& vert
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index > :: getNumberOfCells() const
 {
    return this->numberOfCells;
@@ -1042,9 +966,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int nx, int ny, int nz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index > :: getNumberOfFaces() const
 {
    return nx * this->numberOfNxFaces +
@@ -1056,9 +978,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int dx, int dy, int dz >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index > :: getNumberOfEdges() const
 {
    return dx * this->numberOfDxEdges +
@@ -1069,9 +989,7 @@ Index tnlGrid< 3, Real, Device, Index > :: getNumberOfEdges() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index tnlGrid< 3, Real, Device, Index > :: getNumberOfVertices() const
 {
    return numberOfVertices;
@@ -1080,9 +998,7 @@ Index tnlGrid< 3, Real, Device, Index > :: getNumberOfVertices() const
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlGrid< 3, Real, Device, Index > :: isBoundaryCell( const CoordinatesType& cellCoordinates ) const
 {
    tnlAssert( cellCoordinates.x() >= 0 && cellCoordinates.x() < this->getDimensions().x(),
@@ -1109,9 +1025,7 @@ bool tnlGrid< 3, Real, Device, Index > :: isBoundaryCell( const CoordinatesType&
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool
 tnlGrid< 3, Real, Device, Index >::
 isBoundaryCell( const IndexType& cellIndex ) const
@@ -1127,9 +1041,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int nx, int ny, int nz >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 bool tnlGrid< 3, Real, Device, Index > :: isBoundaryFace( const CoordinatesType& faceCoordinates ) const
 {
    tnlStaticAssert( nx >= 0 && ny >= 0 && nz >=0 && nx + ny + nz == 1, "Wrong template parameters nx or ny or nz." );
@@ -1185,9 +1097,7 @@ template< typename Real,
           typename Device,
           typename Index >
    template< int dx, int dy, int dz >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 bool tnlGrid< 3, Real, Device, Index >::isBoundaryEdge( const CoordinatesType& edgeCoordinates ) const
 {
    tnlStaticAssert( dx >= 0 && dy >= 0 && dz >= 0 && dx + dy + dz == 1, "Wrong template parameters nx or ny or nz." );
@@ -1254,9 +1164,7 @@ bool tnlGrid< 3, Real, Device, Index >::isBoundaryEdge( const CoordinatesType& e
 template< typename Real,
           typename Device,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 bool tnlGrid< 3, Real, Device, Index > :: isBoundaryVertex( const CoordinatesType& vertexCoordinates ) const
 {
    tnlAssert( vertexCoordinates.x() >= 0 && vertexCoordinates.x() < this->getDimensions().x() + 1,
diff --git a/src/mesh/tnlGrid_impl.cpp b/src/mesh/tnlGrid_impl.cpp
index 28a56bf44788c1a05e5e221f2ee1b1ddacd5aa47..fbf9ba34508088c78cc555c7d4fb4201758c54c4 100644
--- a/src/mesh/tnlGrid_impl.cpp
+++ b/src/mesh/tnlGrid_impl.cpp
@@ -21,32 +21,48 @@
 
 template class tnlGrid< 1, float,  tnlHost, int >;
 template class tnlGrid< 1, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlGrid< 1, float,  tnlHost, long int >;
 template class tnlGrid< 1, double, tnlHost, long int >;
+#endif
+
 template class tnlGrid< 2, float,  tnlHost, int >;
 template class tnlGrid< 2, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlGrid< 2, float,  tnlHost, long int >;
 template class tnlGrid< 2, double, tnlHost, long int >;
+#endif
+
 template class tnlGrid< 3, float,  tnlHost, int >;
 template class tnlGrid< 3, double, tnlHost, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlGrid< 3, float,  tnlHost, long int >;
 template class tnlGrid< 3, double, tnlHost, long int >;
+#endif
 
 #ifdef HAVE_CUDA
 #endif
 
 template class tnlGrid< 1, float,  tnlCuda, int >;
 template class tnlGrid< 1, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlGrid< 1, float,  tnlCuda, long int >;
 template class tnlGrid< 1, double, tnlCuda, long int >;
+#endif
+
 template class tnlGrid< 2, float,  tnlCuda, int >;
 template class tnlGrid< 2, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlGrid< 2, float,  tnlCuda, long int >;
 template class tnlGrid< 2, double, tnlCuda, long int >;
+#endif
+
 template class tnlGrid< 3, float,  tnlCuda, int >;
 template class tnlGrid< 3, double, tnlCuda, int >;
+#ifdef INSTANTIATE_LONG_INT
 template class tnlGrid< 3, float,  tnlCuda, long int >;
 template class tnlGrid< 3, double, tnlCuda, long int >;
+#endif
 
 #endif
 
diff --git a/src/mesh/tnlMesh.h b/src/mesh/tnlMesh.h
index 92d61ed7e60d0d5a4950a5b607ccd84f7028cfe4..6455c7d03ec37827c8b53239f7247adbac1a0c9c 100644
--- a/src/mesh/tnlMesh.h
+++ b/src/mesh/tnlMesh.h
@@ -26,14 +26,11 @@ template< typename ConfigTag >
 class tnlMesh : public tnlObject,
                 public tnlMeshStorageLayers< ConfigTag >
 {
-   //template<typename, typename, typename> friend class InitializerLayer;
-   //friend class IOReader<ConfigTag>;
-
-   typedef tnlMeshStorageLayers<ConfigTag>        BaseType;
+   typedef tnlMeshStorageLayers< ConfigTag >                BaseType;
 
    public:
-   typedef ConfigTag                              Config;
-   typedef typename tnlMeshTraits< ConfigTag >::PointType PointType;
+   typedef ConfigTag                                        Config;
+   typedef typename tnlMeshTraits< ConfigTag >::PointType   PointType;
    enum { dimensions = tnlMeshTraits< ConfigTag >::meshDimensions };
 
    /*~tnlMesh()
diff --git a/src/mesh/tnlMeshEntity.h b/src/mesh/tnlMeshEntity.h
index 1bfeb07b300ceb4bd61436fd23af0f2590a59407..5abc50fb815beb72cec5338473f2f378152eef22 100644
--- a/src/mesh/tnlMeshEntity.h
+++ b/src/mesh/tnlMeshEntity.h
@@ -36,6 +36,10 @@ class tnlMeshEntity
                              typename ConfigTag::GlobalIndexType >
 {
    public:
+      
+      // TODO: This is only because of STD lib bug in tnlIndexedSet
+      tnlMeshEntity( const tnlMeshEntity& entyti ) {}
+      tnlMeshEntity() {}
 
    static tnlString getType()
    {
@@ -296,6 +300,11 @@ class tnlMeshEntity< ConfigTag, tnlMeshVertexTag >
 {
    public:
 
+      // TODO: This is only because of STD lib bug in tnlIndexedSet
+      tnlMeshEntity( const tnlMeshEntity& entyti ) {}
+      tnlMeshEntity() {}
+
+      
    static tnlString getType()
    {
       return tnlString( "tnlMesh< " ) +
@@ -453,7 +462,7 @@ ostream& operator <<( ostream& str, const tnlMeshEntity< ConfigTag, EntityTag >&
 
 /****
  * This tells the compiler that theMeshEntity is a type with a dynamic memory allocation.
- * It is necessary for the loading and the saving of the mesh enities arrays.
+ * It is necessary for the loading and the saving of the mesh entities arrays.
  */
 template< typename ConfigTag,
           typename EntityTag >
diff --git a/examples/simple-solver/simpleProblemConfig.h b/src/mesh/tnlMeshEntityIntegrityChecker.h
similarity index 64%
rename from examples/simple-solver/simpleProblemConfig.h
rename to src/mesh/tnlMeshEntityIntegrityChecker.h
index c29d6afc4732ea1fea47ec20977722fd3221ab58..028e140f8b17cbd540d3c946ee2ac6b35104abc0 100644
--- a/examples/simple-solver/simpleProblemConfig.h
+++ b/src/mesh/tnlMeshEntityIntegrityChecker.h
@@ -1,7 +1,7 @@
 /***************************************************************************
-                          simpleProblemConfig.h  -  description
+                          tnlMeshEntityIntegrityChecker.h  -  description
                              -------------------
-    begin                : Jul 8, 2014
+    begin                : Mar 20, 2014
     copyright            : (C) 2014 by Tomas Oberhuber
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
@@ -15,20 +15,20 @@
  *                                                                         *
  ***************************************************************************/
 
-#ifndef SIMPLEPROBLEMCONFIG_H_
-#define SIMPLEPROBLEMCONFIG_H_
+#ifndef TNLMESHENTITYINTEGRITYCHECKER_H_
+#define TNLMESHENTITYINTEGRITYCHECKER_H_
 
-#include <config/tnlConfigDescription.h>
-
-template< typename ConfigTag >
-class simpleProblemConfig
+template< typename MeshEntity >
+class tnlMeshEntityIntegrityChecker
 {
    public:
-      static void configSetup( tnlConfigDescription& config )
+
+      static bool checkEntity( const MeshEntity& entity )
       {
-         config.addDelimiter( "Simple solver settings:" );
-         config.addEntry        < tnlString > ( "problem-name", "This defines particular problem.", "simpl" );
+         return true;
       }
+
 };
 
-#endif /* SIMPLESOLVERCONFIG_H_ */
+
+#endif /* TNLMESHENTITYINTEGRITYCHECKER_H_ */
diff --git a/examples/make-project/main.cpp b/src/mesh/tnlMeshEntityIntegrityCheckerLayer.h
similarity index 56%
rename from examples/make-project/main.cpp
rename to src/mesh/tnlMeshEntityIntegrityCheckerLayer.h
index 80da9581b78d20815510d1b1c7b6ad4ebf7148a5..0b221403a8cb8150edfe2032bc7a4a88688afde4 100644
--- a/examples/make-project/main.cpp
+++ b/src/mesh/tnlMeshEntityIntegrityCheckerLayer.h
@@ -1,8 +1,8 @@
 /***************************************************************************
-                          main.cpp  -  description
+                          tnlMeshEntityIntegrityCheckerLayer.h  -  description
                              -------------------
-    begin                : Jan 12, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
+    begin                : Mar 23, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -15,26 +15,10 @@
  *                                                                         *
  ***************************************************************************/
 
-#include "program-name-conf.h"
-#include <config/tnlConfigDescription.h>
-#include <config/tnlParameterContainer.h>
+#ifndef TNLMESHENTITYINTEGRITYCHECKERLAYER_H_
+#define TNLMESHENTITYINTEGRITYCHECKERLAYER_H_
 
-int main( int argc, char* argv[] )
-{
-   tnlParameterContainer parameters;
-   tnlConfigDescription conf_desc;
-   if( conf_desc.parseConfigDescription( CONFIG_FILE ) != 0 )
-      return EXIT_FAILURE;
-   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) )
-   {
-      conf_desc.printUsage( argv[ 0 ] );
-      return EXIT_FAILURE;
-   }
 
-   /****
-    * Write your code here
-    */
-   return EXIT_SUCCESS;
-}
 
 
+#endif /* TNLMESHENTITYINTEGRITYCHECKERLAYER_H_ */
diff --git a/src/mesh/tnlMeshInitializer.h b/src/mesh/tnlMeshInitializer.h
index 0cee8179edb3fa1b92a9d533d66a493368f37796..897aa5901432704f8b71ed7f1917a6009230e01b 100644
--- a/src/mesh/tnlMeshInitializer.h
+++ b/src/mesh/tnlMeshInitializer.h
@@ -46,6 +46,15 @@ class tnlMeshInitializer
 
    public:
 
+   tnlMeshInitializer()
+   : verbose( false )
+   {}
+
+   void setVerbose( bool verbose )
+   {
+      this->verbose = verbose;
+   }
+
    bool initMesh( MeshType& mesh )
    {
       //cout << "======= Starting mesh initiation ========" << endl;
@@ -53,13 +62,17 @@ class tnlMeshInitializer
       if( ! this->checkCells() )
          return false;
       //cout << "========= Creating entities =============" << endl;
-      this->createEntitiesFromCells();
+      this->createEntitiesFromCells( this->verbose );
       this->createEntityInitializers();
       //cout << "====== Initiating entities ==============" << endl;
       this->initEntities( *this );
       //cout << "Mesh initiation done..." << endl;
       return true;
    }
+
+   protected:
+
+   bool verbose;
 };
 
 template< typename ConfigTag >
@@ -95,21 +108,32 @@ class tnlMeshInitializerLayer< ConfigTag,
    bool checkCells()
    {
       typedef typename tnlMeshEntity< ConfigTag, EntityTag >::template SubentitiesTraits< 0 >::LocalIndexType LocalIndexType;
+      const GlobalIndexType numberOfVertices( this->getMesh().getNumberOfVertices() );
       for( GlobalIndexType cell = 0;
            cell < this->getMesh().getNumberOfCells();
            cell++ )
          for( LocalIndexType i = 0;
               i < this->getMesh().getCell( cell ).getNumberOfVertices();
               i++ )
+         {
             if( this->getMesh().getCell( cell ).getVerticesIndices()[ i ] == - 1 )
             {
                cerr << "The cell number " << cell << " does not have properly set vertex index number " << i << "." << endl;
                return false;
             }
+            if( this->getMesh().getCell( cell ).getVerticesIndices()[ i ] >= numberOfVertices )
+            {
+               cerr << "The cell number " << cell << " does not have properly set vertex index number " << i
+                    << ". The index " << this->getMesh().getCell( cell ).getVerticesIndices()[ i ]
+                    << "is higher than the number of all vertices ( " << numberOfVertices
+                    << " )." << endl;
+               return false;
+            }
+         }
       return true;
    }
 
-   void createEntitiesFromCells()
+   void createEntitiesFromCells( bool verbose )
    {
       //cout << " Creating entities with " << DimensionsTraits::value << " dimensions..." << endl;
       cellInitializerContainer.setSize( this->getMesh().getNumberOfCells() );
@@ -117,12 +141,15 @@ class tnlMeshInitializerLayer< ConfigTag,
            cell < this->getMesh().getNumberOfCells();
            cell++ )
       {
-         //cout << "  Creating the cell number " << cell << endl;
+         if( verbose )
+            cout << "  Creating the cell number " << cell << "            \r " << flush;
          CellInitializerType& cellInitializer = cellInitializerContainer[ cell ];
 
          cellInitializer.init( this->getMesh().getCell( cell ), cell );
          BaseType::createEntitiesFromCells( cellInitializer );
       }
+      if( verbose )
+         cout << endl;
 
    }
 
diff --git a/src/mesh/tnlMeshIntegrityChecker.h b/src/mesh/tnlMeshIntegrityChecker.h
new file mode 100644
index 0000000000000000000000000000000000000000..86f0de4dfb5f4507afc9da6f5631d67857685b43
--- /dev/null
+++ b/src/mesh/tnlMeshIntegrityChecker.h
@@ -0,0 +1,42 @@
+/***************************************************************************
+                          tnlMeshIntegrityChecker.h  -  description
+                             -------------------
+    begin                : Mar 20, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef TNLMESHINTEGRITYCHECKER_H_
+#define TNLMESHINTEGRITYCHECKER_H_
+
+#include <mesh/tnlMesh.h>
+#include <mesh/tnlMeshIntegrityCheckerLayer.h>
+
+template< typename MeshType >
+class tnlMeshIntegrityChecker
+: public tnlMeshIntegrityCheckerLayer< MeshType,
+                                       tnlDimensionsTraits< MeshType::Config::CellTag::dimensions > >
+{
+      typedef tnlDimensionsTraits< MeshType::Config::CellTag::dimensions > DimensionsTraits;
+      typedef tnlMeshIntegrityCheckerLayer< MeshType, DimensionsTraits > BaseType;
+
+   public:
+      static bool checkMesh( const MeshType& mesh )
+      {
+         if( ! BaseType::checkEntities( mesh ) )
+            return false;
+         return true;
+      }
+};
+
+
+#endif /* TNLMESHINTEGRITYCHECKER_H_ */
diff --git a/src/mesh/tnlMeshIntegrityCheckerLayer.h b/src/mesh/tnlMeshIntegrityCheckerLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a67a8442300973408e484f2ddd9e727827ae8bad
--- /dev/null
+++ b/src/mesh/tnlMeshIntegrityCheckerLayer.h
@@ -0,0 +1,107 @@
+/***************************************************************************
+                          tnlMeshIntegrityCheckerLayer.h  -  description
+                             -------------------
+    begin                : Mar 21, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef TNLMESHINTEGRITYCHECKERLAYER_H_
+#define TNLMESHINTEGRITYCHECKERLAYER_H_
+
+#include <mesh/traits/tnlMeshEntitiesTraits.h>
+#include <mesh/traits/tnlDimensionsTraits.h>
+#include <mesh/traits/tnlStorageTraits.h>
+
+template< typename MeshType,
+          typename DimensionsTraits,
+          typename EntityStorageTag = typename tnlMeshEntitiesTraits< typename MeshType::Config,
+                                                                      DimensionsTraits >::EntityStorageTag >
+class tnlMeshIntegrityCheckerLayer;
+
+template< typename MeshType,
+          typename DimensionsTraits >
+class tnlMeshIntegrityCheckerLayer< MeshType,
+                                    DimensionsTraits,
+                                    tnlStorageTraits< true > >
+   : public tnlMeshIntegrityCheckerLayer< MeshType,
+                                          typename DimensionsTraits::Previous >
+{
+   public:
+      typedef tnlMeshIntegrityCheckerLayer< MeshType, 
+                                            typename DimensionsTraits::Previous >     BaseType;
+      enum { dimensions = DimensionsTraits::value };
+
+      static bool checkEntities( const MeshType& mesh )
+      {         
+         typedef typename MeshType::template EntitiesTraits< dimensions >::ContainerType ContainerType;
+         typedef typename ContainerType::IndexType                                       GlobalIndexType;
+         cout << "Checking entities with " << dimensions << " dimensions ..." << endl;
+         for( GlobalIndexType entityIdx = 0;
+              entityIdx < mesh.template getNumberOfEntities< dimensions >();
+              entityIdx++ )
+         {
+            cout << "Entity no. " << entityIdx << "               \r" << flush;
+         }
+         cout << endl;
+         if( ! BaseType::checkEntities( mesh ) )
+            return false;
+         return true;
+      }
+};
+
+template< typename MeshType >
+class tnlMeshIntegrityCheckerLayer< MeshType,
+                                    tnlDimensionsTraits< 0 >,
+                                    tnlStorageTraits< true > >
+{
+   public:
+      enum { dimensions = 0 };
+
+      static bool checkEntities( const MeshType& mesh )
+      {
+         typedef typename MeshType::template EntitiesTraits< dimensions >::ContainerType ContainerType;
+         typedef typename ContainerType::IndexType                                       GlobalIndexType;
+         cout << "Checking entities with " << dimensions << " dimensions ..." << endl;
+         for( GlobalIndexType entityIdx = 0;
+              entityIdx < mesh.template getNumberOfEntities< dimensions >();
+              entityIdx++ )
+         {
+            cout << "Entity no. " << entityIdx << "          \r" << flush;
+         }
+         cout << endl;
+         return true;
+      }
+
+};
+
+template< typename MeshType,
+          typename DimensionsTraits >
+class tnlMeshIntegrityCheckerLayer< MeshType,
+                                    DimensionsTraits,
+                                    tnlStorageTraits< false > >
+   : public tnlMeshIntegrityCheckerLayer< MeshType,
+                                          typename DimensionsTraits::Previous >
+{
+
+};
+
+template< typename MeshType >
+class tnlMeshIntegrityCheckerLayer< MeshType,
+                                    tnlDimensionsTraits< 0 >,
+                                    tnlStorageTraits< false > >
+{
+
+};
+
+
+#endif /* TNLMESHINTEGRITYCHECKERLAYER_H_ */
diff --git a/src/mesh/tnlMeshReaderNetgen.h b/src/mesh/tnlMeshReaderNetgen.h
index 10d1e4d3e6c4bf9dffbb2e36801f12c1da8b8760..e694b10269bf30046d1fbe2f3f06acdedda3a799 100644
--- a/src/mesh/tnlMeshReaderNetgen.h
+++ b/src/mesh/tnlMeshReaderNetgen.h
@@ -135,11 +135,15 @@ class tnlMeshReaderNetgen
        */
        typedef typename MeshType::template EntitiesTraits< dimensions >::GlobalIndexType CellIndexType;
        if( ! inputFile )
+       {
+          cerr << "I cannot read the mesh cells." << endl;
           return false;
+       }
        getline( inputFile, line );
+       iss.clear();
        iss.str( line );
-       CellIndexType numberOfCells;
-       iss >> numberOfCells;
+       CellIndexType numberOfCells=atoi( line.data() );
+       //iss >> numberOfCells; // TODO: I do not know why this does not work
        if( ! mesh.template setNumberOfEntities< dimensions >( numberOfCells ) )
        {
           cerr << "I am not able to allocate enough memory for " << numberOfCells << " cells." << endl;
@@ -156,9 +160,8 @@ class tnlMeshReaderNetgen
           {
              VertexIndexType vertexIdx;
              iss >> vertexIdx;
-             mesh.template getEntity< dimensions >( i ).setVertexIndex( cellVertex, vertexIdx );
+             mesh.template getEntity< dimensions >( i ).setVertexIndex( cellVertex, vertexIdx - 1 );
           }
-          cout << endl;
           if( verbose )
              cout << numberOfCells << " cells expected ... " << i+1 << "/" << numberOfCells << "                 \r" << flush;
        }
diff --git a/src/mesh/tnlMeshWriterVTKLegacy.h b/src/mesh/tnlMeshWriterVTKLegacy.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab5d84c7a0e9dae2e398c1a590e1949c77a89aaf
--- /dev/null
+++ b/src/mesh/tnlMeshWriterVTKLegacy.h
@@ -0,0 +1,80 @@
+/***************************************************************************
+                          tnlMeshWriterVTKLegacy.h  -  description
+                             -------------------
+    begin                : Mar 20, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+
+#ifndef TNLMESHWRITERVTKLEGACY_H_
+#define TNLMESHWRITERVTKLEGACY_H_
+
+#include <fstream>
+#include <istream>
+#include <sstream>
+#include <iomanip>
+
+using namespace std;
+
+class tnlMeshWriterVTKLegacy
+{
+   public:
+
+   template< typename MeshType >
+   static bool write( const tnlString& fileName,
+                          MeshType& mesh,
+                          bool verbose )
+   {
+      if( MeshType::dimensions > 3 )
+      {
+         cerr << "You try to write mesh with " << MeshType::dimensions
+              << "dimensions but VTK legacy format supports only 1D, 2D and 3D meshes." << endl;
+         return false;
+      }
+      fstream outputFile;
+      outputFile.open( fileName.getString(), ios::out );
+      if( ! outputFile )
+      {
+         cerr << "I am not able to open the output file " << fileName << "." << endl;
+         return false;
+      }
+      outputFile << setprecision( 6 );
+      outputFile << fixed;
+
+      if( ! writeMesh( outputFile, mesh, verbose ) )
+         return false;
+   }
+
+   template< typename MeshType >
+   static bool writeMesh( ostream& file,
+                          MeshType& mesh,
+                          bool verbose )
+   {
+      file << "# vtk DataFile Version 2.0" << endl;
+      file << "TNL Mesh" << endl;
+      file << "ASCII" << endl;
+      file << "DATASET UNSTRUCTURED_GRID" << endl;
+      file << endl;
+      file << "POINTS " << mesh.template getNumberOfEntities< 0 >() << " float" << endl;
+      for( int i = 0; i < mesh.template getNumberOfEntities< 0 >(); i++ )
+      {
+         file << mesh.template getEntity< 0 >( i ).getPoint();
+      }
+   }
+
+
+};
+
+
+
+#endif /* TNLMESHWRITERVTKLEGACY_H_ */
diff --git a/src/mesh/tnlTraverser_Grid2D_impl.h b/src/mesh/tnlTraverser_Grid2D_impl.h
index 60105b57ee83fd213e4f93f7a31426b78280b6ab..4d8efa2df82898134c560b3c9458f6d71b313dfe 100644
--- a/src/mesh/tnlTraverser_Grid2D_impl.h
+++ b/src/mesh/tnlTraverser_Grid2D_impl.h
@@ -97,16 +97,20 @@ processBoundaryEntities( const GridType& grid,
    for( coordinates.x() = 0; coordinates.x() < xSize; coordinates.x() ++ )
    {
       coordinates.y() = 0;
-      EntitiesProcessor::processFace< 0, 1 >( grid, userData, grid.getFaceIndex< 0, 1 >( coordinates ), coordinates );
+      EntitiesProcessor::processFace( grid, userData, grid.template getFaceIndex< 0, 1 >( coordinates ), coordinates );
+      //cout << "Boundary face coordinates = " << coordinates << " index = " << grid.template getFaceIndex< 0, 1 >( coordinates ) << endl;
       coordinates.y() = ySize;
-      EntitiesProcessor::processFace< 0, 1 >( grid, userData, grid.getFaceIndex< 0, 1 >( coordinates ), coordinates );
+      EntitiesProcessor::processFace( grid, userData, grid.template getFaceIndex< 0, 1 >( coordinates ), coordinates );
+      //cout << "Boundary face coordinates = " << coordinates << " index = " << grid.template getFaceIndex< 0, 1 >( coordinates ) << endl;
    }
    for( coordinates.y() = 0; coordinates.y() < ySize; coordinates.y() ++ )
    {
       coordinates.x() = 0;
-      EntitiesProcessor::processFace< 1, 0 >( grid, userData, grid.getFaceIndex< 1, 0 >( coordinates ), coordinates );
-      coordinates.x() = ySize;
-      EntitiesProcessor::processFace< 1, 0 >( grid, userData, grid.getFaceIndex< 1, 0 >( coordinates ), coordinates );
+      EntitiesProcessor::processFace( grid, userData, grid.template getFaceIndex< 1, 0 >( coordinates ), coordinates );
+      //cout << "Boundary face coordinates = " << coordinates << " index = " << grid.template getFaceIndex< 1, 0 >( coordinates ) << endl;
+      coordinates.x() = xSize;
+      EntitiesProcessor::processFace( grid, userData, grid.template getFaceIndex< 1, 0 >( coordinates ), coordinates );
+      //cout << "Boundary face coordinates = " << coordinates << " index = " << grid.template getFaceIndex< 1, 0 >( coordinates ) << endl;
    }
 
 }
@@ -130,17 +134,23 @@ processInteriorEntities( const GridType& grid,
 #ifdef HAVE_OPENMP
 //#pragma omp parallel for
 #endif
-   for( coordinates.y() = 1; coordinates.y() < ySize - 1; coordinates.y() ++ )
+
+   //cout << "< 1, 0 >" << endl;
+   for( coordinates.y() = 0; coordinates.y() < ySize; coordinates.y() ++ )
       for( coordinates.x() = 1; coordinates.x() < xSize; coordinates.x() ++ )
       {
          const IndexType index = grid.template getFaceIndex< 1, 0 >( coordinates );
-         EntitiesProcessor::processFace< 1, 0 >( grid, userData, index, coordinates );
+         EntitiesProcessor::processFace( grid, userData, index, coordinates );
+         //cout << "Interior face coordinates = " << coordinates << " index = " << grid.template getFaceIndex< 1, 0 >( coordinates ) << endl;
       }
+
+   //cout << "<  0, 1 >" << endl;
    for( coordinates.y() = 1; coordinates.y() < ySize; coordinates.y() ++ )
-      for( coordinates.x() = 1; coordinates.x() < xSize - 1; coordinates.x() ++ )
+      for( coordinates.x() = 0; coordinates.x() < xSize; coordinates.x() ++ )
       {
          const IndexType index = grid.template getFaceIndex< 0, 1 >( coordinates );
-         EntitiesProcessor::processFace< 0, 1 >( grid, userData, index, coordinates );
+         EntitiesProcessor::processFace( grid, userData, index, coordinates );
+         //cout << "Interior face coordinates = " << coordinates << " index = " << grid.template getFaceIndex< 0, 1 >( coordinates ) << endl;
       }
 }
 
@@ -157,6 +167,25 @@ processBoundaryEntities( const GridType& grid,
    /****
     * Traversing boundary vertices
     */
+   CoordinatesType coordinates;
+   const IndexType& xSize = grid.getDimensions().x();
+   const IndexType& ySize = grid.getDimensions().y();
+
+   for( coordinates.x() = 0; coordinates.x() <= xSize; coordinates.x() ++ )
+   {
+      coordinates.y() = 0;
+      EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+      coordinates.y() = ySize;
+      EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+   }
+   for( coordinates.y() = 1; coordinates.y() <= ySize; coordinates.y() ++ )
+   {
+      coordinates.x() = 0;
+      EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+      coordinates.x() = xSize;
+      EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+   }
+   
 }
 
 template< typename Real,
@@ -171,8 +200,20 @@ processInteriorEntities( const GridType& grid,
    /****
     * Traversing interior vertices
     */
-}
+   CoordinatesType coordinates;
+   const IndexType& xSize = grid.getDimensions().x();
+   const IndexType& ySize = grid.getDimensions().y();
 
+#ifdef HAVE_OPENMP
+//#pragma omp parallel for
+#endif
+   for( coordinates.y() = 1; coordinates.y() < ySize; coordinates.y() ++ )
+      for( coordinates.x() = 1; coordinates.x() < xSize; coordinates.x() ++ )
+      {
+         const IndexType index = grid.getVertexIndex( coordinates );
+         EntitiesProcessor::processVertex( grid, userData, index, coordinates );
+      }  
+}
 
 /***
  *
@@ -249,6 +290,145 @@ __global__ void tnlTraverserGrid2DInteriorCells( const tnlGrid< 2, Real, tnlCuda
    }
 }
 
+template< typename Real,
+          typename Index,
+          typename UserData,
+          typename EntitiesProcessor,
+          int nx,
+          int ny >
+__global__ void tnlTraverserGrid2DBoundaryFaces( const tnlGrid< 2, Real, tnlCuda, Index >* grid,
+                                                 UserData* userData,
+                                                 const Index gridXIdx,
+                                                 const Index gridYIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef tnlGrid< 2, Real, tnlCuda, Index > GridType;
+   typedef typename GridType::CoordinatesType CoordinatesType;
+
+   const IndexType& xSize = grid->getDimensions().x();
+   const IndexType& ySize = grid->getDimensions().y();
+
+   CoordinatesType faceCoordinates( ( gridXIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x,
+                                    ( gridYIdx * tnlCuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y );
+
+   if( faceCoordinates.x() < grid->getDimensions().x() + nx &&
+       faceCoordinates.y() < grid->getDimensions().y() + ny )
+   {
+      if( grid->template isBoundaryFace< nx, ny >( faceCoordinates ) )
+      {
+         //printf( "Processing boundary conditions at %d %d \n", cellCoordinates.x(), cellCoordinates.y() );
+         EntitiesProcessor::processFace( *grid,
+                                         *userData,
+                                         grid->template getFaceIndex< nx, ny >( faceCoordinates ),
+                                         faceCoordinates );
+      }
+   }
+}
+
+template< typename Real,
+          typename Index,
+          typename UserData,
+          typename EntitiesProcessor,
+          int nx,
+          int ny >
+__global__ void tnlTraverserGrid2DInteriorFaces( const tnlGrid< 2, Real, tnlCuda, Index >* grid,
+                                                 UserData* userData,
+                                                 const Index gridXIdx,
+                                                 const Index gridYIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef tnlGrid< 2, Real, tnlCuda, Index > GridType;
+   typedef typename GridType::CoordinatesType CoordinatesType;
+
+   const IndexType& xSize = grid->getDimensions().x();
+   const IndexType& ySize = grid->getDimensions().y();
+
+   CoordinatesType faceCoordinates( ( gridXIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x,
+                                    ( gridYIdx * tnlCuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y );
+
+   if( faceCoordinates.x() < grid->getDimensions().x() + nx &&
+       faceCoordinates.y() < grid->getDimensions().y() + ny )
+   {
+      if( ! grid->template isBoundaryFace< nx, ny >( faceCoordinates ) )
+      {
+         //printf( "Processing interior conditions at %d %d \n", cellCoordinates.x(), cellCoordinates.y() );
+         EntitiesProcessor::processFace( *grid,
+                                         *userData,
+                                         grid->template getFaceIndex< nx, ny >( faceCoordinates ),
+                                         faceCoordinates );
+      }
+   }
+}
+
+template< typename Real,
+          typename Index,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void tnlTraverserGrid2DBoundaryVertices( const tnlGrid< 2, Real, tnlCuda, Index >* grid,
+                                                    UserData* userData,
+                                                    const Index gridXIdx,
+                                                    const Index gridYIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef tnlGrid< 2, Real, tnlCuda, Index > GridType;
+   typedef typename GridType::CoordinatesType CoordinatesType;
+
+   const IndexType& xSize = grid->getDimensions().x();
+   const IndexType& ySize = grid->getDimensions().y();
+
+   CoordinatesType vertexCoordinates( ( gridXIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x,
+                                      ( gridYIdx * tnlCuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y );
+
+   if( vertexCoordinates.x() <= grid->getDimensions().x() &&
+       vertexCoordinates.y() <= grid->getDimensions().y() )
+   {
+      if( grid->isBoundaryVertex( vertexCoordinates ) )
+      {
+         EntitiesProcessor::processVertex( *grid,
+                                           *userData,
+                                           grid->getVertexIndex( vertexCoordinates ),
+                                           vertexCoordinates );
+      }
+   }
+}
+
+template< typename Real,
+          typename Index,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void tnlTraverserGrid2DInteriorVertices( const tnlGrid< 2, Real, tnlCuda, Index >* grid,
+                                                    UserData* userData,
+                                                    const Index gridXIdx,
+                                                    const Index gridYIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef tnlGrid< 2, Real, tnlCuda, Index > GridType;
+   typedef typename GridType::CoordinatesType CoordinatesType;
+
+   const IndexType& xSize = grid->getDimensions().x();
+   const IndexType& ySize = grid->getDimensions().y();
+
+   CoordinatesType vertexCoordinates( ( gridXIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x,
+                                      ( gridYIdx * tnlCuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y );
+
+   if( vertexCoordinates.x() <= grid->getDimensions().x() &&
+       vertexCoordinates.y() <= grid->getDimensions().y() )
+   {
+      if( ! grid->isBoundaryVertex( vertexCoordinates ) )
+      {
+         EntitiesProcessor::processVertex( *grid,
+                                           *userData,
+                                           grid->getVertexIndex( vertexCoordinates ),
+                                           vertexCoordinates );
+      }
+   }
+}
+
+
 #endif
 
 template< typename Real,
@@ -284,9 +464,9 @@ processBoundaryEntities( const GridType& grid,
                                          kernelUserData,
                                          gridXIdx,
                                          gridYIdx );
+         checkCudaDevice;
       }
-   cudaThreadSynchronize();
-   checkCudaDevice;
+   cudaThreadSynchronize();   
 #endif
 }
 
@@ -322,8 +502,8 @@ processInteriorEntities( const GridType& grid,
                                          kernelUserData,
                                          gridXIdx,
                                          gridYIdx );
+         checkCudaDevice;
       }
-   checkCudaDevice;
    tnlCuda::freeFromDevice( kernelGrid );
    tnlCuda::freeFromDevice( kernelUserData );
 #endif
@@ -338,9 +518,61 @@ tnlTraverser< tnlGrid< 2, Real, tnlCuda, Index >, 1 >::
 processBoundaryEntities( const GridType& grid,
                          UserData& userData ) const
 {
+#ifdef HAVE_CUDA
+
    /****
-    * Traversing boundary faces
+    * Boundary conditions
     */
+   GridType* kernelGrid = tnlCuda::passToDevice( grid );
+   UserData* kernelUserData = tnlCuda::passToDevice( userData );
+
+   dim3 cudaBlockSize( 16, 16 );
+   dim3 cudaBlocks;
+   IndexType cudaXGrids, cudaYGrids;
+
+   /****
+    * < 1, 0 > faces
+    */
+   cudaBlocks.x = tnlCuda::getNumberOfBlocks( grid.getDimensions().x() + 1, cudaBlockSize.x );
+   cudaBlocks.y = tnlCuda::getNumberOfBlocks( grid.getDimensions().y(), cudaBlockSize.y );
+   cudaXGrids = tnlCuda::getNumberOfGrids( cudaBlocks.x );
+   cudaYGrids = tnlCuda::getNumberOfGrids( cudaBlocks.y );
+   for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+      for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
+      {
+         tnlTraverserGrid2DBoundaryFaces< Real, Index, UserData, EntitiesProcessor, 1, 0 >
+                                        <<< cudaBlocks, cudaBlockSize >>>
+                                       ( kernelGrid,
+                                         kernelUserData,
+                                         gridXIdx,
+                                         gridYIdx );
+         checkCudaDevice;
+      }
+   cudaThreadSynchronize();
+   
+
+   /****
+    * < 0, 1 > faces
+    */
+   cudaBlocks.x = tnlCuda::getNumberOfBlocks( grid.getDimensions().x(), cudaBlockSize.x );
+   cudaBlocks.y = tnlCuda::getNumberOfBlocks( grid.getDimensions().y() + 1, cudaBlockSize.y );
+   cudaXGrids = tnlCuda::getNumberOfGrids( cudaBlocks.x );
+   cudaYGrids = tnlCuda::getNumberOfGrids( cudaBlocks.y );
+   for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+      for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
+      {
+         tnlTraverserGrid2DBoundaryFaces< Real, Index, UserData, EntitiesProcessor, 0, 1 >
+                                        <<< cudaBlocks, cudaBlockSize >>>
+                                       ( kernelGrid,
+                                         kernelUserData,
+                                         gridXIdx,
+                                         gridYIdx );
+         checkCudaDevice;
+      }
+   cudaThreadSynchronize();
+   
+#endif
+
 }
 
 template< typename Real,
@@ -355,6 +587,59 @@ processInteriorEntities( const GridType& grid,
    /****
     * Traversing interior faces
     */
+#ifdef HAVE_CUDA
+
+   /****
+    * Boundary conditions
+    */
+   GridType* kernelGrid = tnlCuda::passToDevice( grid );
+   UserData* kernelUserData = tnlCuda::passToDevice( userData );
+
+   dim3 cudaBlockSize( 16, 16 );
+   dim3 cudaBlocks;
+   IndexType cudaXGrids, cudaYGrids;
+
+   /****
+    * < 1, 0 > faces
+    */
+   cudaBlocks.x = tnlCuda::getNumberOfBlocks( grid.getDimensions().x() + 1, cudaBlockSize.x );
+   cudaBlocks.y = tnlCuda::getNumberOfBlocks( grid.getDimensions().y(), cudaBlockSize.y );
+   cudaXGrids = tnlCuda::getNumberOfGrids( cudaBlocks.x );
+   cudaYGrids = tnlCuda::getNumberOfGrids( cudaBlocks.y );
+   for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+      for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
+      {
+         tnlTraverserGrid2DInteriorFaces< Real, Index, UserData, EntitiesProcessor, 1, 0 >
+                                        <<< cudaBlocks, cudaBlockSize >>>
+                                       ( kernelGrid,
+                                         kernelUserData,
+                                         gridXIdx,
+                                         gridYIdx );
+         checkCudaDevice;
+      }
+   cudaThreadSynchronize();
+   
+
+   /****
+    * < 0, 1 > faces
+    */
+   cudaBlocks.x = tnlCuda::getNumberOfBlocks( grid.getDimensions().x(), cudaBlockSize.x );
+   cudaBlocks.y = tnlCuda::getNumberOfBlocks( grid.getDimensions().y() + 1, cudaBlockSize.y );
+   cudaXGrids = tnlCuda::getNumberOfGrids( cudaBlocks.x );
+   cudaYGrids = tnlCuda::getNumberOfGrids( cudaBlocks.y );
+   for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+      for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
+      {
+         tnlTraverserGrid2DInteriorFaces< Real, Index, UserData, EntitiesProcessor, 0, 1 >
+                                        <<< cudaBlocks, cudaBlockSize >>>
+                                       ( kernelGrid,
+                                         kernelUserData,
+                                         gridXIdx,
+                                         gridYIdx );
+         checkCudaDevice;
+      }
+   cudaThreadSynchronize();
+#endif
 }
 
 template< typename Real,
@@ -366,9 +651,33 @@ tnlTraverser< tnlGrid< 2, Real, tnlCuda, Index >, 0 >::
 processBoundaryEntities( const GridType& grid,
                          UserData& userData ) const
 {
+#ifdef HAVE_CUDA
    /****
-    * Boundary interior vertices
+    * Traversing boundary vertices    
     */
+   GridType* kernelGrid = tnlCuda::passToDevice( grid );
+   UserData* kernelUserData = tnlCuda::passToDevice( userData );
+
+   dim3 cudaBlockSize( 16, 16 );
+   dim3 cudaBlocks;
+   cudaBlocks.x = tnlCuda::getNumberOfBlocks( grid.getDimensions().x() + 1, cudaBlockSize.x );
+   cudaBlocks.y = tnlCuda::getNumberOfBlocks( grid.getDimensions().y() + 1, cudaBlockSize.y );
+   const IndexType cudaXGrids = tnlCuda::getNumberOfGrids( cudaBlocks.x );
+   const IndexType cudaYGrids = tnlCuda::getNumberOfGrids( cudaBlocks.y );
+
+   for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+      for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
+      {
+         tnlTraverserGrid2DBoundaryVertices< Real, Index, UserData, EntitiesProcessor >
+                                           <<< cudaBlocks, cudaBlockSize >>>
+                                          ( kernelGrid,
+                                            kernelUserData,
+                                            gridXIdx,
+                                            gridYIdx );
+         checkCudaDevice;
+      }
+   cudaThreadSynchronize();   
+#endif
 }
 
 
@@ -381,9 +690,33 @@ tnlTraverser< tnlGrid< 2, Real, tnlCuda, Index >, 0 >::
 processInteriorEntities( const GridType& grid,
                          UserData& userData ) const
 {
+#ifdef HAVE_CUDA
    /****
-    * Traversing interior vertices
+    * Traversing interior vertices    
     */
+   GridType* kernelGrid = tnlCuda::passToDevice( grid );
+   UserData* kernelUserData = tnlCuda::passToDevice( userData );
+
+   dim3 cudaBlockSize( 16, 16 );
+   dim3 cudaBlocks;
+   cudaBlocks.x = tnlCuda::getNumberOfBlocks( grid.getDimensions().x() + 1, cudaBlockSize.x );
+   cudaBlocks.y = tnlCuda::getNumberOfBlocks( grid.getDimensions().y() + 1, cudaBlockSize.y );
+   const IndexType cudaXGrids = tnlCuda::getNumberOfGrids( cudaBlocks.x );
+   const IndexType cudaYGrids = tnlCuda::getNumberOfGrids( cudaBlocks.y );
+
+   for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+      for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
+      {
+         tnlTraverserGrid2DInteriorVertices< Real, Index, UserData, EntitiesProcessor >
+                                           <<< cudaBlocks, cudaBlockSize >>>
+                                          ( kernelGrid,
+                                            kernelUserData,
+                                            gridXIdx,
+                                            gridYIdx );
+         checkCudaDevice;
+      }
+   cudaThreadSynchronize();   
+#endif
 }
 
 
diff --git a/src/mesh/tnlTraverser_Grid3D_impl.h b/src/mesh/tnlTraverser_Grid3D_impl.h
index 9cd074ca5fbad055e2e0f5c6420df935b767835b..739e12cd5e58f3e459911f118af61fb421e76cca 100644
--- a/src/mesh/tnlTraverser_Grid3D_impl.h
+++ b/src/mesh/tnlTraverser_Grid3D_impl.h
@@ -161,6 +161,37 @@ processBoundaryEntities( const GridType& grid,
    /****
     * Traversing boundary vertices
     */
+   CoordinatesType coordinates;
+   const IndexType& xSize = grid.getDimensions().x();
+   const IndexType& ySize = grid.getDimensions().y();
+   const IndexType& zSize = grid.getDimensions().z();
+
+   for( coordinates.y() = 0; coordinates.y() <= ySize; coordinates.y() ++ )
+      for( coordinates.x() = 0; coordinates.x() <= xSize; coordinates.x() ++ )
+      {
+         coordinates.z() = 0;
+         EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+         coordinates.z() = zSize;
+         EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+      }
+
+   for( coordinates.z() = 0; coordinates.z() <= zSize; coordinates.z() ++ )
+      for( coordinates.x() = 0; coordinates.x() <= xSize; coordinates.x() ++ )
+      {
+         coordinates.y() = 0;
+         EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+         coordinates.y() = ySize;
+         EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+      }
+
+   for( coordinates.z() = 0; coordinates.z() <= zSize; coordinates.z() ++ )
+      for( coordinates.y() = 0; coordinates.y() <= ySize; coordinates.y() ++ )
+      {
+         coordinates.x() = 0;
+         EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+         coordinates.x() = xSize;
+         EntitiesProcessor::processVertex( grid, userData, grid.getVertexIndex( coordinates ), coordinates );
+      }
 }
 
 template< typename Real,
@@ -175,6 +206,21 @@ processInteriorEntities( const GridType& grid,
    /****
     * Traversing interior vertices
     */
+   CoordinatesType coordinates;
+   const IndexType& xSize = grid.getDimensions().x();
+   const IndexType& ySize = grid.getDimensions().y();
+   const IndexType& zSize = grid.getDimensions().z();
+
+#ifdef HAVE_OPENMP
+//#pragma omp parallel for
+#endif
+   for( coordinates.z() = 1; coordinates.z() < zSize; coordinates.z() ++ )
+      for( coordinates.y() = 1; coordinates.y() < ySize; coordinates.y() ++ )
+         for( coordinates.x() = 1; coordinates.x() < xSize; coordinates.x() ++ )
+         {
+            const IndexType index = grid.getVertexIndex( coordinates );
+            EntitiesProcessor::processVertex( grid, userData, index, coordinates );
+         }
 }
 
 
@@ -260,6 +306,81 @@ __global__ void tnlTraverserGrid3DInteriorCells( const tnlGrid< 3, Real, tnlCuda
    }
 }
 
+template< typename Real,
+          typename Index,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void tnlTraverserGrid3DBoundaryVertices( const tnlGrid< 3, Real, tnlCuda, Index >* grid,
+                                                    UserData* userData,
+                                                    const Index gridXIdx,
+                                                    const Index gridYIdx,
+                                                    const Index gridZIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef tnlGrid< 3, Real, tnlCuda, Index > GridType;
+   typedef typename GridType::CoordinatesType CoordinatesType;
+
+   const IndexType& xSize = grid->getDimensions().x();
+   const IndexType& ySize = grid->getDimensions().y();
+   const IndexType& zSize = grid->getDimensions().z();
+
+   CoordinatesType vertexCoordinates( ( gridXIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x,
+                                      ( gridYIdx * tnlCuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y,
+                                      ( gridZIdx * tnlCuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z );
+
+   if( vertexCoordinates.x() < grid->getDimensions().x() &&
+       vertexCoordinates.y() < grid->getDimensions().y() &&
+       vertexCoordinates.z() < grid->getDimensions().z() )
+   {
+      if( grid->isBoundaryVertex( vertexCoordinates ) )
+      {
+         EntitiesProcessor::processVertex( *grid,
+                                           *userData,
+                                           grid->getVertexIndex( vertexCoordinates ),
+                                           vertexCoordinates );
+      }
+   }
+}
+
+template< typename Real,
+          typename Index,
+          typename UserData,
+          typename EntitiesProcessor >
+__global__ void tnlTraverserGrid3DInteriorVertices( const tnlGrid< 3, Real, tnlCuda, Index >* grid,
+                                                    UserData* userData,
+                                                    const Index gridXIdx,
+                                                    const Index gridYIdx,
+                                                    const Index gridZIdx )
+{
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef tnlGrid< 3, Real, tnlCuda, Index > GridType;
+   typedef typename GridType::CoordinatesType CoordinatesType;
+
+   const IndexType& xSize = grid->getDimensions().x();
+   const IndexType& ySize = grid->getDimensions().y();
+   const IndexType& zSize = grid->getDimensions().z();
+
+   CoordinatesType vertexCoordinates( ( gridXIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x,
+                                      ( gridYIdx * tnlCuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y,
+                                      ( gridZIdx * tnlCuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z );
+
+   if( vertexCoordinates.x() < grid->getDimensions().x() &&
+       vertexCoordinates.y() < grid->getDimensions().y() &&
+       vertexCoordinates.z() < grid->getDimensions().z() )
+   {
+      if( ! grid->isBoundaryVertex( vertexCoordinates ) )
+      {
+         EntitiesProcessor::processVertex( *grid,
+                                           *userData,
+                                           grid->getVertexIndex( vertexCoordinates ),
+                                           vertexCoordinates );
+      }
+   }
+}
+
+
 #endif
 
 template< typename Real,
@@ -419,6 +540,35 @@ processBoundaryEntities( const GridType& grid,
    /****
     * Traversing boundary vertices
     */
+#ifdef HAVE_CUDA
+   GridType* kernelGrid = tnlCuda::passToDevice( grid );
+   UserData* kernelUserData = tnlCuda::passToDevice( userData );
+
+   dim3 cudaBlockSize( 8, 8, 4 );
+   dim3 cudaBlocks;
+   cudaBlocks.x = tnlCuda::getNumberOfBlocks( grid.getDimensions().x() + 1, cudaBlockSize.x );
+   cudaBlocks.y = tnlCuda::getNumberOfBlocks( grid.getDimensions().y() + 1, cudaBlockSize.y );
+   cudaBlocks.z = tnlCuda::getNumberOfBlocks( grid.getDimensions().z() + 1, cudaBlockSize.z );
+   const IndexType cudaXGrids = tnlCuda::getNumberOfGrids( cudaBlocks.x );
+   const IndexType cudaYGrids = tnlCuda::getNumberOfGrids( cudaBlocks.y );
+   const IndexType cudaZGrids = tnlCuda::getNumberOfGrids( cudaBlocks.z );
+
+   for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+      for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
+         for( IndexType gridZIdx = 0; gridZIdx < cudaZGrids; gridZIdx ++ )
+         {
+            tnlTraverserGrid3DBoundaryVertices< Real, Index, UserData, EntitiesProcessor >
+                                              <<< cudaBlocks, cudaBlockSize >>>
+                                             ( kernelGrid,
+                                               kernelUserData,
+                                               gridXIdx,
+                                               gridYIdx,
+                                               gridZIdx );
+         }
+   cudaThreadSynchronize();
+   checkCudaDevice;
+#endif
+   
 }
 
 template< typename Real,
@@ -433,6 +583,35 @@ processInteriorEntities( const GridType& grid,
    /****
     * Traversing interior vertices
     */
+#ifdef HAVE_CUDA
+   GridType* kernelGrid = tnlCuda::passToDevice( grid );
+   UserData* kernelUserData = tnlCuda::passToDevice( userData );
+
+   dim3 cudaBlockSize( 8, 8, 4 );
+   dim3 cudaBlocks;
+   cudaBlocks.x = tnlCuda::getNumberOfBlocks( grid.getDimensions().x() + 1, cudaBlockSize.x );
+   cudaBlocks.y = tnlCuda::getNumberOfBlocks( grid.getDimensions().y() + 1, cudaBlockSize.y );
+   cudaBlocks.z = tnlCuda::getNumberOfBlocks( grid.getDimensions().z() + 1, cudaBlockSize.z );
+   const IndexType cudaXGrids = tnlCuda::getNumberOfGrids( cudaBlocks.x );
+   const IndexType cudaYGrids = tnlCuda::getNumberOfGrids( cudaBlocks.y );
+   const IndexType cudaZGrids = tnlCuda::getNumberOfGrids( cudaBlocks.z );
+
+   for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
+      for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
+         for( IndexType gridZIdx = 0; gridZIdx < cudaZGrids; gridZIdx ++ )
+         {
+            tnlTraverserGrid3DInteriorVertices< Real, Index, UserData, EntitiesProcessor >
+                                              <<< cudaBlocks, cudaBlockSize >>>
+                                             ( kernelGrid,
+                                               kernelUserData,
+                                               gridXIdx,
+                                               gridYIdx,
+                                               gridZIdx );
+         }
+   cudaThreadSynchronize();
+   checkCudaDevice;
+#endif
+   
 }
 
 
diff --git a/src/operators/CMakeLists.txt b/src/operators/CMakeLists.txt
index 8d6c37b489616968ce31e07a0909d17c6dd1c381..6ddc8f02ab8b75b4304fdb98e540094caadfbdfb 100755
--- a/src/operators/CMakeLists.txt
+++ b/src/operators/CMakeLists.txt
@@ -12,7 +12,9 @@ SET( headers tnlFiniteDifferences.h
              tnlNeumannBoundaryConditions_impl.h
              tnlAnalyticNeumannBoundaryConditions.h
              tnlAnalyticNeumannBoundaryConditions_impl.h
-             tnlExactOperatorEvaluator.h )
+             tnlExactOperatorEvaluator.h
+             tnlOperatorEnumerator.h
+             tnlOperatorEnumerator_impl.h )
              
 SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/operators )
 
diff --git a/src/operators/diffusion/tnlExactLinearDiffusion.h b/src/operators/diffusion/tnlExactLinearDiffusion.h
index 17250df6356a0fa03932fade2a9b694f3077f276..28018f0684c68958eccfb10ca897ad277ac20536 100644
--- a/src/operators/diffusion/tnlExactLinearDiffusion.h
+++ b/src/operators/diffusion/tnlExactLinearDiffusion.h
@@ -18,7 +18,7 @@
 #ifndef TNLEXACTLINEARDIFFUSION_H_
 #define TNLEXACTLINEARDIFFUSION_H_
 
-#include <functions/tnlFunctionType.h>
+#include <functors/tnlFunctionType.h>
 
 template< int Dimensions >
 class tnlExactLinearDiffusion
@@ -38,9 +38,7 @@ class tnlExactLinearDiffusion< 1 >
 #else   
       template< typename Function, typename Vertex, typename Real = typename Vertex::RealType >
 #endif      
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       static Real getValue( const Function& function,
                             const Vertex& v,
                             const Real& time = 0.0 );
@@ -60,9 +58,7 @@ class tnlExactLinearDiffusion< 2 >
 #else   
       template< typename Function, typename Vertex, typename Real = typename Vertex::RealType >
 #endif
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif      
+      __cuda_callable__
       static Real getValue( const Function& function,
                             const Vertex& v,
                             const Real& time = 0.0 );
@@ -82,9 +78,7 @@ class tnlExactLinearDiffusion< 3 >
 #else   
       template< typename Function, typename Vertex, typename Real = typename Vertex::RealType >
 #endif
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       static Real getValue( const Function& function,
                             const Vertex& v,
                             const Real& time = 0.0 );
diff --git a/src/operators/diffusion/tnlExactLinearDiffusion_impl.h b/src/operators/diffusion/tnlExactLinearDiffusion_impl.h
index 19d3dc7663aeb31efe1ee80656dc2ab096526a72..33bd68192b922c91c7b42f7be98028c644b78b9e 100644
--- a/src/operators/diffusion/tnlExactLinearDiffusion_impl.h
+++ b/src/operators/diffusion/tnlExactLinearDiffusion_impl.h
@@ -26,9 +26,7 @@ getType()
 }
 
 template< typename Function, typename Vertex, typename Real >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlExactLinearDiffusion< 1 >::
 getValue( const Function& function,
@@ -46,9 +44,7 @@ getType()
 }
 
 template< typename Function, typename Vertex, typename Real >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlExactLinearDiffusion< 2 >::
 getValue( const Function& function,
@@ -67,9 +63,7 @@ getType()
 }
 
 template< typename Function, typename Vertex, typename Real >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlExactLinearDiffusion< 3 >::
 getValue( const Function& function,
diff --git a/src/operators/diffusion/tnlLinearDiffusion.h b/src/operators/diffusion/tnlLinearDiffusion.h
index 35eebca181a439adf938f69974ca3ad0f77f9d63..5702538a80e59b4101429e2ad2e4d1c8fbe9d47e 100644
--- a/src/operators/diffusion/tnlLinearDiffusion.h
+++ b/src/operators/diffusion/tnlLinearDiffusion.h
@@ -1,3 +1,20 @@
+/***************************************************************************
+                          tnlLinearDiffusion.h  -  description
+                             -------------------
+    begin                : Aug 8, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
 #ifndef TNLLINEARDIFFUSION_H
 #define	TNLLINEARDIFFUSION_H
 
@@ -20,37 +37,32 @@ template< typename MeshReal,
           typename Index >
 class tnlLinearDiffusion< tnlGrid< 1,MeshReal, Device, MeshIndex >, Real, Index >
 {
-   public: 
-   
-   typedef tnlGrid< 1, MeshReal, Device, MeshIndex > MeshType;
-   typedef typename MeshType::CoordinatesType CoordinatesType;
-   typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
-
-   static tnlString getType();
+   public:    
    
-   template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Real getValue( const MeshType& mesh,
-                  const IndexType cellIndex,
-                  const CoordinatesType& coordinates,
-                  const Vector& u,
-                  const RealType& time ) const;
-
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getLinearSystemRowLength( const MeshType& mesh,
-                                   const IndexType& index,
-                                   const CoordinatesType& coordinates ) const;
-
-   template< typename Vector, typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      typedef tnlGrid< 1, MeshReal, Device, MeshIndex > MeshType;
+      typedef typename MeshType::CoordinatesType CoordinatesType;
+      typedef Real RealType;
+      typedef Device DeviceType;
+      typedef Index IndexType;
+      enum { Dimensions = MeshType::Dimensions };
+
+      static tnlString getType();
+
+      template< typename Vector >
+      __cuda_callable__
+      Real getValue( const MeshType& mesh,
+                     const IndexType cellIndex,
+                     const CoordinatesType& coordinates,
+                     const Vector& u,
+                     const RealType& time ) const;
+
+      __cuda_callable__
+      Index getLinearSystemRowLength( const MeshType& mesh,
+                                      const IndexType& index,
+                                      const CoordinatesType& coordinates ) const;
+
+      template< typename Vector, typename MatrixRow >
+      __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const RealType& tau,
                                const MeshType& mesh,
@@ -72,35 +84,30 @@ class tnlLinearDiffusion< tnlGrid< 2, MeshReal, Device, MeshIndex >, Real, Index
 {
    public: 
    
-   typedef tnlGrid< 2, MeshReal, Device, MeshIndex > MeshType;
-   typedef typename MeshType::CoordinatesType CoordinatesType;
-   typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
-
-   static tnlString getType();
-   
-   template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Real getValue( const MeshType& mesh,
-                  const IndexType cellIndex,
-                  const CoordinatesType& coordinates,
-                  const Vector& u,
-                  const Real& time ) const;
-
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getLinearSystemRowLength( const MeshType& mesh,
-                                   const IndexType& index,
-                                   const CoordinatesType& coordinates ) const;
-
-   template< typename Vector, typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      typedef tnlGrid< 2, MeshReal, Device, MeshIndex > MeshType;
+      typedef typename MeshType::CoordinatesType CoordinatesType;
+      typedef Real RealType;
+      typedef Device DeviceType;
+      typedef Index IndexType;
+      enum { Dimensions = MeshType::Dimensions };
+
+      static tnlString getType();
+
+      template< typename Vector >
+      __cuda_callable__
+      Real getValue( const MeshType& mesh,
+                     const IndexType cellIndex,
+                     const CoordinatesType& coordinates,
+                     const Vector& u,
+                     const Real& time ) const;
+
+      __cuda_callable__
+      Index getLinearSystemRowLength( const MeshType& mesh,
+                                      const IndexType& index,
+                                      const CoordinatesType& coordinates ) const;
+
+      template< typename Vector, typename MatrixRow >
+      __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const RealType& tau,
                                const MeshType& mesh,
@@ -121,35 +128,30 @@ class tnlLinearDiffusion< tnlGrid< 3, MeshReal, Device, MeshIndex >, Real, Index
 {
    public: 
    
-   typedef tnlGrid< 3, MeshReal, Device, MeshIndex > MeshType;
-   typedef typename MeshType::CoordinatesType CoordinatesType;
-   typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
-
-   static tnlString getType();
-   
-   template< typename Vector >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Real getValue( const MeshType& mesh,
-                  const IndexType cellIndex,
-                  const CoordinatesType& coordinates,
-                  const Vector& u,
-                  const Real& time ) const;
-
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getLinearSystemRowLength( const MeshType& mesh,
-                                   const IndexType& index,
-                                   const CoordinatesType& coordinates ) const;
-
-   template< typename Vector, typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      typedef tnlGrid< 3, MeshReal, Device, MeshIndex > MeshType;
+      typedef typename MeshType::CoordinatesType CoordinatesType;
+      typedef Real RealType;
+      typedef Device DeviceType;
+      typedef Index IndexType;
+      enum { Dimensions = MeshType::Dimensions };
+
+      static tnlString getType();
+
+      template< typename Vector >
+      __cuda_callable__
+      Real getValue( const MeshType& mesh,
+                     const IndexType cellIndex,
+                     const CoordinatesType& coordinates,
+                     const Vector& u,
+                     const Real& time ) const;
+
+      __cuda_callable__
+      Index getLinearSystemRowLength( const MeshType& mesh,
+                                      const IndexType& index,
+                                      const CoordinatesType& coordinates ) const;
+
+      template< typename Vector, typename MatrixRow >
+      __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const RealType& tau,
                                const MeshType& mesh,
diff --git a/src/operators/diffusion/tnlLinearDiffusion_impl.h b/src/operators/diffusion/tnlLinearDiffusion_impl.h
index 87b02fe2f743a344370433caea26b8dca4baab7f..c7fcc8a0fd5ead0b57f3b2aff8c9553cb78fe547 100644
--- a/src/operators/diffusion/tnlLinearDiffusion_impl.h
+++ b/src/operators/diffusion/tnlLinearDiffusion_impl.h
@@ -1,3 +1,19 @@
+/***************************************************************************
+                          tnlLinearDiffusion_impl.h  -  description
+                             -------------------
+    begin                : Aug 8, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
 
 #ifndef TNLLINEARDIFFUSION_IMP_H
 #define	TNLLINEARDIFFUSION_IMP_H
@@ -26,9 +42,7 @@ template< typename MeshReal,
           typename Real,
           typename Index >
 template< typename Vector >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlLinearDiffusion< tnlGrid< 1, MeshReal, Device, MeshIndex >, Real, Index >::
 getValue( const MeshType& mesh,
@@ -37,7 +51,7 @@ getValue( const MeshType& mesh,
           const Vector& u,
           const Real& time ) const
 {
-   return ( u[ mesh.template getCellNextToCell< - 1 >( cellIndex ) ]
+   return ( u[ mesh.template getCellNextToCell< -1 >( cellIndex ) ]
             - 2.0 * u[ cellIndex ]
             + u[ mesh.template getCellNextToCell< 1 >( cellIndex ) ] ) * mesh.getHxSquareInverse();
 }
@@ -47,9 +61,7 @@ template< typename MeshReal,
           typename MeshIndex,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlLinearDiffusion< tnlGrid< 1, MeshReal, Device, MeshIndex >, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -64,10 +76,8 @@ template< typename MeshReal,
           typename MeshIndex,
           typename Real,
           typename Index >
-   template< typename Vector, typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+   template< typename Vector, typename Matrix >
+__cuda_callable__
 void
 tnlLinearDiffusion< tnlGrid< 1, MeshReal, Device, MeshIndex >, Real, Index >::
 updateLinearSystem( const RealType& time,
@@ -77,8 +87,9 @@ updateLinearSystem( const RealType& time,
                     const CoordinatesType& coordinates,
                     Vector& u,
                     Vector& b,
-                    MatrixRow& matrixRow ) const
+                    Matrix& matrix ) const
 {
+   typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
    const RealType lambdaX = tau * mesh.getHxSquareInverse();
    //printf( "tau = %f lambda = %f dx_sqr = %f dx = %f, \n", tau, lambdaX, mesh.getHxSquareInverse(), mesh.getHx() );
    matrixRow.setElement( 0, mesh.template getCellNextToCell< -1 >( index ),     - lambdaX );
@@ -107,9 +118,7 @@ template< typename MeshReal,
           typename MeshIndex,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlLinearDiffusion< tnlGrid< 2, MeshReal, Device, MeshIndex >, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -126,9 +135,7 @@ template< typename MeshReal,
           typename Real,
           typename Index >
 template< typename Vector >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlLinearDiffusion< tnlGrid< 2, MeshReal, Device, MeshIndex >, Real, Index >::
 getValue( const MeshType& mesh,
@@ -150,10 +157,8 @@ template< typename MeshReal,
           typename MeshIndex,
           typename Real,
           typename Index >
-   template< typename Vector, typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+   template< typename Vector, typename Matrix >
+__cuda_callable__
 void
 tnlLinearDiffusion< tnlGrid< 2, MeshReal, Device, MeshIndex >, Real, Index >::
 updateLinearSystem( const RealType& time,
@@ -163,8 +168,9 @@ updateLinearSystem( const RealType& time,
                     const CoordinatesType& coordinates,
                     Vector& u,
                     Vector& b,
-                    MatrixRow& matrixRow ) const
+                    Matrix& matrix ) const
 {
+   typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
    const RealType lambdaX = tau * mesh.getHxSquareInverse();
    const RealType lambdaY = tau * mesh.getHySquareInverse();
    matrixRow.setElement( 0, mesh.template getCellNextToCell< 0, -1 >( index ), -lambdaY );
@@ -196,9 +202,7 @@ template< typename MeshReal,
           typename Real,
           typename Index >
 template< typename Vector >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+__cuda_callable__
 Real
 tnlLinearDiffusion< tnlGrid< 3, MeshReal, Device, MeshIndex >, Real, Index >::
 getValue( const MeshType& mesh,
@@ -223,9 +227,7 @@ template< typename MeshReal,
           typename MeshIndex,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlLinearDiffusion< tnlGrid< 3, MeshReal, Device, MeshIndex >, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -240,10 +242,8 @@ template< typename MeshReal,
           typename MeshIndex,
           typename Real,
           typename Index >
-   template< typename Vector, typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+   template< typename Vector, typename Matrix >
+__cuda_callable__
 void
 tnlLinearDiffusion< tnlGrid< 3, MeshReal, Device, MeshIndex >, Real, Index >::
 updateLinearSystem( const RealType& time,
@@ -253,8 +253,9 @@ updateLinearSystem( const RealType& time,
                     const CoordinatesType& coordinates,
                     Vector& u,
                     Vector& b,
-                    MatrixRow& matrixRow ) const
+                    Matrix& matrix ) const
 {
+   typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
    const RealType lambdaX = tau * mesh.getHxSquareInverse();
    const RealType lambdaY = tau * mesh.getHySquareInverse();
    const RealType lambdaZ = tau * mesh.getHzSquareInverse();
@@ -267,6 +268,4 @@ updateLinearSystem( const RealType& time,
    matrixRow.setElement( 6, mesh.template getCellNextToCell< 0, 0, 1 >( index ),   -lambdaZ );
 }
 
-
-
 #endif	/* TNLLINEARDIFFUSION_IMP_H */
diff --git a/src/operators/tnlAnalyticDirichletBoundaryConditions.h b/src/operators/tnlAnalyticDirichletBoundaryConditions.h
index 3c0855c48bec35ab2e9ea35310f99005a6e42341..25aa0d383ea569739dd64d8315e1046f58b9bcc2 100644
--- a/src/operators/tnlAnalyticDirichletBoundaryConditions.h
+++ b/src/operators/tnlAnalyticDirichletBoundaryConditions.h
@@ -21,7 +21,7 @@
 
 #include <core/vectors/tnlStaticVector.h>
 #include <config/tnlParameterContainer.h>
-#include <functions/tnlConstantFunction.h>
+#include <functors/tnlConstantFunction.h>
 #include <core/vectors/tnlSharedVector.h>
 
 template< typename Mesh,
@@ -45,50 +45,44 @@ class tnlAnalyticDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Dev
 {
    public:
    
-   typedef tnlGrid< Dimensions, MeshReal, Device, MeshIndex > MeshType;
-   typedef Real RealType;
-   typedef Device DeviceType;
-   typedef Index IndexType;
-   typedef tnlAnalyticDirichletBoundaryConditions< MeshType, Function, Real, Index > ThisType;
+      typedef tnlGrid< Dimensions, MeshReal, Device, MeshIndex > MeshType;
+      typedef Real RealType;
+      typedef Device DeviceType;
+      typedef Index IndexType;
+      typedef tnlAnalyticDirichletBoundaryConditions< MeshType, Function, Real, Index > ThisType;
 
-   typedef tnlSharedVector< RealType, DeviceType, IndexType > SharedVector;
-   typedef tnlVector< RealType, DeviceType, IndexType> DofVectorType;
-   typedef tnlStaticVector< Dimensions, RealType > VertexType;
-   typedef typename MeshType::CoordinatesType CoordinatesType;
+      typedef tnlSharedVector< RealType, DeviceType, IndexType > SharedVector;
+      typedef tnlVector< RealType, DeviceType, IndexType> DofVectorType;
+      typedef tnlStaticVector< Dimensions, RealType > VertexType;
+      typedef typename MeshType::CoordinatesType CoordinatesType;
 
-   static void configSetup( tnlConfigDescription& config,
-                            const tnlString& prefix = "" );
-            
-   bool setup( const tnlParameterContainer& parameters,
-               const tnlString& prefix = "" );
+      static void configSetup( tnlConfigDescription& config,
+                               const tnlString& prefix = "" );
 
-   void setFunction( const Function& function );
+      bool setup( const tnlParameterContainer& parameters,
+                  const tnlString& prefix = "" );
 
-   Function& getFunction();
+      void setFunction( const Function& function );
 
-   const Function& getFunction() const;
+      Function& getFunction();
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   void setBoundaryConditions( const RealType& time,
-                               const MeshType& mesh,
-                               const IndexType index,
-                               const CoordinatesType& coordinates,
-                               DofVectorType& u,
-                               DofVectorType& fu ) const;
-
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
-   Index getLinearSystemRowLength( const MeshType& mesh,
-                                   const IndexType& index,
-                                   const CoordinatesType& coordinates ) const;
-
-   template< typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+      const Function& getFunction() const;
+
+      __cuda_callable__
+      void setBoundaryConditions( const RealType& time,
+                                  const MeshType& mesh,
+                                  const IndexType index,
+                                  const CoordinatesType& coordinates,
+                                  DofVectorType& u,
+                                  DofVectorType& fu ) const;
+
+      __cuda_callable__
+      Index getLinearSystemRowLength( const MeshType& mesh,
+                                      const IndexType& index,
+                                      const CoordinatesType& coordinates ) const;
+
+      template< typename MatrixRow >
+      __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const MeshType& mesh,
                                const IndexType& index,
@@ -99,7 +93,7 @@ class tnlAnalyticDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Dev
 
    protected:
 
-   Function function;
+      Function function;
 };
 
 template< typename Mesh,
diff --git a/src/operators/tnlAnalyticDirichletBoundaryConditions_impl.h b/src/operators/tnlAnalyticDirichletBoundaryConditions_impl.h
index 2a67471b80ad60574b7d09243ea7a8834f2937dc..da90d69aa256688d3e3c32d2da87601f9597009b 100644
--- a/src/operators/tnlAnalyticDirichletBoundaryConditions_impl.h
+++ b/src/operators/tnlAnalyticDirichletBoundaryConditions_impl.h
@@ -97,9 +97,7 @@ template< int Dimensions,
           typename Function,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlAnalyticDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 setBoundaryConditions( const RealType& time,
@@ -120,9 +118,7 @@ template< int Dimensions,
           typename Function,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlAnalyticDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -139,10 +135,8 @@ template< int Dimensions,
           typename Function,
           typename Real,
           typename Index >
-   template< typename MatrixRow >          
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+   template< typename Matrix >          
+__cuda_callable__
 void
 tnlAnalyticDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 updateLinearSystem( const RealType& time,
@@ -151,8 +145,9 @@ updateLinearSystem( const RealType& time,
                     const CoordinatesType& coordinates,
                     DofVectorType& u,
                     DofVectorType& b,
-                    MatrixRow& matrixRow ) const
+                    Matrix& matrix ) const
 {
+   typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
    matrixRow.setElement( 0, index, 1.0 );
    b[ index ] = function.getValue( mesh.template getCellCenter< VertexType >( coordinates ), time );
 }
diff --git a/src/operators/tnlAnalyticNeumannBoundaryConditions.h b/src/operators/tnlAnalyticNeumannBoundaryConditions.h
index 7a90a3763d658cbbeb51facb873aa24dc2e9bb28..cdee63045408aa4b91aa9f360203ea31ae18e7db 100644
--- a/src/operators/tnlAnalyticNeumannBoundaryConditions.h
+++ b/src/operators/tnlAnalyticNeumannBoundaryConditions.h
@@ -79,9 +79,7 @@ class tnlAnalyticNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIn
    typedef typename MeshType::CoordinatesType CoordinatesType;
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void setBoundaryConditions( const RealType& time,
                                const MeshType& mesh,
                                const IndexType index,
@@ -89,17 +87,13 @@ class tnlAnalyticNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIn
                                DofVectorType& u,
                                DofVectorType& fu ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getLinearSystemRowLength( const MeshType& mesh,
                                    const IndexType& index,
                                    const CoordinatesType& coordinates ) const;
 
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const MeshType& mesh,
                                const IndexType& index,
@@ -135,9 +129,7 @@ class tnlAnalyticNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIn
    typedef tnlStaticVector< 2, RealType > VertexType;
    typedef typename MeshType::CoordinatesType CoordinatesType;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__  
    void setBoundaryConditions( const RealType& time,
                                const MeshType& mesh,
                                const IndexType index,
@@ -145,17 +137,13 @@ class tnlAnalyticNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIn
                                DofVectorType& u,
                                DofVectorType& fu ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getLinearSystemRowLength( const MeshType& mesh,
                                    const IndexType& index,
                                    const CoordinatesType& coordinates ) const;
 
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const MeshType& mesh,
                                const IndexType& index,
@@ -191,9 +179,7 @@ class tnlAnalyticNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIn
    typedef typename MeshType::CoordinatesType CoordinatesType;
 
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void setBoundaryConditions( const RealType& time,
                                const MeshType& mesh,
                                const IndexType index,
@@ -201,17 +187,13 @@ class tnlAnalyticNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIn
                                DofVectorType& u,
                                DofVectorType& fu ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getLinearSystemRowLength( const MeshType& mesh,
                                    const IndexType& index,
                                    const CoordinatesType& coordinates ) const;
 
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const MeshType& mesh,
                                const IndexType& index,
diff --git a/src/operators/tnlAnalyticNeumannBoundaryConditions_impl.h b/src/operators/tnlAnalyticNeumannBoundaryConditions_impl.h
index a2e47cbe14b8f11a51719250b3528867962d4bfd..5e65dde3f5512db20d5c2d5390f49195940c431d 100644
--- a/src/operators/tnlAnalyticNeumannBoundaryConditions_impl.h
+++ b/src/operators/tnlAnalyticNeumannBoundaryConditions_impl.h
@@ -72,9 +72,7 @@ template< typename MeshReal,
           typename Function,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlAnalyticNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 setBoundaryConditions( const RealType& time,
@@ -98,9 +96,7 @@ template< typename MeshReal,
           typename Function,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlAnalyticNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -116,10 +112,8 @@ template< typename MeshReal,
           typename Function,
           typename Real,
           typename Index >
-   template< typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+   template< typename Matrix >
+__cuda_callable__
 void
 tnlAnalyticNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 updateLinearSystem( const RealType& time,
@@ -128,8 +122,9 @@ updateLinearSystem( const RealType& time,
                     const CoordinatesType& coordinates,
                     DofVectorType& u,
                     DofVectorType& b,
-                    MatrixRow& matrixRow ) const
+                    Matrix& matrix ) const
 {
+   typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
    const Real functionValue = this->function.getValue( mesh.template getCellCenter< VertexType >( coordinates ), time );
    if( coordinates.x() == 0 )
    {
@@ -155,9 +150,7 @@ template< typename MeshReal,
           typename Function,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlAnalyticNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 setBoundaryConditions( const RealType& time,
@@ -197,9 +190,7 @@ template< typename MeshReal,
           typename Function,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlAnalyticNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -215,10 +206,8 @@ template< typename MeshReal,
           typename Function,
           typename Real,
           typename Index >
-   template< typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+   template< typename Matrix >
+__cuda_callable__
 void
 tnlAnalyticNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 updateLinearSystem( const RealType& time,
@@ -227,8 +216,9 @@ updateLinearSystem( const RealType& time,
                     const CoordinatesType& coordinates,
                     DofVectorType& u,
                     DofVectorType& b,
-                    MatrixRow& matrixRow ) const
+                    Matrix& matrix ) const
 {
+   typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
    const Real functionValue = this->function.getValue( mesh.template getCellCenter< VertexType >( coordinates ), time );
    if( coordinates.x() == 0 )
    {
@@ -265,9 +255,7 @@ template< typename MeshReal,
           typename Function,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlAnalyticNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 setBoundaryConditions( const RealType& time,
@@ -318,9 +306,7 @@ template< typename MeshReal,
           typename Function,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlAnalyticNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -336,10 +322,8 @@ template< typename MeshReal,
           typename Function,
           typename Real,
           typename Index >
-   template< typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+   template< typename Matrix >
+__cuda_callable__
 void
 tnlAnalyticNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIndex >, Function, Real, Index >::
 updateLinearSystem( const RealType& time,
@@ -348,8 +332,9 @@ updateLinearSystem( const RealType& time,
                     const CoordinatesType& coordinates,
                     DofVectorType& u,
                     DofVectorType& b,
-                    MatrixRow& matrixRow ) const
+                    Matrix& matrix ) const
 {
+   typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
    const Real functionValue = this->function.getValue( mesh.template getCellCenter< VertexType >( coordinates ), time );
    if( coordinates.x() == 0 )
    {
diff --git a/src/operators/tnlDirichletBoundaryConditions.h b/src/operators/tnlDirichletBoundaryConditions.h
index ae300ee8762d2c598ed7c4393dcec57cfd893e42..8ece2bddfa0701d74dedf24a9aa4a65a44a632e3 100644
--- a/src/operators/tnlDirichletBoundaryConditions.h
+++ b/src/operators/tnlDirichletBoundaryConditions.h
@@ -59,9 +59,7 @@ class tnlDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Device, Mes
 
    const Vector& getVector() const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void setBoundaryConditions( const RealType& time,
                                const MeshType& mesh,
                                const IndexType index,
@@ -69,17 +67,13 @@ class tnlDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Device, Mes
                                DofVectorType& u,
                                DofVectorType& fu ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getLinearSystemRowLength( const MeshType& mesh,
                                    const IndexType& index,
                                    const CoordinatesType& coordinates ) const;
 
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const MeshType& mesh,
                                const IndexType& index,
diff --git a/src/operators/tnlDirichletBoundaryConditions_impl.h b/src/operators/tnlDirichletBoundaryConditions_impl.h
index fd910a0ba58841654a61e7a582d6b02c025329f5..c392491807698e5e81a522c0b27f1d1237bb5b6f 100644
--- a/src/operators/tnlDirichletBoundaryConditions_impl.h
+++ b/src/operators/tnlDirichletBoundaryConditions_impl.h
@@ -90,9 +90,7 @@ template< int Dimensions,
           typename Vector,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 setBoundaryConditions( const RealType& time,
@@ -113,9 +111,7 @@ template< int Dimensions,
           typename Vector,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -132,10 +128,8 @@ template< int Dimensions,
           typename Vector,
           typename Real,
           typename Index >
-   template< typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+   template< typename Matrix >
+__cuda_callable__
 void
 tnlDirichletBoundaryConditions< tnlGrid< Dimensions, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 updateLinearSystem( const RealType& time,
@@ -144,8 +138,9 @@ updateLinearSystem( const RealType& time,
                     const CoordinatesType& coordinates,
                     DofVectorType& u,
                     DofVectorType& b,
-                    MatrixRow& matrixRow ) const
+                    Matrix& matrix ) const
 {
+   typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
    matrixRow.setElement( 0, index, 1.0 );
    b[ index ] = this->vector[ index ];
 }
diff --git a/src/operators/tnlNeumannBoundaryConditions.h b/src/operators/tnlNeumannBoundaryConditions.h
index e7bfc28da9116094aecdd5c8b36840ea8038e88f..0bebe6b0c1f47b0b40225ea7b9ee8f95dbe80915 100644
--- a/src/operators/tnlNeumannBoundaryConditions.h
+++ b/src/operators/tnlNeumannBoundaryConditions.h
@@ -76,9 +76,7 @@ class tnlNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIndex >, V
    typedef tnlStaticVector< 1, RealType > VertexType;
    typedef typename MeshType::CoordinatesType CoordinatesType;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void setBoundaryConditions( const RealType& time,
                                const MeshType& mesh,
                                const IndexType index,
@@ -86,17 +84,13 @@ class tnlNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIndex >, V
                                DofVectorType& u,
                                DofVectorType& fu ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getLinearSystemRowLength( const MeshType& mesh,
                                    const IndexType& index,
                                    const CoordinatesType& coordinates ) const;
 
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const MeshType& mesh,
                                const IndexType& index,
@@ -131,9 +125,7 @@ class tnlNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIndex >, V
    typedef tnlStaticVector< 2, RealType > VertexType;
    typedef typename MeshType::CoordinatesType CoordinatesType;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void setBoundaryConditions( const RealType& time,
                                const MeshType& mesh,
                                const IndexType index,
@@ -141,17 +133,13 @@ class tnlNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIndex >, V
                                DofVectorType& u,
                                DofVectorType& fu ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getLinearSystemRowLength( const MeshType& mesh,
                                    const IndexType& index,
                                    const CoordinatesType& coordinates ) const;
 
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const MeshType& mesh,
                                const IndexType& index,
@@ -186,9 +174,7 @@ class tnlNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIndex >, V
    typedef tnlStaticVector< 3, RealType > VertexType;
    typedef typename MeshType::CoordinatesType CoordinatesType;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    void setBoundaryConditions( const RealType& time,
                                const MeshType& mesh,
                                const IndexType index,
@@ -196,17 +182,13 @@ class tnlNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIndex >, V
                                DofVectorType& u,
                                DofVectorType& fu ) const;
 
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
    Index getLinearSystemRowLength( const MeshType& mesh,
                                    const IndexType& index,
                                    const CoordinatesType& coordinates ) const;
 
    template< typename MatrixRow >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+   __cuda_callable__
       void updateLinearSystem( const RealType& time,
                                const MeshType& mesh,
                                const IndexType& index,
diff --git a/src/operators/tnlNeumannBoundaryConditions_impl.h b/src/operators/tnlNeumannBoundaryConditions_impl.h
index c80cad00c99853d5d08febb04154cc3cca198907..a31150150df867883f956884a9a9a8c55072b1ab 100644
--- a/src/operators/tnlNeumannBoundaryConditions_impl.h
+++ b/src/operators/tnlNeumannBoundaryConditions_impl.h
@@ -50,9 +50,7 @@ template< typename MeshReal,
           typename Vector,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 setBoundaryConditions( const RealType& time,
@@ -75,9 +73,7 @@ template< typename MeshReal,
           typename Vector,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -93,10 +89,8 @@ template< typename MeshReal,
           typename Vector,
           typename Real,
           typename Index >
-   template< typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+   template< typename Matrix >
+__cuda_callable__
 void
 tnlNeumannBoundaryConditions< tnlGrid< 1, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 updateLinearSystem( const RealType& time,
@@ -105,8 +99,9 @@ updateLinearSystem( const RealType& time,
                     const CoordinatesType& coordinates,
                     DofVectorType& u,
                     DofVectorType& b,
-                    MatrixRow& matrixRow ) const
+                    Matrix& matrix ) const
 {
+   typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
    if( coordinates.x() == 0 )
    {
       matrixRow.setElement( 0, index, 1.0 );
@@ -130,9 +125,7 @@ template< typename MeshReal,
           typename Vector,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 setBoundaryConditions( const RealType& time,
@@ -171,9 +164,7 @@ template< typename MeshReal,
           typename Vector,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -189,10 +180,8 @@ template< typename MeshReal,
           typename Vector,
           typename Real,
           typename Index >
-   template< typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+   template< typename Matrix >
+__cuda_callable__
 void
 tnlNeumannBoundaryConditions< tnlGrid< 2, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 updateLinearSystem( const RealType& time,
@@ -201,8 +190,9 @@ updateLinearSystem( const RealType& time,
                     const CoordinatesType& coordinates,
                     DofVectorType& u,
                     DofVectorType& b,
-                    MatrixRow& matrixRow ) const
+                    Matrix& matrix ) const
 {
+   typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
    if( coordinates.x() == 0 )
    {
       matrixRow.setElement( 0, index,                            1.0 );
@@ -238,9 +228,7 @@ template< typename MeshReal,
           typename Vector,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 void
 tnlNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 setBoundaryConditions( const RealType& time,
@@ -289,9 +277,7 @@ template< typename MeshReal,
           typename Vector,
           typename Real,
           typename Index >
-#ifdef HAVE_CUDA
-   __device__ __host__
-#endif
+__cuda_callable__
 Index
 tnlNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 getLinearSystemRowLength( const MeshType& mesh,
@@ -307,10 +293,8 @@ template< typename MeshReal,
           typename Vector,
           typename Real,
           typename Index >
-   template< typename MatrixRow >
-#ifdef HAVE_CUDA
-__device__ __host__
-#endif
+   template< typename Matrix >
+__cuda_callable__
 void
 tnlNeumannBoundaryConditions< tnlGrid< 3, MeshReal, Device, MeshIndex >, Vector, Real, Index >::
 updateLinearSystem( const RealType& time,
@@ -319,8 +303,9 @@ updateLinearSystem( const RealType& time,
                     const CoordinatesType& coordinates,
                     DofVectorType& u,
                     DofVectorType& b,
-                    MatrixRow& matrixRow ) const
+                    Matrix& matrix ) const
 {
+   typename Matrix::MatrixRow matrixRow = matrix.getRow( index );
    if( coordinates.x() == 0 )
    {
       matrixRow.setElement( 0, index,                            1.0 );
diff --git a/src/operators/tnlOperatorEnumerator.h b/src/operators/tnlOperatorEnumerator.h
new file mode 100644
index 0000000000000000000000000000000000000000..050a23b72d489d1e977df6fe0b1f3783b5f3e6ef
--- /dev/null
+++ b/src/operators/tnlOperatorEnumerator.h
@@ -0,0 +1,175 @@
+/***************************************************************************
+                          tnlOperatorEnumerator.h  -  description
+                             -------------------
+    begin                : Mar 8, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+#ifndef SRC_OPERATORS_TNLOPERATORENUMERATOR_H_
+#define SRC_OPERATORS_TNLOPERATORENUMERATOR_H_
+
+//#include <_operators/tnlOperatorAdapter.h>
+
+template< typename Operator,
+          typename DofVector >
+class tnlOperatorEnumeratorTraverserUserData
+{
+   public:
+
+      typedef typename DofVector::RealType RealType;
+
+      const RealType *time;
+
+      const Operator* _operator;
+
+      DofVector *u;
+
+      const RealType* _operatorCoefficient;
+
+      const RealType* dofVectorCoefficient;
+
+      tnlOperatorEnumeratorTraverserUserData( const RealType& time,
+                                              const Operator& _operator,
+                                              DofVector& u,
+                                              const RealType& _operatorCoefficient,
+                                              const RealType& dofVectorCoefficient )
+      : time( &time ),
+        _operator( &_operator ),
+        u( &u ),
+        _operatorCoefficient( &_operatorCoefficient ),
+        dofVectorCoefficient( &dofVectorCoefficient )
+      {};
+};
+
+
+template< typename Mesh,
+          typename Operator,
+          typename DofVector >
+class tnlOperatorEnumerator
+{
+   public:
+      typedef Mesh MeshType;
+      typedef typename DofVector::RealType RealType;
+      typedef typename DofVector::DeviceType DeviceType;
+      typedef typename DofVector::IndexType IndexType;
+      typedef tnlOperatorEnumeratorTraverserUserData< Operator,
+                                                      DofVector > TraverserUserData;
+
+      template< int EntityDimensions >
+      void enumerate( const MeshType& mesh,
+                      const Operator& _operator,
+                      DofVector& u,
+                      const RealType& _operatorCoefficient = 1.0,
+                      const RealType& dofVectorCoefficient = 0.0,
+                      const RealType& time = 0.0 ) const;
+
+
+      class TraverserEntitiesProcessor
+      {
+         public:
+
+            template< int EntityDimensions >
+#ifdef HAVE_CUDA
+            __host__ __device__
+#endif
+            static void processEntity( const MeshType& mesh,
+                                       TraverserUserData& userData,
+                                       const IndexType index )
+            {
+               //typedef tnlOperatorAdapter< MeshType, Operator > OperatorAdapter;
+               ( *userData.u )[ index ] =
+                        ( *userData.dofVectorCoefficient ) * ( *userData.u )[ index ] +
+                        ( *userData._operatorCoefficient ) * userData._operator ->getValue( mesh,
+                                                                                            index,
+                                                                                            *userData.time );
+            }
+
+      };
+
+};
+
+template< int Dimensions,
+          typename Real,
+          typename Device,
+          typename Index,
+          typename Operator,
+          typename DofVector >
+class tnlOperatorEnumerator< tnlGrid< Dimensions, Real, Device, Index >,
+                             Operator,
+                             DofVector >
+{
+   public:
+
+      typedef tnlGrid< Dimensions, Real, Device, Index > MeshType;
+      typedef typename MeshType::RealType RealType;
+      typedef typename MeshType::DeviceType DeviceType;
+      typedef typename MeshType::IndexType IndexType;
+      typedef typename MeshType::CoordinatesType CoordinatesType;
+      typedef tnlOperatorEnumeratorTraverserUserData< Operator,
+                                                      DofVector > TraverserUserData;
+
+      template< int EntityDimensions >
+      void enumerate( const MeshType& mesh,
+                      const Operator& _operator,
+                      DofVector& u,
+                      const RealType& _operatorCoefficient = 1.0,
+                      const RealType& dofVectorCoefficient = 0.0,
+                      const RealType& time = 0.0 ) const;
+
+      class TraverserEntitiesProcessor
+      {
+         public:
+
+         typedef typename MeshType::VertexType VertexType;
+
+#ifdef HAVE_CUDA
+            __host__ __device__
+#endif
+            static void processCell( const MeshType& mesh,
+                                     TraverserUserData& userData,
+                                     const IndexType index,
+                                     const CoordinatesType& coordinates )
+            {
+               //printf( "Enumerator::processCell mesh =%p \n", &mesh );
+               //typedef tnlOperatorAdapter< MeshType, Operator > OperatorAdapter;
+               ( *userData.u )[ index ] =
+                        ( *userData.dofVectorCoefficient ) * ( *userData.u )[ index ] +
+                        ( *userData._operatorCoefficient ) * userData._operator->getValue( mesh,
+                                                                                           index,
+                                                                                           coordinates,
+                                                                                           *userData.time );
+
+            }
+
+#ifdef HAVE_CUDA
+            __host__ __device__
+#endif
+            static void processFace( const MeshType& mesh,
+                                     TraverserUserData& userData,
+                                     const IndexType index,
+                                     const CoordinatesType& coordinates )
+            {
+               //typedef tnlOperatorAdapter< MeshType, Operator > OperatorAdapter;
+               ( *userData.u )[ index ] =
+                        ( *userData.dofVectorCoefficient ) * ( *userData.u )[ index ] +
+                        ( *userData._operatorCoefficient ) * userData._operator->getValue( mesh,
+                                                                                           index,
+                                                                                           coordinates,
+                                                                                           *userData.time );
+            }
+      };
+
+};
+
+#include <operators/tnlOperatorEnumerator_impl.h>
+
+#endif /* SRC_OPERATORS_TNLOPERATORENUMERATOR_H_ */
diff --git a/src/operators/tnlOperatorEnumerator_impl.h b/src/operators/tnlOperatorEnumerator_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ffe3f21a7a7235cd0c63c545400dd2bf86afa901
--- /dev/null
+++ b/src/operators/tnlOperatorEnumerator_impl.h
@@ -0,0 +1,141 @@
+/***************************************************************************
+                          tnlOperatorEnumerator_impl.h  -  description
+                             -------------------
+    begin                : Mar 8, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+#ifndef SRC_OPERATORS_TNLOPERATORENUMERATOR_IMPL_H_
+#define SRC_OPERATORS_TNLOPERATORENUMERATOR_IMPL_H_
+
+#include <operators/tnlOperatorEnumerator.h>
+#include <mesh/tnlTraverser_Grid1D.h>
+#include <mesh/tnlTraverser_Grid2D.h>
+#include <mesh/tnlTraverser_Grid3D.h>
+
+template< typename Mesh,
+          typename Operator,
+          typename DofVector >
+   template< int EntityDimensions >
+void
+tnlOperatorEnumerator< Mesh, Operator, DofVector >::
+enumerate( const MeshType& mesh,
+           const Operator& _operator,
+           DofVector& u,
+           const RealType& _operatorCoefficient,
+           const RealType& dofVectorCoefficient,
+           const RealType& time ) const
+
+{
+   if( DeviceType::DeviceType == tnlHostDevice )
+   {
+      TraverserUserData userData( time, _operator, u, _operatorCoefficient, dofVectorCoefficient );
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+
+   }
+   if( DeviceType::DeviceType == tnlCudaDevice )
+   {
+      RealType* kernelTime = tnlCuda::passToDevice( time );
+      Operator* kernelOperator = tnlCuda::passToDevice( _operator );
+      DofVector* kernelU = tnlCuda::passToDevice( u );
+      RealType* kernelOperatorCoefficient = tnlCuda::passToDevice( _operatorCoefficient );
+      RealType* kernelDofVectorCoefficient = tnlCuda::passToDevice( dofVectorCoefficient );
+      TraverserUserData userData( *kernelTime, *kernelOperator, *kernelU, *kernelOperatorCoefficient, *kernelDofVectorCoefficient );
+      checkCudaDevice;
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+
+      checkCudaDevice;
+      tnlCuda::freeFromDevice( kernelTime );
+      tnlCuda::freeFromDevice( kernelOperator );
+      tnlCuda::freeFromDevice( kernelU );
+      tnlCuda::freeFromDevice( kernelOperatorCoefficient );
+      tnlCuda::freeFromDevice( kernelDofVectorCoefficient );
+      checkCudaDevice;
+   }
+}
+
+template< int Dimensions,
+          typename Real,
+          typename Device,
+          typename Index,
+          typename Operator,
+          typename DofVector >
+   template< int EntityDimensions >
+void
+tnlOperatorEnumerator< tnlGrid< Dimensions, Real, Device, Index >, Operator, DofVector  >::
+enumerate( const tnlGrid< Dimensions, Real, Device, Index >& mesh,
+           const Operator& _operator,
+           DofVector& u,
+           const RealType& _operatorCoefficient,
+           const RealType& dofVectorCoefficient,
+           const RealType& time ) const
+{
+   if( DeviceType::DeviceType == tnlHostDevice )
+   {
+      TraverserUserData userData( time, _operator, u, _operatorCoefficient, dofVectorCoefficient );
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+
+   }
+   if( DeviceType::DeviceType == tnlCudaDevice )
+   {
+      RealType* kernelTime = tnlCuda::passToDevice( time );
+      Operator* kernelOperator = tnlCuda::passToDevice( _operator );
+      DofVector* kernelU = tnlCuda::passToDevice( u );
+      RealType* kernelOperatorCoefficient = tnlCuda::passToDevice( _operatorCoefficient );
+      RealType* kernelDofVectorCoefficient = tnlCuda::passToDevice( dofVectorCoefficient );
+      TraverserUserData userData( *kernelTime, *kernelOperator, *kernelU, *kernelOperatorCoefficient, *kernelDofVectorCoefficient );
+      checkCudaDevice;
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserEntitiesProcessor >
+                                                    ( mesh,
+                                                      userData );
+
+      checkCudaDevice;
+      tnlCuda::freeFromDevice( kernelTime );
+      tnlCuda::freeFromDevice( kernelOperator );
+      tnlCuda::freeFromDevice( kernelU );
+      tnlCuda::freeFromDevice( kernelOperatorCoefficient );
+      tnlCuda::freeFromDevice( kernelDofVectorCoefficient );
+      checkCudaDevice;
+   }
+}
+
+#endif /* SRC_OPERATORS_TNLOPERATORENUMERATOR_IMPL_H_ */
diff --git a/src/problems/tnlHeatEquationEocProblem.h b/src/problems/tnlHeatEquationEocProblem.h
index e37868c262b1b69d4d753354871cdf68eb5e9421..199404347430d996f2ad9e54d0b45f8368e0bdab 100644
--- a/src/problems/tnlHeatEquationEocProblem.h
+++ b/src/problems/tnlHeatEquationEocProblem.h
@@ -2,7 +2,7 @@
                           tnlHeatEquationEocProblem.h  -  description
                              -------------------
     begin                : Nov 22, 2014
-    copyright            : (C) 2014 by oberhuber
+    copyright            : (C) 2014 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -15,6 +15,13 @@
  *                                                                         *
  ***************************************************************************/
 
+/***
+ * Authors:
+ * Oberhuber Tomas, tomas.oberhuber@fjfi.cvut.cz
+ * Szekely Ondrej, ondra.szekely@gmail.com
+ */
+
+
 #ifndef TNLHEATEQUATIONEOCPROBLEM_H_
 #define TNLHEATEQUATIONEOCPROBLEM_H_
 
diff --git a/src/problems/tnlHeatEquationEocProblem_impl.h b/src/problems/tnlHeatEquationEocProblem_impl.h
index e7dd68285cb3f7f4675de7510fb5992a475c6645..0193cfd37dd5519548d2008a971e7605fadaf2f5 100644
--- a/src/problems/tnlHeatEquationEocProblem_impl.h
+++ b/src/problems/tnlHeatEquationEocProblem_impl.h
@@ -2,7 +2,7 @@
                           tnlHeatEquationEocProblem_impl.h  -  description
                              -------------------
     begin                : Nov 22, 2014
-    copyright            : (C) 2014 by oberhuber
+    copyright            : (C) 2014 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -15,6 +15,13 @@
  *                                                                         *
  ***************************************************************************/
 
+/***
+ * Authors:
+ * Oberhuber Tomas, tomas.oberhuber@fjfi.cvut.cz
+ * Szekely Ondrej, ondra.szekely@gmail.com
+ */
+
+
 #ifndef TNLHEATEQUATIONEOCPROBLEM_IMPL_H_
 #define TNLHEATEQUATIONEOCPROBLEM_IMPL_H_
 
diff --git a/src/problems/tnlHeatEquationEocRhs.h b/src/problems/tnlHeatEquationEocRhs.h
index 4133915989f7f129eca6911e2787a3f35f349c0a..1a95822f6685c2759d4414eb006b5b42da6f69c3 100644
--- a/src/problems/tnlHeatEquationEocRhs.h
+++ b/src/problems/tnlHeatEquationEocRhs.h
@@ -2,7 +2,7 @@
                           tnlHeatEquationEocRhs.h  -  description
                              -------------------
     begin                : Sep 8, 2014
-    copyright            : (C) 2014 by oberhuber
+    copyright            : (C) 2014 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -15,10 +15,16 @@
  *                                                                         *
  ***************************************************************************/
 
+/***
+ * Authors:
+ * Oberhuber Tomas, tomas.oberhuber@fjfi.cvut.cz
+ * Szekely Ondrej, ondra.szekely@gmail.com
+ */
+
 #ifndef TNLHEATEQUATIONEOCRHS_H_
 #define TNLHEATEQUATIONEOCRHS_H_
 
-#include <functions/tnlFunctionType.h>
+#include <functors/tnlFunctionType.h>
 
 template< typename ExactOperator,
           typename TestFunction >
@@ -39,9 +45,7 @@ class tnlHeatEquationEocRhs
 
       template< typename Vertex,
                 typename Real >
-#ifdef HAVE_CUDA
-      __device__ __host__
-#endif
+      __cuda_callable__
       Real getValue( const Vertex& vertex,
                      const Real& time ) const
       {
diff --git a/src/problems/tnlHeatEquationProblem.h b/src/problems/tnlHeatEquationProblem.h
index d6f5375e8db0207dd92007834531b6be7a2644d0..7e32aa65f97a2dae146be50687b6714a6ff3fca9 100644
--- a/src/problems/tnlHeatEquationProblem.h
+++ b/src/problems/tnlHeatEquationProblem.h
@@ -2,7 +2,7 @@
                           tnlHeatEquationProblem.h  -  description
                              -------------------
     begin                : Feb 23, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
+    copyright            : (C) 2013 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -15,11 +15,19 @@
  *                                                                         *
  ***************************************************************************/
 
+/***
+ * Authors:
+ * Oberhuber Tomas, tomas.oberhuber@fjfi.cvut.cz
+ * Szekely Ondrej, ondra.szekely@gmail.com
+ */
+
+
 #ifndef TNLHEATEQUATIONPROBLEM_H_
 #define TNLHEATEQUATIONPROBLEM_H_
 
 #include <problems/tnlPDEProblem.h>
 #include <operators/diffusion/tnlLinearDiffusion.h>
+#include <matrices/tnlEllpackMatrix.h>
 
 template< typename Mesh,
           typename BoundaryCondition,
@@ -37,9 +45,11 @@ class tnlHeatEquationProblem : public tnlPDEProblem< Mesh,
       typedef typename Mesh::DeviceType DeviceType;
       typedef typename DifferentialOperator::IndexType IndexType;
       typedef tnlPDEProblem< Mesh, RealType, DeviceType, IndexType > BaseType;
+      typedef tnlCSRMatrix< RealType, DeviceType, IndexType > MatrixType;
 
       using typename BaseType::MeshType;
       using typename BaseType::DofVectorType;
+      using typename BaseType::MeshDependentDataType;
 
       static tnlString getTypeStatic();
 
@@ -53,17 +63,17 @@ class tnlHeatEquationProblem : public tnlPDEProblem< Mesh,
       bool setInitialCondition( const tnlParameterContainer& parameters,
                                 const MeshType& mesh,
                                 DofVectorType& dofs,
-                                DofVectorType& auxDofs );
+                                MeshDependentDataType& meshDependentData );
 
-      template< typename MatrixType >
+      template< typename Matrix >
       bool setupLinearSystem( const MeshType& mesh,
-                              MatrixType& matrix );
+                              Matrix& matrix );
 
       bool makeSnapshot( const RealType& time,
                          const IndexType& step,
                          const MeshType& mesh,
                          DofVectorType& dofs,
-                         DofVectorType& auxDofs );
+                         MeshDependentDataType& meshDependentData );
 
       IndexType getDofs( const MeshType& mesh ) const;
 
@@ -74,16 +84,17 @@ class tnlHeatEquationProblem : public tnlPDEProblem< Mesh,
                            const RealType& tau,
                            const MeshType& mesh,
                            DofVectorType& _u,
-                           DofVectorType& _fu );
+			   DofVectorType& _fu,
+                           MeshDependentDataType& meshDependentData );
 
-      template< typename MatrixType >
+      template< typename Matrix >
       void assemblyLinearSystem( const RealType& time,
                                  const RealType& tau,
                                  const MeshType& mesh,
-                                 DofVectorType& dofs,
-                                 DofVectorType& auxDofs,
-                                 MatrixType& matrix,
-                                 DofVectorType& rightHandSide );
+                                 DofVectorType& dofs,                                 
+                                 Matrix& matrix,
+                                 DofVectorType& rightHandSide,
+				 MeshDependentDataType& meshDependentData );
 
 
       protected:
diff --git a/src/problems/tnlHeatEquationProblem_impl.h b/src/problems/tnlHeatEquationProblem_impl.h
index 0a79298ba0fc3df6436de9370d2eb2a0be4d0b18..5953cdf6566ccdf0a9a4f1c5bf30968fefc059cd 100644
--- a/src/problems/tnlHeatEquationProblem_impl.h
+++ b/src/problems/tnlHeatEquationProblem_impl.h
@@ -2,7 +2,7 @@
                           tnlHeatEquationProblem_impl.h  -  description
                              -------------------
     begin                : Mar 10, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
+    copyright            : (C) 2013 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -15,6 +15,12 @@
  *                                                                         *
  ***************************************************************************/
 
+/***
+ * Authors:
+ * Oberhuber Tomas, tomas.oberhuber@fjfi.cvut.cz
+ * Szekely Ondrej, ondra.szekely@gmail.com
+ */
+
 #ifndef TNLHEATEQUATIONPROBLEM_IMPL_H_
 #define TNLHEATEQUATIONPROBLEM_IMPL_H_
 
@@ -24,6 +30,7 @@
 #include <core/tnlLogger.h>
 #include <solvers/pde/tnlExplicitUpdater.h>
 #include <solvers/pde/tnlLinearSystemAssembler.h>
+#include <solvers/pde/tnlBackwardTimeDiscretisation.h>
 
 
 template< typename Mesh,
@@ -108,7 +115,7 @@ tnlHeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOper
 setInitialCondition( const tnlParameterContainer& parameters,
                      const MeshType& mesh,
                      DofVectorType& dofs,
-                     DofVectorType& auxiliaryDofs )
+                     MeshDependentDataType& meshDependentData )
 {
    this->bindDofs( mesh, dofs );
    const tnlString& initialConditionFile = parameters.getParameter< tnlString >( "initial-condition" );
@@ -124,24 +131,24 @@ template< typename Mesh,
           typename BoundaryCondition,
           typename RightHandSide,
           typename DifferentialOperator >
-   template< typename MatrixType >          
+   template< typename Matrix >          
 bool
 tnlHeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::
 setupLinearSystem( const MeshType& mesh,
-                   MatrixType& matrix )
+                   Matrix& matrix )
 {
    const IndexType dofs = this->getDofs( mesh );
-   typedef typename MatrixType::RowLengthsVector RowLengthsVectorType;
-   RowLengthsVectorType rowLengths;
+   typedef typename Matrix::CompressedRowsLengthsVector CompressedRowsLengthsVectorType;
+   CompressedRowsLengthsVectorType rowLengths;
    if( ! rowLengths.setSize( dofs ) )
       return false;
-   tnlMatrixSetter< MeshType, DifferentialOperator, BoundaryCondition, RowLengthsVectorType > matrixSetter;
-   matrixSetter.template getRowLengths< Mesh::Dimensions >( mesh,
+   tnlMatrixSetter< MeshType, DifferentialOperator, BoundaryCondition, CompressedRowsLengthsVectorType > matrixSetter;
+   matrixSetter.template getCompressedRowsLengths< Mesh::Dimensions >( mesh,
                                                             differentialOperator,
                                                             boundaryCondition,
                                                             rowLengths );
    matrix.setDimensions( dofs, dofs );
-   if( ! matrix.setRowLengths( rowLengths ) )
+   if( ! matrix.setCompressedRowsLengths( rowLengths ) )
       return false;
    return true;
    //return tnlMultidiagonalMatrixSetter< Mesh >::setupMatrix( mesh, matrix );
@@ -157,7 +164,7 @@ makeSnapshot( const RealType& time,
               const IndexType& step,
               const MeshType& mesh,
               DofVectorType& dofs,
-              DofVectorType& auxiliaryDofs )
+              MeshDependentDataType& meshDependentData )
 {
    cout << endl << "Writing output at time " << time << " step " << step << "." << endl;
 
@@ -180,7 +187,8 @@ getExplicitRHS( const RealType& time,
                 const RealType& tau,
                 const MeshType& mesh,
                 DofVectorType& u,
-                DofVectorType& fu )
+		DofVectorType& fu,
+                MeshDependentDataType& meshDependentData )
 {
    /****
     * If you use an explicit solver like tnlEulerSolver or tnlMersonSolver, you
@@ -212,18 +220,24 @@ template< typename Mesh,
           typename BoundaryCondition,
           typename RightHandSide,
           typename DifferentialOperator >
-    template< typename MatrixType >          
+    template< typename Matrix >          
 void
 tnlHeatEquationProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::
 assemblyLinearSystem( const RealType& time,
                       const RealType& tau,
                       const MeshType& mesh,
-                      DofVectorType& u,
-                      DofVectorType& auxDofs,
-                      MatrixType& matrix,
-                      DofVectorType& b )
+                      DofVectorType& u,                      
+                      Matrix& matrix,
+                      DofVectorType& b,
+		      MeshDependentDataType& meshDependentData )
 {
-   tnlLinearSystemAssembler< Mesh, DofVectorType, DifferentialOperator, BoundaryCondition, RightHandSide, MatrixType > systemAssembler;
+   tnlLinearSystemAssembler< Mesh,
+                             DofVectorType,
+                             DifferentialOperator,
+                             BoundaryCondition,
+                             RightHandSide,
+                             tnlBackwardTimeDiscretisation,
+                             Matrix > systemAssembler;
    systemAssembler.template assembly< Mesh::Dimensions >( time,
                                                           tau,
                                                           mesh,
@@ -237,6 +251,26 @@ assemblyLinearSystem( const RealType& time,
    cout << endl << b << endl;
    cout << endl << u << endl;
    abort();*/
+   /*cout << "Matrix multiplication test ..." << endl;
+   tnlVector< RealType, DeviceType, IndexType > y;
+   y.setLike( u );
+   tnlTimerRT timer;
+   timer.reset();
+   timer.start();
+   for( int i = 0; i < 100; i++ )
+      matrix.vectorProduct( u, y );
+   timer.stop();
+   cout << "The time is " << timer.getTime();
+   cout << "Scalar product test ..." << endl;
+   timer.reset();
+   RealType a;
+   timer.start();
+   for( int i = 0; i < 100; i++ )
+      a = y.scalarProduct( u );
+   timer.stop();
+   cout << "The time is " << timer.getTime();
+   cout << endl;
+   abort();*/
 }
 
 #endif /* TNLHEATEQUATIONPROBLEM_IMPL_H_ */
diff --git a/src/problems/tnlPDEProblem.h b/src/problems/tnlPDEProblem.h
index 5ea0699400d2e4ac6d9dc0d4b0dfa4d3ae5b9b55..7be3d7f0a161a2c0a7295961c73c4d94bad6bb16 100644
--- a/src/problems/tnlPDEProblem.h
+++ b/src/problems/tnlPDEProblem.h
@@ -37,6 +37,7 @@ class tnlPDEProblem : public tnlProblem< Real, Device, Index >
       typedef Mesh MeshType;
       typedef tnlVector< RealType, DeviceType, IndexType> DofVectorType;
       typedef tnlCSRMatrix< RealType, DeviceType, IndexType > MatrixType;
+      typedef tnlVector< RealType, DeviceType, IndexType > MeshDependentDataType;
 
       /****
        * This means that the time stepper will be set from the command line arguments.
@@ -50,23 +51,23 @@ class tnlPDEProblem : public tnlProblem< Real, Device, Index >
       void writeProlog( tnlLogger& logger,
                         const tnlParameterContainer& parameters ) const;
 
-      IndexType getAuxiliaryDofs( const MeshType& mesh ) const;
+      bool setMeshDependentData( const MeshType& mesh,
+                                 MeshDependentDataType& meshDependentData );
 
-      void bindAuxiliaryDofs( const MeshType& mesh,
-                              DofVectorType& auxiliaryDofs );
+      void bindMeshDependentData( const MeshType& mesh,
+                                  MeshDependentDataType& meshDependentData );
 
       bool preIterate( const RealType& time,
                        const RealType& tau,
                        const MeshType& mesh,
                        DofVectorType& dofs,
-                       DofVectorType& auxDofs );
+                       MeshDependentDataType& meshDependentData );
 
       bool postIterate( const RealType& time,
                         const RealType& tau,
                         const MeshType& mesh,
                         DofVectorType& dofs,
-                        DofVectorType& auxDofs );
-
+                        MeshDependentDataType& meshDependentData );
 
       tnlSolverMonitor< RealType, IndexType >* getSolverMonitor();
 
diff --git a/src/problems/tnlPDEProblem_impl.h b/src/problems/tnlPDEProblem_impl.h
index 33e05b13dd535aac8b2e3f091f5fe4772f94e9e9..9954e2b1f4294c348fd4f197f7e3dfb6e08f273d 100644
--- a/src/problems/tnlPDEProblem_impl.h
+++ b/src/problems/tnlPDEProblem_impl.h
@@ -58,14 +58,15 @@ template< typename Mesh,
           typename Real,
           typename Device,
           typename Index >
-typename tnlPDEProblem< Mesh, Real, Device, Index >::IndexType
+bool
 tnlPDEProblem< Mesh, Real, Device, Index >::
-getAuxiliaryDofs( const MeshType& mesh ) const
+setMeshDependentData( const MeshType& mesh,
+                      MeshDependentDataType& meshDependentData )
 {
    /****
-    * Set-up DOFs and supporting grid functions which will not appear in the discrete solver
+    * Set-up auxiliary data depending on the numerical mesh
     */
-   return 0;
+   return true;
 }
 
 template< typename Mesh,
@@ -74,8 +75,8 @@ template< typename Mesh,
           typename Index >
 void
 tnlPDEProblem< Mesh, Real, Device, Index >::
-bindAuxiliaryDofs( const MeshType& mesh,
-                   DofVectorType& auxiliaryDofs )
+bindMeshDependentData( const MeshType& mesh,
+                       MeshDependentDataType& meshDependentData )
 {
 }
 
diff --git a/src/solvers/CMakeLists.txt b/src/solvers/CMakeLists.txt
index cd08601fdbdeaa3478c1da158037a21d095d8bc9..befbe5500a3fb236123bae2e3c4130577217de98 100755
--- a/src/solvers/CMakeLists.txt
+++ b/src/solvers/CMakeLists.txt
@@ -6,8 +6,8 @@ ADD_SUBDIRECTORY( preconditioners )
 
 SET( headers tnlIterativeSolver.h
              tnlIterativeSolver_impl.h
-             tnlConfigTags.h
-             tnlFastBuildConfig.h
+             tnlBuildConfigTags.h
+             tnlFastBuildConfigTag.h
              tnlMeshTypeResolver.h
              tnlMeshTypeResolver_impl.h
              tnlSolver.h
diff --git a/src/solvers/linear/krylov/tnlBICGStabSolver_impl.h b/src/solvers/linear/krylov/tnlBICGStabSolver_impl.h
index d6c521c7b64405e8d8a2285d7c03a4ae61686878..4919d56bd612cee98420efcaa2136d0a4b9202f2 100644
--- a/src/solvers/linear/krylov/tnlBICGStabSolver_impl.h
+++ b/src/solvers/linear/krylov/tnlBICGStabSolver_impl.h
@@ -50,7 +50,7 @@ tnlBICGStabSolver< Matrix, Preconditioner >::
 configSetup( tnlConfigDescription& config,
              const tnlString& prefix )
 {
-   tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
+   //tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
 }
 
 template< typename Matrix,
@@ -142,7 +142,7 @@ bool tnlBICGStabSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vect
       /****
        * s_j = r_j - alpha_j * A p_j
        */
-      s. alphaXPlusBetaZ( 1.0, r, -alpha, Ap );
+      s.addVectors( r, 1.0, Ap, -alpha );
 
       /****
        * omega_j = ( A s_j, s_j ) / ( A s_j, A s_j )
@@ -164,12 +164,12 @@ bool tnlBICGStabSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vect
       /****
        * x_{j+1} = x_j + alpha_j * p_j + omega_j * s_j
        */
-      x. alphaXPlusBetaZPlusY( alpha, p, omega, s );
+      x.addVectors( p, alpha, s, omega );
       
       /****
        * r_{j+1} = s_j - omega_j * A * s_j
        */
-      r. alphaXPlusBetaZ( 1.0, s, -omega, As );
+      r.addVectors( s, 1.0, As, -omega );
 
       /****
        * beta = alpha_j / omega_j * ( r_{j+1}, r^ast_0 ) / ( r_j, r^ast_0 )
@@ -183,7 +183,9 @@ bool tnlBICGStabSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vect
       /****
        * p_{j+1} = r_{j+1} + beta_j * ( p_j - omega_j * A p_j )
        */
-      RealType residue = computeBICGStabNewP( p, r, beta, omega, Ap );
+      p.addVectors( r, 1.0, Ap, -beta * omega, beta );
+      RealType residue = r.lpNorm( 2.0 );
+      //RealType residue = computeBICGStabNewP( p, r, beta, omega, Ap );
 
       residue /= bNorm;
       this->setResidue( residue );
@@ -213,64 +215,12 @@ bool tnlBICGStabSolver< Matrix, Preconditioner > :: setSize( IndexType size )
        ! As. setSize( size ) ||
        ! M_tmp. setSize( size ) )
    {
-      cerr << "I am not able to allocated all supporting arrays for the BICGStab solver." << endl;
+      cerr << "I am not able to allocate all supporting arrays for the BICGStab solver." << endl;
       return false;
    }
    return true;
 
 };
 
-template< typename RealType,
-          typename Vector >
-RealType computeBICGStabNewPHost( Vector& p,
-                                  const Vector&r,
-                                  const RealType& beta,
-                                  const RealType& omega,
-                                  const Vector& Ap )
-{
-   typedef typename Vector :: IndexType IndexType;
-   const IndexType& size = p. getSize();
-   RealType residue( 0.0 );
-   for( IndexType i = 0; i < size; i ++ )
-   {
-      p[ i ] = r[ i ] + beta * ( p[ i ] - omega * Ap[ i ] );
-      residue += r[ i ] * r[ i ];
-   }
-   return residue;
-}
-
-template< typename RealType,
-          typename Vector >
-RealType computeBICGStabNewPCuda( Vector& p,
-                                  const Vector&r,
-                                  const RealType& beta,
-                                  const RealType& omega,
-                                  const Vector& Ap )
-{
-   abort();
-}
-
-
-template< typename RealType,
-          typename Vector >
-RealType computeBICGStabNewP( Vector& p,
-                              const Vector&r,
-                              const RealType& beta,
-                              const RealType& omega,
-                              const Vector& Ap )
-{
-   typedef typename Vector::DeviceType DeviceType;
-   tnlAssert( DeviceType::getDevice() == tnlHostDevice ||
-              DeviceType::getDevice() == tnlCudaDevice, );
-   switch( DeviceType::getDevice() )
-   {
-      case tnlHostDevice:
-         return computeBICGStabNewPHost( p, r, beta, omega, Ap );
-      case tnlCudaDevice:
-         return computeBICGStabNewPCuda( p, r, beta, omega, Ap );
-   }
-   return 0.0;
-}
-
 
 #endif
diff --git a/src/solvers/linear/krylov/tnlCGSolver_impl.h b/src/solvers/linear/krylov/tnlCGSolver_impl.h
index 70271b6561ac8586759b267f73c250ff58bbf528..ae599af1bbb4e6e51307bee5b9cf6d7df1a92efd 100644
--- a/src/solvers/linear/krylov/tnlCGSolver_impl.h
+++ b/src/solvers/linear/krylov/tnlCGSolver_impl.h
@@ -40,7 +40,7 @@ tnlCGSolver< Matrix, Preconditioner >::
 configSetup( tnlConfigDescription& config,
              const tnlString& prefix )
 {
-   tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
+   //tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
 }
 
 template< typename Matrix,
diff --git a/src/solvers/linear/krylov/tnlGMRESSolver_impl.h b/src/solvers/linear/krylov/tnlGMRESSolver_impl.h
index 011e7e466178036ebb2299659f4a21dafed80a95..cbf970a2297203cbd94c2f2f4f8d44ff7c3e2e26 100644
--- a/src/solvers/linear/krylov/tnlGMRESSolver_impl.h
+++ b/src/solvers/linear/krylov/tnlGMRESSolver_impl.h
@@ -55,8 +55,8 @@ tnlGMRESSolver< Matrix, Preconditioner >::
 configSetup( tnlConfigDescription& config,
              const tnlString& prefix )
 {
-   tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
-   config.addEntry< int >( prefix + "gmres-restarting", "Number of iterations after which the GMRES restarts.", 10 );
+   //tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
+   config.addEntry< int >( prefix + "gmres-restarting", "Number of iterations after which the GMRES restarts.", 10 );   
 }
 
 template< typename Matrix,
@@ -68,6 +68,7 @@ setup( const tnlParameterContainer& parameters,
 {
    tnlIterativeSolver< RealType, IndexType >::setup( parameters, prefix );
    this->setRestarting( parameters.getParameter< int >( "gmres-restarting" ) );
+   return true;
 }
 
 template< typename Matrix,
@@ -105,10 +106,11 @@ bool tnlGMRESSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
            << ". Please set some positive value using the SetRestarting method." << endl;
       return false;
    }
-   if( ! setSize( matrix -> getRows(), restarting ) ) return false;
-
-
-   IndexType i, j = 1, k, l;
+   if( ! setSize( matrix -> getRows(), restarting ) )
+   {
+       cerr << "I am not able to allocate enough memory for the GMRES solver. You may try to decrease the restarting parameter." << endl;
+       return false;
+   }
 
    IndexType _size = size;
 
@@ -132,7 +134,7 @@ bool tnlGMRESSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
       normb = _M_tmp. lpNorm( ( RealType ) 2.0 );
 
       matrix -> vectorProduct( x, _M_tmp );
-      _M_tmp. alphaXPlusBetaY( ( RealType ) 1.0, b, -1.0 );
+      _M_tmp.addVector( b, ( RealType ) 1.0, -1.0 );
       /*for( i = 0; i < size; i ++ )
          M_tmp[ i ] = b[ i ] - M_tmp[ i ];*/
 
@@ -141,11 +143,16 @@ bool tnlGMRESSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
    }
    else
    {
-      matrix -> vectorProduct( x, _r );
+      matrix -> vectorProduct( x, _r );      
       normb = b. lpNorm( ( RealType ) 2.0 );
-      _r. alphaXPlusBetaY( ( RealType ) 1.0, b, -1.0 );
+      _r. addVector( b, ( RealType ) 1.0, -1.0 );
       beta = _r. lpNorm( ( RealType ) 2.0 );
+      //cout << "x = " << x << endl;
    }
+   
+    //cout << "norm b = " << normb << endl;
+    //cout << " beta = " << beta << endl;
+
 
    if( normb == 0.0 ) normb = 1.0;
 
@@ -153,13 +160,13 @@ bool tnlGMRESSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
    this->setResidue( beta / normb );
 
    tnlSharedVector< RealType, DeviceType, IndexType > vi;
-   vi. setName( "tnlGMRESSolver::vi" );
+   //vi. setName( "tnlGMRESSolver::vi" );
    tnlSharedVector< RealType, DeviceType, IndexType > vk;
-   vk. setName( "tnlGMRESSolver::vk" );
+   //vk. setName( "tnlGMRESSolver::vk" );
    while( this->nextIteration() )
    {
       const IndexType m = restarting;
-      for( i = 0; i < m + 1; i ++ )
+      for( IndexType i = 0; i < m + 1; i ++ )
          H[ i ] = s[ i ] = cs[ i ] = sn[ i ] = 0.0;
 
       /****
@@ -181,7 +188,7 @@ bool tnlGMRESSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
       /****
        * Starting m-loop
        */
-      for( i = 0; i < m && this->getIterations() <= this->getMaxIterations(); i++ )
+      for( IndexType i = 0; i < m && this->nextIteration(); i++ )
       {
          vi. bind( &( _v. getData()[ i * size ] ), size );
          /****
@@ -194,37 +201,49 @@ bool tnlGMRESSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
          }
          else
              matrix -> vectorProduct( vi, _w );
-
-         for( k = 0; k <= i; k++ )
-         {
-            vk. bind( &( _v. getData()[ k * _size ] ), _size );
-            /***
-             * H_{k,i} = ( w, v_k )
-             */
-            RealType H_k_i = _w. scalarProduct( vk );
-            H[ k + i * ( m + 1 ) ] = H_k_i;
-
-            /****
-             * w = w - H_{k,i} v_k
-             */
-            _w. addVector( vk, -H_k_i );
-         }
+         
+         //cout << " i = " << i << " vi = " << vi << endl;
+
+         for( IndexType k = 0; k <= i; k++ )
+            H[ k + i * ( m + 1 ) ] = 0.0;
+         for( IndexType l = 0; l < 2; l++ )
+            for( IndexType k = 0; k <= i; k++ )
+            {
+               vk. bind( &( _v. getData()[ k * _size ] ), _size );
+               /***
+                * H_{k,i} = ( w, v_k )
+                */
+               RealType H_k_i = _w. scalarProduct( vk );
+               H[ k + i * ( m + 1 ) ] += H_k_i;           
+
+               /****
+                * w = w - H_{k,i} v_k
+                */
+               _w. addVector( vk, -H_k_i );
+
+               //cout << "H_ki = " << H_k_i << endl;
+               //cout << "w = " << _w << endl;
+            }
          /***
           * H_{i+1,i} = |w|
           */
          RealType normw = _w. lpNorm( ( RealType ) 2.0 );
          H[ i + 1 + i * ( m + 1 ) ] = normw;
 
+         //cout << "normw = " << normw << endl;
+         
          /***
           * v_{i+1} = w / |w|
           */
          vi. bind( &( _v. getData()[ ( i + 1 ) * size ] ), size );
          vi. addVector( _w, ( RealType ) 1.0 / normw );
+         
+         //cout << "vi = " << vi << endl;
 
          /****
           * Applying the Givens rotations
           */
-         for( k = 0; k < i; k++ )
+         for( IndexType k = 0; k < i; k++ )
             applyPlaneRotation( H[ k + i * ( m + 1 )],
                                 H[ k + 1 + i * ( m + 1 ) ],
                                 cs[ k ],
@@ -246,13 +265,13 @@ bool tnlGMRESSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
          this->setResidue( fabs( s[ i + 1 ] ) / normb );
          this->refreshSolverMonitor();
 
-         if( this->getResidue() < this->getConvergenceResidue() )
+         /*if( this->getResidue() < this->getConvergenceResidue() )
          {
             update( i, m, _H, _s, _v, x );
             return true;
          }
          if( ! this->nextIteration() )
-            return false;
+            return false;*/
       }
       update( m - 1, m, _H, _s, _v, x );
 
@@ -263,19 +282,24 @@ bool tnlGMRESSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
       if( preconditioner )
       {
          matrix -> vectorProduct( x, _M_tmp );
-         for( i = 0; i < _size; i ++ )
+         for( IndexType i = 0; i < _size; i ++ )
             M_tmp[ i ] = b[ i ] - M_tmp[ i ];
          //preconditioner -> solve( M_tmp, r );
-         for( i = 0; i < _size; i ++ )
+         for( IndexType i = 0; i < _size; i ++ )
             beta += r[ i ] * r[ i ];
       }
       else
       {
          matrix -> vectorProduct( x, _r );
-         _r. alphaXPlusBetaY( ( RealType ) 1.0, b, -1.0 );
+         _r.addVector( b, ( RealType ) 1.0, -1.0 );
          beta = _r. lpNorm( ( RealType ) 2.0 );
       }
       this->setResidue( beta / normb );
+
+      //cout << " x = " << x << endl;
+      //cout << " beta = " << beta << endl;
+      //cout << "residue = " << beta / normb << endl;
+
    }
    this->refreshSolverMonitor();
    return this->checkConvergence();
diff --git a/src/solvers/linear/krylov/tnlTFQMRSolver_impl.h b/src/solvers/linear/krylov/tnlTFQMRSolver_impl.h
index cc3c51bf2bd9a085702ded5398168741948c6de2..b9f5fefea64720287e51b7b1d2ed7cbaf60c8144 100644
--- a/src/solvers/linear/krylov/tnlTFQMRSolver_impl.h
+++ b/src/solvers/linear/krylov/tnlTFQMRSolver_impl.h
@@ -49,7 +49,7 @@ tnlTFQMRSolver< Matrix, Preconditioner >::
 configSetup( tnlConfigDescription& config,
              const tnlString& prefix )
 {
-   tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
+   //tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
 }
 
 template< typename Matrix,
@@ -59,7 +59,7 @@ tnlTFQMRSolver< Matrix, Preconditioner >::
 setup( const tnlParameterContainer& parameters,
        const tnlString& prefix )
 {
-   tnlIterativeSolver< RealType, IndexType >::setup( parameters, prefix );
+   return tnlIterativeSolver< RealType, IndexType >::setup( parameters, prefix );
 }
 
 template< typename Matrix,
@@ -99,7 +99,7 @@ bool tnlTFQMRSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
    }
    else*/
    {
-      r. alphaXPlusBetaY( -1.0, b, -1.0 );
+      r. addVector( b, -1.0, -1.0 );
       w = u = r;
       matrix -> vectorProduct( u, v );
       d. setValue( 0.0 );
@@ -127,7 +127,7 @@ bool tnlTFQMRSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
       w.addVector( Au, -alpha );
       //cerr << "alpha = " << alpha << endl;
       //cerr << "theta * theta / alpha * eta = " << theta * theta / alpha * eta << endl;
-      d. alphaXPlusBetaY( 1.0, u, theta * theta / alpha * eta );
+      d. addVector( u, 1.0, theta * theta / alpha * eta );
       theta = w. lpNorm( 2.0 ) / tau;
       const RealType c = sqrt( 1.0 + theta * theta );
       tau = tau * theta * c;
@@ -143,7 +143,7 @@ bool tnlTFQMRSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
          Au.addVector( v, beta );
          u.addVector( w, 1.0, beta );
          matrix -> vectorProduct( u, Au_new );
-         v.alphaXPlusBetaZ( 1.0, Au_new, beta, Au );
+         v.addVectors( Au_new, 1.0, Au, beta );
       }
       
       //this -> setResidue( residue );
diff --git a/src/solvers/linear/stationary/tnlSORSolver_impl.h b/src/solvers/linear/stationary/tnlSORSolver_impl.h
index 4de51ef56fabd9c8508b486a5bf9f4eff2443bb6..419e190d55462ec8da4eb0d40b18f1f4c032e285 100644
--- a/src/solvers/linear/stationary/tnlSORSolver_impl.h
+++ b/src/solvers/linear/stationary/tnlSORSolver_impl.h
@@ -39,7 +39,7 @@ tnlSORSolver< Matrix, Preconditioner >::
 configSetup( tnlConfigDescription& config,
              const tnlString& prefix )
 {
-   tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
+   //tnlIterativeSolver< RealType, IndexType >::configSetup( config, prefix );
    config.addEntry< double >( prefix + "sor-omega", "Relaxation parameter of the SOR method.", 1.0 );
 }
 
diff --git a/src/solvers/ode/tnlEulerSolver_impl.h b/src/solvers/ode/tnlEulerSolver_impl.h
index 6e7881cb940b3aa34169d46627d704461c7a6488..2522596e58c280c4f66307d238b7e487963ff9f7 100644
--- a/src/solvers/ode/tnlEulerSolver_impl.h
+++ b/src/solvers/ode/tnlEulerSolver_impl.h
@@ -48,7 +48,7 @@ template< typename Problem >
 void tnlEulerSolver< Problem > :: configSetup( tnlConfigDescription& config,
                                                const tnlString& prefix )
 {
-   tnlExplicitSolver< Problem >::configSetup( config, prefix );
+   //tnlExplicitSolver< Problem >::configSetup( config, prefix );
    config.addEntry< double >( prefix + "euler-cfl", "Coefficient C in the Courant–Friedrichs–Lewy condition.", 0.0 );
 };
 
diff --git a/src/solvers/ode/tnlExplicitSolver_impl.h b/src/solvers/ode/tnlExplicitSolver_impl.h
index 11a49ae9f2d231dd2ea110cb68b1da3896014615..ad35537d4a4bca62ab60fc60157e55b83797a2c1 100644
--- a/src/solvers/ode/tnlExplicitSolver_impl.h
+++ b/src/solvers/ode/tnlExplicitSolver_impl.h
@@ -41,7 +41,7 @@ tnlExplicitSolver< Problem >::
 configSetup( tnlConfigDescription& config,
              const tnlString& prefix )
 {
-   tnlIterativeSolver< typename Problem::RealType, typename Problem::IndexType >::configSetup( config, prefix );
+   //tnlIterativeSolver< typename Problem::RealType, typename Problem::IndexType >::configSetup( config, prefix );
 }
 
 template< typename Problem >
diff --git a/src/solvers/ode/tnlMersonSolver_impl.h b/src/solvers/ode/tnlMersonSolver_impl.h
index 88d69280c3f6e6bfb6532203fe9f424627970d4a..95241e39acf21bc1868a09115d3a50fdf7b1bac5 100644
--- a/src/solvers/ode/tnlMersonSolver_impl.h
+++ b/src/solvers/ode/tnlMersonSolver_impl.h
@@ -111,7 +111,7 @@ template< typename Problem >
 void tnlMersonSolver< Problem > :: configSetup( tnlConfigDescription& config,
                                                 const tnlString& prefix )
 {
-   tnlExplicitSolver< Problem >::configSetup( config, prefix );
+   //tnlExplicitSolver< Problem >::configSetup( config, prefix );
    config.addEntry< double >( prefix + "merson-adaptivity", "Time step adaptivity controlling coefficient (the smaller the more precise the computation is, zero means no adaptivity).", 1.0e-4 );
 };
 
diff --git a/src/solvers/ode/tnlODESolverMonitor_impl.h b/src/solvers/ode/tnlODESolverMonitor_impl.h
index 9a104aa018c51916e0035cd6daadcce6b2fcfab3..4eed193d24e5ff432b24563418dca31fa4167f98 100644
--- a/src/solvers/ode/tnlODESolverMonitor_impl.h
+++ b/src/solvers/ode/tnlODESolverMonitor_impl.h
@@ -28,7 +28,7 @@ tnlODESolverMonitor< RealType, IndexType> :: tnlODESolverMonitor()
 template< typename RealType, typename IndexType >
 void tnlODESolverMonitor< RealType, IndexType> :: refresh()
 {
-   if( this -> verbose > 0 && this -> refreshing % this -> outputPeriod == 0 )
+   if( this -> verbose > 0 && this -> getIterations() % this -> refreshRate == 0 )
    {
       // TODO: add EST
       //cout << " EST: " << estimated;
@@ -41,7 +41,7 @@ void tnlODESolverMonitor< RealType, IndexType> :: refresh()
        /*double flops = ( double ) tnl_flops_counter. getFlops();
        if( flops )
        {
-         cout << " GFLOPS:  " << setw( 8 ) << 1.0e-9 * flops / rt_timer -> GetTime();
+         cout << " GFLOPS:  " << setw( 8 ) << 1.0e-9 * flops / rt_timer -> getTime();
        }*/
        cout << "   \r" << flush;
     }
diff --git a/src/solvers/pde/CMakeLists.txt b/src/solvers/pde/CMakeLists.txt
index 80a4500775ce37e32e6c62b2292b06e80bde0c55..6521acc8b569c435f86b6f1597275dceee6c9db1 100755
--- a/src/solvers/pde/CMakeLists.txt
+++ b/src/solvers/pde/CMakeLists.txt
@@ -7,6 +7,8 @@ SET( headers tnlPDESolver.h
              tnlSemiImplicitTimeStepper.h
              tnlSemiImplicitTimeStepper_impl.h
              tnlLinearSystemAssembler.h
-             tnlLinearSystemAssembler_impl.h  )
+             tnlLinearSystemAssembler_impl.h
+             tnlBackwardTimeDiscretisation.h
+             tnlNoTimeDiscretisation.h  )
              
 INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/solvers/pde )
\ No newline at end of file
diff --git a/src/solvers/pde/tnlBackwardTimeDiscretisation.h b/src/solvers/pde/tnlBackwardTimeDiscretisation.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c0f11218e1ce22db648d17946f0692ccfef36ff
--- /dev/null
+++ b/src/solvers/pde/tnlBackwardTimeDiscretisation.h
@@ -0,0 +1,44 @@
+/***************************************************************************
+                          tnlBackwardTimeDiscretisation.h  -  description
+                             -------------------
+    begin                : Apr 4, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+
+#ifndef TNLBACKWARDTIMEDISCRETISATION_H
+#define	TNLBACKWARDTIMEDISCRETISATION_H
+
+#include <core/tnlCuda.h>
+
+class tnlBackwardTimeDiscretisation
+{
+    public:        
+        
+        template< typename RealType,
+                  typename IndexType,
+                  typename MatrixType >
+        __cuda_callable__ static void applyTimeDiscretisation( MatrixType& matrix,
+                                                               RealType& b,
+                                                               const IndexType index,
+                                                               const RealType& u,
+                                                               const RealType& tau,
+                                                               const RealType& rhs )
+        {
+            b += u + tau * rhs;
+            matrix.addElementFast( index, index, 1.0, 1.0 );
+        }
+};
+
+#endif	/* TNLBACKWARDTIMEDISCRETISATION_H */
+
diff --git a/src/solvers/pde/tnlExplicitTimeStepper.h b/src/solvers/pde/tnlExplicitTimeStepper.h
index 1f4bf0af0fede03b042dfaf61d395c0301a18f26..9d3dc11a2754673c08fc008b6ebeabb0caa354fd 100644
--- a/src/solvers/pde/tnlExplicitTimeStepper.h
+++ b/src/solvers/pde/tnlExplicitTimeStepper.h
@@ -21,6 +21,8 @@
 #include <solvers/ode/tnlODESolverMonitor.h>
 #include <config/tnlConfigDescription.h>
 #include <config/tnlParameterContainer.h>
+#include <core/tnlTimerRT.h>
+#include <core/tnlLogger.h>
 
 
 template< typename Problem,
@@ -36,6 +38,7 @@ class tnlExplicitTimeStepper
    typedef typename Problem::IndexType IndexType;
    typedef typename Problem::MeshType MeshType;
    typedef typename ProblemType::DofVectorType DofVectorType;
+   typedef typename ProblemType::MeshDependentDataType MeshDependentDataType;
 
    tnlExplicitTimeStepper();
 
@@ -61,12 +64,14 @@ class tnlExplicitTimeStepper
                const RealType& stopTime,
                const MeshType& mesh,
                DofVectorType& dofVector,
-               DofVectorType& auxiliaryDofVector );
+               MeshDependentDataType& meshDependentData );
 
    void getExplicitRHS( const RealType& time,
                         const RealType& tau,
                         DofVectorType& _u,
                         DofVectorType& _fu );
+   
+   bool writeEpilog( tnlLogger& logger );
 
    protected:
 
@@ -78,7 +83,9 @@ class tnlExplicitTimeStepper
 
    RealType timeStep;
 
-   DofVectorType* auxiliaryDofs;
+   MeshDependentDataType* meshDependentData;
+   
+   tnlTimerRT explicitUpdaterTimer;
 };
 
 #include <solvers/pde/tnlExplicitTimeStepper_impl.h>
diff --git a/src/solvers/pde/tnlExplicitTimeStepper_impl.h b/src/solvers/pde/tnlExplicitTimeStepper_impl.h
index 7f8359d0a3fcd14cfe9a7f76ff42f56d3324917d..e47e0c562eeaa1f0040abc80e4763897330dc04c 100644
--- a/src/solvers/pde/tnlExplicitTimeStepper_impl.h
+++ b/src/solvers/pde/tnlExplicitTimeStepper_impl.h
@@ -18,6 +18,9 @@
 #ifndef TNLEXPLICITTIMESTEPPER_IMPL_H_
 #define TNLEXPLICITTIMESTEPPER_IMPL_H_
 
+#include "tnlExplicitTimeStepper.h"
+
+
 template< typename Problem,
           template < typename OdeProblem > class OdeSolver >
 tnlExplicitTimeStepper< Problem, OdeSolver >::
@@ -53,6 +56,7 @@ bool
 tnlExplicitTimeStepper< Problem, OdeSolver >::
 init( const MeshType& mesh )
 {
+   this->explicitUpdaterTimer.reset();
    return true;
 }
 
@@ -106,7 +110,7 @@ solve( const RealType& time,
        const RealType& stopTime,
        const MeshType& mesh,
        DofVectorType& dofVector,
-       DofVectorType& auxiliaryDofVector )
+       MeshDependentDataType& meshDependentData )
 {
    tnlAssert( this->odeSolver, );
    this->odeSolver->setTau( this -> timeStep );
@@ -116,33 +120,37 @@ solve( const RealType& time,
    if( this->odeSolver->getMinIterations() )
       this->odeSolver->setMaxTau( ( stopTime - time ) / ( typename OdeSolver< Problem >::RealType ) this->odeSolver->getMinIterations() );
    this->mesh = &mesh;
-   this->auxiliaryDofs = &auxiliaryDofVector;
+   this->meshDependentData = &meshDependentData;
    return this->odeSolver->solve( dofVector );
 }
 
 template< typename Problem,
           template < typename OdeProblem > class OdeSolver >
-void tnlExplicitTimeStepper< Problem, OdeSolver >::getExplicitRHS( const RealType& time,
-                                                                   const RealType& tau,
-                                                                   DofVectorType& u,
-                                                                   DofVectorType& fu )
+void
+tnlExplicitTimeStepper< Problem, OdeSolver >::
+getExplicitRHS( const RealType& time,
+                const RealType& tau,
+                DofVectorType& u,
+                DofVectorType& fu )
 {
    if( ! this->problem->preIterate( time,
                                     tau,
                                     *( this->mesh),
                                     u,
-                                    *( this->auxiliaryDofs ) ) )
+                                    *( this->meshDependentData ) ) )
    {
       cerr << endl << "Preiteration failed." << endl;
       return;
       //return false; // TODO: throw exception
    }
-   this->problem->getExplicitRHS( time, tau, *( this->mesh ), u, fu );
+   this->explicitUpdaterTimer.start();   
+   this->problem->getExplicitRHS( time, tau, *( this->mesh ), u, fu, *( this->meshDependentData ) );
+   this->explicitUpdaterTimer.stop();
    if( ! this->problem->postIterate( time,
                                      tau,
                                      *( this->mesh ),
                                      u,
-                                     *( this->auxiliaryDofs ) ) )
+                                     *( this->meshDependentData ) ) )
    {
       cerr << endl << "Postiteration failed." << endl;
       return;
@@ -150,4 +158,14 @@ void tnlExplicitTimeStepper< Problem, OdeSolver >::getExplicitRHS( const RealTyp
    }
 }
 
+template< typename Problem,
+          template < typename OdeProblem > class OdeSolver >
+bool
+tnlExplicitTimeStepper< Problem, OdeSolver >::
+writeEpilog( tnlLogger& logger )
+{
+   logger.writeParameter< double >( "Explicit update computation time:", this->explicitUpdaterTimer.getTime() );
+   return true;
+}
+
 #endif /* TNLEXPLICITTIMESTEPPER_IMPL_H_ */
diff --git a/src/solvers/pde/tnlExplicitUpdater.h b/src/solvers/pde/tnlExplicitUpdater.h
index 01432ace7fba41d153cf9f8b0520a959018e54f7..de64bd1563afba9109e1178bbb84dc1a0533da04 100644
--- a/src/solvers/pde/tnlExplicitUpdater.h
+++ b/src/solvers/pde/tnlExplicitUpdater.h
@@ -18,14 +18,14 @@
 #ifndef TNLEXPLICITUPDATER_H_
 #define TNLEXPLICITUPDATER_H_
 
-#include <functions/tnlFunctionAdapter.h>
+#include <functors/tnlFunctorAdapter.h>
 
 template< typename Real,
           typename DofVector,
           typename DifferentialOperator,
           typename BoundaryConditions,
           typename RightHandSide >
-class tnlExplicitUpdaterTraversalUserData
+class tnlExplicitUpdaterTraverserUserData
 {
    public:
 
@@ -39,7 +39,7 @@ class tnlExplicitUpdaterTraversalUserData
 
       DofVector *u, *fu;
 
-      tnlExplicitUpdaterTraversalUserData( const Real& time,
+      tnlExplicitUpdaterTraverserUserData( const Real& time,
                                            const DifferentialOperator& differentialOperator,
                                            const BoundaryConditions& boundaryConditions,
                                            const RightHandSide& rightHandSide,
@@ -67,11 +67,11 @@ class tnlExplicitUpdater
       typedef typename DofVector::RealType RealType;
       typedef typename DofVector::DeviceType DeviceType;
       typedef typename DofVector::IndexType IndexType;
-      typedef tnlExplicitUpdaterTraversalUserData< RealType,
+      typedef tnlExplicitUpdaterTraverserUserData< RealType,
                                                    DofVector,
                                                    DifferentialOperator,
                                                    BoundaryConditions,
-                                                   RightHandSide > TraversalUserData;
+                                                   RightHandSide > TraverserUserData;
 
       template< int EntityDimensions >
       void update( const RealType& time,
@@ -82,7 +82,7 @@ class tnlExplicitUpdater
                    DofVector& u,
                    DofVector& fu ) const;
 
-      class TraversalBoundaryEntitiesProcessor
+      class TraverserBoundaryEntitiesProcessor
       {
          public:
 
@@ -91,7 +91,7 @@ class tnlExplicitUpdater
             __host__ __device__
 #endif
             static void processEntity( const MeshType& mesh,
-                                       TraversalUserData& userData,
+                                       TraverserUserData& userData,
                                        const IndexType index )
             {
                userData.boundaryConditions->setBoundaryConditions( *userData.time,
@@ -103,7 +103,7 @@ class tnlExplicitUpdater
 
       };
 
-      class TraversalInteriorEntitiesProcessor
+      class TraverserInteriorEntitiesProcessor
       {
          public:
 
@@ -112,14 +112,14 @@ class tnlExplicitUpdater
             __host__ __device__
 #endif
             static void processEntity( const MeshType& mesh,
-                                       TraversalUserData& userData,
+                                       TraverserUserData& userData,
                                        const IndexType index )
             {
                (* userData.fu )[ index ] = userData.differentialOperator->getValue( mesh,
                                                                                     index,
                                                                                     *userData.u,
                                                                                     *userData.time );
-               typedef tnlFunctionAdapter< MeshType, RightHandSide > FunctionAdapter;
+               typedef tnlFunctorAdapter< MeshType, RightHandSide > FunctionAdapter;
                ( *userData.fu )[ index ] += FunctionAdapter::getValue( mesh,
                                                                        *userData.rightHandSide,
                                                                        index,
@@ -151,11 +151,11 @@ class tnlExplicitUpdater< tnlGrid< Dimensions, Real, Device, Index >,
       typedef typename MeshType::DeviceType DeviceType;
       typedef typename MeshType::IndexType IndexType;
       typedef typename MeshType::CoordinatesType CoordinatesType;
-      typedef tnlExplicitUpdaterTraversalUserData< RealType,
+      typedef tnlExplicitUpdaterTraverserUserData< RealType,
                                                    DofVector,
                                                    DifferentialOperator,
                                                    BoundaryConditions,
-                                                   RightHandSide > TraversalUserData;
+                                                   RightHandSide > TraverserUserData;
       
       template< int EntityDimensions >
       void update( const RealType& time,
@@ -166,7 +166,7 @@ class tnlExplicitUpdater< tnlGrid< Dimensions, Real, Device, Index >,
                    DofVector& u,
                    DofVector& fu ) const;
 
-      class TraversalBoundaryEntitiesProcessor
+      class TraverserBoundaryEntitiesProcessor
       {
          public:
 
@@ -178,7 +178,7 @@ class tnlExplicitUpdater< tnlGrid< Dimensions, Real, Device, Index >,
             __host__ __device__
 #endif
             static void processCell( const MeshType& mesh,
-                                     TraversalUserData& userData,
+                                     TraverserUserData& userData,
                                      const IndexType index,
                                      const CoordinatesType& coordinates )
             {
@@ -190,9 +190,26 @@ class tnlExplicitUpdater< tnlGrid< Dimensions, Real, Device, Index >,
                                                                    *userData.fu );
             }
 
+#ifdef HAVE_CUDA
+            __host__ __device__
+#endif
+            static void processFace( const MeshType& mesh,
+                                     TraverserUserData& userData,
+                                     const IndexType index,
+                                     const CoordinatesType& coordinates )
+            {
+               userData.boundaryConditions->setBoundaryConditions( *userData.time,
+                                                                   mesh,
+                                                                   index,
+                                                                   coordinates,
+                                                                   *userData.u,
+                                                                   *userData.fu );
+            }
+
+
       };
 
-      class TraversalInteriorEntitiesProcessor
+      class TraverserInteriorEntitiesProcessor
       {
          public:
 
@@ -202,7 +219,7 @@ class tnlExplicitUpdater< tnlGrid< Dimensions, Real, Device, Index >,
             __host__ __device__
 #endif
             static void processCell( const MeshType& mesh,
-                                     TraversalUserData& userData,
+                                     TraverserUserData& userData,
                                      const IndexType index,
                                      const CoordinatesType& coordinates )
             {
@@ -212,7 +229,7 @@ class tnlExplicitUpdater< tnlGrid< Dimensions, Real, Device, Index >,
                                                                                    *userData.u,
                                                                                    *userData.time );
 
-               typedef tnlFunctionAdapter< MeshType, RightHandSide > FunctionAdapter;
+               typedef tnlFunctorAdapter< MeshType, RightHandSide > FunctionAdapter;
                ( * userData.fu )[ index ] += FunctionAdapter::getValue( mesh,
                                                                         *userData.rightHandSide,
                                                                         index,
@@ -220,6 +237,27 @@ class tnlExplicitUpdater< tnlGrid< Dimensions, Real, Device, Index >,
                                                                         *userData.time );
             }
 
+#ifdef HAVE_CUDA
+            __host__ __device__
+#endif
+            static void processFace( const MeshType& mesh,
+                                     TraverserUserData& userData,
+                                     const IndexType index,
+                                     const CoordinatesType& coordinates )
+            {
+               ( *userData.fu)[ index ] = userData.differentialOperator->getValue( mesh,
+                                                                                   index,
+                                                                                   coordinates,
+                                                                                   *userData.u,
+                                                                                   *userData.time );
+
+               typedef tnlFunctorAdapter< MeshType, RightHandSide > FunctionAdapter;
+               ( * userData.fu )[ index ] += FunctionAdapter::getValue( mesh,
+                                                                        *userData.rightHandSide,
+                                                                        index,
+                                                                        coordinates,
+                                                                        *userData.time );
+            }
       };
 
 };
diff --git a/src/solvers/pde/tnlExplicitUpdater_impl.h b/src/solvers/pde/tnlExplicitUpdater_impl.h
index 172ea8700831ec1c7833d44331a87c6cdc27864b..35f80644ca7f3bec21172bd3ab12819410e8f432 100644
--- a/src/solvers/pde/tnlExplicitUpdater_impl.h
+++ b/src/solvers/pde/tnlExplicitUpdater_impl.h
@@ -40,14 +40,14 @@ update( const RealType& time,
 {
    if( DeviceType::DeviceType == tnlHostDevice )
    {
-      TraversalUserData userData( time, differentialOperator, boundaryConditions, rightHandSide, u, fu );
-      tnlTraverser< MeshType, EntityDimensions > meshTraversal;
-      meshTraversal.template processBoundaryEntities< TraversalUserData,
-                                                      TraversalBoundaryEntitiesProcessor >
+      TraverserUserData userData( time, differentialOperator, boundaryConditions, rightHandSide, u, fu );
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserBoundaryEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
-      meshTraversal.template processInteriorEntities< TraversalUserData,
-                                                      TraversalInteriorEntitiesProcessor >
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserInteriorEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
 
@@ -60,15 +60,15 @@ update( const RealType& time,
       RightHandSide* kernelRightHandSide = tnlCuda::passToDevice( rightHandSide );
       DofVector* kernelU = tnlCuda::passToDevice( u );
       DofVector* kernelFu = tnlCuda::passToDevice( fu );
-      TraversalUserData userData( *kernelTime, *kernelDifferentialOperator, *kernelBoundaryConditions, *kernelRightHandSide, *kernelU, *kernelFu );
+      TraverserUserData userData( *kernelTime, *kernelDifferentialOperator, *kernelBoundaryConditions, *kernelRightHandSide, *kernelU, *kernelFu );
       checkCudaDevice;
-      tnlTraverser< MeshType, EntityDimensions > meshTraversal;
-      meshTraversal.template processBoundaryEntities< TraversalUserData,
-                                                      TraversalBoundaryEntitiesProcessor >
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserBoundaryEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
-      meshTraversal.template processInteriorEntities< TraversalUserData,
-                                                      TraversalInteriorEntitiesProcessor >
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserInteriorEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
 
@@ -105,14 +105,14 @@ update( const RealType& time,
 
    if( ( tnlDeviceEnum ) DeviceType::DeviceType == tnlHostDevice )
    {
-      TraversalUserData userData( time, differentialOperator, boundaryConditions, rightHandSide, u, fu );
-      tnlTraverser< MeshType, EntityDimensions > meshTraversal;
-      meshTraversal.template processBoundaryEntities< TraversalUserData,
-                                                      TraversalBoundaryEntitiesProcessor >
+      TraverserUserData userData( time, differentialOperator, boundaryConditions, rightHandSide, u, fu );
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserBoundaryEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
-      meshTraversal.template processInteriorEntities< TraversalUserData,
-                                                      TraversalInteriorEntitiesProcessor >
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserInteriorEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
 
@@ -127,14 +127,14 @@ update( const RealType& time,
       DofVector* kernelU = tnlCuda::passToDevice( u );
       DofVector* kernelFu = tnlCuda::passToDevice( fu );
       checkCudaDevice;
-      TraversalUserData userData( *kernelTime, *kernelDifferentialOperator, *kernelBoundaryConditions, *kernelRightHandSide, *kernelU, *kernelFu );
-      tnlTraverser< MeshType, EntityDimensions > meshTraversal;
-      meshTraversal.template processBoundaryEntities< TraversalUserData,
-                                                      TraversalBoundaryEntitiesProcessor >
+      TraverserUserData userData( *kernelTime, *kernelDifferentialOperator, *kernelBoundaryConditions, *kernelRightHandSide, *kernelU, *kernelFu );
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserBoundaryEntitiesProcessor >
                                                    ( mesh,
                                                      userData );
-      meshTraversal.template processInteriorEntities< TraversalUserData,
-                                                      TraversalInteriorEntitiesProcessor >
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserInteriorEntitiesProcessor >
                                                    ( mesh,
                                                      userData );
 
diff --git a/src/solvers/pde/tnlLinearSystemAssembler.h b/src/solvers/pde/tnlLinearSystemAssembler.h
index 0898e1e2938534beaf071b4297a65fbe10a27ba4..065f2cf3d9f5781a353457b6659f0567bdbb8fe5 100644
--- a/src/solvers/pde/tnlLinearSystemAssembler.h
+++ b/src/solvers/pde/tnlLinearSystemAssembler.h
@@ -18,13 +18,15 @@
 #ifndef TNLLINEARSYSTEMASSEMBLER_H_
 #define TNLLINEARSYSTEMASSEMBLER_H_
 
+#include <functors/tnlFunctorAdapter.h>
+
 template< typename Real,
           typename DofVector,
           typename DifferentialOperator,
           typename BoundaryConditions,
           typename RightHandSide,
           typename Matrix >
-class tnlLinearSystemAssemblerTraversalUserData
+class tnlLinearSystemAssemblerTraverserUserData
 {
    public:
       typedef Matrix MatrixType;
@@ -44,8 +46,11 @@ class tnlLinearSystemAssemblerTraversalUserData
 
       Matrix *matrix;
 
-      tnlLinearSystemAssemblerTraversalUserData( const Real& time,
+      const Real* timeDiscretisationCoefficient;
+
+      tnlLinearSystemAssemblerTraverserUserData( const Real& time,
                                                  const Real& tau,
+                                                 const Real& timeDiscretisationCoefficient,
                                                  const DifferentialOperator& differentialOperator,
                                                  const BoundaryConditions& boundaryConditions,
                                                  const RightHandSide& rightHandSide,
@@ -54,6 +59,7 @@ class tnlLinearSystemAssemblerTraversalUserData
                                                  DofVector& b )
       : time( &time ),
         tau( &tau ),
+        timeDiscretisationCoefficient( &timeDiscretisationCoefficient ),
         differentialOperator( &differentialOperator ),
         boundaryConditions( &boundaryConditions ),
         rightHandSide( &rightHandSide ),
@@ -72,6 +78,7 @@ template< typename Mesh,
           typename DifferentialOperator,
           typename BoundaryConditions,
           typename RightHandSide,
+          typename TimeDiscretisation,
           typename Matrix >
 class tnlLinearSystemAssembler
 {
@@ -81,12 +88,12 @@ class tnlLinearSystemAssembler
    typedef typename DofVector::DeviceType DeviceType;
    typedef typename DofVector::IndexType IndexType;
    typedef Matrix MatrixType;
-   typedef tnlLinearSystemAssemblerTraversalUserData< RealType,
+   typedef tnlLinearSystemAssemblerTraverserUserData< RealType,
                                                       DofVector,
                                                       DifferentialOperator,
                                                       BoundaryConditions,
                                                       RightHandSide,
-                                                      MatrixType > TraversalUserData;
+                                                      MatrixType > TraverserUserData;
 
    template< int EntityDimensions >
    void assembly( const RealType& time,
@@ -99,7 +106,7 @@ class tnlLinearSystemAssembler
                   MatrixType& matrix,
                   DofVector& b ) const;
 
-   class TraversalBoundaryEntitiesProcessor
+   class TraverserBoundaryEntitiesProcessor
    {
       public:
 
@@ -108,25 +115,20 @@ class tnlLinearSystemAssembler
          __host__ __device__
 #endif
          static void processEntity( const MeshType& mesh,
-                                    TraversalUserData& userData,
+                                    TraverserUserData& userData,
                                     const IndexType index )
          {
-            typename MatrixType::MatrixRow matrixRow = userData.matrix->getRow( index );
-            userData.boundaryConditions->updateLinearSystem( *userData.time,
-                                                            mesh,
-                                                            index,
-                                                            *userData.u,
-                                                            *userData.b,
-                                                            matrixRow );
-            /*userData.matrix->setRowFast( index,
-                                         userData.columns->getData(),
-                                         userData.values->getData(),
-                                         rowLength );*/
+            userData.boundaryConditions->updateLinearSystem( *userData.time + *userData.tau,
+                                                             mesh,
+                                                             index,
+                                                             *userData.u,
+                                                             *userData.b,
+                                                             *userData.matrix );
          }
 
    };
 
-   class TraversalInteriorEntitiesProcessor
+   class TraverserInteriorEntitiesProcessor
    {
       public:
 
@@ -135,31 +137,36 @@ class tnlLinearSystemAssembler
          __host__ __device__
 #endif
          static void processEntity( const MeshType& mesh,
-                                    TraversalUserData& userData,
+                                    TraverserUserData& userData,
                                     const IndexType index )
          {
-            typedef tnlFunctionAdapter< MeshType, RightHandSide > FunctionAdapter;
-            ( *userData.b )[ index ] = ( *userData.u )[ index ] +
+            typedef tnlFunctorAdapter< MeshType, RightHandSide > FunctionAdapter;
+            ( *userData.b )[ index ] = 0.0;/*( *userData.u )[ index ] +
                      ( *userData.tau ) * FunctionAdapter::getValue( mesh,
-                                                             *userData.rightHandSide,
-                                                             index,
-                                                             *userData.time );
+                                                                    *userData.rightHandSide,
+                                                                    index,
+                                                                    *userData.time );*/
 
-            typename MatrixType::MatrixRow matrixRow = userData.matrix->getRow( index );
             userData.differentialOperator->updateLinearSystem( *userData.time,
                                                                *userData.tau,
                                                                mesh,
                                                                index,
                                                                *userData.u,
                                                                *userData.b,
-                                                               matrixRow );
-            /*userData.matrix->setRowFast( index,
-                                         userData.columns->getData(),
-                                         userData.values->getData(),
-                                         rowLength );*/
-            userData.matrix->addElement( index, index, 1.0, 1.0 );
+                                                               *userData.matrix );
+            //userData.matrix->addElement( index, index, 1.0, 1.0 );
+            const RealType& rhs = FunctionAdapter::getValue( mesh,
+                                                             *userData.rightHandSide,
+                                                             index,
+                                                             *userData.time );
+            TimeDiscretisation::applyTimeDiscretisation( *userData.matrix,
+                                                         ( *userData.b )[ index ],
+                                                         index,
+                                                         ( *userData.u )[ index ],
+                                                         ( *userData.tau ),
+                                                         rhs );
+            
          }
-
    };
 };
 
@@ -171,12 +178,14 @@ template< int Dimensions,
           typename DifferentialOperator,
           typename BoundaryConditions,
           typename RightHandSide,
+          typename TimeDiscretisation,
           typename Matrix >
 class tnlLinearSystemAssembler< tnlGrid< Dimensions, Real, Device, Index >,
                                 DofVector,
                                 DifferentialOperator,
                                 BoundaryConditions,
                                 RightHandSide,
+                                TimeDiscretisation,
                                 Matrix >
 {
    public:
@@ -186,12 +195,15 @@ class tnlLinearSystemAssembler< tnlGrid< Dimensions, Real, Device, Index >,
    typedef typename DofVector::IndexType IndexType;
    typedef Matrix MatrixType;
    typedef typename MeshType::CoordinatesType CoordinatesType;
-   typedef tnlLinearSystemAssemblerTraversalUserData< RealType,
+   typedef tnlLinearSystemAssemblerTraverserUserData< RealType,
                                                       DofVector,
                                                       DifferentialOperator,
                                                       BoundaryConditions,
                                                       RightHandSide,
-                                                      MatrixType > TraversalUserData;
+                                                      MatrixType > TraverserUserData;
+
+   tnlLinearSystemAssembler()
+   : timeDiscretisationCoefficient( 1.0 ){}
 
    template< int EntityDimensions >
    void assembly( const RealType& time,
@@ -204,7 +216,15 @@ class tnlLinearSystemAssembler< tnlGrid< Dimensions, Real, Device, Index >,
                   MatrixType& matrix,
                   DofVector& b ) const;
 
-   class TraversalBoundaryEntitiesProcessor
+   /****
+    * TODO: Fix this. Somehow.
+    */
+   void setTimeDiscretisationCoefficient( const Real& c )
+   {
+      this->timeDiscretisationCoefficient = c;
+   }
+
+   class TraverserBoundaryEntitiesProcessor
    {
       public:
 
@@ -212,45 +232,71 @@ class tnlLinearSystemAssembler< tnlGrid< Dimensions, Real, Device, Index >,
          __host__ __device__
 #endif
          static void processCell( const MeshType& mesh,
-                                  TraversalUserData& userData,
+                                  TraverserUserData& userData,
                                   const IndexType index,
                                   const CoordinatesType& coordinates )
          {
             //printf( "index = %d \n", index );
-            typename MatrixType::MatrixRow matrixRow = userData.matrix->getRow( index );
+             ( *userData.b )[ index ] = 0.0;           
+            userData.boundaryConditions->updateLinearSystem( *userData.time + *userData.tau,
+                                                             mesh,
+                                                             index,
+                                                             coordinates,
+                                                             *userData.u,
+                                                             *userData.b,
+                                                             *userData.matrix );
+         }
+
+#ifdef HAVE_CUDA
+         __host__ __device__
+#endif
+         static void processFace( const MeshType& mesh,
+                                  TraverserUserData& userData,
+                                  const IndexType index,
+                                  const CoordinatesType& coordinates )
+         {
+            //printf( "index = %d \n", index );
+            // printf("Matrix assembler: Index = %d \n", index );
+            ( *userData.b )[ index ] = 0.0;
             userData.boundaryConditions->updateLinearSystem( *userData.time,
                                                              mesh,
                                                              index,
                                                              coordinates,
                                                              *userData.u,
                                                              *userData.b,
-                                                             matrixRow );
+                                                             *userData.matrix );
+            //printf( "BC: index = %d, b = %f \n", index, ( *userData.b )[ index ] );
          }
 
+
    };
 
-   class TraversalInteriorEntitiesProcessor
+   class TraverserInteriorEntitiesProcessor
    {
       public:
 
+      /****
+       *
+       * TODO: FIX THIS. The assembler is not designed properly for the stationary problems!!!
+       *
+       */
 #ifdef HAVE_CUDA
          __host__ __device__
 #endif
          static void processCell( const MeshType& mesh,
-                                  TraversalUserData& userData,
+                                  TraverserUserData& userData,
                                   const IndexType index,
                                   const CoordinatesType& coordinates )
          {
             //printf( "index = %d \n", index );
-            typedef tnlFunctionAdapter< MeshType, RightHandSide > FunctionAdapter;
-            ( *userData.b )[ index ] = ( *userData.u )[ index ] +
+            typedef tnlFunctorAdapter< MeshType, RightHandSide > FunctionAdapter;
+            ( *userData.b )[ index ] = 0.0; /*( *userData.timeDiscretisationCoefficient) * ( *userData.u )[ index ] +
                                   ( *userData.tau ) * FunctionAdapter::getValue( mesh,
                                                              *userData.rightHandSide,
                                                              index,
                                                              coordinates,
-                                                             *userData.time );
+                                                             *userData.time );*/
             
-            typename MatrixType::MatrixRow matrixRow = userData.matrix->getRow( index );
             userData.differentialOperator->updateLinearSystem( *userData.time,
                                                                *userData.tau,
                                                                mesh,
@@ -258,11 +304,77 @@ class tnlLinearSystemAssembler< tnlGrid< Dimensions, Real, Device, Index >,
                                                                coordinates,
                                                                *userData.u,
                                                                *userData.b,
-                                                               matrixRow );
-            userData.matrix->addElementFast( index, index, 1.0, 1.0 );
+                                                               *userData.matrix );
+            /*if( *userData.timeDiscretisationCoefficient != 0.0 )
+               userData.matrix->addElementFast( index,
+                                                index,
+                                                *userData.timeDiscretisationCoefficient,
+                                                1.0 );*/
+            
+            const RealType& rhs = FunctionAdapter::getValue( mesh,
+                                                             *userData.rightHandSide,
+                                                             index,
+                                                             coordinates,
+                                                             *userData.time );
+            TimeDiscretisation::applyTimeDiscretisation( *userData.matrix,
+                                                         ( *userData.b )[ index ],
+                                                         index,
+                                                         ( *userData.u )[ index ],
+                                                         ( *userData.tau ),
+                                                         rhs );
+            //printf( "IC: index = %d, b = %f \n", index, ( *userData.b )[ index ] );
          }
 
+#ifdef HAVE_CUDA
+         __host__ __device__
+#endif
+         static void processFace( const MeshType& mesh,
+                                  TraverserUserData& userData,
+                                  const IndexType index,
+                                  const CoordinatesType& coordinates )
+         {
+            //printf( "index = %d \n", index );
+            // printf("Matrix assembler: Index = %d \n", index );
+            typedef tnlFunctorAdapter< MeshType, RightHandSide > FunctionAdapter;
+            ( *userData.b )[ index ] = 0.0; /*( *userData.timeDiscretisationCoefficient) * ( *userData.u )[ index ] +
+                                  ( *userData.tau ) * FunctionAdapter::getValue( mesh,
+                                                             *userData.rightHandSide,
+                                                             index,
+                                                             coordinates,
+                                                             *userData.time );*/
+
+            userData.differentialOperator->updateLinearSystem( *userData.time,
+                                                               *userData.tau,
+                                                               mesh,
+                                                               index,
+                                                               coordinates,
+                                                               *userData.u,
+                                                               *userData.b,
+                                                               *userData.matrix );
+            /*if( *userData.timeDiscretisationCoefficient != 0.0 )
+               userData.matrix->addElementFast( index,
+                                                index,
+                                                *userData.timeDiscretisationCoefficient,
+                                                1.0 );*/
+            
+            const RealType& rhs = FunctionAdapter::getValue( mesh,
+                                                             *userData.rightHandSide,
+                                                             index,
+                                                             coordinates,
+                                                             *userData.time );
+            TimeDiscretisation::applyTimeDiscretisation( *userData.matrix,
+                                                         ( *userData.b )[ index ],
+                                                         index,
+                                                         ( *userData.u )[ index ],
+                                                         ( *userData.tau ),
+                                                         rhs );
+
+         }
    };
+
+   protected:
+
+   Real timeDiscretisationCoefficient;
 };
 
 #include <solvers/pde/tnlLinearSystemAssembler_impl.h>
diff --git a/src/solvers/pde/tnlLinearSystemAssembler_impl.h b/src/solvers/pde/tnlLinearSystemAssembler_impl.h
index d520ea2b6304a772096557767dbb8719b5f076b5..a884e636311136b1b0f56b0bdf1c3ffa492c2862 100644
--- a/src/solvers/pde/tnlLinearSystemAssembler_impl.h
+++ b/src/solvers/pde/tnlLinearSystemAssembler_impl.h
@@ -27,10 +27,11 @@ template< typename Mesh,
           typename DifferentialOperator,
           typename BoundaryConditions,
           typename RightHandSide,
+          typename TimeDiscretisation,
           typename Matrix >
    template< int EntityDimensions >
 void
-tnlLinearSystemAssembler< Mesh, DofVector, DifferentialOperator, BoundaryConditions, RightHandSide, Matrix >::
+tnlLinearSystemAssembler< Mesh, DofVector, DifferentialOperator, BoundaryConditions, RightHandSide, TimeDiscretisation, Matrix >::
 assembly( const RealType& time,
           const RealType& tau,
           const Mesh& mesh,
@@ -43,21 +44,21 @@ assembly( const RealType& time,
 {
    const IndexType maxRowLength = matrix.getMaxRowLength();
    tnlAssert( maxRowLength > 0, );
-   typedef typename TraversalUserData::RowValuesType RowValuesType;
-   typedef typename TraversalUserData::RowColumnsType RowColumnsType;
+   typedef typename TraverserUserData::RowValuesType RowValuesType;
+   typedef typename TraverserUserData::RowColumnsType RowColumnsType;
    RowValuesType values;
    RowColumnsType columns;
 
    if( DeviceType::DeviceType == tnlHostDevice )
    {
-      TraversalUserData userData( time, tau, differentialOperator, boundaryConditions, rightHandSide, u, matrix, b );
-      tnlTraverser< MeshType, EntityDimensions > meshTraversal;
-      meshTraversal.template processBoundaryEntities< TraversalUserData,
-                                                      TraversalBoundaryEntitiesProcessor >
+      TraverserUserData userData( time, tau, differentialOperator, boundaryConditions, rightHandSide, u, matrix, b );
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserBoundaryEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
-      meshTraversal.template processInteriorEntities< TraversalUserData,
-                                                      TraversalInteriorEntitiesProcessor >
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserInteriorEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
    }
@@ -71,15 +72,15 @@ assembly( const RealType& time,
       DofVector* kernelU = tnlCuda::passToDevice( u );
       DofVector* kernelB = tnlCuda::passToDevice( b );
       MatrixType* kernelMatrix = tnlCuda::passToDevice( matrix );
-      TraversalUserData userData( *kernelTime, *kernelTau, *kernelDifferentialOperator, *kernelBoundaryConditions, *kernelRightHandSide, *kernelU, *kernelMatrix, *kernelB );
+      TraverserUserData userData( *kernelTime, *kernelTau, *kernelDifferentialOperator, *kernelBoundaryConditions, *kernelRightHandSide, *kernelU, *kernelMatrix, *kernelB );
       checkCudaDevice;
-      tnlTraverser< MeshType, EntityDimensions > meshTraversal;
-      meshTraversal.template processBoundaryEntities< TraversalUserData,
-                                                      TraversalBoundaryEntitiesProcessor >
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserBoundaryEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
-      meshTraversal.template processInteriorEntities< TraversalUserData,
-                                                      TraversalInteriorEntitiesProcessor >
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserInteriorEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
 
@@ -104,10 +105,11 @@ template< int Dimensions,
           typename DifferentialOperator,
           typename BoundaryConditions,
           typename RightHandSide,
+          typename TimeDiscretisation,
           typename Matrix >
    template< int EntityDimensions >
 void
-tnlLinearSystemAssembler< tnlGrid< Dimensions, Real, Device, Index >, DofVector, DifferentialOperator, BoundaryConditions, RightHandSide, Matrix >::
+tnlLinearSystemAssembler< tnlGrid< Dimensions, Real, Device, Index >, DofVector, DifferentialOperator, BoundaryConditions, RightHandSide, TimeDiscretisation, Matrix >::
 assembly( const RealType& time,
           const RealType& tau,
           const tnlGrid< Dimensions, Real, Device, Index >& mesh,
@@ -123,14 +125,22 @@ assembly( const RealType& time,
 
    if( ( tnlDeviceEnum ) DeviceType::DeviceType == tnlHostDevice )
    {
-      TraversalUserData userData( time, tau, differentialOperator, boundaryConditions, rightHandSide, u, matrix, b );
-      tnlTraverser< MeshType, EntityDimensions > meshTraversal;
-      meshTraversal.template processBoundaryEntities< TraversalUserData,
-                                                      TraversalBoundaryEntitiesProcessor >
+      TraverserUserData userData( time,
+                                  tau,
+                                  this->timeDiscretisationCoefficient,
+                                  differentialOperator,
+                                  boundaryConditions,
+                                  rightHandSide,
+                                  u,
+                                  matrix,
+                                  b );
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserBoundaryEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
-      meshTraversal.template processInteriorEntities< TraversalUserData,
-                                                      TraversalInteriorEntitiesProcessor >
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserInteriorEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
    }
@@ -138,21 +148,31 @@ assembly( const RealType& time,
    {
       RealType* kernelTime = tnlCuda::passToDevice( time );
       RealType* kernelTau = tnlCuda::passToDevice( tau );
+      RealType timeDiscretisationCoefficient = this->timeDiscretisationCoefficient; // retyping between different floating point types, TODO check it
+      RealType* kernelTimeDiscretisationCoefficient = tnlCuda::passToDevice( timeDiscretisationCoefficient );
       DifferentialOperator* kernelDifferentialOperator = tnlCuda::passToDevice( differentialOperator );
       BoundaryConditions* kernelBoundaryConditions = tnlCuda::passToDevice( boundaryConditions );
       RightHandSide* kernelRightHandSide = tnlCuda::passToDevice( rightHandSide );
       DofVector* kernelU = tnlCuda::passToDevice( u );
       DofVector* kernelB = tnlCuda::passToDevice( b );
       MatrixType* kernelMatrix = tnlCuda::passToDevice( matrix );
-      TraversalUserData userData( *kernelTime, *kernelTau, *kernelDifferentialOperator, *kernelBoundaryConditions, *kernelRightHandSide, *kernelU, *kernelMatrix, *kernelB );
+      TraverserUserData userData( *kernelTime,
+                                  *kernelTau,
+                                  *kernelTimeDiscretisationCoefficient,
+                                  *kernelDifferentialOperator,
+                                  *kernelBoundaryConditions,
+                                  *kernelRightHandSide,
+                                  *kernelU,
+                                  *kernelMatrix,
+                                  *kernelB );
       checkCudaDevice;
-      tnlTraverser< MeshType, EntityDimensions > meshTraversal;
-      meshTraversal.template processBoundaryEntities< TraversalUserData,
-                                                      TraversalBoundaryEntitiesProcessor >
+      tnlTraverser< MeshType, EntityDimensions > meshTraverser;
+      meshTraverser.template processBoundaryEntities< TraverserUserData,
+                                                      TraverserBoundaryEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
-      meshTraversal.template processInteriorEntities< TraversalUserData,
-                                                      TraversalInteriorEntitiesProcessor >
+      meshTraverser.template processInteriorEntities< TraverserUserData,
+                                                      TraverserInteriorEntitiesProcessor >
                                                     ( mesh,
                                                       userData );
 
diff --git a/src/solvers/pde/tnlNoTimeDiscretisation.h b/src/solvers/pde/tnlNoTimeDiscretisation.h
new file mode 100644
index 0000000000000000000000000000000000000000..27965ea03706c74da4cd1512ebfc1f732032bca3
--- /dev/null
+++ b/src/solvers/pde/tnlNoTimeDiscretisation.h
@@ -0,0 +1,42 @@
+/***************************************************************************
+                          tnlNoTimeDiscretisation.h  -  description
+                             -------------------
+    begin                : Apr 4, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef TNLNOTIMEDISCRETISATION_H
+#define	TNLNOTIMEDISCRETISATION_H
+
+#include <core/tnlCuda.h>
+
+class tnlNoTimeDiscretisation
+{
+    public:
+        
+        template< typename RealType,
+                  typename IndexType,
+                  typename MatrixType >
+        __cuda_callable__ static void applyTimeDiscretisation( MatrixType& matrix,
+                                                               RealType& b,
+                                                               const IndexType index,
+                                                               const RealType& u,
+                                                               const RealType& tau,
+                                                               const RealType& rhs )
+        {
+            b += rhs;
+        };
+};
+
+#endif	/* TNLNOTIMEDISCRETISATION_H */
+
diff --git a/src/solvers/pde/tnlPDESolver.h b/src/solvers/pde/tnlPDESolver.h
index d72f56fb726446e66a549143caa13fc48b12962f..87c1f18e5b36a0814455703c9d689a54004f043b 100644
--- a/src/solvers/pde/tnlPDESolver.h
+++ b/src/solvers/pde/tnlPDESolver.h
@@ -22,6 +22,7 @@
 #include <config/tnlConfigDescription.h>
 #include <config/tnlParameterContainer.h>
 #include <solvers/tnlSolverMonitor.h>
+#include <core/tnlLogger.h>
 
 template< typename Problem,
           typename TimeStepper >
@@ -29,71 +30,78 @@ class tnlPDESolver : public tnlObject
 {
    public:
 
-   typedef typename TimeStepper::RealType RealType;
-   typedef typename TimeStepper::DeviceType DeviceType;
-   typedef typename TimeStepper::IndexType IndexType;
-   typedef Problem ProblemType;
-   typedef typename ProblemType::MeshType MeshType;
-   typedef typename ProblemType::DofVectorType DofVectorType;
-   
-   tnlPDESolver();
+      typedef typename TimeStepper::RealType RealType;
+      typedef typename TimeStepper::DeviceType DeviceType;
+      typedef typename TimeStepper::IndexType IndexType;
+      typedef Problem ProblemType;
+      typedef typename ProblemType::MeshType MeshType;
+      typedef typename ProblemType::DofVectorType DofVectorType;
+      typedef typename ProblemType::MeshDependentDataType MeshDependentDataType;
 
-   static void configSetup( tnlConfigDescription& config,
-                            const tnlString& prefix = "" );
+      tnlPDESolver();
 
-   bool setup( const tnlParameterContainer& parameters,
-              const tnlString& prefix = "" );
+      static void configSetup( tnlConfigDescription& config,
+                               const tnlString& prefix = "" );
 
-   bool writeProlog( tnlLogger& logger,
-                     const tnlParameterContainer& parameters );
+      bool setup( const tnlParameterContainer& parameters,
+                 const tnlString& prefix = "" );
 
-   void setTimeStepper( TimeStepper& timeStepper );
+      bool writeProlog( tnlLogger& logger,
+                        const tnlParameterContainer& parameters );
 
-   void setProblem( ProblemType& problem );
+      void setTimeStepper( TimeStepper& timeStepper );
 
-   bool setFinalTime( const RealType& finalT );
+      void setProblem( ProblemType& problem );
 
-   const RealType& getFinalTine() const;
+      void setInitialTime( const RealType& initialT );
 
-   bool setTimeStep( const RealType& timeStep );
+      const RealType& getInitialTime() const;
 
-   const RealType& getTimeStep() const;
+      bool setFinalTime( const RealType& finalT );
 
-   bool setTimeStepOrder( const RealType& timeStepOrder );
+      const RealType& getFinalTime() const;
 
-   const RealType& getTimeStepOrder() const;
+      bool setTimeStep( const RealType& timeStep );
 
-   bool setSnapshotPeriod( const RealType& period );
-   
-   const RealType& getSnapshotPeriod() const;
+      const RealType& getTimeStep() const;
 
-   void setIoRtTimer( tnlTimerRT& ioRtTimer);
+      bool setTimeStepOrder( const RealType& timeStepOrder );
 
-   void setComputeRtTimer( tnlTimerRT& computeRtTimer );
+      const RealType& getTimeStepOrder() const;
 
-   void setIoCpuTimer( tnlTimerCPU& ioCpuTimer );
+      bool setSnapshotPeriod( const RealType& period );
 
-   void setComputeCpuTimer( tnlTimerCPU& computeCpuTimer );
+      const RealType& getSnapshotPeriod() const;
 
-   bool solve();
+      void setIoRtTimer( tnlTimerRT& ioRtTimer);
+
+      void setComputeRtTimer( tnlTimerRT& computeRtTimer );
+
+      void setIoCpuTimer( tnlTimerCPU& ioCpuTimer );
+
+      void setComputeCpuTimer( tnlTimerCPU& computeCpuTimer );
+
+      bool solve();
+
+      bool writeEpilog( tnlLogger& logger ) const;
 
    protected:
 
-   MeshType mesh;
+      MeshType mesh;
 
-   DofVectorType dofs;
+      DofVectorType dofs;
 
-   DofVectorType auxiliaryDofs;
+      MeshDependentDataType meshDependentData;
 
-   TimeStepper* timeStepper;
+      TimeStepper* timeStepper;
 
-   RealType finalTime, snapshotPeriod, timeStep, timeStepOrder;
+      RealType initialTime, finalTime, snapshotPeriod, timeStep, timeStepOrder;
 
-   ProblemType* problem;
+      ProblemType* problem;
 
-   tnlTimerRT *ioRtTimer, *computeRtTimer;
+      tnlTimerRT *ioRtTimer, *computeRtTimer;
 
-   tnlTimerCPU *ioCpuTimer, *computeCpuTimer;
+      tnlTimerCPU *ioCpuTimer, *computeCpuTimer;
 
 };
 
diff --git a/src/solvers/pde/tnlPDESolver_impl.h b/src/solvers/pde/tnlPDESolver_impl.h
index d8d1f3c1f9c408dfa9ac8609647abe78c4fe6754..b289344ab6f26a459906247d40da533d52052aeb 100644
--- a/src/solvers/pde/tnlPDESolver_impl.h
+++ b/src/solvers/pde/tnlPDESolver_impl.h
@@ -23,6 +23,7 @@ template< typename Problem,
 tnlPDESolver< Problem, TimeStepper >::
 tnlPDESolver()
 : timeStepper( 0 ),
+  initialTime( 0.0 ),
   finalTime( 0.0 ),
   snapshotPeriod( 0.0 ),
   timeStep( 1.0 ),
@@ -34,7 +35,7 @@ tnlPDESolver()
   computeCpuTimer( 0 )
 {
    this->dofs.setName( "dofs" );
-   this->auxiliaryDofs.setName( "auxiliaryDofs" );
+   this->meshDependentData.setName( "meshDependentData" );
 }
 
 template< typename Problem,
@@ -46,6 +47,7 @@ configSetup( tnlConfigDescription& config,
 {
    config.addEntry< tnlString >( prefix + "initial-condition", "File name with the initial condition.", "init.tnl" );
    config.addRequiredEntry< double >( prefix + "final-time", "Stop time of the time dependent problem." );
+   config.addEntry< double >( prefix + "initial-time", "Initial time of the time dependent problem.", 0 );
    config.addRequiredEntry< double >( prefix + "snapshot-period", "Time period for writing the problem status.");
    config.addEntry< double >( "time-step", "The time step for the time discretisation.", 1.0 );
    config.addEntry< double >( "time-step-order", "The time step is set to time-step*pow( space-step, time-step-order).", 0.0 );
@@ -77,8 +79,7 @@ setup( const tnlParameterContainer& parameters,
     */
    tnlAssert( problem->getDofs( this->mesh ) != 0, );
    cout << "Allocating dofs ... ";
-   if( ! this->dofs.setSize( problem->getDofs( this->mesh ) ) ||
-       ! this->auxiliaryDofs.setSize( problem->getAuxiliaryDofs( this->mesh ) ) )
+   if( ! this->dofs.setSize( problem->getDofs( this->mesh ) ) )
    {
       cerr << endl;
       cerr << "I am not able to allocate DOFs (degrees of freedom)." << endl;
@@ -86,17 +87,20 @@ setup( const tnlParameterContainer& parameters,
    }
    cout << " [ OK ]" << endl;
    this->dofs.setValue( 0.0 );
-   if( this->auxiliaryDofs.getSize() != 0 )
-      this->auxiliaryDofs.setValue( 0.0 );
-   this->problem->bindDofs( mesh, this->dofs );
-   this->problem->bindAuxiliaryDofs( mesh, this->auxiliaryDofs );
+   this->problem->bindDofs( this->mesh, this->dofs );
+   
+   /****
+    * Set mesh dependent data
+    */
+   this->problem->setMeshDependentData( this->mesh, this->meshDependentData );
+   this->problem->bindMeshDependentData( this->mesh, this->meshDependentData );
    
    /***
     * Set-up the initial condition
     */
    cout << "Setting up the initial condition ... ";
    typedef typename Problem :: DofVectorType DofVectorType;
-   if( ! this->problem->setInitialCondition( parameters, mesh, this->dofs, this->auxiliaryDofs ) )
+   if( ! this->problem->setInitialCondition( parameters, this->mesh, this->dofs, this->meshDependentData ) )
       return false;
    cout << " [ OK ]" << endl;
 
@@ -104,6 +108,7 @@ setup( const tnlParameterContainer& parameters,
     * Initialize the time discretisation
     */
    this->setFinalTime( parameters.getParameter< double >( "final-time" ) );
+   this->setInitialTime( parameters.getParameter< double >( "initial-time" ) );
    this->setSnapshotPeriod( parameters.getParameter< double >( "snapshot-period" ) );
    this->setTimeStep( parameters.getParameter< double >( "time-step") );
    this->setTimeStepOrder( parameters.getParameter< double >( "time-step-order" ) );
@@ -123,7 +128,8 @@ writeProlog( tnlLogger& logger,
    mesh.writeProlog( logger );
    logger.writeSeparator();
    logger.writeParameter< tnlString >( "Time discretisation:", "time-discretisation", parameters );
-   logger.writeParameter< double >( "Initial time step:", "time-step", parameters );
+   logger.writeParameter< double >( "Initial time step:", this->timeStep * pow( mesh.getSmallestSpaceStep(), this->timeStepOrder ) );
+   logger.writeParameter< double >( "Initial time:", "initial-time", parameters );
    logger.writeParameter< double >( "Final time:", "final-time", parameters );
    logger.writeParameter< double >( "Snapshot period:", "snapshot-period", parameters );
    const tnlString& solverName = parameters. getParameter< tnlString >( "discrete-solver" );
@@ -134,12 +140,16 @@ writeProlog( tnlLogger& logger,
       logger.writeParameter< double >( "Omega:", "sor-omega", parameters, 1 );
    if( solverName == "gmres" )
       logger.writeParameter< int >( "Restarting:", "gmres-restarting", parameters, 1 );
+   logger.writeParameter< double >( "Convergence residue:", "convergence-residue", parameters );
+   logger.writeParameter< double >( "Divergence residue:", "divergence-residue", parameters );
+   logger.writeParameter< int >( "Maximal number of iterations:", "max-iterations", parameters );
+   logger.writeParameter< int >( "Minimal number of iterations:", "min-iterations", parameters );
    logger.writeSeparator();
    logger.writeParameter< tnlString >( "Real type:", "real-type", parameters, 0 );
    logger.writeParameter< tnlString >( "Index type:", "index-type", parameters, 0 );
    logger.writeParameter< tnlString >( "Device:", "device", parameters, 0 );
    logger.writeSeparator();
-   logger.writeSystemInformation();
+   logger.writeSystemInformation( parameters );
    logger.writeSeparator();
    logger.writeCurrentTime( "Started at:" );
    return true;
@@ -163,15 +173,34 @@ setProblem( ProblemType& problem )
    this->problem = &problem;
 }
 
+template< typename Problem,
+          typename TimeStepper >
+void
+tnlPDESolver< Problem, TimeStepper >::
+setInitialTime( const RealType& initialTime )
+{
+   this->initialTime = initialTime;
+}
+
+template< typename Problem,
+          typename TimeStepper >
+const typename TimeStepper :: RealType&
+tnlPDESolver< Problem, TimeStepper >::
+getInitialTime() const
+{
+   return this->initialTime;
+}
+
+
 template< typename Problem,
           typename TimeStepper >
 bool
 tnlPDESolver< Problem, TimeStepper >::
 setFinalTime( const RealType& finalTime )
 {
-   if( finalTime <= 0 )
+   if( finalTime <= this->initialTime )
    {
-      cerr << "Final time for tnlPDESolver must be positive value." << endl;
+      cerr << "Final time for tnlPDESolver must larger than the initial time which is now " << this->initialTime << "." << endl;
       return false;
    }
    this->finalTime = finalTime;
@@ -182,7 +211,7 @@ template< typename Problem,
           typename TimeStepper >
 const typename TimeStepper :: RealType&
 tnlPDESolver< Problem, TimeStepper >::
-getFinalTine() const
+getFinalTime() const
 {
    return this->finalTime;
 }
@@ -287,7 +316,9 @@ void tnlPDESolver< Problem, TimeStepper > :: setComputeCpuTimer( tnlTimerCPU& co
 }
 
 template< typename Problem, typename TimeStepper >
-bool tnlPDESolver< Problem, TimeStepper > :: solve()
+bool
+tnlPDESolver< Problem, TimeStepper >::
+solve()
 {
    tnlAssert( timeStepper != 0,
               cerr << "No time stepper was set in tnlPDESolver with name " << this -> getName() );
@@ -299,47 +330,56 @@ bool tnlPDESolver< Problem, TimeStepper > :: solve()
       cerr << "No snapshot tau was set in tnlPDESolver " << this -> getName() << "." << endl;
       return false;
    }
-   RealType t( 0.0 );
+   RealType t( this->initialTime );
    IndexType step( 0 );
-   IndexType allSteps = ceil( this->finalTime / this->snapshotPeriod );
-   this->timeStepper->setProblem( * ( this->problem ) );
-   this->timeStepper->init( mesh );
-   this->problem->bindDofs( mesh, this->dofs );
-   this->problem->bindAuxiliaryDofs( mesh, this->auxiliaryDofs );
+   IndexType allSteps = ceil( ( this->finalTime - this->initialTime ) / this->snapshotPeriod );
 
-   if( ! this->problem->makeSnapshot( t, step, mesh, this->dofs, this->auxiliaryDofs ) )
+   if( ! this->problem->makeSnapshot( t, step, mesh, this->dofs, this->meshDependentData ) )
    {
       cerr << "Making the snapshot failed." << endl;
       return false;
    }
-   timeStepper->setTimeStep( this->timeStep * pow( mesh.getSmallestSpaceStep(), this->timeStepOrder ) );
+
+   /****
+    * Initialize the time stepper
+    */
+   this->timeStepper->setProblem( * ( this->problem ) );
+   this->timeStepper->init( this->mesh );
+   this->timeStepper->setTimeStep( this->timeStep * pow( mesh.getSmallestSpaceStep(), this->timeStepOrder ) );
    while( step < allSteps )
    {
       RealType tau = Min( this -> snapshotPeriod,
                           this -> finalTime - t );
-      if( ! this->timeStepper->solve( t, t + tau, mesh, this->dofs, this->auxiliaryDofs ) )
+      if( ! this->timeStepper->solve( t, t + tau, mesh, this->dofs, this->meshDependentData ) )
          return false;
       step ++;
       t += tau;
 
-      this->ioRtTimer->Continue();
-      this->ioCpuTimer->Continue();
-      this->computeRtTimer->Stop();
-      this->computeCpuTimer->Stop();
+      this->ioRtTimer->start();
+      this->ioCpuTimer->start();
+      this->computeRtTimer->stop();
+      this->computeCpuTimer->stop();
 
-      if( ! this->problem->makeSnapshot( t, step, mesh, this->dofs, this->auxiliaryDofs ) )
+      if( ! this->problem->makeSnapshot( t, step, mesh, this->dofs, this->meshDependentData ) )
       {
          cerr << "Making the snapshot failed." << endl;
          return false;
       }
 
-      this-> ioRtTimer->Stop();
-      this-> ioCpuTimer->Stop();
-      this-> computeRtTimer->Continue();
-      this-> computeCpuTimer->Continue();
-
+      this-> ioRtTimer->stop();
+      this-> ioCpuTimer->stop();
+      this-> computeRtTimer->start();
+      this-> computeCpuTimer->start();
    }
    return true;
 }
 
+template< typename Problem, typename TimeStepper >
+bool
+tnlPDESolver< Problem, TimeStepper >::
+writeEpilog( tnlLogger& logger ) const
+{
+   return this->timeStepper->writeEpilog( logger );
+}
+
 #endif /* TNLPDESOLVER_IMPL_H_ */
diff --git a/src/solvers/pde/tnlSemiImplicitTimeStepper.h b/src/solvers/pde/tnlSemiImplicitTimeStepper.h
index 1e2a71477f83b5d736bb8d3337e2f9ac7dac0316..ab23a123dd6639794142e0dad15bc25206a41bb8 100644
--- a/src/solvers/pde/tnlSemiImplicitTimeStepper.h
+++ b/src/solvers/pde/tnlSemiImplicitTimeStepper.h
@@ -18,6 +18,9 @@
 #ifndef TNLSEMIIMPLICITTIMESTEPPER_H_
 #define TNLSEMIIMPLICITTIMESTEPPER_H_
 
+#include <core/tnlTimerRT.h>
+#include <core/tnlLogger.h>
+
 template< typename Problem,
           typename LinearSystemSolver >
 class tnlSemiImplicitTimeStepper
@@ -30,6 +33,7 @@ class tnlSemiImplicitTimeStepper
    typedef typename Problem::IndexType IndexType;
    typedef typename Problem::MeshType MeshType;
    typedef typename ProblemType::DofVectorType DofVectorType;
+   typedef typename ProblemType::MeshDependentDataType MeshDependentDataType;
    typedef LinearSystemSolver LinearSystemSolverType;
    typedef typename ProblemType::MatrixType MatrixType;
 
@@ -59,7 +63,9 @@ class tnlSemiImplicitTimeStepper
                const RealType& stopTime,
                const MeshType& mesh,
                DofVectorType& dofVector,
-               DofVectorType& auxiliaryDofVector );
+               MeshDependentDataType& meshDependentData );
+   
+   bool writeEpilog( tnlLogger& logger );
 
    protected:
 
@@ -73,6 +79,8 @@ class tnlSemiImplicitTimeStepper
 
    RealType timeStep;
 
+   tnlTimerRT linearSystemAssemblerTimer, linearSystemSolverTimer;
+   
    bool verbose;
 };
 
diff --git a/src/solvers/pde/tnlSemiImplicitTimeStepper_impl.h b/src/solvers/pde/tnlSemiImplicitTimeStepper_impl.h
index 7aafb69904916a5d3011d72f2fb9999482b55636..57c87ae1407ba0faed345ad8bae1aae1218f90b5 100644
--- a/src/solvers/pde/tnlSemiImplicitTimeStepper_impl.h
+++ b/src/solvers/pde/tnlSemiImplicitTimeStepper_impl.h
@@ -73,6 +73,8 @@ init( const MeshType& mesh )
    }
    if( ! this->rightHandSide.setSize( this->matrix.getRows() ) )
       return false;
+   this->linearSystemAssemblerTimer.reset();
+   this->linearSystemSolverTimer.reset();
    return true;
 }
 
@@ -134,7 +136,7 @@ solve( const RealType& time,
        const RealType& stopTime,
        const MeshType& mesh,
        DofVectorType& dofVector,
-       DofVectorType& auxiliaryDofVector )
+       MeshDependentDataType& meshDependentData )
 {
    tnlAssert( this->problem != 0, );
    RealType t = time;
@@ -147,34 +149,38 @@ solve( const RealType& time,
                                        currentTau,
                                        mesh,
                                        dofVector,
-                                       auxiliaryDofVector ) )
+                                       meshDependentData ) )
       {
          cerr << endl << "Preiteration failed." << endl;
          return false;
       }
       if( verbose )
          cout << "                                                                  Assembling the linear system ... \r" << flush;
+      this->linearSystemAssemblerTimer.start();
       this->problem->assemblyLinearSystem( t,
                                            currentTau,
                                            mesh,
                                            dofVector,
-                                           auxiliaryDofVector,
                                            this->matrix,
-                                           this->rightHandSide );
+                                           this->rightHandSide,
+                                           meshDependentData );
+      this->linearSystemAssemblerTimer.stop();
       if( verbose )
          cout << "                                                                  Solving the linear system for time " << t << "             \r" << flush;
+      this->linearSystemSolverTimer.start();
       if( ! this->linearSystemSolver->template solve< DofVectorType, tnlLinearResidueGetter< MatrixType, DofVectorType > >( this->rightHandSide, dofVector ) )
       {
          cerr << endl << "The linear system solver did not converge." << endl;
          return false;
       }
+      this->linearSystemSolverTimer.stop();
       //if( verbose )
       //   cout << endl;
       if( ! this->problem->postIterate( t,
                                         currentTau,
                                         mesh,
                                         dofVector,
-                                        auxiliaryDofVector ) )
+                                        meshDependentData ) )
       {
          cerr << endl << "Postiteration failed." << endl;
          return false;
@@ -184,4 +190,15 @@ solve( const RealType& time,
    return true;
 }
 
+template< typename Problem,
+          typename LinearSystemSolver >
+bool
+tnlSemiImplicitTimeStepper< Problem, LinearSystemSolver >::
+writeEpilog( tnlLogger& logger )
+{
+   logger.writeParameter< double >( "Linear system assembler time:", this->linearSystemAssemblerTimer.getTime() );
+   logger.writeParameter< double >( "Linear system solver time:", this->linearSystemSolverTimer.getTime() );
+   return true;
+}
+
 #endif /* TNLSEMIIMPLICITTIMESTEPPER_IMPL_H_ */
diff --git a/src/solvers/tnlConfigTags.h b/src/solvers/tnlBuildConfigTags.h
similarity index 99%
rename from src/solvers/tnlConfigTags.h
rename to src/solvers/tnlBuildConfigTags.h
index 9f1794dcc0652a99c3340a931b1f592ed8687ecb..5b094846fbe5899d9f72cb184da920f2f3bd76bc 100644
--- a/src/solvers/tnlConfigTags.h
+++ b/src/solvers/tnlBuildConfigTags.h
@@ -20,7 +20,7 @@
 
 #include <mesh/tnlGrid.h>
 
-class tnlDefaultConfigTag{};
+class tnlDefaultBuildConfigTag{};
 
 /****
  * All devices are enabled by default. Those which are not available
diff --git a/src/solvers/tnlDummyProblem.h b/src/solvers/tnlDummyProblem.h
index 73ca61fa789b619018eb59e1737b9c9bbd5e5291..0d5f226d379a5d6224b15ad1939a8d4ffebf6702 100644
--- a/src/solvers/tnlDummyProblem.h
+++ b/src/solvers/tnlDummyProblem.h
@@ -34,6 +34,7 @@ class tnlDummyProblem
       typedef Index IndexType;
       typedef tnlVector< Real, Device, Index > DofVectorType;
       typedef tnlGrid< 1, Real, Device, Index > MeshType;
+      typedef DofVectorType MeshDependentDataType;
 };
 
 
diff --git a/src/solvers/tnlFastBuildConfig.h b/src/solvers/tnlFastBuildConfigTag.h
similarity index 98%
rename from src/solvers/tnlFastBuildConfig.h
rename to src/solvers/tnlFastBuildConfigTag.h
index b8bd739351d7637e7860f323598becede0f55850..23b603cf7a24e19262c6b9b0dba8dc02319b93b0 100644
--- a/src/solvers/tnlFastBuildConfig.h
+++ b/src/solvers/tnlFastBuildConfigTag.h
@@ -18,7 +18,7 @@
 #ifndef TNLFASTBUILDCONFIG_H_
 #define TNLFASTBUILDCONFIG_H_
 
-#include <solvers/tnlConfigTags.h>
+#include <solvers/tnlBuildConfigTags.h>
 
 class tnlFastBuildConfig
 {
diff --git a/src/solvers/tnlIterativeSolverMonitor.h b/src/solvers/tnlIterativeSolverMonitor.h
index 4ac48116dd684c88e81dcd60989b9fd9116245e3..b4c949ee78ffc3bd2db38b99e07075983f9738f8 100644
--- a/src/solvers/tnlIterativeSolverMonitor.h
+++ b/src/solvers/tnlIterativeSolverMonitor.h
@@ -41,6 +41,8 @@ class tnlIterativeSolverMonitor : public tnlSolverMonitor< Real, Index >
    const RealType& getResidue() const;
 
    void setVerbose( const Index& verbose );
+   
+   void setRefreshRate( const IndexType& refreshRate );
 
    virtual void refresh();
 
@@ -58,7 +60,7 @@ class tnlIterativeSolverMonitor : public tnlSolverMonitor< Real, Index >
 
    IndexType refreshing;
 
-   IndexType outputPeriod;
+   IndexType refreshRate;
 
    IndexType verbose;
 
diff --git a/src/solvers/tnlIterativeSolverMonitor_impl.h b/src/solvers/tnlIterativeSolverMonitor_impl.h
index 803fad0ed734cb50ae1ebc97bb07e8b2f0e31ad3..1db4c8048b0749e63588ea2e39a43ae22bc0d737 100644
--- a/src/solvers/tnlIterativeSolverMonitor_impl.h
+++ b/src/solvers/tnlIterativeSolverMonitor_impl.h
@@ -27,7 +27,7 @@ tnlIterativeSolverMonitor< Real, Index > :: tnlIterativeSolverMonitor()
 : iterations( 0 ),
   residue( 0 ),
   refreshing( 0 ),
-  outputPeriod( 1 ),
+  refreshRate( 1 ),
   verbose( 1 )
 {
 }
@@ -62,37 +62,43 @@ void tnlIterativeSolverMonitor< Real, Index > :: setVerbose( const Index& verbos
    this -> verbose = verbose;
 }
 
+template< typename Real, typename Index>
+void tnlIterativeSolverMonitor< Real, Index > :: setRefreshRate( const Index& refreshRate )
+{
+   this -> refreshRate = refreshRate;
+}
+
 template< typename Real, typename Index>
 void tnlIterativeSolverMonitor< Real, Index > :: refresh()
 {
-   if( this -> verbose > 0 && this -> refreshing % this -> outputPeriod == 0 )
+   if( this->verbose > 0 && this->getIterations() % this->refreshRate == 0 )
    {
-      cout << " ITER:" << setw( 8 ) << this -> getIterations()
+      cout << " ITER:" << setw( 8 ) << this->getIterations()
            << " RES:" << setprecision( 5 ) << setw( 12 ) << this -> getResidue()
            << " CPU: " << setw( 8 ) << this -> getCPUTime()
            << " ELA: " << setw( 8 ) << this -> getRealTime()
            << "   \r" << flush;
    }
-   this -> refreshing ++;
+   this->refreshing ++;
 }
 
 template< typename Real, typename Index>
 void tnlIterativeSolverMonitor< Real, Index > :: resetTimers()
 {
-   cpuTimer. Reset();
-   rtTimer. Reset();
+   cpuTimer.reset();
+   rtTimer.reset();
 }
 
 template< typename Real, typename Index>
 double tnlIterativeSolverMonitor< Real, Index > :: getCPUTime()
 {
-   return cpuTimer. GetTime();
+   return cpuTimer.getTime();
 }
 
 template< typename Real, typename Index>
 double tnlIterativeSolverMonitor< Real, Index > :: getRealTime()
 {
-   return rtTimer. GetTime();
+   return rtTimer.getTime();
 }
 
 
diff --git a/src/solvers/tnlIterativeSolver_impl.h b/src/solvers/tnlIterativeSolver_impl.h
index 0331a974ebd1ab31cc845f021ad79fc126d090c4..012a6da17141a62ba10514632a00daeab0bc6902 100644
--- a/src/solvers/tnlIterativeSolver_impl.h
+++ b/src/solvers/tnlIterativeSolver_impl.h
@@ -54,6 +54,7 @@ bool tnlIterativeSolver< Real, Index> :: setup( const tnlParameterContainer& par
    this->setConvergenceResidue( parameters.getParameter< double >( "convergence-residue" ) );
    this->setDivergenceResidue( parameters.getParameter< double >( "divergence-residue" ) );
    this->setRefreshRate( parameters.getParameter< int >( "refresh-rate" ) );
+   return true;
 }
 
 template< typename Real, typename Index >
@@ -100,27 +101,11 @@ bool tnlIterativeSolver< Real, Index> :: nextIteration()
          solverMonitor->refresh();
    }
 
-   if( std::isnan( this->getResidue() ) )
-   {
-      //cerr << endl << "RES is Nan" << endl;
-      return false;
-   }
-   if(( this->getResidue() > this->getDivergenceResidue() &&
-         this->getIterations() > this->minIterations ) )
-   {
-      ///cerr << endl << "RES is over the divergence residue." << endl;
+   if( std::isnan( this->getResidue() ) || 
+       this->getIterations() > this->getMaxIterations()  ||
+       ( this->getResidue() > this->getDivergenceResidue() && this->getIterations() > this->minIterations ) ||
+       ( this->getResidue() < this->getConvergenceResidue() && this->getIterations() > this->minIterations ) ) 
       return false;
-   }
-   if( this->getIterations() > this->getMaxIterations() )
-   {
-      //cerr << endl << "Max. iterations exceeded." << endl;
-      return false;
-   }
-   if( this->getResidue() < this->getConvergenceResidue() )
-   {
-      //cerr << endl << "The solver has. converged." <<  endl;
-      return false;
-   }
    return true;
 }
 
@@ -217,6 +202,7 @@ void tnlIterativeSolver< Real, Index> :: refreshSolverMonitor()
    {
       this -> solverMonitor -> setIterations( this -> getIterations() );
       this -> solverMonitor -> setResidue( this -> getResidue() );
+      this -> solverMonitor -> setRefreshRate( this-> refreshRate );
       this -> solverMonitor -> refresh();
    }
 }
diff --git a/src/solvers/tnlSolver.h b/src/solvers/tnlSolver.h
index c99bd3af92ab7621534801539b0d3656f27069ab..9e4040d87a48621acbe7750bb4f2164063c73c11 100644
--- a/src/solvers/tnlSolver.h
+++ b/src/solvers/tnlSolver.h
@@ -18,15 +18,15 @@
 #ifndef TNLSOLVER_H_
 #define TNLSOLVER_H_
 
-#include <solvers/tnlConfigTags.h>
+#include <solvers/tnlBuildConfigTags.h>
 
 template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter > class ProblemSetter,
           template< typename ConfTag > class ProblemConfig,
-          typename ConfigTag = tnlDefaultConfigTag >
+          typename ConfigTag = tnlDefaultBuildConfigTag >
 class tnlSolver
 {
    public:
-   bool run( int argc, char* argv[] );
+   static bool run( int argc, char* argv[] );
 
    protected:
 };
diff --git a/src/solvers/tnlSolverConfig_impl.h b/src/solvers/tnlSolverConfig_impl.h
index 0e814f49064494f32f784de79257db4665d036dd..329b0335918660845018b304f8191b543e7b302f 100644
--- a/src/solvers/tnlSolverConfig_impl.h
+++ b/src/solvers/tnlSolverConfig_impl.h
@@ -19,7 +19,7 @@
 #define TNLSOLVERCONFIG_IMPL_H_
 
 #include <tnlConfig.h>
-#include <solvers/tnlConfigTags.h>
+#include <solvers/tnlBuildConfigTags.h>
 #include <solvers/tnlDummyProblem.h>
 #include <solvers/pde/tnlExplicitTimeStepper.h>
 #include <solvers/pde/tnlPDESolver.h>
@@ -100,7 +100,7 @@ bool tnlSolverConfig< ConfigTag, ProblemConfig >::configSetup( tnlConfigDescript
    }
    config.addRequiredEntry< tnlString >( "discrete-solver", "The solver of the discretised problem:" );
    if( tnlConfigTagTimeDiscretisation< ConfigTag, tnlExplicitTimeDiscretisationTag >::enabled )
-   {
+   {      
       if( tnlConfigTagExplicitSolver< ConfigTag, tnlExplicitEulerSolverTag >::enabled )
          config.addEntryEnum( "euler" );
       if( tnlConfigTagExplicitSolver< ConfigTag, tnlExplicitMersonSolverTag >::enabled )
@@ -117,9 +117,16 @@ bool tnlSolverConfig< ConfigTag, ProblemConfig >::configSetup( tnlConfigDescript
       if( tnlConfigTagSemiImplicitSolver< ConfigTag, tnlSemiImplicitSORSolverTag >::enabled )
          config.addEntryEnum( "sor" );
    }
+   if( tnlConfigTagTimeDiscretisation< ConfigTag, tnlExplicitTimeDiscretisationTag >::enabled ||
+       tnlConfigTagTimeDiscretisation< ConfigTag, tnlSemiImplicitTimeDiscretisationTag >::enabled )
+   {
+      config.addDelimiter( " === Iterative solvers parameters === " );
+      tnlIterativeSolver< double, int >::configSetup( config );
+   }
    if( tnlConfigTagTimeDiscretisation< ConfigTag, tnlExplicitTimeDiscretisationTag >::enabled )
    {
       config.addDelimiter( " === Explicit solvers parameters === " );
+      tnlExplicitSolver< tnlDummyProblem< double, tnlHost, int > >::configSetup( config );
       if( tnlConfigTagExplicitSolver< ConfigTag, tnlExplicitEulerSolverTag >::enabled )
          tnlEulerSolver< tnlDummyProblem< double, tnlHost, int > >::configSetup( config );
 
@@ -128,7 +135,7 @@ bool tnlSolverConfig< ConfigTag, ProblemConfig >::configSetup( tnlConfigDescript
    }
    if( tnlConfigTagTimeDiscretisation< ConfigTag, tnlSemiImplicitTimeDiscretisationTag >::enabled )
    {
-      config.addDelimiter( " === Semi-implicit solvers parameters === " );
+      config.addDelimiter( " === Semi-implicit solvers parameters === " );      
       typedef tnlCSRMatrix< double, tnlHost, int > MatrixType;
       if( tnlConfigTagSemiImplicitSolver< ConfigTag, tnlSemiImplicitCGSolverTag >::enabled )
          tnlCGSolver< MatrixType >::configSetup( config );
diff --git a/src/solvers/tnlSolverInitiator.h b/src/solvers/tnlSolverInitiator.h
index 3a27cf53574aa383ee79f9036775e732d8139181..525d304c9d2bfea36396c2b0e35cbfae65903527 100644
--- a/src/solvers/tnlSolverInitiator.h
+++ b/src/solvers/tnlSolverInitiator.h
@@ -20,7 +20,7 @@
 
 #include <core/tnlObject.h>
 #include <config/tnlParameterContainer.h>
-#include <solvers/tnlConfigTags.h>
+#include <solvers/tnlBuildConfigTags.h>
 
 template< template< typename Real, typename Device, typename Index, typename MeshType, typename ConfigTag, typename SolverStarter > class ProblemSetter,
           typename ConfigTag >
diff --git a/src/solvers/tnlSolverInitiator_impl.h b/src/solvers/tnlSolverInitiator_impl.h
index 05e281e1bc72a7d1fd2433d91195c405da5a0026..0876bd27527879a2a9cf4faaa071fc5f5bfa90df 100644
--- a/src/solvers/tnlSolverInitiator_impl.h
+++ b/src/solvers/tnlSolverInitiator_impl.h
@@ -17,7 +17,7 @@
 
 #include <config/tnlParameterContainer.h>
 #include <solvers/tnlMeshTypeResolver.h>
-#include <solvers/tnlConfigTags.h>
+#include <solvers/tnlBuildConfigTags.h>
 #include <solvers/linear/stationary/tnlSORSolver.h>
 #include <solvers/linear/krylov/tnlCGSolver.h>
 #include <solvers/linear/krylov/tnlBICGStabSolver.h>
diff --git a/src/solvers/tnlSolverMonitor.h b/src/solvers/tnlSolverMonitor.h
index 08c0b562b44e1d2ba1574ee15007afc8b320071c..df946e3643cdba6110c841ae8caa253fc06909e5 100644
--- a/src/solvers/tnlSolverMonitor.h
+++ b/src/solvers/tnlSolverMonitor.h
@@ -26,6 +26,7 @@ class tnlSolverMonitor
    virtual void refresh() = 0;
 
    ~tnlSolverMonitor() {};
+      
 };
 
 
diff --git a/src/solvers/tnlSolverStarter.h b/src/solvers/tnlSolverStarter.h
index 150fd3d83bd75bc5142a1c625ebeae04d43dd9a9..a2f604bedc3b21d29e34b58d967b5708ee354abb 100644
--- a/src/solvers/tnlSolverStarter.h
+++ b/src/solvers/tnlSolverStarter.h
@@ -33,7 +33,8 @@ class tnlSolverStarter
    template< typename Problem >
    static bool run( const tnlParameterContainer& parameters );
 
-   bool writeEpilog( ostream& str );
+   template< typename Solver >
+   bool writeEpilog( ostream& str, const Solver& solver );
 
    template< typename Problem, typename TimeStepper >
    bool runPDESolver( Problem& problem,
diff --git a/src/solvers/tnlSolverStarter_impl.h b/src/solvers/tnlSolverStarter_impl.h
index 98135409449424796cfa927adbd3f91924b1d8fb..09e61c33e25d96b0d63cbe30891d21b274f7a296 100644
--- a/src/solvers/tnlSolverStarter_impl.h
+++ b/src/solvers/tnlSolverStarter_impl.h
@@ -516,17 +516,17 @@ bool tnlSolverStarter< ConfigTag > :: runPDESolver( Problem& problem,
                                                     const tnlParameterContainer& parameters,
                                                     TimeStepper& timeStepper )
 {
-   this->totalCpuTimer. Reset();
-   this->totalRtTimer. Reset();
+   this->totalCpuTimer.reset();
+   this->totalRtTimer.reset();
 
    /****
     * Set-up the PDE solver
     */
    tnlPDESolver< Problem, TimeStepper > solver;
    solver.setProblem( problem );
+   solver.setTimeStepper( timeStepper );
    if( ! solver.setup( parameters ) )
       return false;
-   solver.setTimeStepper( timeStepper );
 
    /****
     * Write a prolog
@@ -560,16 +560,16 @@ bool tnlSolverStarter< ConfigTag > :: runPDESolver( Problem& problem,
    /****
     * Set-up timers
     */
-   this->computeRtTimer. Reset();
-   this->computeCpuTimer. Reset();
-   this->ioRtTimer. Reset();
-   this->ioRtTimer. Stop();
-   this->ioCpuTimer. Reset();
-   this->ioCpuTimer. Stop();
-   solver.setComputeRtTimer( this -> computeRtTimer );
-   solver.setComputeCpuTimer( this -> computeCpuTimer );
-   solver.setIoRtTimer( this -> ioRtTimer );
-   solver.setIoCpuTimer( this -> ioCpuTimer );
+   this->computeRtTimer.reset();
+   this->computeCpuTimer.reset();
+   this->ioRtTimer.reset();
+   this->ioRtTimer.stop();
+   this->ioCpuTimer.reset();
+   this->ioCpuTimer.stop();
+   solver.setComputeRtTimer( this->computeRtTimer );
+   solver.setComputeCpuTimer( this->computeCpuTimer );
+   solver.setIoRtTimer( this->ioRtTimer );
+   solver.setIoCpuTimer( this->ioCpuTimer );
 
    /****
     * Start the solver
@@ -597,16 +597,16 @@ bool tnlSolverStarter< ConfigTag > :: runPDESolver( Problem& problem,
    /****
     * Stop timers
     */
-   this->computeRtTimer.Stop();
-   this->computeCpuTimer.Stop();
-   this->totalCpuTimer.Stop();
-   this->totalRtTimer.Stop();
+   this->computeRtTimer.stop();
+   this->computeCpuTimer.stop();
+   this->totalCpuTimer.stop();
+   this->totalRtTimer.stop();
 
    /****
     * Write an epilog
     */
    if( verbose )
-      writeEpilog( cout );
+      writeEpilog( cout, solver );
    if( haveLogFile )
    {
       fstream logFile;
@@ -618,7 +618,7 @@ bool tnlSolverStarter< ConfigTag > :: runPDESolver( Problem& problem,
       }
       else
       {
-         writeEpilog( logFile );
+         writeEpilog( logFile, solver );
          logFile.close();
       }
    }
@@ -626,18 +626,21 @@ bool tnlSolverStarter< ConfigTag > :: runPDESolver( Problem& problem,
 }
 
 template< typename ConfigTag >
-bool tnlSolverStarter< ConfigTag > :: writeEpilog( ostream& str )
+   template< typename Solver >
+bool tnlSolverStarter< ConfigTag > :: writeEpilog( ostream& str, const Solver& solver  )
 {
    tnlLogger logger( logWidth, str );
    logger.writeCurrentTime( "Finished at:" );
-   logger.writeParameter< double >( "IO Real Time:", this -> ioRtTimer. GetTime() );
-   logger.writeParameter< double >( "IO CPU Time:", this -> ioCpuTimer. GetTime() );
-   logger.writeParameter< double >( "Compute Real Time:", this -> computeRtTimer. GetTime() );
-   logger.writeParameter< double >( "Compute CPU Time:", this -> computeCpuTimer. GetTime() );
-   logger.writeParameter< double >( "Total Real Time:", this -> totalRtTimer. GetTime() );
-   logger.writeParameter< double >( "Total CPU Time:", this -> totalCpuTimer. GetTime() );
+   if( ! solver.writeEpilog( logger ) )
+      return false;
+   logger.writeParameter< double >( "IO Real Time:", this -> ioRtTimer. getTime() );
+   logger.writeParameter< double >( "IO CPU Time:", this -> ioCpuTimer. getTime() );
+   logger.writeParameter< double >( "Compute Real Time:", this -> computeRtTimer. getTime() );
+   logger.writeParameter< double >( "Compute CPU Time:", this -> computeCpuTimer. getTime() );
+   logger.writeParameter< double >( "Total Real Time:", this -> totalRtTimer. getTime() );
+   logger.writeParameter< double >( "Total CPU Time:", this -> totalCpuTimer. getTime() );
    char buf[ 256 ];
-   sprintf( buf, "%f %%", 100 * ( ( double ) this -> totalCpuTimer. GetTime() ) / this -> totalRtTimer. GetTime() );
+   sprintf( buf, "%f %%", 100 * ( ( double ) this -> totalCpuTimer. getTime() ) / this -> totalRtTimer. getTime() );
    logger.writeParameter< char* >( "CPU usage:", buf );
    logger.writeSeparator();
    return true;
diff --git a/src/tnl-benchmarks.h b/src/tnl-benchmarks.h
index 3eaa5152fa08d52f5280e35a0c499536e00ddf79..9bb1727a4d9227153676c0035999d87a11c8c9ea 100644
--- a/src/tnl-benchmarks.h
+++ b/src/tnl-benchmarks.h
@@ -45,7 +45,7 @@ bool transferBenchmark( const int size,
    for( int i = 0; i < cycles; i ++ )
       if( ! host_vector2. copyFrom( host_vector ) )
          return false;
-   double time = timer. GetTime();
+   double time = timer. getTime();
    double giga_byte = ( double ) ( 1 << 30 );
    host_to_host_band_width = bytes / giga_byte / time;
 
@@ -55,7 +55,7 @@ bool transferBenchmark( const int size,
    for( int i = 0; i < cycles; i ++ )
       if( ! device_vector. copyFrom( host_vector ) )
          return false;
-   time = timer. GetTime();
+   time = timer. getTime();
    host_to_device_band_width = bytes / giga_byte / time;
 
    cout << "Transfering " << bytes / mega_byte << " MB from HOST to DEVICE took " << time << " seconds. Bandwidth is " << host_to_device_band_width << " GB/s." << endl;
@@ -64,7 +64,7 @@ bool transferBenchmark( const int size,
    for( int i = 0; i < cycles; i ++ )
       if( ! host_vector2. copyFrom( device_vector ) )
          return false;
-   time = timer. GetTime();
+   time = timer. getTime();
    device_to_host_band_width = bytes / giga_byte / time;
 
    cout << "Transfering " << bytes / mega_byte << " MB from DEVICE to HOST took " << time << " seconds. Bandwidth is " << device_to_host_band_width << " GB/s." << endl;
@@ -74,7 +74,7 @@ bool transferBenchmark( const int size,
       if( ! device_vector2. copyFrom( device_vector ) )
          return false;
 
-   time = timer. GetTime();
+   time = timer. getTime();
 
    // Since we read and write tha data back we process twice as many bytes.
    bytes *= 2;
@@ -230,7 +230,7 @@ void reductionBenchmark( const int size,
 
       }
    }
-   const double time = timer. GetTime();
+   const double time = timer. getTime();
    double giga_byte = ( double ) ( 1 << 30 );
    long int mega_byte = 1 << 20;
    long int bytes_reduced = size * sizeof( T ) * reducing_cycles * 3;
diff --git a/tests/benchmarks/CMakeLists.txt b/tests/benchmarks/CMakeLists.txt
index 90d967d345e0c9b55c1202f31550b2d14f0cdc29..0dd2c05611c2abeb5492662eae789750f4066fae 100755
--- a/tests/benchmarks/CMakeLists.txt
+++ b/tests/benchmarks/CMakeLists.txt
@@ -1,24 +1,34 @@
 ADD_SUBDIRECTORY( share )
 
 IF( BUILD_CUDA )
+    CUDA_ADD_EXECUTABLE( tnl-cuda-benchmarks${debugExt} tnl-cuda-benchmarks.cu
+                         OPTIONS ${CUDA_ADD_EXECUTABLE_OPTIONS} )
+    SET_TARGET_PROPERTIES( tnl-cuda-benchmarks${debugExt} PROPERTIES CUDA_COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" )
+    TARGET_LINK_LIBRARIES( tnl-cuda-benchmarks${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )                        
+    
     CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cu
                          OPTIONS ${CUDA_ADD_EXECUTABLE_OPTIONS} )
     SET_TARGET_PROPERTIES( tnl-benchmark-spmv${debugExt} PROPERTIES CUDA_COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )                        
+    
     CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers${debugExt} tnl-benchmark-linear-solvers.cu
                          OPTIONS ${CUDA_ADD_EXECUTABLE_OPTIONS} )
-    SET_TARGET_PROPERTIES( tnl-benchmark-spmv${debugExt} PROPERTIES CUDA_COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" )
     SET_TARGET_PROPERTIES( tnl-benchmark-linear-solvers${debugExt} PROPERTIES CUDA_COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" )
-    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )                        
+    TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )                        
 ELSE()
     ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cpp )
     ADD_EXECUTABLE( tnl-benchmark-linear-solvers${debugExt} tnl-benchmark-linear-solvers.cpp )
     SET_TARGET_PROPERTIES( tnl-benchmark-spmv${debugExt} PROPERTIES COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" )
     SET_TARGET_PROPERTIES( tnl-benchmark-linear-solvers${debugExt} PROPERTIES COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" )
     TARGET_LINK_LIBRARIES( tnl-benchmark-spmv${debugExt} tnl${debugExt}-${tnlVersion} )
+    TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers${debugExt} tnl${debugExt}-${tnlVersion} )
 ENDIF()
 
-TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers${debugExt} tnl${debugExt}-${tnlVersion} )
-                                                              
+if( BUILD_CUDA )                                                              
+   INSTALL( TARGETS tnl-cuda-benchmarks${debugExt}
+            RUNTIME DESTINATION bin )
+endif()
+
 INSTALL( TARGETS tnl-benchmark-spmv${debugExt}
                  tnl-benchmark-linear-solvers${debugExt}
          RUNTIME DESTINATION bin )
diff --git a/tests/benchmarks/share/tnl-matrix-solvers-benchmark.cfg.desc b/tests/benchmarks/share/tnl-matrix-solvers-benchmark.cfg.desc
deleted file mode 100644
index 8e8280574470d15a71513a8cd49958af2ffdcafa..0000000000000000000000000000000000000000
--- a/tests/benchmarks/share/tnl-matrix-solvers-benchmark.cfg.desc
+++ /dev/null
@@ -1,19 +0,0 @@
-group IO
-{
-   string input-file(!)         [Input binary file name.];
-   string input-mtx-file("")    [Input mtx file name.];
-   string log-file("")          [Log file name.];
-   string matrix-stats-file("") [File for matrix statistics like size number of non-zero elements.];
-   real stop-time(3.0)          [How many seconds shell we iterate SpMV.];
-   integer verbose(1)           [Verbose mode.];
-},[Arguments describing input and output data.];
-
-group solver
-{
-   string device("host")         [On what device the solver will run. Can be host or cuda.];
-   string solver-name(!)         [Set matrix solver for benchmarking. It can be sor, cg, bicgstab, tfqmr, gmres. ];
-   string solver-class("tnl")    [Choose other library of solvers. It can be tnl or petsc.];
-   real max-residue(1.0e-6)      [Set what residue we want to achieve.];
-   integer gmres-restarting(20)  [Set restarting for GMRES method.];   
-   real sor-omega(1.0)           [Omega parameter for the SOR method. Can be 0--2.];
-},[Arguments describing the solver.];
\ No newline at end of file
diff --git a/tests/benchmarks/share/tnl-sparse-matrix-benchmark.cfg.desc b/tests/benchmarks/share/tnl-sparse-matrix-benchmark.cfg.desc
deleted file mode 100644
index 9a3bde6ec250122dcb96fc681f8bd4f276bc5b9b..0000000000000000000000000000000000000000
--- a/tests/benchmarks/share/tnl-sparse-matrix-benchmark.cfg.desc
+++ /dev/null
@@ -1,12 +0,0 @@
-group IO
-{
-   string input-mtx-file(!)     [Input mtx file name.];
-   string input-file("")        [Input binary file name.];
-   string pdf-file("")          [PDF file with matrix pattern.];
-   string log-file("")          [Log file name.];
-   string precision("double")   [Precision of the arithmetics.];
-   real stop-time(3.0)          [How many seconds shell we iterate SpMV.];
-   integer max-iterations(100)  [Maximum number of SpMV repetitions.];
-   bool format-test( no )       [Turn on/off test of matrix formats.]; 
-   integer verbose(1)           [Verbose mode.];
-},[Arguments describing input and output data.];
\ No newline at end of file
diff --git a/tests/benchmarks/tnl-benchmark-spmv.h b/tests/benchmarks/tnl-benchmark-spmv.h
index aaf27b1b3e661df35f1191aec23118d83aeb8761..2f360605a64acfa4789ab6bfe1a05cd3722c9350 100644
--- a/tests/benchmarks/tnl-benchmark-spmv.h
+++ b/tests/benchmarks/tnl-benchmark-spmv.h
@@ -245,17 +245,17 @@ double benchmarkMatrix( const Matrix& matrix,
                         fstream& logFile )
 {
    tnlTimerRT timer;
-   timer.Reset();
+   timer.reset();
    double time( 0.0 );
    int iterations( 0 );
    while( time < stopTime )
    {
       matrix.vectorProduct( x, b );
 #ifdef HAVE_CUDA
-      if( Matrix::DeviceType::DeviceType == tnlCudaDevice )
+      if( ( tnlDeviceEnum ) Matrix::DeviceType::DeviceType == tnlCudaDevice )
          cudaThreadSynchronize();
 #endif
-      time = timer.GetTime();
+      time = timer.getTime();
       iterations++;
    }
    const double gflops = computeGflops( nonzeroElements, iterations, time );
@@ -393,7 +393,7 @@ bool setupBenchmark( const tnlParameterContainer& parameters )
          cusparseDestroy( cusparseHandle );
 
          cout << " done.   \r";
-         cudaCSRMatrix.setCudaKernelType( CSRMatrixCudaType::scalar );
+         /*cudaCSRMatrix.setCudaKernelType( CSRMatrixCudaType::scalar );
          benchmarkMatrix( cudaCSRMatrix,
                           cudaX,
                           cudaB,
@@ -524,7 +524,7 @@ bool setupBenchmark( const tnlParameterContainer& parameters )
                           stopTime,
                           baseline,
                           verbose,
-                          logFile );
+                          logFile );*/
       }
       cudaCSRMatrix.reset();
 #endif
diff --git a/tests/benchmarks/tnl-benchmarks.cpp b/tests/benchmarks/tnl-benchmarks.cpp
deleted file mode 100644
index 69af7724356147e84594aa3c4f8c35db6010b20b..0000000000000000000000000000000000000000
--- a/tests/benchmarks/tnl-benchmarks.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/***************************************************************************
-                          tnl-benchmarks.cpp  -  description
-                             -------------------
-    begin                : Nov 25, 2010
-    copyright            : (C) 2010 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-
-#include <core/vectors/tnlVectorHost.h>
-#include <core/vectors/tnlVectorCUDA.h>
-#include <tnl-benchmarks.h>
-
-
-int main( int argc, char* argv[] )
-{
-
-#ifdef HAVE_CUDA
-   cout << "Benchmarking memory bandwidth when transfering int ..." << endl;
-
-   const int size = 1 << 22;
-   double host_to_host_band_width;
-   double host_to_device_band_width;
-   double device_to_host_band_width;
-   double device_to_device_band_width;
-
-   transferBenchmark< int >( size,
-                             host_to_host_band_width,
-                             host_to_device_band_width,
-                             device_to_host_band_width,
-                             device_to_device_band_width );
-
-
-   cout << "Benchmarking reduction of int ..." << endl;
-   for( int i = 0; i <= 6; i ++ )
-      reductionBenchmark< int >( size, i );
-
-   cout << "Benchmarking reduction of float ..." << endl;
-   for( int i = 0; i <= 6; i ++ )
-      reductionBenchmark< float >( size, i );
-
-   cout << "Benchmarking reduction of double ..." << endl;
-   for( int i = 0; i <= 6; i ++ )
-      reductionBenchmark< double >( size / 2, i );
-
-#endif
-   return EXIT_SUCCESS;
-}
diff --git a/tests/benchmarks/tnl-benchmarks.h b/tests/benchmarks/tnl-benchmarks.h
deleted file mode 100644
index bb40f95948086bcd9cd5ff99cb36eb6a786c7024..0000000000000000000000000000000000000000
--- a/tests/benchmarks/tnl-benchmarks.h
+++ /dev/null
@@ -1,257 +0,0 @@
-/***************************************************************************
-                          tnl-benchmarks.h  -  description
-                             -------------------
-    begin                : Jan 27, 2010
-    copyright            : (C) 2010 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef TNLBENCHMARKS_H_
-#define TNLBENCHMARKS_H_
-
-#include <core/mfuncs.h>
-#include <core/tnlTimerCPU.h>
-#include <../tests/unit-tests/core/tnl-cuda-kernels.h>
-#include <core/low-level/cuda-long-vector-kernels.h>
-
-template< class T >
-bool transferBenchmark( const int size,
-                        double& host_to_host_band_width,
-                        double& host_to_device_band_width,
-                        double& device_to_host_band_width,
-                        double& device_to_device_band_width )
-{
-
-  tnlVector< T > host_vector( "transferBenchmark:host-vector", size );
-  tnlVector< T > host_vector2( "transferBenchmark:host-vector-2", size );
-  tnlVector< T, tnlCuda > device_vector( "transferBenchmark:device-vector", size );
-  tnlVector< T, tnlCuda > device_vector2( "transferBenchmark:device-vector-2", size );
-
-   for( int i = 0; i < size; i ++ )
-      host_vector[ i ] = i + 1;
-
-   const long int cycles = 100;
-   long int bytes = cycles * size * sizeof( int );
-   long int mega_byte = 1 << 20;
-
-   tnlTimerCPU timer;
-   timer. Reset();
-   for( int i = 0; i < cycles; i ++ )
-      host_vector2 = host_vector;
-
-   double time = timer. GetTime();
-   double giga_byte = ( double ) ( 1 << 30 );
-   host_to_host_band_width = bytes / giga_byte / time;
-
-   cout << "Transfering " << bytes / mega_byte << " MB from HOST to HOST took " << time << " seconds. Bandwidth is " << host_to_host_band_width << " GB/s." << endl;
-
-   timer. Reset();
-   for( int i = 0; i < cycles; i ++ )
-      device_vector = host_vector;
-
-   time = timer. GetTime();
-   host_to_device_band_width = bytes / giga_byte / time;
-
-   cout << "Transfering " << bytes / mega_byte << " MB from HOST to DEVICE took " << time << " seconds. Bandwidth is " << host_to_device_band_width << " GB/s." << endl;
-
-   timer. Reset();
-   for( int i = 0; i < cycles; i ++ )
-      host_vector2 = device_vector;
-
-   time = timer. GetTime();
-   device_to_host_band_width = bytes / giga_byte / time;
-
-   cout << "Transfering " << bytes / mega_byte << " MB from DEVICE to HOST took " << time << " seconds. Bandwidth is " << device_to_host_band_width << " GB/s." << endl;
-
-   timer. Reset();
-   for( int i = 0; i < cycles; i ++ )
-      device_vector2 = device_vector;
-
-
-   time = timer. GetTime();
-
-   // Since we read and write tha data back we process twice as many bytes.
-   bytes *= 2;
-   device_to_device_band_width = bytes / giga_byte / time;
-
-   cout << "Transfering " << bytes / mega_byte << " MB from DEVICE to DEVICE took " << time << " seconds. Bandwidth is " << device_to_device_band_width << " GB/s." << endl;
-}
-
-template< class T >
-void tnlCPUReductionSum( const tnlVector< T >& host_vector,
-                         T& sum )
-{
-   const T* data = host_vector. getData();
-   const int size = host_vector. getSize();
-   sum = 0.0;
-   for( int i = 0; i < size; i ++ )
-      sum += data[ i ];
-};
-
-template< class T >
-void tnlCPUReductionMin( const tnlVector< T >& host_vector,
-                         T& min )
-{
-   const T* data = host_vector. getData();
-   const int size = host_vector. getSize();
-   //tnlAssert( data );
-   min = data[ 0 ];
-   for( int i = 1; i < size; i ++ )
-      min = :: Min( min,  data[ i ] );
-};
-
-template< class T >
-void tnlCPUReductionMax( const tnlVector< T >& host_vector,
-                         T& max )
-{
-   const T* data = host_vector. getData();
-   const int size = host_vector. getSize();
-   //tnlAssert( data );
-   max = data[ 0 ];
-   for( int i = 1; i < size; i ++ )
-      max = :: Max( max,  data[ i ] );
-};
-
-template< class T >
-void reductionBenchmark( const int size,
-                         const int algorithm )
-{
-   tnlVector< T > host_vector( "reductionBenchmark:host-vector", size );
-   tnlVector< T, tnlCuda > device_vector( "reductionBenchmark:device-vector", size );
-   tnlVector< T, tnlCuda > device_aux( "reductionBenchmark:device-aux", size / 2 );
-
-   for( int i = 0; i < size; i ++ )
-      host_vector[ i ] = i + 1;
-
-   device_vector = host_vector;
-
-   T sum, min, max;
-   const long int reducing_cycles( 1 );
-
-   tnlTimerCPU timer;
-   timer. Reset();
-   for( int i = 0; i < reducing_cycles; i ++ )
-   {
-      switch( algorithm )
-      {
-         case 0:  // reduction on CPU
-            tnlCPUReductionSum( host_vector, sum );
-            tnlCPUReductionMin( host_vector, sum );
-            tnlCPUReductionMax( host_vector, sum );
-#ifdef HAVE_CUDA
-         case 1:
-            tnlCUDASimpleReduction1< T, tnlParallelReductionSum >( size,
-                                                                   device_vector. getData(),
-                                                                   sum,
-                                                                   device_aux. getData() );
-            tnlCUDASimpleReduction1< T, tnlParallelReductionMin >( size,
-                                                                   device_vector. getData(),
-                                                                   min,
-                                                                   device_aux. getData() );
-            tnlCUDASimpleReduction1< T, tnlParallelReductionMax >( size,
-                                                                   device_vector. getData(),
-                                                                   max,
-                                                                   device_aux. getData() );
-            break;
-         case 2:
-            tnlCUDASimpleReduction2< T, tnlParallelReductionSum >( size,
-                                                                   device_vector. getData(),
-                                                                   sum,
-                                                                   device_aux. getData() );
-            tnlCUDASimpleReduction2< T, tnlParallelReductionMin >( size,
-                                                                   device_vector. getData(),
-                                                                   min,
-                                                                   device_aux. getData() );
-            tnlCUDASimpleReduction2< T, tnlParallelReductionMax >( size,
-                                                                   device_vector. getData(),
-                                                                   max,
-                                                                   device_aux. getData() );
-            break;
-         case 3:
-            tnlCUDASimpleReduction3< T, tnlParallelReductionSum >( size,
-                                                                   device_vector. getData(),
-                                                                   sum,
-                                                                   device_aux. getData() );
-            tnlCUDASimpleReduction3< T, tnlParallelReductionMin >( size,
-                                                                   device_vector. getData(),
-                                                                   min,
-                                                                   device_aux. getData() );
-            tnlCUDASimpleReduction3< T, tnlParallelReductionMax >( size,
-                                                                   device_vector. getData(),
-                                                                   max,
-                                                                   device_aux. getData() );
-            break;
-         case 4:
-            tnlCUDASimpleReduction4< T, tnlParallelReductionSum >( size,
-                                                                   device_vector. getData(),
-                                                                   sum,
-                                                                   device_aux. getData() );
-            tnlCUDASimpleReduction4< T, tnlParallelReductionMin >( size,
-                                                                   device_vector. getData(),
-                                                                   min,
-                                                                   device_aux. getData() );
-            tnlCUDASimpleReduction4< T, tnlParallelReductionMax >( size,
-                                                                   device_vector. getData(),
-                                                                   max,
-                                                                   device_aux. getData() );
-            break;
-         case 5:
-            tnlCUDASimpleReduction5< T, tnlParallelReductionSum >( size,
-                                                                   device_vector. getData(),
-                                                                   sum,
-                                                                   device_aux. getData() );
-            tnlCUDASimpleReduction5< T, tnlParallelReductionMin >( size,
-                                                                   device_vector. getData(),
-                                                                   min,
-                                                                   device_aux. getData() );
-            tnlCUDASimpleReduction5< T, tnlParallelReductionMax >( size,
-                                                                   device_vector. getData(),
-                                                                   max,
-                                                                   device_aux. getData() );
-            break;
-         default:
-            reductionOnCudaDevice< T, T, int, tnlParallelReductionSum >( size,
-                                                                              device_vector. getData(),
-                                                                              NULL,
-                                                                              sum,
-                                                                              0.0,
-                                                                              device_aux. getData() );
-            reductionOnCudaDevice< T, T, int, tnlParallelReductionMin >( size,
-                                                                              device_vector. getData(),
-                                                                              NULL,
-                                                                              min,
-                                                                              0.0,
-                                                                              device_aux. getData() );
-            reductionOnCudaDevice< T, T, int, tnlParallelReductionMax >( size,
-                                                                              device_vector. getData(),
-                                                                              NULL,
-                                                                              max,
-                                                                              0.0,
-                                                                              device_aux. getData() );
-#endif
-
-      }
-   }
-   const double time = timer. GetTime();
-   double giga_byte = ( double ) ( 1 << 30 );
-   long int mega_byte = 1 << 20;
-   long int bytes_reduced = size * sizeof( T ) * reducing_cycles * 3;
-   const double reduction_band_width = bytes_reduced / giga_byte / time;
-
-   cout << "Reducing " << bytes_reduced / mega_byte
-        << " MB on DEVICE using algorithm " << algorithm
-        << " took " << time
-        << " seconds. Bandwidth is " << reduction_band_width
-        << " GB/s." << endl;
-}
-
-#endif /* TNLBENCHMARKS_H_ */
diff --git a/examples/simple-solver/simpleProblemSetter.h b/tests/benchmarks/tnl-cuda-benchmarks.cu
similarity index 55%
rename from examples/simple-solver/simpleProblemSetter.h
rename to tests/benchmarks/tnl-cuda-benchmarks.cu
index cd8f01c41d9279ef39e09b5b046ba220872ef6ad..e21eb98636b46afdf62abb576c6fdec4c36e3821 100644
--- a/examples/simple-solver/simpleProblemSetter.h
+++ b/tests/benchmarks/tnl-cuda-benchmarks.cu
@@ -1,8 +1,8 @@
 /***************************************************************************
-                          simpleProblemSetter.h  -  description
+                          tnl-cuda-benchmarks.cu  -  description
                              -------------------
-    begin                : Feb 23, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
+    begin                : May 28, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -15,26 +15,4 @@
  *                                                                         *
  ***************************************************************************/
 
-#ifndef SIMPLEPROBLEMTYPESSETTER_H_
-#define SIMPLEPROBLEMTYPESSETTER_H_
-
-#include <config/tnlParameterContainer.h>
-#include <mesh/tnlGrid.h>
-#include "simpleProblemSolver.h"
-
-template< typename RealType,
-          typename DeviceType,
-          typename IndexType,
-          typename MeshType,
-          typename ConfigTag,
-          typename SolverStarter >
-class simpleProblemSetter
-{
-   public:
-
-   static bool run( const tnlParameterContainer& parameters );
-};
-
-#include "simpleProblemSetter_impl.h"
-
-#endif /* SIMPLEPROBLEMSETTER_H_ */
+#include "tnl-cuda-benchmarks.h"
\ No newline at end of file
diff --git a/tests/benchmarks/tnl-cuda-benchmarks.h b/tests/benchmarks/tnl-cuda-benchmarks.h
new file mode 100644
index 0000000000000000000000000000000000000000..38c108b5d90aad5232d0428c8ffa1bb9878526e6
--- /dev/null
+++ b/tests/benchmarks/tnl-cuda-benchmarks.h
@@ -0,0 +1,174 @@
+/***************************************************************************
+                          tnl-benchmarks.h  -  description
+                             -------------------
+    begin                : Jan 27, 2010
+    copyright            : (C) 2010 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef TNLCUDABENCHMARKS_H_
+#define TNLCUDBENCHMARKS_H_
+
+#include <tnlConfig.h>
+#include <core/vectors/tnlVector.h>
+#include <core/tnlTimerRT.h>
+
+#ifdef HAVE_CUBLAS
+//#include <cublas.h>
+#endif    
+
+
+
+int main( int argc, char* argv[] )
+{
+#ifdef HAVE_CUDA
+   
+   typedef double Real;
+   typedef tnlVector< Real, tnlHost > HostVector;
+   typedef tnlVector< Real, tnlCuda > CudaVector;
+
+   
+   /****
+    * The first argument of this program is the size od data set to be reduced.
+    * If no argument is given we use hardcoded default value.
+    */
+   int size = 1 << 22;
+   if( argc > 1 )
+      size = atoi( argv[ 1 ] );
+   int loops = 10;
+   if( argc > 2 )
+      loops = atoi( argv[ 2 ] );
+   
+   
+   
+   const double oneGB = 1024.0 * 1024.0 * 1024.0;
+   double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB;
+   
+   HostVector hostVector, hostVector2;
+   CudaVector deviceVector, deviceVector2;
+   hostVector.setSize( size );
+   if( ! deviceVector.setSize( size ) )
+      return EXIT_FAILURE;
+   hostVector2.setLike( hostVector );
+   if( ! deviceVector2.setLike( deviceVector ) )
+      return EXIT_FAILURE;
+
+   hostVector.setValue( 1.0 );
+   deviceVector.setValue( 1.0 );
+   hostVector2.setValue( 1.0 );
+   deviceVector2.setValue( 1.0 );
+
+   tnlTimerRT timer;
+   double bandwidth( 0.0 );
+
+   /*   
+   cout << "Benchmarking CPU-GPU memory bandwidth: ";
+   timer.reset();
+   timer.start();
+   for( int i = 0; i < loops; i++ )
+     deviceVector = hostVector;
+   timer.stop();    
+   bandwidth = datasetSize / timer.getTime();
+   cout << bandwidth << " GB/sec." << endl;
+    
+   cout << "Benchmarking vector addition on CPU: ";
+   timer.reset();
+   timer.start();
+   for( int i = 0; i < loops; i++ )
+     hostVector.addVector( hostVector2 );
+   timer.stop();
+   bandwidth = 2 * datasetSize / timer.getTime();
+   cout << bandwidth << " GB/sec." << endl;
+    
+    cout << "Benchmarking vector addition on GPU: ";
+    timer.reset();
+    timer.start();
+    for( int i = 0; i < loops; i++ )
+      deviceVector.addVector( deviceVector2 );
+    cudaThreadSynchronize();
+    timer.stop();
+    bandwidth = 3 * datasetSize / timer.getTime();
+    cout << bandwidth << " GB/sec." << endl;
+    */
+
+   Real resultHost, resultDevice;
+   cout << "Benchmarking scalar product on CPU: ";
+   timer.reset();
+   timer.start();
+   for( int i = 0; i < loops; i++ )
+     resultHost = hostVector.scalarProduct( hostVector2 );
+   timer.stop();
+   bandwidth = 2 * datasetSize / timer.getTime();
+   cout << bandwidth << " GB/sec." << endl;
+    
+   cout << "Benchmarking scalar product on GPU: " << endl;
+   timer.reset();
+   timer.start();
+   for( int i = 0; i < loops; i++ )
+      resultDevice = deviceVector.scalarProduct( deviceVector );
+   cout << "Time: " << timer.getTime() << endl;
+   timer.stop();
+   bandwidth = 2 * datasetSize / timer.getTime();
+   cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl;
+   if( resultHost != resultDevice )
+   {
+      cerr << "Error. " << resultHost << " != " << resultDevice << endl;
+      //return EXIT_FAILURE;
+   }
+
+#ifdef HAVE_CUBLAS
+   cout << "Benchmarking scalar product on GPU with Cublas: " << endl;
+   cublasHandle_t handle;
+   cublasCreate( &handle );
+   timer.reset();
+   timer.start();   
+   for( int i = 0; i < loops; i++ )
+      cublasDdot( handle,
+                  size,
+                  deviceVector.getData(), 1,
+                  deviceVector.getData(), 1,
+                  &resultDevice );
+   cudaThreadSynchronize();
+   timer.stop();
+   bandwidth = 2 * datasetSize / timer.getTime();
+   cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl;
+#endif    
+#endif
+   
+   cout << "Benchmarking prefix-sum on CPU ..." << endl;
+   timer.reset();
+   timer.start();
+   hostVector.computePrefixSum();
+   cout << "Time: " << timer.getTime() << endl;
+   timer.stop();
+   bandwidth = 2 * datasetSize / loops / timer.getTime();
+   cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl;
+   
+   cout << "Benchmarking prefix-sum on GPU ..." << endl;
+   timer.reset();
+   timer.start();
+   deviceVector.computePrefixSum();
+   cout << "Time: " << timer.getTime() << endl;
+   timer.stop();
+   bandwidth = 2 * datasetSize / loops / timer.getTime();
+   cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl;
+
+   for( int i = 0; i < size; i++ )
+      if( hostVector.getElement( i ) != deviceVector.getElement( i ) )
+      {
+         cerr << "Error in prefix sum at position " << i << ":  " << hostVector.getElement( i ) << " != " << deviceVector.getElement( i ) << endl;
+      }
+
+   return EXIT_SUCCESS;
+}
+
+#endif /* TNLCUDABENCHMARKS_H_ */
diff --git a/tests/long-time-unit-tests/CMakeLists.txt b/tests/long-time-unit-tests/CMakeLists.txt
index 82f67730c24e5a7f00f91b699b6c6914bcb04303..e526b0a7f9aa14b93b5845f54066328a988a0488 100755
--- a/tests/long-time-unit-tests/CMakeLists.txt
+++ b/tests/long-time-unit-tests/CMakeLists.txt
@@ -2,10 +2,17 @@ set( ENABLE_CODECOVERAGE )
 
 SET( headers matrix-formats-test.h )
 
-ADD_EXECUTABLE( tnl-test-matrix-formats${mpiExt}${debugExt} ${headers} matrix-formats-test.cpp )
+if( BUILD_CUDA )
+    CUDA_ADD_EXECUTABLE( tnl-test-matrix-formats${mpiExt}${debugExt} ${headers} matrix-formats-test.cu
+                         OPTIONS ${CUDA_ADD_EXECUTABLE_OPTIONS} )
+else()
+   ADD_EXECUTABLE( tnl-test-matrix-formats${mpiExt}${debugExt} ${headers} matrix-formats-test.cpp )                                                                   
+endif()
+
 TARGET_LINK_LIBRARIES( tnl-test-matrix-formats${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
                                                                    tnl${mpiExt}${debugExt}-0.1 )
 
+
 INSTALL( TARGETS tnl-test-matrix-formats${debugExt}
          RUNTIME DESTINATION bin
          PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
diff --git a/examples/simple-solver/main.cpp b/tests/long-time-unit-tests/matrix-formats-test.cu
similarity index 62%
rename from examples/simple-solver/main.cpp
rename to tests/long-time-unit-tests/matrix-formats-test.cu
index 2ab0f85a74e0cdeeeb1f4c41f316b36b6022e472..ea8e69e2ed478934a623a27df666633dd4ecf3be 100644
--- a/examples/simple-solver/main.cpp
+++ b/tests/long-time-unit-tests/matrix-formats-test.cu
@@ -1,7 +1,7 @@
 /***************************************************************************
-                          main.cpp  -  description
+                          matrix-formats-test.cu  -  description
                              -------------------
-    begin                : Jan 12, 2013
+    begin                : Dec 14, 2013
     copyright            : (C) 2013 by Tomas Oberhuber
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
@@ -15,21 +15,4 @@
  *                                                                         *
  ***************************************************************************/
 
-#include "simpleProblemSetter.h"
-#include "simpleProblemConfig.h"
-#include <solvers/tnlSolver.h>
-#include <solvers/tnlFastBuildConfig.h>
-#include <solvers/tnlConfigTags.h>
-
-//typedef tnlDefaultConfigTag BuildConfig;
-typedef tnlFastBuildConfig BuildConfig;
-
-int main( int argc, char* argv[] )
-{
-   tnlSolver< simpleProblemSetter, simpleProblemConfig, BuildConfig > solver;
-   if( ! solver. run( argc, argv ) )
-      return EXIT_FAILURE;
-   return EXIT_SUCCESS;
-}
-
-
+#include "matrix-formats-test.h"
\ No newline at end of file
diff --git a/tests/unit-tests/core/vectors/tnlVectorOperationsTester.h b/tests/unit-tests/core/vectors/tnlVectorOperationsTester.h
index ee7b3d0184626ec645d09149036446d9e993fc99..2732fe86aadac54698169ec71df48066bb414b19 100644
--- a/tests/unit-tests/core/vectors/tnlVectorOperationsTester.h
+++ b/tests/unit-tests/core/vectors/tnlVectorOperationsTester.h
@@ -62,9 +62,9 @@ class tnlVectorOperationsTester : public CppUnit :: TestCase
       suiteOfTests -> addTest( new TestCallerType( "vectorScalarMultiplicationTest", &tnlVectorOperationsTester::vectorScalarMultiplicationTest ) );
       suiteOfTests -> addTest( new TestCallerType( "getSclaraProductTest", &tnlVectorOperationsTester::getVectorScalarProductTest ) );
       suiteOfTests -> addTest( new TestCallerType( "addVectorTest", &tnlVectorOperationsTester::addVectorTest ) );
-      suiteOfTests -> addTest( new TestCallerType( "alphaXPlusBetaYTest", &tnlVectorOperationsTester::alphaXPlusBetaYTest ) );
+      /*suiteOfTests -> addTest( new TestCallerType( "alphaXPlusBetaYTest", &tnlVectorOperationsTester::alphaXPlusBetaYTest ) );
       suiteOfTests -> addTest( new TestCallerType( "alphaXPlusBetaZTest", &tnlVectorOperationsTester::alphaXPlusBetaZTest ) );
-      suiteOfTests -> addTest( new TestCallerType( "alphaXPlusBetaZPlusYTest", &tnlVectorOperationsTester::alphaXPlusBetaZPlusYTest ) );
+      suiteOfTests -> addTest( new TestCallerType( "alphaXPlusBetaZPlusYTest", &tnlVectorOperationsTester::alphaXPlusBetaZPlusYTest ) );*/
       suiteOfTests -> addTest( new TestCallerType( "prefixSumTest", &tnlVectorOperationsTester::prefixSumTest ) );
       suiteOfTests -> addTest( new TestCallerType( "exclusivePrefixSumTest", &tnlVectorOperationsTester::exclusivePrefixSumTest ) );
       return suiteOfTests;
@@ -325,7 +325,7 @@ class tnlVectorOperationsTester : public CppUnit :: TestCase
          CPPUNIT_ASSERT( y.getElement( i ) == 1.0 + 3.0 * i );
    };
 
-   void alphaXPlusBetaYTest()
+   /*void alphaXPlusBetaYTest()
    {
       const int size( 10000 );
       tnlVector< Real, Device > x, y;
@@ -368,7 +368,7 @@ class tnlVectorOperationsTester : public CppUnit :: TestCase
 
       for( int i = 0; i < size; i ++ )
          CPPUNIT_ASSERT( y.getElement( i ) == -1.0 + 3.0 * i );
-   };
+   };*/
 
    void prefixSumTest()
    {
diff --git a/tests/unit-tests/matrices/tnlChunkedEllpackMatrixTester.h b/tests/unit-tests/matrices/tnlChunkedEllpackMatrixTester.h
index 258beecdc607c78d4083cc025380814faf43e23a..5f281816eb863183594d98bd3cd2bd4da6750f97 100644
--- a/tests/unit-tests/matrices/tnlChunkedEllpackMatrixTester.h
+++ b/tests/unit-tests/matrices/tnlChunkedEllpackMatrixTester.h
@@ -85,7 +85,7 @@ class tnlChunkedEllpackMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m1.getRows() );
       rowLengths.setValue( 5 );
-      m1.setRowLengths( rowLengths );
+      m1.setCompressedRowsLengths( rowLengths );
       m2.setLike( m1 );
       CPPUNIT_ASSERT( m1.getRows() == m2.getRows() );
    }
@@ -100,7 +100,7 @@ class tnlChunkedEllpackMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 7 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       for( int i = 0; i < 7; i++ )
          CPPUNIT_ASSERT( m.setElement( 0, i, i ) );
@@ -117,7 +117,7 @@ class tnlChunkedEllpackMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 7 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       for( int i = 0; i < 10; i++ )
          m.setElement( i, i, i );
@@ -144,7 +144,7 @@ class tnlChunkedEllpackMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 10 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       for( int i = 0; i < 10; i++ )
          m.setElement( i, i, i );
@@ -164,7 +164,7 @@ class tnlChunkedEllpackMatrixTester : public CppUnit :: TestCase
       m.setDimensions( 10, 10 );
       m.setNumberOfChunksInSlice( SliceSize );
       m.setDesiredChunkSize( ChunkSize );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       for( int i = 9; i >= 0; i-- )
          for( int j = 9; j >= 0; j-- )
             m.setElement( i, j, i+j );
@@ -185,7 +185,7 @@ class tnlChunkedEllpackMatrixTester : public CppUnit :: TestCase
       rowLengths.setSize( m.getRows() );
       for( int i = 0; i < 10; i++ )
          rowLengths.setElement( i, i+1 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       for( int i = 0; i < 10; i++ )
          for( int j = 0; j <= i; j++ )
@@ -200,7 +200,7 @@ class tnlChunkedEllpackMatrixTester : public CppUnit :: TestCase
 
       m.reset();
       m.setDimensions( 10, 10 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       for( int i = 9; i >= 0; i-- )
          for( int j = i; j >= 0; j-- )
             m.setElement( i, j, i + j );
@@ -223,7 +223,7 @@ class tnlChunkedEllpackMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 7 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       for( int i = 0; i < 10; i++ )
          m.setElement( i, i, i );
       for( int i = 0; i < 10; i++ )
@@ -256,7 +256,7 @@ class tnlChunkedEllpackMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 7 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       for( int i = 0; i < size; i++ )
       {
          v.setElement( i, i );
diff --git a/tests/unit-tests/matrices/tnlEllpackMatrixTester.h b/tests/unit-tests/matrices/tnlEllpackMatrixTester.h
index 040eba9066c526984e131184e1d19713ccc7cffb..442da3df4d21417c4fde872774d970b81e04eee8 100644
--- a/tests/unit-tests/matrices/tnlEllpackMatrixTester.h
+++ b/tests/unit-tests/matrices/tnlEllpackMatrixTester.h
@@ -72,7 +72,7 @@ class tnlEllpackMatrixTester : public CppUnit :: TestCase
    {
       MatrixType m1, m2;
       m1.setDimensions( 10, 10 );
-      m1.setConstantRowLengths( 5 );
+      m1.setConstantCompressedRowsLengths( 5 );
       m2.setLike( m1 );
       CPPUNIT_ASSERT( m1.getRows() == m2.getRows() );
    }
@@ -81,7 +81,7 @@ class tnlEllpackMatrixTester : public CppUnit :: TestCase
    {
       MatrixType m;
       m.setDimensions( 10, 10 );
-      m.setConstantRowLengths( 7 );
+      m.setConstantCompressedRowsLengths( 7 );
 
       for( int i = 0; i < 7; i++ )
          CPPUNIT_ASSERT( m.setElement( 0, i, i ) );
@@ -92,7 +92,7 @@ class tnlEllpackMatrixTester : public CppUnit :: TestCase
    {
       MatrixType m;
       m.setDimensions( 10, 10 );
-      m.setConstantRowLengths( 7 );
+      m.setConstantCompressedRowsLengths( 7 );
 
       for( int i = 0; i < 10; i++ )
          m.setElement( i, i, i );
@@ -108,7 +108,7 @@ class tnlEllpackMatrixTester : public CppUnit :: TestCase
    {
       MatrixType m;
       m.setDimensions( 10, 10 );
-      m.setConstantRowLengths( 10 );
+      m.setConstantCompressedRowsLengths( 10 );
 
       for( int i = 0; i < 10; i++ )
          m.setElement( i, i, i );
@@ -125,7 +125,7 @@ class tnlEllpackMatrixTester : public CppUnit :: TestCase
 
       m.reset();
       m.setDimensions( 10, 10 );
-      m.setConstantRowLengths( 10 );
+      m.setConstantCompressedRowsLengths( 10 );
       for( int i = 9; i >= 0; i-- )
          for( int j = 9; j >= 0; j-- )
             m.setElement( i, j, i+j );
@@ -139,7 +139,7 @@ class tnlEllpackMatrixTester : public CppUnit :: TestCase
    {
       MatrixType m;
       m.setDimensions( 10, 10 );
-      m.setConstantRowLengths( 7 );
+      m.setConstantCompressedRowsLengths( 7 );
       for( int i = 0; i < 10; i++ )
          m.setElement( i, i, i );
       for( int i = 0; i < 10; i++ )
@@ -166,7 +166,7 @@ class tnlEllpackMatrixTester : public CppUnit :: TestCase
       w.setSize( size );
       MatrixType m;
       m.setDimensions( size, size );
-      m.setConstantRowLengths( 7 );
+      m.setConstantCompressedRowsLengths( 7 );
       for( int i = 0; i < size; i++ )
       {
          v.setElement( i, i );
diff --git a/tests/unit-tests/matrices/tnlSlicedEllpackMatrixTester.h b/tests/unit-tests/matrices/tnlSlicedEllpackMatrixTester.h
index 29931d56bded94df5f817ade0b6a246c3d42b2d9..452c8387cdff03feaeea8191562df6c458d775e1 100644
--- a/tests/unit-tests/matrices/tnlSlicedEllpackMatrixTester.h
+++ b/tests/unit-tests/matrices/tnlSlicedEllpackMatrixTester.h
@@ -77,7 +77,7 @@ class tnlSlicedEllpackMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m1.getRows() );
       rowLengths.setValue( 5 );
-      m1.setRowLengths( rowLengths );
+      m1.setCompressedRowsLengths( rowLengths );
       m2.setLike( m1 );
       CPPUNIT_ASSERT( m1.getRows() == m2.getRows() );
    }
@@ -89,7 +89,7 @@ class tnlSlicedEllpackMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 7 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       for( int i = 0; i < 7; i++ )
          CPPUNIT_ASSERT( m.setElement( 0, i, i ) );
@@ -103,7 +103,7 @@ class tnlSlicedEllpackMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 7 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       for( int i = 0; i < 10; i++ )
          m.setElement( i, i, i );
@@ -127,7 +127,7 @@ class tnlSlicedEllpackMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 10 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       for( int i = 0; i < 10; i++ )
          m.setElement( i, i, i );
@@ -144,7 +144,7 @@ class tnlSlicedEllpackMatrixTester : public CppUnit :: TestCase
 
       m.reset();
       m.setDimensions( 10, 10 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       for( int i = 9; i >= 0; i-- )
          for( int j = 9; j >= 0; j-- )
             m.setElement( i, j, i+j );
@@ -162,7 +162,7 @@ class tnlSlicedEllpackMatrixTester : public CppUnit :: TestCase
       rowLengths.setSize( m.getRows() );
       for( int i = 0; i < 10; i++ )
          rowLengths.setElement( i, i+1 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       for( int i = 0; i < 10; i++ )
          for( int j = 0; j <= i; j++ )
@@ -177,7 +177,7 @@ class tnlSlicedEllpackMatrixTester : public CppUnit :: TestCase
 
       m.reset();
       m.setDimensions( 10, 10 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       for( int i = 9; i >= 0; i-- )
          for( int j = i; j >= 0; j-- )
             m.setElement( i, j, i + j );
@@ -197,7 +197,7 @@ class tnlSlicedEllpackMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 7 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       for( int i = 0; i < 10; i++ )
          m.setElement( i, i, i );
       for( int i = 0; i < 10; i++ )
@@ -227,7 +227,7 @@ class tnlSlicedEllpackMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 7 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       for( int i = 0; i < size; i++ )
       {
          v.setElement( i, i );
diff --git a/tests/unit-tests/matrices/tnlSparseMatrixTester.h b/tests/unit-tests/matrices/tnlSparseMatrixTester.h
index 5d12983eb8cf2bb4833a840c86fb3266ac29c67f..8bb3706fca93240b10f2d2def3f04174f9f96b97 100644
--- a/tests/unit-tests/matrices/tnlSparseMatrixTester.h
+++ b/tests/unit-tests/matrices/tnlSparseMatrixTester.h
@@ -156,7 +156,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m1.getRows() );
       rowLengths.setValue( 5 );
-      m1.setRowLengths( rowLengths );
+      m1.setCompressedRowsLengths( rowLengths );
       m2.setLike( m1 );
       CPPUNIT_ASSERT( m1.getRows() == m2.getRows() );
    }
@@ -172,7 +172,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 7 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       for( int i = 0; i < 7; i++ )
          CPPUNIT_ASSERT( m.setElement( 0, i, i ) );
@@ -192,7 +192,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 7 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       if( DeviceType::getDevice() == tnlHostDevice )
       {
@@ -232,7 +232,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 7 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       for( int i = 0; i < 10; i++ )
          m.setElement( i, i, i );
@@ -257,7 +257,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 7 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       if( DeviceType::DeviceType == ( int ) tnlHostDevice )
       {
@@ -303,7 +303,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 10 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       for( int i = 0; i < 10; i++ )
          m.setElement( i, i, i );
@@ -320,7 +320,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
 
       m.reset();
       m.setDimensions( 10, 10 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       for( int i = 9; i >= 0; i-- )
          for( int j = 9; j >= 0; j-- )
             m.setElement( i, j, i+j );
@@ -338,7 +338,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 10 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       if( DeviceType::DeviceType == ( int ) tnlHostDevice )
       {
@@ -376,7 +376,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
 
       m.reset();
       m.setDimensions( 10, 10 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       if( DeviceType::DeviceType == ( int ) tnlHostDevice )
       {
          for( int i = 9; i >= 0; i-- )
@@ -417,7 +417,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       rowLengths.setSize( m.getRows() );
       for( int i = 0; i < 10; i++ )
          rowLengths.setElement( i, i+1 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       for( int i = 0; i < 10; i++ )
          for( int j = 0; j <= i; j++ )
@@ -432,7 +432,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
 
       m.reset();
       m.setDimensions( 10, 10 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       for( int i = 9; i >= 0; i-- )
          for( int j = i; j >= 0; j-- )
             m.setElement( i, j, i + j );
@@ -454,7 +454,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       rowLengths.setSize( m.getRows() );
       for( int i = 0; i < 10; i++ )
          rowLengths.setElement( i, i+1 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       if( DeviceType::DeviceType == ( int ) tnlHostDevice )
       {
@@ -490,7 +490,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
 
       m.reset();
       m.setDimensions( 10, 10 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       if( DeviceType::DeviceType == ( int ) tnlHostDevice )
       {
          for( int i = 9; i >= 0; i-- )
@@ -533,7 +533,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 7 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       for( int i = 0; i < 10; i++ )
          m.setElement( i, i, i );
       for( int i = 0; i < 10; i++ )
@@ -563,7 +563,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 7 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       RealType values[ 1 ];
       IndexType columnIndexes[ 1 ];
 
@@ -594,7 +594,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 7 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
 
       if( DeviceType::DeviceType == ( int ) tnlHostDevice )
@@ -648,7 +648,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 10 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       RealType values[ 10 ];
       IndexType columnIndexes[ 10 ];
 
@@ -674,7 +674,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
 
       m.reset();
       m.setDimensions( 10, 10 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       for( int i = 9; i >= 0; i-- )
       {
          for( int j = 9; j >= 0; j-- )
@@ -695,7 +695,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 10 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       RealType values[ 10 ];
       IndexType columnIndexes[ 10 ];
@@ -744,7 +744,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
 
       m.reset();
       m.setDimensions( 10, 10 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       if( DeviceType::DeviceType == ( int ) tnlHostDevice )
       {
@@ -789,7 +789,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       rowLengths.setSize( m.getRows() );
       for( int i = 0; i < 10; i++ )
          rowLengths.setElement( i, i+1 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
 
       RealType values[ 10 ];
@@ -814,7 +814,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
 
       m.reset();
       m.setDimensions( 10, 10 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       for( int i = 9; i >= 0; i-- )
       {
          for( int j = i; j >= 0; j-- )
@@ -839,7 +839,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       rowLengths.setSize( m.getRows() );
       for( int i = 0; i < 10; i++ )
          rowLengths.setElement( i, i+1 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
 
       RealType values[ 10 ];
@@ -885,7 +885,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
 
       m.reset();
       m.setDimensions( 10, 10 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
 
       if( DeviceType::DeviceType == ( int ) tnlHostDevice )
       {
@@ -937,7 +937,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( 7 );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       for( int i = 0; i < size; i++ )
       {
          v.setElement( i, i );
@@ -961,7 +961,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( size );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       for( int i = 0; i < size; i++ )
       {
          for( int j = 0; j < size; j++ )
@@ -986,7 +986,7 @@ class tnlSparseMatrixTester : public CppUnit :: TestCase
       IndexVector rowLengths;
       rowLengths.setSize( m.getRows() );
       rowLengths.setValue( size );
-      m.setRowLengths( rowLengths );
+      m.setCompressedRowsLengths( rowLengths );
       for( int i = 0; i < size; i++ )
       {
          for( int j = 0; j <= i; j++ )
diff --git a/tests/unit-tests/mesh/CMakeLists.txt b/tests/unit-tests/mesh/CMakeLists.txt
index 66d04858d348c9cd46b44c1946d2e5b960d5cca6..8a806acdfe3ac0bb351a4cc3b80235d733a5b206 100755
--- a/tests/unit-tests/mesh/CMakeLists.txt
+++ b/tests/unit-tests/mesh/CMakeLists.txt
@@ -6,12 +6,12 @@ ADD_EXECUTABLE( tnlGridTest${mpiExt}${debugExt} ${headers} tnlGridTest.cpp )
 TARGET_LINK_LIBRARIES( tnlGridTest${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
                                                        tnl${mpiExt}${debugExt}-0.1 )
 
-#ADD_EXECUTABLE( tnlMeshEntityTest${mpiExt}${debugExt} ${headers} tnlMeshEntityTest.cpp )
-#TARGET_LINK_LIBRARIES( tnlMeshEntityTest${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
-#                                                              tnl${mpiExt}${debugExt}-0.1 )
+ADD_EXECUTABLE( tnlMeshEntityTest${mpiExt}${debugExt} ${headers} tnlMeshEntityTest.cpp )
+TARGET_LINK_LIBRARIES( tnlMeshEntityTest${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
+                                                              tnl${mpiExt}${debugExt}-0.1 )
 
-#ADD_EXECUTABLE( tnlMeshTest${mpiExt}${debugExt} ${headers} tnlMeshTest.cpp )
-#TARGET_LINK_LIBRARIES( tnlMeshTest${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
-#                                                              tnl${mpiExt}${debugExt}-0.1 )
+ADD_EXECUTABLE( tnlMeshTest${mpiExt}${debugExt} ${headers} tnlMeshTest.cpp )
+TARGET_LINK_LIBRARIES( tnlMeshTest${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
+                                                              tnl${mpiExt}${debugExt}-0.1 )
 
                                                               
\ No newline at end of file
diff --git a/tests/unit-tests/mesh/tnlMeshEntityTester.h b/tests/unit-tests/mesh/tnlMeshEntityTester.h
index 415bec2180a904895b5aa35f47a3dab771286fbd..fdf5221e2a301731738fe1df59270a1676d9ca24 100644
--- a/tests/unit-tests/mesh/tnlMeshEntityTester.h
+++ b/tests/unit-tests/mesh/tnlMeshEntityTester.h
@@ -31,31 +31,21 @@
 #include <mesh/topologies/tnlMeshTriangleTag.h>
 #include <mesh/topologies/tnlMeshTetrahedronTag.h>
     
- typedef tnlMeshConfigBase< 2, double, int, int, void > MeshConfigBaseType;
- struct TestTriangleEntityTag : public MeshConfigBaseType
- {
-     typedef tnlMeshTriangleTag CellTag;
- };
- struct TestEdgeEntityTag : public MeshConfigBaseType
- {
-     typedef tnlMeshEdgeTag CellTag;
- };
- struct TestVertexEntityTag : public MeshConfigBaseType
- {
-     typedef tnlMeshVertexTag CellTag;
- };
-
- template< int Dimensions >
- struct tnlMeshSuperentityStorage< TestTriangleEntityTag, tnlMeshVertexTag, Dimensions >
- {
-    enum { enabled = true };
- };
-
- template< int Dimensions >
- struct tnlMeshSuperentityStorage< TestTriangleEntityTag, tnlMeshEdgeTag, Dimensions >
- {
-    enum { enabled = true };
- };
+typedef tnlMeshConfigBase< tnlMeshTriangleTag, 2, double, int, int, void > TestTriangleEntityTag;
+typedef tnlMeshConfigBase< tnlMeshEdgeTag, 2, double, int, int, void > TestEdgeEntityTag;
+typedef tnlMeshConfigBase< tnlMeshVertexTag, 2, double, int, int, void > TestVertexEntityTag;
+
+template< int Dimensions >
+struct tnlMeshSuperentityStorage< TestTriangleEntityTag, tnlMeshVertexTag, Dimensions >
+{
+   enum { enabled = true };
+};
+
+template< int Dimensions >
+struct tnlMeshSuperentityStorage< TestTriangleEntityTag, tnlMeshEdgeTag, Dimensions >
+{
+   enum { enabled = true };
+};
 
 template< typename RealType, typename Device, typename IndexType >
 class tnlMeshEntityTester : public CppUnit :: TestCase
@@ -86,11 +76,7 @@ class tnlMeshEntityTester : public CppUnit :: TestCase
 
    void vertexMeshEntityTest()
    {
-      typedef tnlMeshConfigBase< 2, RealType, IndexType, IndexType, void > MeshConfigBaseType;
-      struct TestEntityTag : public MeshConfigBaseType
-      {
-         typedef tnlMeshEdgeTag CellTag;
-      };
+      typedef tnlMeshConfigBase< tnlMeshEdgeTag, 2, RealType, IndexType, IndexType, void > TestEntityTag;
       typedef tnlMeshEntity< TestEntityTag, tnlMeshVertexTag > VertexMeshEntityType;
       typedef typename VertexMeshEntityType::PointType PointType;
 
@@ -227,23 +213,11 @@ class tnlMeshEntityTester : public CppUnit :: TestCase
 
    void tetrahedronMeshEntityTest()
    {
-      typedef tnlMeshConfigBase< 3, RealType, IndexType, IndexType, void > MeshConfigBaseType;
-      struct TestTetrahedronEntityTag : public MeshConfigBaseType
-      {
-          typedef tnlMeshTetrahedronTag CellTag;
-      };
-      struct TestTriangleEntityTag : public MeshConfigBaseType
-      {
-          typedef tnlMeshTriangleTag CellTag;
-      };
-      struct TestEdgeEntityTag : public MeshConfigBaseType
-      {
-          typedef tnlMeshEdgeTag CellTag;
-      };
-      struct TestVertexEntityTag : public MeshConfigBaseType
-      {
-          typedef tnlMeshVertexTag CellTag;
-      };
+      typedef tnlMeshConfigBase< tnlMeshTetrahedronTag, 3, RealType, IndexType, IndexType, void > TestTetrahedronEntityTag;
+      typedef tnlMeshConfigBase< tnlMeshTriangleTag, 3, RealType, IndexType, IndexType, void > TestTriangleEntityTag;
+      typedef tnlMeshConfigBase< tnlMeshEdgeTag, 3, RealType, IndexType, IndexType, void > TestEdgeEntityTag;
+      typedef tnlMeshConfigBase< tnlMeshVertexTag, 3, RealType, IndexType, IndexType, void > TestVertexEntityTag;
+
       typedef tnlMeshEntity< TestTetrahedronEntityTag, tnlMeshTetrahedronTag > TetrahedronMeshEntityType;
       typedef tnlMeshEntity< TestTriangleEntityTag, tnlMeshTriangleTag > TriangleMeshEntityType;
       typedef tnlMeshEntity< TestEdgeEntityTag, tnlMeshEdgeTag > EdgeMeshEntityType;
diff --git a/tests/unit-tests/mesh/tnlMeshTester.h b/tests/unit-tests/mesh/tnlMeshTester.h
index 7e9254e643eb3a01187f8e90da61c14a88e0349b..efc5858d7fcfa1a4c8dfd9e458ad4b33021e29f8 100644
--- a/tests/unit-tests/mesh/tnlMeshTester.h
+++ b/tests/unit-tests/mesh/tnlMeshTester.h
@@ -35,70 +35,56 @@
 #include <mesh/topologies/tnlMeshHexahedronTag.h>
 #include <mesh/tnlMeshInitializer.h>
 
- typedef tnlMeshConfigBase< 2, double, int, int, void > Mesh2dConfigBaseType;
- struct TestTriangleMeshConfig : public Mesh2dConfigBaseType
- {
-     typedef tnlMeshTriangleTag CellTag;
- };
-
- template< int Dimensions >
- struct tnlMeshSuperentityStorage< TestTriangleMeshConfig, tnlMeshVertexTag, Dimensions >
- {
-    enum { enabled = true };
- };
+typedef tnlMeshConfigBase< tnlMeshTriangleTag, 2, double, int, int, void > TestTriangleMeshConfig;
 
- template< int Dimensions >
- struct tnlMeshSuperentityStorage< TestTriangleMeshConfig, tnlMeshEdgeTag, Dimensions >
- {
-    enum { enabled = true };
- };
+template< int Dimensions >
+struct tnlMeshSuperentityStorage< TestTriangleMeshConfig, tnlMeshVertexTag, Dimensions >
+{
+   enum { enabled = true };
+};
+
+template< int Dimensions >
+struct tnlMeshSuperentityStorage< TestTriangleMeshConfig, tnlMeshEdgeTag, Dimensions >
+{
+   enum { enabled = true };
+};
  
- struct TestQuadrilateralMeshConfig : public Mesh2dConfigBaseType
- {
-     typedef tnlMeshQuadrilateralTag CellTag;
- };
-
- template< int Dimensions >
- struct tnlMeshSuperentityStorage< TestQuadrilateralMeshConfig, tnlMeshVertexTag, Dimensions >
- {
-    enum { enabled = true };
- };
+typedef tnlMeshConfigBase< tnlMeshQuadrilateralTag, 2, double, int, int, void > TestQuadrilateralMeshConfig;
 
- template< int Dimensions >
- struct tnlMeshSuperentityStorage< TestQuadrilateralMeshConfig, tnlMeshEdgeTag, Dimensions >
- {
-    enum { enabled = true };
- };
+template< int Dimensions >
+struct tnlMeshSuperentityStorage< TestQuadrilateralMeshConfig, tnlMeshVertexTag, Dimensions >
+{
+   enum { enabled = true };
+};
 
- typedef tnlMeshConfigBase< 3, double, int, int, void > Mesh3dConfigBaseType;
- struct TestTetrahedronMeshConfig : public Mesh3dConfigBaseType
- {
-     typedef tnlMeshTetrahedronTag CellTag;
- };
+template< int Dimensions >
+struct tnlMeshSuperentityStorage< TestQuadrilateralMeshConfig, tnlMeshEdgeTag, Dimensions >
+{
+   enum { enabled = true };
+};
 
- template< int Dimensions >
- struct tnlMeshSuperentityStorage< TestTetrahedronMeshConfig, tnlMeshVertexTag, Dimensions >
- {
-    enum { enabled = true };
- };
+typedef tnlMeshConfigBase< tnlMeshTetrahedronTag, 3, double, int, int, void > TestTetrahedronMeshConfig;
 
- template< int Dimensions >
- struct tnlMeshSuperentityStorage< TestTetrahedronMeshConfig, tnlMeshEdgeTag, Dimensions >
- {
-    enum { enabled = true };
- };
+template< int Dimensions >
+struct tnlMeshSuperentityStorage< TestTetrahedronMeshConfig, tnlMeshVertexTag, Dimensions >
+{
+   enum { enabled = true };
+};
 
- template< int Dimensions >
- struct tnlMeshSuperentityStorage< TestTetrahedronMeshConfig, tnlMeshTriangleTag, Dimensions >
- {
-     enum { enabled = true };
- };
+template< int Dimensions >
+struct tnlMeshSuperentityStorage< TestTetrahedronMeshConfig, tnlMeshEdgeTag, Dimensions >
+{
+   enum { enabled = true };
+};
 
-struct TestHexahedronMeshConfig : public Mesh3dConfigBaseType
+template< int Dimensions >
+struct tnlMeshSuperentityStorage< TestTetrahedronMeshConfig, tnlMeshTriangleTag, Dimensions >
 {
-   typedef tnlMeshHexahedronTag CellTag;
+    enum { enabled = true };
 };
 
+typedef tnlMeshConfigBase< tnlMeshHexahedronTag, 3, double, int, int, void > TestHexahedronMeshConfig;
+
 template< int Dimensions >
 struct tnlMeshSuperentityStorage< TestHexahedronMeshConfig, tnlMeshVertexTag, Dimensions >
 {
@@ -134,11 +120,11 @@ class tnlMeshTester : public CppUnit :: TestCase
       CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlMeshTester" );
       CppUnit :: TestResult result;
 
-      suiteOfTests -> addTest( new TestCallerType( "twoTrianglesTest", &TesterType::twoTrianglesTest ) );
-      suiteOfTests -> addTest( new TestCallerType( "tetrahedronsTest", &TesterType::tetrahedronsTest ) );
-      suiteOfTests -> addTest( new TestCallerType( "regularMeshOfTrianglesTest", &TesterType::regularMeshOfTrianglesTest ) );
-      suiteOfTests -> addTest( new TestCallerType( "regularMeshOfQuadrilateralsTest", &TesterType::regularMeshOfQuadrilateralsTest ) );
-      suiteOfTests -> addTest( new TestCallerType( "regularMeshOfHexahedronsTest", &TesterType::regularMeshOfHexahedronsTest ) );
+      suiteOfTests->addTest( new TestCallerType( "twoTrianglesTest", &TesterType::twoTrianglesTest ) );
+      suiteOfTests->addTest( new TestCallerType( "tetrahedronsTest", &TesterType::tetrahedronsTest ) );
+      suiteOfTests->addTest( new TestCallerType( "regularMeshOfTrianglesTest", &TesterType::regularMeshOfTrianglesTest ) );
+      suiteOfTests->addTest( new TestCallerType( "regularMeshOfQuadrilateralsTest", &TesterType::regularMeshOfQuadrilateralsTest ) );
+      suiteOfTests->addTest( new TestCallerType( "regularMeshOfHexahedronsTest", &TesterType::regularMeshOfHexahedronsTest ) );
       return suiteOfTests;
    }
 
diff --git a/tests/unit-tests/operators/diffusion/tnlLinearDiffusionTest.cpp b/tests/unit-tests/operators/diffusion/tnlLinearDiffusionTest.cpp
index 687e485c2a8da7798dae95f3f253229d4a5a18d3..e20eba5283b58569a24c19308968151359cb5158 100644
--- a/tests/unit-tests/operators/diffusion/tnlLinearDiffusionTest.cpp
+++ b/tests/unit-tests/operators/diffusion/tnlLinearDiffusionTest.cpp
@@ -25,7 +25,7 @@
 #include <operators/diffusion/tnlLinearDiffusion.h>
 #include <operators/diffusion/tnlExactLinearDiffusion.h>
 #include "../tnlPDEOperatorEocTestResult.h"
-#include <functions/tnlExpBumpFunction.h>
+#include <functors/tnlExpBumpFunction.h>
 
 template< int Dimensions,
           typename Real,
diff --git a/tests/unit-tests/operators/diffusion/tnlLinearDiffusionTest.cu b/tests/unit-tests/operators/diffusion/tnlLinearDiffusionTest.cu
index 912bd9214cddb49e63db55f8a16900751435a1b5..081b142f4b65bf1ee08593d678023a41fc8a6e6d 100644
--- a/tests/unit-tests/operators/diffusion/tnlLinearDiffusionTest.cu
+++ b/tests/unit-tests/operators/diffusion/tnlLinearDiffusionTest.cu
@@ -25,7 +25,7 @@
 #include <operators/diffusion/tnlLinearDiffusion.h>
 #include <operators/diffusion/tnlExactLinearDiffusion.h>
 #include "../tnlPDEOperatorEocTestResult.h"
-#include <functions/tnlExpBumpFunction.h>
+#include <functors/tnlExpBumpFunction.h>
 
 template< int Dimensions,
           typename Real,
diff --git a/tests/unit-tests/operators/tnlPDEOperatorEocTestSetter.h b/tests/unit-tests/operators/tnlPDEOperatorEocTestSetter.h
index 16542c201dd6c1bcc4348d91e762425fdc0b630c..69d1c31843e98eb34c7dfb34734b5a6ff12ffd58 100644
--- a/tests/unit-tests/operators/tnlPDEOperatorEocTestSetter.h
+++ b/tests/unit-tests/operators/tnlPDEOperatorEocTestSetter.h
@@ -19,7 +19,7 @@
 #define TNLPDEOPERATOREOCTESTSETTER_H_
 
 #include <mesh/tnlGrid.h>
-#include <functions/tnlExpBumpFunction.h>
+#include <functors/tnlExpBumpFunction.h>
 
 template< typename ApproximateOperator,
           typename ExactOperator,
diff --git a/tests/unit-tests/tnlApproximationError.h b/tests/unit-tests/tnlApproximationError.h
index b39741257d69173c08e21c58183b3d458eee3bc3..74c3fb2f7004f64b9f1f0ab174c21c10c4c383a9 100644
--- a/tests/unit-tests/tnlApproximationError.h
+++ b/tests/unit-tests/tnlApproximationError.h
@@ -19,7 +19,7 @@
 #define TNLAPPROXIMATIONERROR_H_
 
 #include <mesh/tnlGrid.h>
-#include <functions/tnlConstantFunction.h>
+#include <functors/tnlConstantFunction.h>
 #include <operators/tnlAnalyticDirichletBoundaryConditions.h>
 #include <solvers/pde/tnlExplicitUpdater.h>
 
diff --git a/tests/unit-tests/tnlApproximationError_impl.h b/tests/unit-tests/tnlApproximationError_impl.h
index 28b1936677892656276f74c5b1b3937e82b047ec..eff97c93b396b704636a99cc03b92e647fecc224 100644
--- a/tests/unit-tests/tnlApproximationError_impl.h
+++ b/tests/unit-tests/tnlApproximationError_impl.h
@@ -20,10 +20,11 @@
 
 #include <mesh/tnlTraverser.h>
 #include <core/vectors/tnlVector.h>
-#include <functions/tnlFunctionDiscretizer.h>
+#include <functors/tnlFunctionDiscretizer.h>
 #include <matrices/tnlCSRMatrix.h>
 #include <matrices/tnlMatrixSetter.h>
 #include <solvers/pde/tnlLinearSystemAssembler.h>
+#include <solvers/pde/tnlNoTimeDiscretisation.h>
 #include <operators/tnlExactOperatorEvaluator.h>
 
 template< typename Mesh,
@@ -95,10 +96,10 @@ getError( const Mesh& mesh,
 {
    typedef tnlVector< RealType, DeviceType, IndexType > Vector;
    typedef tnlCSRMatrix< RealType, DeviceType, IndexType > MatrixType;
-   typedef typename MatrixType::RowLengthsVector RowLengthsVectorType;
+   typedef typename MatrixType::CompressedRowsLengthsVector CompressedRowsLengthsVectorType;
    Vector functionData, exactData, approximateData;
    MatrixType matrix;
-   RowLengthsVectorType rowLengths;
+   CompressedRowsLengthsVectorType rowLengths;
    BoundaryConditionsType boundaryConditions;
    boundaryConditions.setFunction( function );
    ConstantFunctionType zeroFunction;
@@ -113,16 +114,16 @@ getError( const Mesh& mesh,
 
    tnlFunctionDiscretizer< Mesh, Function, Vector >::template discretize< 0, 0, 0 >( mesh, function, functionData );
 
-   tnlMatrixSetter< MeshType, ApproximateOperator, BoundaryConditionsType, RowLengthsVectorType > matrixSetter;
-   matrixSetter.template getRowLengths< Mesh::Dimensions >( mesh,
+   tnlMatrixSetter< MeshType, ApproximateOperator, BoundaryConditionsType, CompressedRowsLengthsVectorType > matrixSetter;
+   matrixSetter.template getCompressedRowsLengths< Mesh::Dimensions >( mesh,
                                                             approximateOperator,
                                                             boundaryConditions,
                                                             rowLengths );
    matrix.setDimensions( entities, entities );
-   if( ! matrix.setRowLengths( rowLengths ) )
+   if( ! matrix.setCompressedRowsLengths( rowLengths ) )
       return;
 
-   tnlLinearSystemAssembler< Mesh, Vector, ApproximateOperator, BoundaryConditionsType, ConstantFunctionType, MatrixType > systemAssembler;
+   tnlLinearSystemAssembler< Mesh, Vector, ApproximateOperator, BoundaryConditionsType, ConstantFunctionType, tnlNoTimeDiscretisation, MatrixType > systemAssembler;
    systemAssembler.template assembly< Mesh::Dimensions >( 0.0, // time
                                                           1.0, // tau
                                                           mesh,
diff --git a/tnlConfig.h.in b/tnlConfig.h.in
index aa39826c80236408ac0fb633acbed44b33fe0bfe..c6b0df075bd64965f8df6b880424928a283d8e4c 100644
--- a/tnlConfig.h.in
+++ b/tnlConfig.h.in
@@ -1,4 +1,4 @@
-@HAVE_LIBBZ2@
+@HAVE_CUBLAS@
 
 @HAVE_CUSP@
 
@@ -16,7 +16,7 @@
 
 #define TNL_TESTS_DIRECTORY @testsDirectory@
 
-#define TNL_CPP_COMPILER_NAME "@CMAKE_CXX_COMPILER@"
+#define TNL_CPP_COMPILER_NAME "@CMAKE_CXX_COMPILER_ID@ @CMAKE_CXX_COMPILER_VERSION@"
 
 #define maxCudaGridSize 65535
 
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 8bc1a507b458a254953bad38bb050d76c03d1c45..6575ba2f79b96d113ace39a78737ee986fe6dae4 100755
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -1,2 +1,13 @@
 add_subdirectory (src)
-add_subdirectory (share)
\ No newline at end of file
+add_subdirectory (share)
+add_subdirectory (tnl-quickstart)
+
+CONFIGURE_FILE( "tnl-compile.in" "${PROJECT_TOOLS_PATH}/tnl-compile" @ONLY )
+CONFIGURE_FILE( "tnl-link.in" "${PROJECT_TOOLS_PATH}/tnl-link" @ONLY )
+CONFIGURE_FILE( "tnl-bindir.in" "${PROJECT_TOOLS_PATH}/tnl-bindir" @ONLY )
+
+INSTALL( FILES ${PROJECT_TOOLS_PATH}/tnl-compile 
+               ${PROJECT_TOOLS_PATH}/tnl-link
+               ${PROJECT_TOOLS_PATH}/tnl-bindir
+         DESTINATION bin
+         PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
\ No newline at end of file
diff --git a/tools/src/CMakeLists.txt b/tools/src/CMakeLists.txt
index 44921b44f264a9fd9b6adbe4ed56199fc49c89a1..a14f6bae5dfacc943469fc814aa345f75fa8ed58 100755
--- a/tools/src/CMakeLists.txt
+++ b/tools/src/CMakeLists.txt
@@ -43,6 +43,11 @@ target_link_libraries( tnl-functions-benchmark${debugExt} tnl${debugExt}-${tnlVe
 ADD_EXECUTABLE(tnl-curve2gnuplot${debugExt} ${tnlcurve2gnuplotsources})
 target_link_libraries (tnl-curve2gnuplot${debugExt} tnl${debugExt}-${tnlVersion} )
 
+IF( BUILD_CUDA )
+    CUDA_ADD_EXECUTABLE( tnl-cuda-arch${debugExt} tnl-cuda-arch.cu
+                         OPTIONS ${CUDA_ADD_EXECUTABLE_OPTIONS} )
+    SET_TARGET_PROPERTIES( tnl-cuda-arch${debugExt} PROPERTIES CUDA_COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" )
+ENDIF()
 #ADD_EXECUTABLE( tnl-matrix-convert${debugExt} ${tnlmatrixconvertsources} )
 #target_link_libraries( tnl-matrix-convert${debugExt} tnl${debugExt}-${tnlVersion} )
 
@@ -56,6 +61,12 @@ INSTALL( TARGETS tnl-init${debugExt}
          RUNTIME DESTINATION bin
          PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
          
+IF( BUILD_CUDA )
+   INSTALL( TARGETS tnl-cuda-arch${debugExt}
+            RUNTIME DESTINATION bin
+            PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
+ENDIF()
+
 INSTALL( FILES tnl-err2eoc
                tnl-time-series2png
                tnl-eoc-test-log
diff --git a/tools/src/functions-benchmark.h b/tools/src/functions-benchmark.h
index 45e8d3a0ebbcaea887bb5639e5db94af2c20dddf..a9f79a0d1026a979cd42dec69154b057f096d654 100644
--- a/tools/src/functions-benchmark.h
+++ b/tools/src/functions-benchmark.h
@@ -37,7 +37,7 @@ template< typename REAL > void benchmarkAddition( long int loops )
       a4 += REAL( 0.1 );
    }
 
-   double cpu_time = cpu_timer. GetTime();
+   double cpu_time = cpu_timer. getTime();
    cout << " ( " << a1 + a2 + a3 + a4 << " ) " <<  cpu_time << "secs. " << 4.0 * ( ( double ) loops ) / cpu_time * 1.0e-9 << " GFLOPS." << endl;
 }
 
@@ -61,7 +61,7 @@ template< typename REAL > void benchmarkMultiplication( const long int loops )
       }
    }
 
-   double cpu_time = cpu_timer. GetTime();
+   double cpu_time = cpu_timer. getTime();
    cout << " ( " << a1 * a2 * a3 * a4 << " ) " <<  cpu_time << "secs. " << 4.0 * ( ( double ) loops ) / cpu_time * 1.0e-9 << " GFLOPS." << endl;
 }
 
@@ -83,7 +83,7 @@ template< typename REAL > void benchmarkDivision( long int loops )
       if( a1 < REAL( 0.01 ) ) a1 = a2 = a3 = a4 = REAL( 1.0e9 );
    }
 
-   double cpu_time = cpu_timer. GetTime();
+   double cpu_time = cpu_timer. getTime();
    cout << " ( " << a1 / a2 / a3 / a4 << " ) " << cpu_time << "secs. " << 4.0 * ( ( double ) loops / 2 ) / cpu_time * 1.0e-9 << " GFLOPS." << endl;
 }
 
@@ -105,7 +105,7 @@ template< typename REAL > void benchmarkSqrt( long int loops )
       if( a1 < REAL( 100.0 ) ) a1 = a2 = a3 = a4 = REAL( 1.0e9 );
    }
 
-   double cpu_time = cpu_timer. GetTime();
+   double cpu_time = cpu_timer. getTime();
    cout << " ( " << a1 + a2 + a3 + a4 << " ) " << cpu_time << "secs. " << 4.0 * ( ( double ) loops / 2 ) / cpu_time * 1.0e-9 << " GFLOPS." << endl;
 }
 
@@ -126,7 +126,7 @@ template< typename REAL > void benchmarkSin( long int loops )
       a4 = sin( a4 );
    }
 
-   double cpu_time = cpu_timer. GetTime();
+   double cpu_time = cpu_timer. getTime();
    cout << " ( " << a1 + a2 + a3 + a4 << " ) " << cpu_time << "secs. " << 4.0 * ( ( double ) loops ) / cpu_time * 1.0e-9 << " GFLOPS." << endl;
 }
 
@@ -148,7 +148,7 @@ template< typename REAL > void benchmarkExp( long int loops )
       if( a1 > REAL( 1.0e9 ) ) a1 = a2 = a3 = a4 = REAL( 1.1 );
    }
 
-   double cpu_time = cpu_timer. GetTime();
+   double cpu_time = cpu_timer. getTime();
    cout << " ( " << a1 + a2 + a3 + a4 << " ) " << cpu_time << "secs. " << 4.0 * ( ( double ) loops) / cpu_time * 1.0e-9 << " GFLOPS." << endl;
 }
 
@@ -170,7 +170,7 @@ template< typename REAL > void benchmarkPow( long int loops )
       if( a1 < REAL( 1.0 ) ) a1 = a2 = a3 = a4 = REAL( 1.0e9 );
    }
 
-   double cpu_time = cpu_timer. GetTime();
+   double cpu_time = cpu_timer. getTime();
    cout << " ( " << a1 + a2 + a3 + a4 << " ) " << cpu_time << "secs. " << 4.0 * ( ( double ) loops) / cpu_time * 1.0e-9 << " GFLOPS." << endl;
 }
 
diff --git a/tools/src/tnl-cuda-arch.cu b/tools/src/tnl-cuda-arch.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b6bbe8d4dc843cce931c8f7fcdb9d2233cda6407
--- /dev/null
+++ b/tools/src/tnl-cuda-arch.cu
@@ -0,0 +1,21 @@
+#include <stdio.h> 
+
+int main() {
+    int num_devices;
+    cudaGetDeviceCount( &num_devices );
+    for( int i = 0; i < num_devices; i++ ) {
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties( &prop, i );
+
+        int compute_minor = prop.minor;
+        // sm_21 is the only 'real' architecture that does not have 'virtual' counterpart
+        if( prop.major == 2 )
+            compute_minor = 0;
+
+        if( i > 0 )
+            printf(" ");
+        printf( "-gencode arch=compute_%d%d,code=sm_%d%d",
+                prop.major, compute_minor, prop.major, prop.minor );
+    }
+    printf("\n");
+}
diff --git a/tools/src/tnl-err2eoc b/tools/src/tnl-err2eoc
index b262fae9e1df3c0e27a4193203ae28912d153f9f..4582ef2efbbd1aa60027406f69586e7529f1cfc3 100644
--- a/tools/src/tnl-err2eoc
+++ b/tools/src/tnl-err2eoc
@@ -12,7 +12,7 @@ refinement = 2
 i = 0
 while i < len( arguments ):
    if arguments[ i ] == "--refinement":
-      refinement = arguments[ i + 1 ]
+      refinement = float( arguments[ i + 1 ] )
       i = i + 2
       continue
    if arguments[ i ] == "--output-file":
diff --git a/tools/src/tnl-init.cpp b/tools/src/tnl-init.cpp
index 6cff727f12f7b04acc766720010dc44c8d828ea9..f71d0e41ccc996b0b298f6133ac1d7b62802e12a 100644
--- a/tools/src/tnl-init.cpp
+++ b/tools/src/tnl-init.cpp
@@ -21,7 +21,7 @@
 #include <debug/tnlDebug.h>
 #include <config/tnlConfigDescription.h>
 #include <config/tnlParameterContainer.h>
-#include <functions/tnlTestFunction.h>
+#include <functors/tnlTestFunction.h>
 #include <mesh/tnlDummyMesh.h>
 #include <mesh/tnlGrid.h>
 
@@ -35,6 +35,7 @@ void setupConfig( tnlConfigDescription& config )
       config.addEntryEnum< tnlString >( "float" );
       config.addEntryEnum< tnlString >( "double" );
       config.addEntryEnum< tnlString >( "long-double" );
+   config.addEntry< double >( "initial-time", "Initial time for a serie of snapshots of the time-dependent function.", 0.0 );      
    config.addEntry< double >( "final-time", "Final time for a serie of snapshots of the time-dependent function.", 0.0 );
    config.addEntry< double >( "snapshot-period", "Period between snapshots in a serie of the time-dependent function.", 0.0 );
    config.addEntry< int >( "x-derivative", "Order of the partial derivative w.r.t x.", 0 );
diff --git a/tools/src/tnl-init.h b/tools/src/tnl-init.h
index f9f4aa17d661156c199c6de917857b17f5683389..c3091f6f5c9f2ffac7fccd81f9ced9db53777ccb 100644
--- a/tools/src/tnl-init.h
+++ b/tools/src/tnl-init.h
@@ -21,8 +21,8 @@
 #include <config/tnlParameterContainer.h>
 #include <core/vectors/tnlVector.h>
 #include <mesh/tnlGrid.h>
-#include <functions/tnlFunctionDiscretizer.h>
-#include <functions/tnlTestFunction.h>
+#include <functors/tnlFunctionDiscretizer.h>
+#include <functors/tnlTestFunction.h>
 #include <operators/tnlFiniteDifferences.h>
 #include <core/mfilename.h>
 
@@ -49,13 +49,14 @@ bool renderFunction( const tnlParameterContainer& parameters )
    DiscreteFunctionType discreteFunction;
    if( ! discreteFunction.setSize( mesh.getNumberOfCells() ) )
       return false;
-
-   double time( 0.0 );
+   
    double finalTime = parameters.getParameter< double >( "final-time" );
+   double initialTime = parameters.getParameter< double >( "initial-time" );
    double tau = parameters.getParameter< double >( "snapshot-period" );
    bool numericalDifferentiation = parameters.getParameter< bool >( "numerical-differentiation" );
    int step( 0 );
-   const int steps = tau > 0 ? ceil( finalTime / tau ): 0;
+   double time( initialTime );
+   const int steps = tau > 0 ? ceil( ( finalTime - initialTime ) / tau ): 0;
 
    while( step <= steps )
    {
diff --git a/tools/src/tnl-mesh-convert.cpp b/tools/src/tnl-mesh-convert.cpp
index b3a233093840af0209253dd9a790de75021c4cb2..e13a0a6fb2957d639276c8f311a33ed234b81c8a 100644
--- a/tools/src/tnl-mesh-convert.cpp
+++ b/tools/src/tnl-mesh-convert.cpp
@@ -21,16 +21,13 @@
 #include "tnlConfig.h"
 #include <config/tnlParameterContainer.h>
 
-const char configFile[] = TNL_CONFIG_DIRECTORY "tnl-mesh-convert.cfg.desc";
-
 void configSetup( tnlConfigDescription& config )
 {
    config.addDelimiter                            ( "General settings:" );
-   config.addEntry< tnlString >( "output-file", "Output binary file in TNL format.", "mesh.tnl" );
-   config.addEntry< int >( "verbose", "Set the verbosity of the program.", 1 );
-   
-   config.addDelimiter                            ( "The mesh description:" );
-   config.addEntry< tnlString >( "input-mesh-file", "Input file with the mesh." );
+   config.addRequiredEntry< tnlString >( "input-file", "Input file with the mesh." );
+   config.addEntry< tnlString >( "output-file", "Output mesh file in TNL or VTK format.", "mesh.tnl" );
+   //config.addEntry< tnlString >( "output-format", "Output mesh file format.", "vtk" );
+   config.addEntry< int >( "verbose", "Set the verbosity of the program.", 1 );     
    config.addEntry< tnlString >( "mesh-name", "The mesh name.", "tnl-mesh" ); 
 }
 
diff --git a/tools/src/tnl-mesh-convert.h b/tools/src/tnl-mesh-convert.h
index 45df6bda6522358378e788e6c4202071345e1614..bd0b694b7ba0a76c38b6a91c655a846de5aa287d 100644
--- a/tools/src/tnl-mesh-convert.h
+++ b/tools/src/tnl-mesh-convert.h
@@ -20,34 +20,72 @@
 
 #include <config/tnlParameterContainer.h>
 #include <mesh/tnlMeshReaderNetgen.h>
+#include <mesh/tnlMeshWriterVTKLegacy.h>
 #include <mesh/config/tnlMeshConfigBase.h>
 #include <mesh/topologies/tnlMeshTriangleTag.h>
+#include <mesh/topologies/tnlMeshTetrahedronTag.h>
 #include <mesh/tnlMesh.h>
 #include <mesh/tnlMeshInitializer.h>
+#include <mesh/tnlMeshIntegrityChecker.h>
 #include <core/mfilename.h>
 
 template< int Dimensions >
 bool readMeshWithDimensions( const tnlParameterContainer& parameters )
 {
-   const tnlString& inputFileName = parameters.getParameter< tnlString >( "input-mesh-file" );
-   const tnlString fileExt = getFileExtension( inputFileName );
+   const tnlString& inputFileName = parameters.getParameter< tnlString >( "input-file" );
+   const tnlString& outputFileName = parameters.getParameter< tnlString >( "output-file" );
+   const tnlString inputFileExt = getFileExtension( inputFileName );
+   const tnlString outputFileExt = getFileExtension( outputFileName );
 
    if( Dimensions == 2 )
    {
-      struct MeshConfig : public tnlMeshConfigBase< 2 >
+      typedef tnlMesh< tnlMeshConfigBase< tnlMeshTriangleTag > > MeshType;
+      MeshType mesh;
+      if( inputFileExt == "ng" &&
+          ! tnlMeshReaderNetgen::readMesh<>( inputFileName, mesh, true ) )
+         return false;
+      tnlMeshInitializer< tnlMeshConfigBase< tnlMeshTriangleTag > > meshInitializer;
+      meshInitializer.setVerbose( true );
+      if( ! meshInitializer.initMesh( mesh ) )
+         return false;
+      if( ! tnlMeshIntegrityChecker< MeshType >::checkMesh( mesh ) )
+         return false;
+      tnlString outputFile;
+      cout << "Writing the 2D mesh to the file " << outputFile << "." << endl;
+      if( outputFileExt == "tnl" )
+      {         
+         if( ! mesh.save( outputFile ) )
+         {
+            cerr << "I am not able to write the mesh into the file " << outputFile << "." << endl;
+            return false;
+         }
+      }
+      if( outputFileExt == "vtk" )
       {
-         typedef tnlMeshTriangleTag CellTag;
-      };      
-      tnlMesh< MeshConfig > mesh;
-      if( fileExt == "ng" &&
+         if( ! tnlMeshWriterVTKLegacy::write( outputFileName, mesh, true ) )
+         {
+            cerr << "I am not able to write the mesh into the file " << outputFile << "." << endl;
+            return false;         
+         }
+      }
+   }
+   if( Dimensions == 3 )
+   {
+      typedef tnlMesh< tnlMeshConfigBase< tnlMeshTetrahedronTag > > MeshType;
+      MeshType mesh;
+      if( inputFileExt == "ng" &&
           ! tnlMeshReaderNetgen::readMesh<>( inputFileName, mesh, true ) )
          return false;
-      //if( ! tnlMeshInitializer< MeshConfig >::initMesh( mesh ) )
-      //   return false;
+      tnlMeshInitializer< tnlMeshConfigBase< tnlMeshTetrahedronTag > > meshInitializer;
+      meshInitializer.setVerbose( true );
+      if( ! meshInitializer.initMesh( mesh ) )
+         return false;
+      if( ! tnlMeshIntegrityChecker< MeshType >::checkMesh( mesh ) )
+         return false;
       tnlString outputFile;
       if( parameters.getParameter< tnlString >( "output-file", outputFile ) )
       {
-         cout << "Writing the mesh to the file " << outputFile << "." << endl;
+         cout << "Writing the 3D mesh to the file " << outputFile << "." << endl;
          if( ! mesh.save( outputFile ) )
          {
             cerr << "I am not able to safe the mesh into the file " << outputFile << "." << endl;
@@ -55,28 +93,28 @@ bool readMeshWithDimensions( const tnlParameterContainer& parameters )
          }
       }
    }
+
    return true;
 }
 
 bool convertMesh( const tnlParameterContainer& parameters )
 {
-   tnlString inputFileName;
-   if( parameters.getParameter( "input-mesh-file", inputFileName ) )
+   tnlString inputFileName = parameters.getParameter< tnlString >( "input-file" );
+
+   const tnlString fileExt = getFileExtension( inputFileName );
+   if( fileExt == "ng" )
    {
-      const tnlString fileExt = getFileExtension( inputFileName );
-      if( fileExt == "ng" )
-      {
-         int dimensions;
-         if( ! tnlMeshReaderNetgen::detectDimensions( inputFileName, dimensions ) )
-            return false;
-         if( dimensions == 2 &&
-             ! readMeshWithDimensions< 2 >( parameters ) )
-            return false;
-         if( dimensions == 3 &&
-             ! readMeshWithDimensions< 3 >( parameters ) )
-            return false;
-      }
+      int dimensions;
+      if( ! tnlMeshReaderNetgen::detectDimensions( inputFileName, dimensions ) )
+         return false;
+      if( dimensions == 2 &&
+          ! readMeshWithDimensions< 2 >( parameters ) )
+         return false;
+      if( dimensions == 3 &&
+          ! readMeshWithDimensions< 3 >( parameters ) )
+         return false;
    }
+
    return true;
 }
 
diff --git a/tools/tnl-bindir.in b/tools/tnl-bindir.in
new file mode 100644
index 0000000000000000000000000000000000000000..2ce6738087bc081246b24e7790426e652bf4ec36
--- /dev/null
+++ b/tools/tnl-bindir.in
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+echo @CMAKE_INSTALL_PREFIX@/bin
\ No newline at end of file
diff --git a/tools/tnl-compile.in b/tools/tnl-compile.in
new file mode 100644
index 0000000000000000000000000000000000000000..f471dc6c38583790225c87c60ba1fd30140a0572
--- /dev/null
+++ b/tools/tnl-compile.in
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+DEBUG_FLAGS="-DNDEBUG" # -march=native"
+CUDA_FLAGS=""
+CXX_STD_FLAGS="-std=c++11"
+
+for option in "$@"
+do
+    case $option in
+        --cuda                  ) CUDA_FLAGS="-DHAVE_CUDA -DHAVE_NOT_CXX11 `tnl-cuda-arch`"
+                                  CXX_STD_FLAGS="" ;;
+        --debug                 ) DEBUG_FLAGS="-g -O0"
+    esac
+done
+
+echo -I@CMAKE_INSTALL_PREFIX@/include/tnl-@tnlVersion@ ${CUDA_FLAGS} ${CXX_STD_FLAGS} ${DEBUG_FLAGS}
\ No newline at end of file
diff --git a/tools/tnl-link.in b/tools/tnl-link.in
new file mode 100644
index 0000000000000000000000000000000000000000..ad4bd83ec9949fdad888e5fdc32c71af720dfe3c
--- /dev/null
+++ b/tools/tnl-link.in
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+DEBUG=""
+
+for option in "$@"
+do
+    case $option in
+        --debug                  ) DEBUG="-dbg"
+    esac
+done
+
+echo -L@CMAKE_INSTALL_PREFIX@/lib -ltnl${DEBUG}-@tnlVersion@ @CUSPARSE_LIBRARY@
\ No newline at end of file
diff --git a/tools/tnl-quickstart/CMakeLists.txt b/tools/tnl-quickstart/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1a403b4bcfa9d1b644120456b40a799df138da77
--- /dev/null
+++ b/tools/tnl-quickstart/CMakeLists.txt
@@ -0,0 +1,4 @@
+INSTALL( FILES tnl-quickstart
+               tnl-quickstart.py
+         DESTINATION bin
+         PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )
\ No newline at end of file
diff --git a/tools/tnl-quickstart/tnl-quickstart b/tools/tnl-quickstart/tnl-quickstart
new file mode 100644
index 0000000000000000000000000000000000000000..eefb6910f5f32d63f3fc2c6fb1342f21aa190c79
--- /dev/null
+++ b/tools/tnl-quickstart/tnl-quickstart
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash 
+
+PYTHON=`which python3`
+TNL_BINDIR=`tnl-bindir`
+
+if test x${PYTHON} = x;
+then 
+    echo "It seems that Python 3 is not install on your system."
+    echo "You may install it as follows:"
+    echo ""
+    echo "In Ubuntu: sudo apt-get install python3"
+    echo "In OpenSuse:"
+    echo "In CentOS:"
+else
+${PYTHON} ${TNL_BINDIR}/tnl-quickstart.py
+fi
+
diff --git a/tools/tnl-quickstart/tnl-quickstart.py b/tools/tnl-quickstart/tnl-quickstart.py
new file mode 100644
index 0000000000000000000000000000000000000000..bed01df046ada91ed847b1d4294493f448da9c91
--- /dev/null
+++ b/tools/tnl-quickstart/tnl-quickstart.py
@@ -0,0 +1,767 @@
+#! /usr/bin/python
+
+# To change this license header, choose License Headers in Project Properties.
+# To change this template file, choose Tools | Templates
+# and open the template in the editor.
+
+__author__ = "Tomas Oberhuber"
+__date__ = "$May 6, 2015 8:40:59 PM$"
+
+
+def generateMakefile( problemBaseName ):
+    file = open( "Makefile", "w" )
+    file.write( "# Uncomment the following line to enable CUDA\n" )
+    file.write( "#WITH_CUDA = yes\n" )
+    file.write( "\n" ) 
+    file.write( "TARGET = " + problemBaseName + "\n")
+    file.write( "INSTALL_DIR = ${HOME}/local\n" )
+    file.write( "\n" )
+    file.write( "LDFLAGS = $(shell tnl-link )\n" )
+    file.write( "\n" )
+    file.write( "ifdef WITH_CUDA\n" )
+    file.write( "   CXX = nvcc\n" )     
+    file.write( "   CXX_FLAGS = $(shell tnl-compile --cuda)\n" )    
+    file.write( "else\n" )                                  
+    file.write( "   CXX = g++\n" ) 
+    file.write( "   CXX_FLAGS = $(shell tnl-compile)\n" )    
+    file.write( "endif\n" )                              
+    file.write( "\n" )
+    file.write( "SOURCES = " + problemBaseName + ".cpp\n" )
+    file.write( "HEADERS = " + problemBaseName + ".h\n" )
+    file.write( "OBJECTS = " + problemBaseName + ".o\n" ) 
+    file.write( "DIST = $(SOURCES) $(CUDA_SOURCES) $(HEADERS) Makefile\n" ) 
+    file.write( "\n" ) 
+    file.write( "ifdef WITH_CUDA\n" )
+    file.write( "   OBJECTS = " + problemBaseName + "-cuda.o\n" )     
+    file.write( "endif\n" )     
+    file.write( "\n" )     
+    file.write( "all: $(TARGET)\n" ) 
+    file.write( "\n" ) 
+    file.write( "clean:\n" ) 
+    file.write( "\t rm -f *.o" ) 
+    file.write( "\n" ) 
+    file.write( "dist: $(DIST)" ) 
+    file.write( "\t tar zcvf $(TARGET).tgz $(DIST)\n" ) 
+    file.write( "\n" ) 
+    file.write( "$(TARGET): $(OBJECTS)\n" ) 
+    file.write( "\t$(CXX) -o $@ $< $(LDFLAGS)\n" ) 
+    file.write( "\n" ) 
+    file.write( "%.o: %.cpp\n" ) 
+    file.write( "\t $(CXX) $(CPPFLAGS) $(CXX_FLAGS) -c -o $@ $<" ) 
+    file.write( "\n" ) 
+    file.write( "%.o: %.cu\n" ) 
+    file.write( "\t $(CXX) $(CPPFLAGS) $(CXX_FLAGS) -c -o $@ $<" )     
+    file.close()
+
+def generateMain( problemName, problemBaseName, operatorName ):
+    file = open( problemBaseName + ".h", "w" )
+    file.write( "#include <tnlConfig.h>\n" )
+    file.write( "#include <solvers/tnlSolver.h>\n" )
+    file.write( "#include <solvers/tnlConfigTags.h>\n" )
+    file.write( "#include <solvers/tnlFastBuildConfig.h>\n" )    
+    file.write( "#include <operators/tnlAnalyticDirichletBoundaryConditions.h>\n" )
+    file.write( "#include <operators/tnlDirichletBoundaryConditions.h>\n" )
+    file.write( "#include <operators/tnlAnalyticNeumannBoundaryConditions.h>\n" )
+    file.write( "#include <operators/tnlNeumannBoundaryConditions.h>\n" )
+    file.write( "#include <functors/tnlConstantFunction.h>\n" )
+    file.write( "#include \"" + problemBaseName + "Problem.h\"\n" )
+    file.write( "#include \"" + operatorName + ".h\"\n" )
+    file.write( "#include \"" + problemBaseName + "Rhs.h\"\n" )    
+    file.write( "\n" )
+    file.write( "typedef tnlFastBuildConfig BuildConfig;\n" )    
+    file.write( "\n" )    
+    file.write( "/****\n" )    
+    file.write( " * Uncoment the following (and comment the previous line) for the complete build.\n" )    
+    file.write( " * This will include support for all floating point precisions, all indexing types\n" )    
+    file.write( " * and more solvers. You may then choose between them from the command line.\n" )    
+    file.write( " * The compile time may, however, take tens of minutes or even several hours,\n" )    
+    file.write( " * esppecially if CUDA is enabled. Use this, if you want, only for the final build,\n" )        
+    file.write( " * not in the development phase.\n" )    
+    file.write( " */\n" )    
+    file.write( "//typedef tnlDefaultConfigTag BuildConfig;\n" )    
+    file.write( "\n" )
+    file.write( "template< typename ConfigTag >" )
+    file.write( "class " + problemBaseName + "Config\n" )
+    file.write( "{\n" )
+    file.write( "   public:\n" )
+    file.write( "      static void configSetup( tnlConfigDescription & config )\n" )
+    file.write( "      {\n" )
+    file.write( "         config.addDelimiter( \"" + problemName + " settings:\" );\n" )
+    file.write( "         config.addEntry< tnlString >( \"boundary-conditions-type\", \"Choose the boundary conditions type.\", \"dirichlet\");\n" )
+    file.write( "            config.addEntryEnum< tnlString >( \"dirichlet\" );\n" )
+    file.write( "            config.addEntryEnum< tnlString >( \"neumann\" );\n" )
+    file.write( "         config.addEntry< double >( \"boundary-conditions-constant\", \"This sets a value in case of the constant boundary conditions.\" );\n" )
+    file.write( "\n" )
+    file.write( "         /****\n" )
+    file.write( "          * Add definition of your solver command line arguments.\n" )
+    file.write( "          */\n" )
+    file.write( "\n" )
+    file.write( "      }\n" )
+    file.write( "};\n" )
+    file.write( "\n" )
+    file.write( "template< typename Real,\n" )
+    file.write( "          typename Device,\n" )
+    file.write( "          typename Index,\n" )
+    file.write( "          typename MeshType,\n" )
+    file.write( "          typename ConfigTag,\n" )
+    file.write( "          typename SolverStarter >\n" )
+    file.write( "class " + problemBaseName + "Setter\n" )
+    file.write( "{\n" )
+    file.write( "   public:\n" )
+    file.write( "\n" )
+    file.write( "      typedef Real RealType;\n" )
+    file.write( "      typedef Device DeviceType;\n" )
+    file.write( "      typedef Index IndexType;\n" )
+    file.write( "\n" )
+    file.write( "      static bool run( const tnlParameterContainer & parameters )\n" )
+    file.write( "      {\n" )
+    file.write( "          enum { Dimensions = MeshType::Dimensions };\n" )
+    file.write( "          typedef " + operatorName + "< MeshType, Real, Index > ApproximateOperator;\n" )
+    file.write( "          typedef " + problemBaseName + "Rhs RightHandSide;\n" )    
+    file.write( "          typedef tnlStaticVector < MeshType::Dimensions, Real > Vertex;\n" )
+    file.write( "\n" )
+    file.write( "         /****\n" )
+    file.write( "          * Resolve the template arguments of your solver here.\n" )
+    file.write( "          * The following code is for the Dirichlet and the Neumann boundary conditions.\n" )
+    file.write( "          * Both can be constant or defined as descrete values of tnlVector.\n" )    
+    file.write( "          */\n" )    
+    file.write( "          tnlString boundaryConditionsType = parameters.getParameter< tnlString >( \"boundary-conditions-type\" );\n" )
+    file.write( "          if( parameters.checkParameter( \"boundary-conditions-constant\" ) )\n" )
+    file.write( "          {\n" )
+    file.write( "             typedef tnlConstantFunction< Dimensions, Real > ConstantFunction;\n" )
+    file.write( "             if( boundaryConditionsType == \"dirichlet\" )\n" )
+    file.write( "             {\n" )
+    file.write( "                typedef tnlAnalyticDirichletBoundaryConditions< MeshType, ConstantFunction, Real, Index > BoundaryConditions;\n" )
+    file.write( "                typedef " + problemBaseName + "Problem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Problem;\n" )
+    file.write( "                SolverStarter solverStarter;\n" )
+    file.write( "                return solverStarter.template run< Problem >( parameters );\n" )
+    file.write( "             }\n" )
+    file.write( "             typedef tnlAnalyticNeumannBoundaryConditions< MeshType, ConstantFunction, Real, Index > BoundaryConditions;\n" )
+    file.write( "             typedef " + problemBaseName + "Problem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Problem;\n" )
+    file.write( "             SolverStarter solverStarter;\n" )
+    file.write( "             return solverStarter.template run< Problem >( parameters );\n" )
+    file.write( "          }\n" )
+    file.write( "          typedef tnlVector< Real, Device, Index > VectorType;\n" )
+    file.write( "          if( boundaryConditionsType == \"dirichlet\" )\n" )
+    file.write( "          {\n" )
+    file.write( "             typedef tnlDirichletBoundaryConditions< MeshType, VectorType, Real, Index > BoundaryConditions;\n" )
+    file.write( "             typedef " + problemBaseName + "Problem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Problem;\n" )
+    file.write( "             SolverStarter solverStarter;\n" )
+    file.write( "             return solverStarter.template run< Problem >( parameters );\n" )
+    file.write( "          }\n" )
+    file.write( "          typedef tnlNeumannBoundaryConditions< MeshType, VectorType, Real, Index > BoundaryConditions;\n" )
+    file.write( "          typedef " + problemBaseName + "Problem< MeshType, BoundaryConditions, RightHandSide, ApproximateOperator > Problem;\n" )
+    file.write( "          SolverStarter solverStarter;\n" )
+    file.write( "          return solverStarter.template run< Problem >( parameters );\n" )
+    file.write( "      }\n" )
+    file.write( "\n" )
+    file.write( "};\n" )
+    file.write( "\n" )
+    file.write( "int main( int argc, char* argv[] )\n" )
+    file.write( "{\n" )
+    file.write( "   tnlSolver< " + problemBaseName + "Setter, " + problemBaseName + "Config, BuildConfig > solver;\n" )
+    file.write( "   if( ! solver. run( argc, argv ) )\n" )
+    file.write( "      return EXIT_FAILURE;\n" )
+    file.write( "   return EXIT_SUCCESS;\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.close()
+    file = open( problemBaseName + ".cpp", "w")
+    file.write( "#include \"" + problemBaseName + ".h\"\n")
+    file.close();
+    file = open( problemBaseName + "-cuda.cu", "w")
+    file.write( "#include \"" + problemBaseName + ".h\"\n")
+    file.close()
+    
+def generateProblem( problemName, problemBaseName ):
+    file = open( problemBaseName + "Problem.h", "w" )
+    file.write( "#ifndef " + problemBaseName + "PROBLEM_H_\n" )
+    file.write( "#define " + problemBaseName + "PROBLEM_H_\n" )
+    file.write( "\n" )
+    file.write( "#include <problems/tnlPDEProblem.h>\n")
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "           typename DifferentialOperator >\n" )
+    file.write( "class " + problemBaseName + "Problem:\n" )
+    file.write( "   public tnlPDEProblem< Mesh,\n" )
+    file.write( "                         typename DifferentialOperator::RealType,\n" )
+    file.write( "                         typename Mesh::DeviceType,\n" )
+    file.write( "                         typename DifferentialOperator::IndexType >\n" )
+    file.write( "{\n" )
+    file.write( "   public:\n" )
+    file.write( "\n" )
+    file.write( "      typedef typename DifferentialOperator::RealType RealType;\n" )
+    file.write( "      typedef typename Mesh::DeviceType DeviceType;\n" )
+    file.write( "      typedef typename DifferentialOperator::IndexType IndexType;\n" )
+    file.write( "      typedef tnlPDEProblem< Mesh, RealType, DeviceType, IndexType > BaseType;\n" )
+    file.write( "\n" )
+    file.write( "      using typename BaseType::MeshType;\n" )
+    file.write( "      using typename BaseType::DofVectorType;\n" )
+    file.write( "      using typename BaseType::MeshDependentDataType;\n" )
+    file.write( "\n" )
+    file.write( "      static tnlString getTypeStatic();\n" )
+    file.write( "\n" )
+    file.write( "      tnlString getPrologHeader() const;\n" )
+    file.write( "\n" )
+    file.write( "      void writeProlog( tnlLogger& logger,\n" )
+    file.write( "                        const tnlParameterContainer& parameters ) const;\n" )
+    file.write( "\n" )
+    file.write( "      bool setup( const tnlParameterContainer& parameters );\n" )
+    file.write( "\n" )
+    file.write( "      bool setInitialCondition( const tnlParameterContainer& parameters,\n" )
+    file.write( "                                const MeshType& mesh,\n" )
+    file.write( "                                DofVectorType& dofs,\n" )
+    file.write( "                                MeshDependentDataType& meshDependentData );\n" )
+    file.write( "\n" )
+    file.write( "      template< typename Matrix >\n" )
+    file.write( "      bool setupLinearSystem( const MeshType& mesh,\n" )
+    file.write( "                              Matrix& matrix );\n" )
+    file.write( "\n" )
+    file.write( "      bool makeSnapshot( const RealType& time,\n" )
+    file.write( "                         const IndexType& step,\n" )
+    file.write( "                         const MeshType& mesh,\n" )
+    file.write( "                         DofVectorType& dofs,\n" )
+    file.write( "                         MeshDependentDataType& meshDependentData );\n" )
+    file.write( "\n" )
+    file.write( "      IndexType getDofs( const MeshType& mesh ) const;\n" )
+    file.write( "\n" )
+    file.write( "      void bindDofs( const MeshType& mesh,\n" )
+    file.write( "                     DofVectorType& dofs );\n" )
+    file.write( "\n" )
+    file.write( "      void getExplicitRHS( const RealType& time,\n" )
+    file.write( "                           const RealType& tau,\n" )
+    file.write( "                           const MeshType& mesh,\n" )
+    file.write( "                           DofVectorType& _u,\n" )
+    file.write( "                           DofVectorType& _fu,\n" )
+    file.write( "                           MeshDependentDataType& meshDependentData );\n" )
+    file.write( "\n" )
+    file.write( "      template< typename Matrix >\n" )
+    file.write( "      void assemblyLinearSystem( const RealType& time,\n" )
+    file.write( "                                 const RealType& tau,\n" )
+    file.write( "                                 const MeshType& mesh,\n" )
+    file.write( "                                 DofVectorType& dofs,\n" )
+    file.write( "                                 Matrix& matrix,\n" )
+    file.write( "                                 DofVectorType& rightHandSide,\n" )
+    file.write( "                                 MeshDependentDataType& meshDependentData );\n" )
+    file.write( "\n" )
+    file.write( "   protected:\n" )
+    file.write( "\n" )    
+    file.write( "      DifferentialOperator differentialOperator;\n" )
+    file.write( "      BoundaryCondition boundaryCondition;\n" )
+    file.write( "      RightHandSide rightHandSide;\n" )
+    file.write( "};\n" )
+    file.write( "\n" )
+    file.write( "#include \"" + problemBaseName + "Problem_impl.h\"\n" )
+    file.write( "\n" )
+    file.write( "#endif /* " + problemBaseName + "PROBLEM_H_ */\n" )
+    file.close()
+                                 
+    file = open( problemBaseName + "Problem_impl.h", "w" )
+    file.write( "#ifndef " + problemBaseName + "PROBLEM_IMPL_H_\n" )
+    file.write( "#define " + problemBaseName + "PROBLEM_IMPL_H_\n" )
+    file.write( "\n" )
+    file.write( "#include <core/mfilename.h>\n" )
+    file.write( "#include <matrices/tnlMatrixSetter.h>\n" )
+    file.write( "#include <solvers/pde/tnlExplicitUpdater.h>\n" )
+    file.write( "#include <solvers/pde/tnlLinearSystemAssembler.h>\n" )
+    file.write( "#include <solvers/pde/tnlBackwardTimeDiscretisation.h>\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "tnlString\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )
+    file.write( "getTypeStatic()\n" )
+    file.write( "{\n" )
+    file.write( "   return tnlString( \"" + problemBaseName + "Problem< \" ) + Mesh :: getTypeStatic() + \" >\";\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "tnlString\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )
+    file.write( "getPrologHeader() const\n" )
+    file.write( "{\n" )    
+    file.write( "   return tnlString( \"" + problemName + "\" );\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "void\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )    
+    file.write( "writeProlog( tnlLogger& logger, const tnlParameterContainer& parameters ) const\n" )
+    file.write( "{\n" )
+    file.write( "   /****\n" )
+    file.write( "    * Add data you want to have in the computation report (log) as follows:\n" )
+    file.write( "    * logger.writeParameter< double >( \"Parameter description\", parameter );\n" )
+    file.write( "    */\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "bool\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )        
+    file.write( "setup( const tnlParameterContainer& parameters )\n" )
+    file.write( "{\n" )
+    file.write( "   if( ! this->boundaryCondition.setup( parameters, \"boundary-conditions-\" ) ||\n" )
+    file.write( "       ! this->rightHandSide.setup( parameters, \"right-hand-side-\" ) )\n" )
+    file.write( "      return false;\n" )
+    file.write( "   return true;\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "typename " + problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::IndexType\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )            
+    file.write( "getDofs( const MeshType& mesh ) const\n" )
+    file.write( "{\n" )
+    file.write( "   /****\n" )
+    file.write( "    * Return number of  DOFs (degrees of freedom) i.e. number\n" )
+    file.write( "    * of unknowns to be resolved by the main solver.\n" )
+    file.write( "    */\n" )
+    file.write( "   return mesh.getNumberOfCells();\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "void\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )            
+    file.write( "bindDofs( const MeshType& mesh,\n" )
+    file.write( "          DofVectorType& dofVector )\n" )    
+    file.write( "{\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "bool\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )            
+    file.write( "setInitialCondition( const tnlParameterContainer& parameters,\n" )    
+    file.write( "                     const MeshType& mesh,\n" )
+    file.write( "                     DofVectorType& dofs,\n" )
+    file.write( "                     MeshDependentDataType& meshDependentData )\n" )
+    file.write( "{\n" )
+    file.write( "   const tnlString& initialConditionFile = parameters.getParameter< tnlString >( \"initial-condition\" );\n" )
+    file.write( "   if( ! dofs.load( initialConditionFile ) )\n" )
+    file.write( "   {\n" )
+    file.write( "      cerr << \"I am not able to load the initial condition from the file \" << initialConditionFile << \".\" << endl;\n" )
+    file.write( "      return false;\n" )
+    file.write( "   }\n" )
+    file.write( "   return true; \n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "   template< typename Matrix >\n" )
+    file.write( "bool\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )                
+    file.write( "setupLinearSystem( const MeshType& mesh,\n" )
+    file.write( "                   Matrix& matrix )\n" )
+    file.write( "{\n" )
+    file.write( "   const IndexType dofs = this->getDofs( mesh );\n" )
+    file.write( "   typedef typename Matrix::RowLengthsVector RowLengthsVectorType;\n" )
+    file.write( "   RowLengthsVectorType rowLengths;\n" )
+    file.write( "   if( ! rowLengths.setSize( dofs ) )\n" )
+    file.write( "      return false;\n" )
+    file.write( "   tnlMatrixSetter< MeshType, DifferentialOperator, BoundaryCondition, RowLengthsVectorType > matrixSetter;\n" )
+    file.write( "   matrixSetter.template getRowLengths< Mesh::Dimensions >( mesh,\n" )
+    file.write( "                                                            differentialOperator,\n" )
+    file.write( "                                                            boundaryCondition,\n" )
+    file.write( "                                                            rowLengths );\n" )
+    file.write( "   matrix.setDimensions( dofs, dofs );\n" )
+    file.write( "   if( ! matrix.setRowLengths( rowLengths ) )\n" )
+    file.write( "      return false;\n" )
+    file.write( "   return true;\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "bool\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )                    
+    file.write( "makeSnapshot( const RealType& time,\n" )
+    file.write( "              const IndexType& step,\n" )
+    file.write( "              const MeshType& mesh,\n" )
+    file.write( "              DofVectorType& dofs,\n" )
+    file.write( "              MeshDependentDataType& meshDependentData )\n" )
+    file.write( "{\n" )
+    file.write( "   cout << endl << \"Writing output at time \" << time << \" step \" << step << \".\" << endl;\n" )
+    file.write( "   this->bindDofs( mesh, dofs );\n" )
+    file.write( "   tnlString fileName;\n" )
+    file.write( "   FileNameBaseNumberEnding( \"u-\", step, 5, \".tnl\", fileName );\n" )
+    file.write( "   if( ! dofs.save( fileName ) )\n" )
+    file.write( "      return false;\n" )
+    file.write( "   return true;\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "void\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )                    
+    file.write( "getExplicitRHS( const RealType& time,\n" )
+    file.write( "                const RealType& tau,\n" )
+    file.write( "                const MeshType& mesh,\n" )
+    file.write( "                DofVectorType& u,\n" )
+    file.write( "                DofVectorType& fu,\n" )
+    file.write( "                MeshDependentDataType& meshDependentData )\n" )
+    file.write( "{\n" )
+    file.write( "   /****\n" )
+    file.write( "    * If you use an explicit solver like tnlEulerSolver or tnlMersonSolver, you\n" )
+    file.write( "    * need to implement this method. Compute the right-hand side of\n" )
+    file.write( "    *\n" )
+    file.write( "    *   d/dt u(x) = fu( x, u )\n" )
+    file.write( "    *\n" )
+    file.write( "    * You may use supporting mesh dependent data if you need.\n" )
+    file.write( "    */\n" )
+    file.write( "\n" )
+    file.write( "   this->bindDofs( mesh, u );\n" )
+    file.write( "   tnlExplicitUpdater< Mesh, DofVectorType, DifferentialOperator, BoundaryCondition, RightHandSide > explicitUpdater;\n" )
+    file.write( "   explicitUpdater.template update< Mesh::Dimensions >( time,\n" )
+    file.write( "                                                        mesh,\n" )
+    file.write( "                                                        this->differentialOperator,\n" )
+    file.write( "                                                        this->boundaryCondition,\n" )
+    file.write( "                                                        this->rightHandSide,\n" )
+    file.write( "                                                        u,\n" )
+    file.write( "                                                        fu );\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename BoundaryCondition,\n" )
+    file.write( "          typename RightHandSide,\n" )
+    file.write( "          typename DifferentialOperator >\n" )
+    file.write( "   template< typename Matrix >\n" )
+    file.write( "void\n" )
+    file.write( problemBaseName + "Problem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator >::\n" )                
+    file.write( "assemblyLinearSystem( const RealType& time,\n" )
+    file.write( "                      const RealType& tau,\n" )
+    file.write( "                      const MeshType& mesh,\n" )
+    file.write( "                      DofVectorType& u,\n" )
+    file.write( "                      Matrix& matrix,\n" )
+    file.write( "                      DofVectorType& b,\n" )
+    file.write( "                      MeshDependentDataType& meshDependentData )\n" )
+    file.write( "{\n" )
+    file.write( "   tnlLinearSystemAssembler< Mesh,\n" )
+    file.write( "                             DofVectorType,\n" )
+    file.write( "                             DifferentialOperator,\n" )
+    file.write( "                             BoundaryCondition,\n" )
+    file.write( "                             RightHandSide,\n" )
+    file.write( "                             tnlBackwardTimeDiscretisation,\n" )
+    file.write( "                             Matrix > systemAssembler;\n" )
+    file.write( "   systemAssembler.template assembly< Mesh::Dimensions >( time,\n" )
+    file.write( "                                                          tau,\n" )
+    file.write( "                                                          mesh,\n" )
+    file.write( "                                                          this->differentialOperator,\n" )
+    file.write( "                                                          this->boundaryCondition,\n" )
+    file.write( "                                                          this->rightHandSide,\n" )
+    file.write( "                                                          u,\n" )
+    file.write( "                                                          matrix,\n" )
+    file.write( "                                                          b );\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "#endif /* " + problemBaseName + "PROBLEM_IMPL_H_ */\n" )
+    file.close()
+
+def generateOperatorGridSpecializationHeader( file, operatorName, dimensions ):
+    file.write( "template< typename MeshReal,\n" )
+    file.write( "          typename Device,\n" )
+    file.write( "          typename MeshIndex,\n" )
+    file.write( "          typename Real,\n" )
+    file.write( "          typename Index >\n" )
+    file.write( "class " + operatorName + "< tnlGrid< " + dimensions + ",MeshReal, Device, MeshIndex >, Real, Index >\n" )
+    file.write( "{\n" )
+    file.write( "   public:\n" )
+    file.write( "      typedef tnlGrid< " + dimensions + ", MeshReal, Device, MeshIndex > MeshType;\n" )
+    file.write( "      typedef typename MeshType::CoordinatesType CoordinatesType;\n" )
+    file.write( "      typedef Real RealType;\n" )
+    file.write( "      typedef Device DeviceType;\n" )
+    file.write( "      typedef Index IndexType;\n" )
+    file.write( "      enum { Dimensions = MeshType::Dimensions };\n" )
+    file.write( "\n" )
+    file.write( "      static tnlString getType();\n" )
+    file.write( "\n" )
+    file.write( "      template< typename Vector >\n" )
+    file.write( "      __cuda_callable__\n" )
+    file.write( "      Real getValue( const MeshType& mesh,\n" )
+    file.write( "                     const IndexType cellIndex,\n" )
+    file.write( "                     const CoordinatesType& coordinates,\n" )
+    file.write( "                     const Vector& u,\n" )
+    file.write( "                     const RealType& time ) const;\n" )
+    file.write( "\n" )
+    file.write( "      __cuda_callable__\n" )
+    file.write( "      Index getLinearSystemRowLength( const MeshType& mesh,\n" )
+    file.write( "                                      const IndexType& index,\n" )
+    file.write( "                                      const CoordinatesType& coordinates ) const;\n" )
+    file.write( "\n" )
+    file.write( "      template< typename Vector, typename MatrixRow >\n" )
+    file.write( "      __cuda_callable__\n" )
+    file.write( "      void updateLinearSystem( const RealType& time,\n" )
+    file.write( "                               const RealType& tau,\n" )
+    file.write( "                               const MeshType& mesh,\n" )
+    file.write( "                               const IndexType& index,\n" )
+    file.write( "                               const CoordinatesType& coordinates,\n" )
+    file.write( "                               Vector& u,\n" )
+    file.write( "                               Vector& b,\n" )
+    file.write( "                               MatrixRow& matrixRow ) const;\n" )
+    file.write( "};\n" )
+    file.write( "\n" )    
+    
+def generateOperatorGridSpecializationImplementation( file, operatorName, dimensions ):
+    file.write( "template< typename MeshReal,\n" )
+    file.write( "          typename Device,\n" )
+    file.write( "          typename MeshIndex,\n" )
+    file.write( "          typename Real,\n" )
+    file.write( "          typename Index >\n" )
+    file.write( "tnlString\n" )
+    file.write( operatorName + "< tnlGrid< " + dimensions + ", MeshReal, Device, MeshIndex >, Real, Index >::\n" )
+    file.write( "getType()\n" )
+    file.write( "{\n" )
+    file.write( "   return tnlString( \"" + operatorName + "< \" ) +\n" )
+    file.write( "          MeshType::getType() + \", \" +\n" )
+    file.write( "          ::getType< Real >() + \", \" +\n" )
+    file.write( "          ::getType< Index >() + \" >\";\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename MeshReal,\n" )
+    file.write( "          typename Device,\n" )
+    file.write( "          typename MeshIndex,\n" )
+    file.write( "          typename Real,\n" )
+    file.write( "          typename Index >\n" )    
+    file.write( "template< typename Vector >\n" )
+    file.write( "__cuda_callable__\n" )
+    file.write( "Real\n" )
+    file.write( operatorName + "< tnlGrid< " + dimensions + ", MeshReal, Device, MeshIndex >, Real, Index >::\n" )
+    file.write( "getValue( const MeshType& mesh,\n" )
+    file.write( "          const IndexType cellIndex,\n" )
+    file.write( "          const CoordinatesType& coordinates,\n" )
+    file.write( "          const Vector& u,\n" )
+    file.write( "          const Real& time ) const\n" )
+    file.write( "{\n" )
+    file.write( "   /****\n" )
+    file.write( "    * Implement your explicit form of the differential operator here.\n" )
+    file.write( "    * The following example is the Laplace operator approximated \n" )
+    file.write( "    * by the Finite difference method.\n" )
+    file.write( "    */\n" )    
+    file.write( "\n" )
+    if dimensions == "1":
+        file.write( "   return ( u[ mesh.template getCellNextToCell< -1 >( cellIndex ) ]\n" )
+        file.write( "            - 2.0 * u[ cellIndex ]\n" )
+        file.write( "            + u[ mesh.template getCellNextToCell< 1 >( cellIndex ) ] ) * mesh.getHxSquareInverse();\n" )
+    if dimensions == "2":
+        file.write( "   return ( u[ mesh.template getCellNextToCell< -1, 0 >( cellIndex ) ]\n" )
+        file.write( "            - 2.0 * u[ cellIndex ]\n" )
+        file.write( "            + u[ mesh.template getCellNextToCell< 1, 0 >( cellIndex ) ] ) * mesh.getHxSquareInverse() +\n" )
+        file.write( "           ( u[ mesh.template getCellNextToCell< 0, -1 >( cellIndex ) ]\n" )
+        file.write( "             - 2.0 * u[ cellIndex ]\n" )
+        file.write( "             + u[ mesh.template getCellNextToCell< 0, 1 >( cellIndex ) ] ) * mesh.getHySquareInverse();\n" )
+    if dimensions == "3":
+        file.write( "   return ( u[ mesh.template getCellNextToCell< -1, 0, 0 >( cellIndex ) ]\n" )
+        file.write( "            - 2.0 * u[ cellIndex ]\n" )
+        file.write( "            + u[ mesh.template getCellNextToCell< 1, 0, 0 >( cellIndex ) ] ) * mesh.getHxSquareInverse() +\n" )
+        file.write( "          ( u[ mesh.template getCellNextToCell< 0, -1, 0 >( cellIndex ) ]\n" )
+        file.write( "            - 2.0 * u[ cellIndex ]\n" )
+        file.write( "            + u[ mesh.template getCellNextToCell< 0, 1, 0 >( cellIndex ) ] ) * mesh.getHySquareInverse() +\n" )
+        file.write( "          ( u[ mesh.template getCellNextToCell< 0, 0, -1 >( cellIndex ) ]\n" )
+        file.write( "            - 2.0 * u[ cellIndex ]\n" )
+        file.write( "            + u[ mesh.template getCellNextToCell< 0, 0, 1 >( cellIndex ) ] ) * mesh.getHzSquareInverse();\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename MeshReal,\n" )
+    file.write( "          typename Device,\n" )
+    file.write( "          typename MeshIndex,\n" )
+    file.write( "          typename Real,\n" )
+    file.write( "          typename Index >\n" )        
+    file.write( "__cuda_callable__\n" )
+    file.write( "Index\n" )
+    file.write( operatorName + "< tnlGrid< " + dimensions + ", MeshReal, Device, MeshIndex >, Real, Index >::\n" )
+    file.write( "getLinearSystemRowLength( const MeshType& mesh,\n" )
+    file.write( "                          const IndexType& index,\n" )
+    file.write( "                          const CoordinatesType& coordinates ) const\n" )
+    file.write( "{\n" )
+    file.write( "   /****\n" )
+    file.write( "    * Return a number of non-zero elements in a line (associated with given grid element) of\n" )
+    file.write( "    * the linear system.\n" )
+    file.write( "    * The following example is the Laplace operator approximated \n" )
+    file.write( "    * by the Finite difference method.\n" )
+    file.write( "    */\n" )
+    file.write( "\n" )
+    file.write( "   return 2*Dimensions + 1;\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    file.write( "template< typename MeshReal,\n" )
+    file.write( "          typename Device,\n" )
+    file.write( "          typename MeshIndex,\n" )
+    file.write( "          typename Real,\n" )
+    file.write( "          typename Index >\n" )
+    file.write( "   template< typename Vector, typename MatrixRow >\n" )       
+    file.write( "__cuda_callable__\n" )        
+    file.write( "void\n" )
+    file.write( operatorName + "< tnlGrid< " + dimensions + ", MeshReal, Device, MeshIndex >, Real, Index >::\n" )
+    file.write( "updateLinearSystem( const RealType& time,\n" )
+    file.write( "                    const RealType& tau,\n" )
+    file.write( "                    const MeshType& mesh,\n" )
+    file.write( "                    const IndexType& index,\n" )
+    file.write( "                    const CoordinatesType& coordinates,\n" )
+    file.write( "                    Vector& u,\n" )
+    file.write( "                    Vector& b,\n" )
+    file.write( "                    MatrixRow& matrixRow ) const\n" )
+    file.write( "{\n" )
+    file.write( "   /****\n" )
+    file.write( "    * Setup the non-zero elements of the linear system here.\n" )
+    file.write( "    * The following example is the Laplace operator appriximated \n" )
+    file.write( "    * by the Finite difference method.\n" )
+    file.write( "    */\n" )    
+    file.write( "\n" )
+    if dimensions == "1":
+       file.write( "   const RealType lambdaX = tau * mesh.getHxSquareInverse();\n" )
+       file.write( "   matrixRow.setElement( 0, mesh.template getCellNextToCell< -1 >( index ),     - lambdaX );\n" )
+       file.write( "   matrixRow.setElement( 1, index,                             2.0 * lambdaX );\n" )
+       file.write( "   matrixRow.setElement( 2, mesh.template getCellNextToCell< 1 >( index ),       - lambdaX );\n" )
+    if dimensions == "2":
+        file.write( "   const RealType lambdaX = tau * mesh.getHxSquareInverse();\n" )
+        file.write( "   const RealType lambdaY = tau * mesh.getHySquareInverse();\n" )
+        file.write( "   matrixRow.setElement( 0, mesh.template getCellNextToCell< 0, -1 >( index ), -lambdaY );\n" )
+        file.write( "   matrixRow.setElement( 1, mesh.template getCellNextToCell< -1, 0 >( index ), -lambdaX );\n" )
+        file.write( "   matrixRow.setElement( 2, index,                                             2.0 * ( lambdaX + lambdaY ) );\n" )
+        file.write( "   matrixRow.setElement( 3, mesh.template getCellNextToCell< 1, 0 >( index ),   -lambdaX );\n" )
+        file.write( "   matrixRow.setElement( 4, mesh.template getCellNextToCell< 0, 1 >( index ),   -lambdaY );\n" )
+    if dimensions == "3":        
+        file.write( "   const RealType lambdaX = tau * mesh.getHxSquareInverse();\n" )
+        file.write( "   const RealType lambdaY = tau * mesh.getHySquareInverse();\n" )
+        file.write( "   const RealType lambdaZ = tau * mesh.getHzSquareInverse();\n" )
+        file.write( "   matrixRow.setElement( 0, mesh.template getCellNextToCell< 0, 0, -1 >( index ), -lambdaZ );\n" )
+        file.write( "   matrixRow.setElement( 1, mesh.template getCellNextToCell< 0, -1, 0 >( index ), -lambdaY );\n" )
+        file.write( "   matrixRow.setElement( 2, mesh.template getCellNextToCell< -1, 0, 0 >( index ), -lambdaX );\n" )
+        file.write( "   matrixRow.setElement( 3, index,                             2.0 * ( lambdaX + lambdaY + lambdaZ ) );\n" )
+        file.write( "   matrixRow.setElement( 4, mesh.template getCellNextToCell< 1, 0, 0 >( index ),   -lambdaX );\n" )
+        file.write( "   matrixRow.setElement( 5, mesh.template getCellNextToCell< 0, 1, 0 >( index ),   -lambdaY );\n" )
+        file.write( "   matrixRow.setElement( 6, mesh.template getCellNextToCell< 0, 0, 1 >( index ),   -lambdaZ );\n" )
+    file.write( "}\n" )
+    file.write( "\n" )
+    
+def generateOperator( operatorName):
+    file = open( operatorName + ".h", "w" )    
+    file.write( "#ifndef " + operatorName + "_H\n" )
+    file.write( "#define " + operatorName + "_H\n" )
+    file.write( "\n" )
+    file.write( "#include <core/vectors/tnlVector.h>\n" )
+    file.write( "#include <mesh/tnlGrid.h>\n" )
+    file.write( "\n" )
+    file.write( "template< typename Mesh,\n" )
+    file.write( "          typename Real = typename Mesh::RealType,\n" )
+    file.write( "          typename Index = typename Mesh::IndexType >\n" )
+    file.write( "class " + operatorName + "\n" )
+    file.write( "{\n" )
+    file.write( "};\n" )
+    file.write( "\n" )
+    generateOperatorGridSpecializationHeader( file, operatorName, "1" )
+    generateOperatorGridSpecializationHeader( file, operatorName, "2" )
+    generateOperatorGridSpecializationHeader( file, operatorName, "3" )
+    file.write( "\n" )
+    file.write( "#include \""+ operatorName + "_impl.h\"\n" )
+    file.write( "\n" )
+    file.write( "#endif	/* " + operatorName + "_H */\n" )
+    file.close()
+    file = open( operatorName + "_impl.h", "w" )    
+    file.write( "#ifndef " + operatorName + "_IMPL_H\n" )
+    file.write( "#define " + operatorName + "_IMPL_H\n" )
+    file.write( "\n" )
+    file.write( "/****\n" )
+    file.write( " * 1D problem\n" )
+    file.write( " */\n" )
+    generateOperatorGridSpecializationImplementation( file, operatorName, "1" )
+    file.write( "/****\n" )
+    file.write( " * 2D problem\n" )
+    file.write( " */\n" )
+    generateOperatorGridSpecializationImplementation( file, operatorName, "2" )
+    file.write( "/****\n" )
+    file.write( " * 3D problem\n" )
+    file.write( " */\n" )    
+    generateOperatorGridSpecializationImplementation( file, operatorName, "3" )
+    file.write( "#endif	/* " + operatorName + "IMPL_H */\n" )
+    file.write( "\n" )
+    file.close()
+    
+def generateRhs( problemBaseName ):
+    file = open( problemBaseName + "Rhs.h", "w" )
+    file.write( "#ifndef " + problemBaseName + "RHS_H_\n" )
+    file.write( "#define " + problemBaseName + "RHS_H_\n" )
+    file.write( "\n" )
+    file.write( "class " + problemBaseName + "Rhs\n" )
+    file.write( "{\n" )
+    file.write( "   public:\n" )
+    file.write( "      bool setup( const tnlParameterContainer& parameters,\n" )
+    file.write( "                  const tnlString& prefix = \"\" )\n" )
+    file.write( "      {\n" )
+    file.write( "         return true;\n" )
+    file.write( "      }\n" )
+    file.write( "\n" )
+    file.write( "      template< typename Mesh,\n" )
+    file.write( "                typename Index,\n" )
+    file.write( "                typename Real >\n" )
+    file.write( "      __cuda_callable__\n" )
+    file.write( "      Real getValue( const Mesh& mesh,\n" )
+    file.write( "                     const Index& index,\n" )
+    file.write( "                     const Real& time ) const\n" )    
+    file.write( "      {\n" )    
+    file.write( "         typedef typename Mesh::VertexType VertexType;\n" )
+    file.write( "         VertexType v = mesh.template getCellCenter< VertexType >( index );\n" )        
+    file.write( "         return 0.0;\n" )    
+    file.write( "      };\n" )    
+    file.write( "};\n" )    
+    file.write( "\n" )    
+    file.write( "#endif /* " + problemBaseName + "RHS_H_ */\n" )    
+    file.close()
+
+def generateRunScript( problemBaseName ):
+    file = open( "run-" + problemBaseName, "w" )
+    file.write( "#!/usr/bin/env bash\n" ) 
+    file.write( "\n" ) 
+    file.write( "tnl-grid-setup --dimensions 2 \\\n" ) 
+    file.write( "               --origin-x 0.0 \\\n" ) 
+    file.write( "               --origin-y 0.0 \\\n" ) 
+    file.write( "               --proportions-x 1.0 \\\n" ) 
+    file.write( "               --proportions-y 1.0 \\\n" ) 
+    file.write( "               --size-x 100 \\\n" ) 
+    file.write( "               --size-y 100\n" ) 
+    file.write( "\n" ) 
+    file.write( "tnl-init --test-function sin-wave \\\n" ) 
+    file.write( "         --output-file init.tnl\n" ) 
+    file.write( "./" + problemBaseName + " --time-discretisation explicit \\\n" ) 
+    file.write( "              --boundary-conditions-constant 0 \\\n" )
+    file.write( "              --discrete-solver merson \\\n" ) 
+    file.write( "              --snapshot-period 0.01 \\\n" ) 
+    file.write( "              --final-time 1.0\n" ) 
+    file.write( "\n" ) 
+    file.write( "tnl-view --mesh mesh.tnl --input-files *tnl     \n" ) 
+    file.close()
+    
+print( "TNL Quickstart -- solver generator")
+print( "----------------------------------")
+problemName = input( "Problam name:" )
+problemBaseName = input( "Problem class base name (base name acceptable in C++ code):" )
+operatorName = input( "Operator name:")
+generateMakefile( problemBaseName )
+generateMain( problemName, problemBaseName, operatorName )
+generateProblem( problemName, problemBaseName )
+generateOperator( operatorName )
+generateRhs( problemBaseName )
+generateRunScript( problemBaseName )
\ No newline at end of file