diff --git a/buildAll b/buildAll
index d553862bd5fdf6878b16afb67f934c1424acfe75..3986a9ea1db990dc0960376c7c614a5f44b7ff45 100755
--- a/buildAll
+++ b/buildAll
@@ -2,7 +2,7 @@
 
 TARGET=TNL
 INSTALL_PREFIX=${HOME}/local
-WITH_CUDA=yes
+WITH_CUDA=no
 WITH_CUSPARSE=no
 CUDA_ARCHITECTURE=2.0
 VERBOSE=1
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 674a3e3ae508b060fc4e83cb01286e3d27dad44b..a2055fe17986df13e20f6b479bfb37589a976210 100755
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,13 +10,28 @@ ADD_SUBDIRECTORY( solvers )
 ADD_SUBDIRECTORY( legacy )
 ADD_SUBDIRECTORY( implementation )
 
-ADD_LIBRARY( tnl${debugExt}-${tnlVersion} SHARED 
-               ${tnl_config_SOURCES}
-               ${tnl_core_SOURCES}
-               ${tnl_implementation_SOURCES}
-               ${tnl_legacy_SOURCES}
-               ${tnl_debug_SOURCES}
-               ${tnl_matrix_SOURCES} )
+set( tnl_SOURCES ${tnl_config_SOURCES}
+                 ${tnl_core_SOURCES}
+                 ${tnl_implementation_SOURCES}
+                 ${tnl_legacy_SOURCES}
+                 ${tnl_debug_SOURCES}
+                 ${tnl_matrix_SOURCES} )
+
+set( tnl_CUDA__SOURCES ${tnl_config_CUDA__SOURCES}
+                       ${tnl_core_CUDA__SOURCES}
+                       ${tnl_implementation_CUDA__SOURCES}
+                       ${tnl_legacy_CUDA__SOURCES}
+                       ${tnl_debug_CUDA__SOURCES}
+                       ${tnl_matrix_CUDA__SOURCES} )
+                 
+                 
+if( BUILD_CUDA )
+   CUDA_ADD_LIBRARY( tnl${debugExt}-${tnlVersion} SHARED 
+                     ${tnl_CUDA__SOURCES} )
+else( BUILD_CUDA )
+   ADD_LIBRARY( tnl${debugExt}-${tnlVersion} SHARED 
+                ${tnl_SOURCES} )
+endif( BUILD_CUDA )                                    
                
 SET_TARGET_PROPERTIES( tnl${debugExt}-${tnlVersion} PROPERTIES 
                           SOVERSION 0 
@@ -26,19 +41,25 @@ TARGET_LINK_LIBRARIES( tnl${debugExt}-${tnlVersion}
 INSTALL( TARGETS tnl${debugExt}-${tnlVersion} DESTINATION lib )
 
 IF( BUILD_MPI )
-  ADD_LIBRARY( tnl-mpi${debugExt}-${tnlVersion} SHARED
-                 ${tnl_config_SOURCES}
-                 ${tnl_core_SOURCES}
-                 ${tnl_implementation_SOURCES}
-                 ${tnl_debug_SOURCES}
-                 ${tnl_matrix_SOURCES} )
-  SET_TARGET_PROPERTIES( tnl-mpi${debugExt}-${tnlVersion} PROPERTIES
+   
+   if( BUILD_CUDA )
+      CUDA_ADD_LIBRARY( tnl-mpi${debugExt}-${tnlVersion} SHARED
+                        ${tnl_CUDA__SOURCES} )
+   else( BUILD_CUDA )
+         ADD_LIBRARY( tnl-mpi${debugExt}-${tnlVersion} SHARED
+                      ${tnl_SOURCES} )  
+   endif( BUILD_CUDA )                        
+   
+   SET_TARGET_PROPERTIES( tnl-mpi${debugExt}-${tnlVersion} PROPERTIES
                             SOVERSION 0 
                             VERSION ${tnlVersion} ) 
-  TARGET_LINK_LIBRARIES( tnl-mpi${debugExt}-${tnlVersion} 
+   
+   TARGET_LINK_LIBRARIES( tnl-mpi${debugExt}-${tnlVersion} 
                             ${MPI_LIBRARIES} 
                             ${BZIP2_LIBRARIES} )
-   INSTALL( TARGETS tnl-mpi${debugExt}-${tnlVersion} DESTINATION lib )                                                             
+   
+   INSTALL( TARGETS tnl-mpi${debugExt}-${tnlVersion} DESTINATION lib )
+                                                                
 ENDIF()
 
 
diff --git a/src/config/CMakeLists.txt b/src/config/CMakeLists.txt
index a1fdbf958285f77ce50fd511ae1e5b2be48ef0ec..6878b4b35db21fb81c50c9f207e1f29f7adc7666 100755
--- a/src/config/CMakeLists.txt
+++ b/src/config/CMakeLists.txt
@@ -13,12 +13,20 @@ SET( headers tnlConfigDescription.h
     )
 
 SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/config )
-SET( tnl_config_SOURCES 
+set( common_SOURCES
      ${CURRENT_DIR}/tnlConfigDescription.cpp 
      ${CURRENT_DIR}/tnlConfigDescriptionScanner.cpp 
      ${CURRENT_DIR}/tnlConfigDescriptionParser.cpp 
      ${CURRENT_DIR}/tnlParameterContainer.cpp 
-     ${CURRENT_DIR}/parse.cc
-    PARENT_SCOPE )
+     ${CURRENT_DIR}/parse.cc )
+SET( tnl_config_SOURCES 
+     ${common_SOURCES}
+     PARENT_SCOPE )
+
+if( BUILD_CUDA )
+SET( tnl_config_CUDA__SOURCES
+     ${common_SOURCES} 
+     PARENT_SCOPE )
+endif()    
 
 INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/config )
diff --git a/src/core/cuda/CMakeLists.txt b/src/core/cuda/CMakeLists.txt
index 3290b863cf2973ba56b3ec6e98c45bb96646b109..64f7fb8c273bf0d843547a7ad7bd99034beb9505 100755
--- a/src/core/cuda/CMakeLists.txt
+++ b/src/core/cuda/CMakeLists.txt
@@ -1,5 +1,5 @@
 set( headers device-check.h
-             reduction.h
+             cuda-reduction.h
              reduction-operations.h )
 
 INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/core/cuda )
\ No newline at end of file
diff --git a/src/implementation/core/cuda/reduction-operations_impl.h b/src/core/cuda/cuda-reduction.h
similarity index 56%
rename from src/implementation/core/cuda/reduction-operations_impl.h
rename to src/core/cuda/cuda-reduction.h
index 113b837a5909e9f834ed08a8dbcd6d9c8027ab93..e097e3869c19c11211f83afb3f699cef09a40ce8 100644
--- a/src/implementation/core/cuda/reduction-operations_impl.h
+++ b/src/core/cuda/cuda-reduction.h
@@ -1,8 +1,8 @@
 /***************************************************************************
-                          reduction-operations_impl.h  -  description
+                          cuda-reduction.h  -  description
                              -------------------
-    begin                : Mar 22, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
+    begin                : Oct 28, 2010
+    copyright            : (C) 2010 by Tomas Oberhuber
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -15,9 +15,16 @@
  *                                                                         *
  ***************************************************************************/
 
-#ifndef REDUCTION_OPERATIONS_IMPL_H_
-#define REDUCTION_OPERATIONS_IMPL_H_
+#ifndef CUDA_REDUCTION_H_
+#define CUDA_REDUCTION_H_
 
+template< typename Operation >
+bool reductionOnCudaDevice( const Operation& operation,
+                            const typename Operation :: IndexType size,
+                            const typename Operation :: RealType* deviceInput1,
+                            const typename Operation :: RealType* deviceInput2,
+                            typename Operation :: ResultType& result );
 
+#include <implementation/core/cuda/cuda-reduction_impl.h>
 
-#endif /* REDUCTION_OPERATIONS_IMPL_H_ */
+#endif /* CUDA_REDUCTION_H_ */
diff --git a/src/core/cuda/reduction-operations.h b/src/core/cuda/reduction-operations.h
index 9a6d8a0c4affd944bc1da8ee10d27bd9b76a215f..acb92a015fdec23b7997e5f84c4887d168444838 100644
--- a/src/core/cuda/reduction-operations.h
+++ b/src/core/cuda/reduction-operations.h
@@ -22,10 +22,6 @@
 #include <cuda.h>
 #include <core/mfuncs.h>
 
-enum tnlTupleOperation {  tnlParallelReductionLpNorm,
-                          tnlParallelReductionSdot };
-
-
 /***
  * This function returns minimum of two numbers stored on the device.
  */
@@ -35,20 +31,20 @@ template< class T > __device__ T tnlCudaMin( const T& a,
    return a < b ? a : b;
 }
 
-__device__ int tnlCudaMin( const int& a,
-                           const int& b )
+__device__ inline int tnlCudaMin( const int& a,
+                                  const int& b )
 {
    return min( a, b );
 }
 
-__device__ float tnlCudaMin( const float& a,
-                             const float& b )
+__device__ inline  float tnlCudaMin( const float& a,
+                                     const float& b )
 {
    return fminf( a, b );
 }
 
-__device__ double tnlCudaMin( const double& a,
-                              const double& b )
+__device__ inline  double tnlCudaMin( const double& a,
+                                      const double& b )
 {
    return fmin( a, b );
 }
@@ -62,20 +58,20 @@ template< class T > __device__ T tnlCudaMax( const T& a,
    return a > b ? a : b;
 }
 
-__device__ int tnlCudaMax( const int& a,
-                           const int& b )
+__device__  inline int tnlCudaMax( const int& a,
+                                   const int& b )
 {
    return max( a, b );
 }
 
-__device__ float tnlCudaMax( const float& a,
-                             const float& b )
+__device__  inline float tnlCudaMax( const float& a,
+                                     const float& b )
 {
    return fmaxf( a, b );
 }
 
-__device__ double tnlCudaMax( const double& a,
-                              const double& b )
+__device__  inline double tnlCudaMax( const double& a,
+                                      const double& b )
 {
    return fmax( a, b );
 }
@@ -83,20 +79,21 @@ __device__ double tnlCudaMax( const double& a,
 /***
  * This function returns absolute value of given number on the device.
  */
-__device__ int tnlCudaAbs( const int& a )
+__device__  inline int tnlCudaAbs( const int& a )
 {
    return abs( a );
 }
 
-__device__ float tnlCudaAbs( const float& a )
+__device__  inline float tnlCudaAbs( const float& a )
 {
    return fabs( a );
 }
 
-__device__ double tnlCudaAbs( const double& a )
+__device__  inline double tnlCudaAbs( const double& a )
 {
    return fabs( a );
 }
+#endif
 
 template< typename Real, typename Index >
 class tnlParallelReductionSum
@@ -106,6 +103,7 @@ class tnlParallelReductionSum
    typedef Real RealType;
    typedef Index IndexType;
    typedef Real ResultType;
+   typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation;
 
    ResultType initialValueOnHost( const IndexType idx,
                                   const RealType* data1,
@@ -121,7 +119,7 @@ class tnlParallelReductionSum
    {
       return current + data1[ idx ];
    };
-
+#ifdef HAVE_CUDA
    __device__ ResultType initialValueOnDevice( const IndexType idx1,
                                                const IndexType idx2,
                                                const RealType* data1,
@@ -140,7 +138,7 @@ class tnlParallelReductionSum
    __device__ ResultType firstReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
                                                  const IndexType idx3,
-                                                 const RealType* data1,
+                                                 const ResultType* data1,
                                                  const RealType* data2,
                                                  const RealType* data3 ) const
    {
@@ -149,7 +147,7 @@ class tnlParallelReductionSum
 
    __device__ ResultType firstReductionOnDevice( const IndexType idx1,
                                                  const IndexType idx2,
-                                                 const RealType* data1,
+                                                 const ResultType* data1,
                                                  const RealType* data2,
                                                  const RealType* data3 ) const
    {
@@ -162,6 +160,7 @@ class tnlParallelReductionSum
    {
       return data[ idx1 ] + data[ idx2 ];
    };
+#endif
 };
 
 template< typename Real, typename Index >
@@ -171,62 +170,66 @@ class tnlParallelReductionMin
 
    typedef Real RealType;
    typedef Index IndexType;
+   typedef Real ResultType;
+   typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation;
 
-   RealType initialValueOnHost( const IndexType idx,
-                                const RealType* data1,
-                                const RealType* data2 ) const
+   ResultType initialValueOnHost( const IndexType idx,
+                                  const RealType* data1,
+                                  const RealType* data2 ) const
    {
       return data1[ idx ];
    };
 
-   RealType reduceOnHost( const IndexType idx,
-                          const RealType& current,
-                          const RealType* data1,
-                          const RealType* data2 ) const
+   ResultType reduceOnHost( const IndexType idx,
+                            const ResultType& current,
+                            const RealType* data1,
+                            const RealType* data2 ) const
    {
       return Min( current, data1[ idx ] );
    };
 
-   __device__ RealType initialValueOnDevice( const IndexType idx1,
-                                             const IndexType idx2,
-                                             const RealType* data1,
-                                             const RealType* data2 ) const
+#ifdef HAVE_CUDA
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
    {
       return tnlCudaMin( data1[ idx1 ], data1[ idx2 ] );
    }
 
-   __device__ RealType initialValueOnDevice( const IndexType idx1,
-                                             const RealType* data1,
-                                             const RealType* data2 ) const
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
    {
       return data1[ idx1 ];
    };
 
-   __device__ RealType firstReductionOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const IndexType idx3,
-                                               const RealType* data1,
-                                               const RealType* data2,
-                                               const RealType* data3 ) const
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const IndexType idx3,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
    {
       return tnlCudaMin( data1[ idx1 ], tnlCudaMin(  data2[ idx2 ],  data2[ idx3 ] ) );
    };
 
-   __device__ RealType firstReductionOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2,
-                                               const RealType* data3 ) const
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
    {
       return tnlCudaMin( data1[ idx1 ], data2[ idx2 ] );
    };
 
-   __device__ RealType commonReductionOnDevice( const IndexType idx1,
-                                                const IndexType idx2,
-                                                const RealType* data ) const
+   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
+                                                  const IndexType idx2,
+                                                  const ResultType* data ) const
    {
       return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
    };
+#endif
 };
 
 template< typename Real, typename Index >
@@ -236,62 +239,66 @@ class tnlParallelReductionMax
 
    typedef Real RealType;
    typedef Index IndexType;
+   typedef Real ResultType;
+   typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation;
 
-   RealType initialValueOnHost( const IndexType idx,
-                                const RealType* data1,
-                                const RealType* data2 ) const
+   ResultType initialValueOnHost( const IndexType idx,
+                                  const RealType* data1,
+                                  const RealType* data2 ) const
    {
       return data1[ idx ];
    };
 
-   RealType reduceOnHost( const IndexType idx,
-                          const RealType& current,
-                          const RealType* data1,
-                          const RealType* data2 ) const
+   ResultType reduceOnHost( const IndexType idx,
+                            const ResultType& current,
+                            const RealType* data1,
+                            const RealType* data2 ) const
    {
       return Max( current, data1[ idx ] );
    };
 
-   __device__ RealType initialValueOnDevice( const IndexType idx1,
-                                             const IndexType idx2,
-                                             const RealType* data1,
-                                             const RealType* data2 ) const
+#ifdef HAVE_CUDA
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
    {
       return tnlCudaMax( data1[ idx1 ], data1[ idx2 ] );
    }
 
-   __device__ RealType initialValueOnDevice( const IndexType idx1,
-                                             const RealType* data1,
-                                             const RealType* data2 ) const
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
    {
       return data1[ idx1 ];
    };
 
-   __device__ RealType firstReductionOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const IndexType idx3,
-                                               const RealType* data1,
-                                               const RealType* data2,
-                                               const RealType* data3 ) const
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const IndexType idx3,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
    {
       return tnlCudaMax( data1[ idx1 ], tnlCudaMax( data2[ idx2 ], data2[ idx3 ] ) );
    };
 
-   __device__ RealType firstReductionOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2,
-                                               const RealType* data3 ) const
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
    {
       return tnlCudaMax( data1[ idx1 ], data2[ idx2 ] );
    };
 
-   __device__ RealType commonReductionOnDevice( const IndexType idx1,
-                                                const IndexType idx2,
-                                                const RealType* data ) const
+   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
+                                                  const IndexType idx2,
+                                                  const ResultType* data ) const
    {
       return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
    };
+#endif
 };
 
 template< typename Real, typename Index >
@@ -301,62 +308,66 @@ class tnlParallelReductionAbsSum
 
    typedef Real RealType;
    typedef Index IndexType;
+   typedef Real ResultType;
+   typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation;
 
-   RealType initialValueOnHost( const IndexType idx,
-                                const RealType* data1,
-                                const RealType* data2 ) const
+   ResultType initialValueOnHost( const IndexType idx,
+                                  const RealType* data1,
+                                  const RealType* data2 ) const
    {
       return tnlAbs( data1[ idx ] );
    };
 
-   RealType reduceOnHost( const IndexType idx,
-                          const RealType& current,
-                          const RealType* data1,
-                          const RealType* data2 ) const
+   ResultType reduceOnHost( const IndexType idx,
+                            const ResultType& current,
+                            const RealType* data1,
+                            const RealType* data2 ) const
    {
       return current + tnlAbs( data1[ idx ] );
    };
 
-   __device__ RealType initialValueOnDevice( const IndexType idx1,
-                                             const IndexType idx2,
-                                             const RealType* data1,
-                                             const RealType* data2 ) const
+#ifdef HAVE_CUDA
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
    {
       return tnlCudaAbs( data1[ idx1 ] ) + tnlCudaAbs( data1[ idx2 ] );
    };
 
-   __device__ RealType initialValueOnDevice( const IndexType idx1,
-                                             const RealType* data1,
-                                             const RealType* data2 ) const
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
    {
       return tnlCudaAbs( data1[ idx1 ] );
    };
 
-   __device__ RealType firstReductionOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const IndexType idx3,
-                                               const RealType* data1,
-                                               const RealType* data2,
-                                               const RealType* data3 ) const
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const IndexType idx3,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
    {
       return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] ) + tnlCudaAbs( data2[ idx3 ] );
    };
 
-   __device__ RealType firstReductionOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2,
-                                               const RealType* data3 ) const
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
    {
       return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] );
    };
 
-   __device__ RealType commonReductionOnDevice( const IndexType idx1,
-                                                const IndexType idx2,
-                                                const RealType* data ) const
+   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
+                                                  const IndexType idx2,
+                                                  const ResultType* data ) const
    {
       return data[ idx1 ] + data[ idx2 ];
    };
+#endif
 };
 
 template< typename Real, typename Index >
@@ -366,62 +377,66 @@ class tnlParallelReductionAbsMin
 
    typedef Real RealType;
    typedef Index IndexType;
+   typedef Real ResultType;
+   typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation;
 
-   RealType initialValueOnHost( const IndexType idx,
-                                const RealType* data1,
-                                const RealType* data2 ) const
+   ResultType initialValueOnHost( const IndexType idx,
+                                  const RealType* data1,
+                                  const RealType* data2 ) const
    {
       return tnlAbs( data1[ idx ] );
    };
 
-   RealType reduceOnHost( const IndexType idx,
-                          const RealType& current,
-                          const RealType* data1,
-                          const RealType* data2 ) const
+   ResultType reduceOnHost( const IndexType idx,
+                            const ResultType& current,
+                            const RealType* data1,
+                            const RealType* data2 ) const
    {
       return Min( current, tnlAbs( data1[ idx ] ) );
    };
 
-   __device__ RealType initialValueOnDevice( const IndexType idx1,
-                                             const IndexType idx2,
-                                             const RealType* data1,
-                                             const RealType* data2 ) const
+#ifdef HAVE_CUDA
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
    {
       return tnlCudaMin( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) );
    }
 
-   __device__ RealType initialValueOnDevice( const IndexType idx1,
-                                             const RealType* data1,
-                                             const RealType* data2 ) const
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
    {
       return tnlCudaAbs( data1[ idx1 ] );
    };
 
-   __device__ RealType firstReductionOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const IndexType idx3,
-                                               const RealType* data1,
-                                               const RealType* data2,
-                                               const RealType* data3 ) const
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const IndexType idx3,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
    {
       return tnlCudaMin( data1[ idx1 ], tnlCudaMin(  tnlCudaAbs( data2[ idx2 ] ),  tnlCudaAbs( data2[ idx3 ] ) ) );
    };
 
-   __device__ RealType firstReductionOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2,
-                                               const RealType* data3 ) const
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
    {
       return tnlCudaMin( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) );
    };
 
-   __device__ RealType commonReductionOnDevice( const IndexType idx1,
-                                                const IndexType idx2,
-                                                const RealType* data ) const
+   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
+                                                  const IndexType idx2,
+                                                  const ResultType* data ) const
    {
       return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
    };
+#endif
 };
 
 template< typename Real, typename Index >
@@ -431,67 +446,498 @@ class tnlParallelReductionAbsMax
 
    typedef Real RealType;
    typedef Index IndexType;
+   typedef Real ResultType;
+   typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation;
 
-   RealType initialValueOnHost( const IndexType idx,
-                                const RealType* data1,
-                                const RealType* data2 ) const
+   ResultType initialValueOnHost( const IndexType idx,
+                                  const RealType* data1,
+                                  const RealType* data2 ) const
    {
       return tnlAbs( data1[ idx ] );
    };
 
-   RealType reduceOnHost( const IndexType idx,
-                          const RealType& current,
-                          const RealType* data1,
-                          const RealType* data2 ) const
+   ResultType reduceOnHost( const IndexType idx,
+                            const ResultType& current,
+                            const RealType* data1,
+                            const RealType* data2 ) const
    {
       return Max( current, tnlAbs( data1[ idx ] ) );
    };
 
-   __device__ RealType initialValueOnDevice( const IndexType idx1,
-                                             const IndexType idx2,
-                                             const RealType* data1,
-                                             const RealType* data2 ) const
+#ifdef HAVE_CUDA
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
    {
       return tnlCudaMax( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) );
    }
 
-   __device__ RealType initialValueOnDevice( const IndexType idx1,
-                                             const RealType* data1,
-                                             const RealType* data2 ) const
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
    {
       return tnlCudaAbs( data1[ idx1 ] );
    };
 
-   __device__ RealType firstReductionOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const IndexType idx3,
-                                               const RealType* data1,
-                                               const RealType* data2,
-                                               const RealType* data3 ) const
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const IndexType idx3,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
    {
       return tnlCudaMax( data1[ idx1 ], tnlCudaMax( tnlCudaAbs( data2[ idx2 ] ), tnlCudaAbs( data2[ idx3 ] ) ) );
    };
 
-   __device__ RealType firstReductionOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2,
-                                               const RealType* data3 ) const
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
    {
       return tnlCudaMax( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) );
    };
 
-   __device__ RealType commonReductionOnDevice( const IndexType idx1,
-                                                const IndexType idx2,
-                                                const RealType* data ) const
+   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
+                                                  const IndexType idx2,
+                                                  const ResultType* data ) const
    {
       return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
    };
+#endif
 };
 
+template< typename Real, typename Index >
+class tnlParallelReductionLogicalAnd
+{
+   public:
+
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Real ResultType;
+   typedef tnlParallelReductionLogicalAnd< Real, Index > LaterReductionOperation;
+
+   ResultType initialValueOnHost( const IndexType idx,
+                                  const RealType* data1,
+                                  const RealType* data2 ) const
+   {
+      return data1[ idx ];
+   };
+
+   ResultType reduceOnHost( const IndexType idx,
+                            const ResultType& current,
+                            const RealType* data1,
+                            const RealType* data2 ) const
+   {
+      return current && data1[ idx ];
+   };
+
+#ifdef HAVE_CUDA
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
+   {
+      return data1[ idx1 ] && data1[ idx2 ];
+   }
+
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
+   {
+      return data1[ idx1 ];
+   };
+
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const IndexType idx3,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
+   {
+      return data1[ idx1 ] && data2[ idx2 ] && data2[ idx3 ];
+   };
 
-#include <implementation/core/cuda/reduction-operations_impl.h>
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
+   {
+      return data1[ idx1 ] && data2[ idx2 ];
+   };
 
+   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
+                                                  const IndexType idx2,
+                                                  const ResultType* data ) const
+   {
+      return data[ idx1 ] && data[ idx2 ];
+   };
 #endif
+};
+
+
+template< typename Real, typename Index >
+class tnlParallelReductionLogicalOr
+{
+   public:
+
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Real ResultType;
+   typedef tnlParallelReductionLogicalOr< Real, Index > LaterReductionOperation;
+
+   ResultType initialValueOnHost( const IndexType idx,
+                                  const RealType* data1,
+                                  const RealType* data2 ) const
+   {
+      return data1[ idx ];
+   };
+
+   ResultType reduceOnHost( const IndexType idx,
+                            const ResultType& current,
+                            const RealType* data1,
+                            const RealType* data2 ) const
+   {
+      return current || data1[ idx ];
+   };
+
+#ifdef HAVE_CUDA
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
+   {
+      return data1[ idx1 ] || data1[ idx2 ];
+   }
+
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
+   {
+      return data1[ idx1 ];
+   };
+
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const IndexType idx3,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
+   {
+      return data1[ idx1 ] || data2[ idx2 ] || data2[ idx3 ];
+   };
+
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
+   {
+      return data1[ idx1 ] || data2[ idx2 ];
+   };
+
+   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
+                                                  const IndexType idx2,
+                                                  const ResultType* data ) const
+   {
+      return data[ idx1 ] || data[ idx2 ];
+   };
+#endif
+};
+
+template< typename Real, typename Index >
+class tnlParallelReductionLpNorm
+{
+   public:
+
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Real ResultType;
+   typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation;
+
+   void setPower( const RealType& p )
+   {
+      this -> p = p;
+   };
+
+   ResultType initialValueOnHost( const IndexType idx,
+                                  const RealType* data1,
+                                  const RealType* data2 ) const
+   {
+      return pow( tnlAbs( data1[ idx ] ), p );
+   };
+
+   ResultType reduceOnHost( const IndexType idx,
+                            const ResultType& current,
+                            const RealType* data1,
+                            const RealType* data2 ) const
+   {
+      return current + pow( tnlAbs( data1[ idx ] ), p );
+   };
+
+#ifdef HAVE_CUDA
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
+   {
+      return pow( tnlCudaAbs( data1[ idx1 ] ), p ) + pow( tnlCudaAbs( data1[ idx2 ] ), p );
+   }
+
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
+   {
+      return pow( tnlCudaAbs( data1[ idx1 ] ), p );
+   };
+
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const IndexType idx3,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
+   {
+      return data1[ idx1 ] +
+             pow( tnlCudaAbs( data2[ idx2 ] ), p ) +
+             pow( tnlCudaAbs( data2[ idx3 ] ), p );
+   };
+
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
+   {
+      return data1[ idx1 ] + pow( tnlCudaAbs( data2[ idx2 ] ), p );
+   };
+
+   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
+                                                  const IndexType idx2,
+                                                  const ResultType* data ) const
+   {
+      return data[ idx1 ] + data[ idx2 ];
+   };
+#endif
+
+   protected:
+
+   RealType p;
+};
+
+template< typename Real, typename Index >
+class tnlParallelReductionEqualities
+{
+   public:
+
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef bool ResultType;
+   typedef tnlParallelReductionLogicalAnd< bool, Index > LaterReductionOperation;
+
+   ResultType initialValueOnHost( const IndexType idx,
+                                  const RealType* data1,
+                                  const RealType* data2 ) const
+   {
+      return  ( data1[ idx ] == data2[ idx ] );
+   };
+
+   ResultType reduceOnHost( const IndexType idx,
+                            const ResultType& current,
+                            const RealType* data1,
+                            const RealType* data2 ) const
+   {
+      return current && ( data1[ idx ] == data2[ idx ] );
+   };
+
+#ifdef HAVE_CUDA
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
+   {
+      return ( data1[ idx1 ] == data2[ idx1 ] ) && ( data1[ idx2 ] == data2[ idx2] );
+   }
+
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
+   {
+      return ( data1[ idx1 ]== data2[ idx1 ] );
+   };
+
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const IndexType idx3,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
+   {
+      return data1[ idx1 ] &&
+             ( data2[ idx2 ] == data2[ idx2] ) &&
+             ( data2[ idx3 ] == data3[ idx3] );
+   };
+
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
+   {
+      return data1[ idx1 ] && ( data2[ idx2 ] == data3[ idx2 ] );
+   };
+
+   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
+                                                  const IndexType idx2,
+                                                  const ResultType* data ) const
+   {
+      return data[ idx1 ] && data[ idx2 ];
+   };
+#endif
+};
+
+template< typename Real, typename Index >
+class tnlParallelReductionInequalities
+{
+   public:
+
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef bool ResultType;
+   typedef tnlParallelReductionLogicalAnd< bool, Index > LaterReductionOperation;
+
+   ResultType initialValueOnHost( const IndexType idx,
+                                  const RealType* data1,
+                                  const RealType* data2 ) const
+   {
+      return  ( data1[ idx ] != data2[ idx ] );
+   };
+
+   ResultType reduceOnHost( const IndexType idx,
+                            const ResultType& current,
+                            const RealType* data1,
+                            const RealType* data2 ) const
+   {
+      return current && ( data1[ idx ] != data2[ idx ] );
+   };
+
+#ifdef HAVE_CUDA
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
+   {
+      return ( data1[ idx1 ] != data2[ idx1 ] ) && ( data1[ idx2 ] != data2[ idx2] );
+   }
+
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
+   {
+      return ( data1[ idx1 ] != data2[ idx1 ] );
+   };
+
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const IndexType idx3,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
+   {
+      return data1[ idx1 ] &&
+             ( data2[ idx2 ] != data2[ idx2] ) &&
+             ( data2[ idx3 ] != data3[ idx3] );
+   };
+
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
+   {
+      return data1[ idx1 ] && ( data2[ idx2 ] != data3[ idx2 ] );
+   };
+
+   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
+                                                  const IndexType idx2,
+                                                  const ResultType* data ) const
+   {
+      return data[ idx1 ] && data[ idx2 ];
+   };
+#endif
+};
+
+template< typename Real, typename Index >
+class tnlParallelReductionSdot
+{
+   public:
+
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Real ResultType;
+   typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation;
+
+   ResultType initialValueOnHost( const IndexType idx,
+                                  const RealType* data1,
+                                  const RealType* data2 ) const
+   {
+      return  data1[ idx ] * data2[ idx ];
+   };
+
+   ResultType reduceOnHost( const IndexType idx,
+                            const ResultType& current,
+                            const RealType* data1,
+                            const RealType* data2 ) const
+   {
+      return current + ( data1[ idx ] * data2[ idx ] );
+   };
+
+#ifdef HAVE_CUDA
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
+   {
+      return ( data1[ idx1 ] * data2[ idx1 ] ) + ( data1[ idx2 ] * data2[ idx2] );
+   }
+
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
+   {
+      return ( data1[ idx1 ] * data2[ idx1 ] );
+   };
+
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const IndexType idx3,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
+   {
+      return data1[ idx1 ] +
+             ( data2[ idx2 ] * data2[ idx2] ) +
+             ( data2[ idx3 ] * data3[ idx3] );
+   };
+
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const ResultType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
+   {
+      return data1[ idx1 ] + ( data2[ idx2 ] * data3[ idx2 ] );
+   };
+
+   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
+                                                  const IndexType idx2,
+                                                  const ResultType* data ) const
+   {
+      return data[ idx1 ] + data[ idx2 ];
+   };
+#endif
+};
 
 #endif /* REDUCTION_OPERATIONS_H_ */
diff --git a/src/core/cuda/reduction.h b/src/core/cuda/reduction.h
deleted file mode 100644
index 13b850cf00f8bf597f01550dc2199dafd809b6c1..0000000000000000000000000000000000000000
--- a/src/core/cuda/reduction.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/***************************************************************************
-                          cuda-long-vector-kernels.h  -  description
-                             -------------------
-    begin                : Oct 28, 2010
-    copyright            : (C) 2010 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef CUDALONGVECTORKERNELS_H_
-#define CUDALONGVECTORKERNELS_H_
-
-#ifdef HAVE_CUDA
-#include <cuda.h>
-#endif
-#include <iostream>
-
-/***
- * The template calling the final CUDA kernel for the single vector reduction.
- * The template parameters are:
- * @param T is the type of data we want to reduce
- * @param operation is the operation reducing the data.
- *        It can be tnlParallelReductionSum, tnlParallelReductionMin or tnlParallelReductionMax.
- * The function parameters:
- * @param size tells number of elements in the data array.
- * @param deviceInput1 is the pointer to an array storing the data we want
- *        to reduce. This array must stay on the device!.
- * @param deviceInput2 is the pointer to an array storing the coupling data for example
- *        the second vector for the SDOT operation. This array must stay on the device!.
- * @param result will contain the result of the reduction if everything was ok
- *        and the return code is true.
- * @param parameter can be used for example for the passing the parameter p of Lp norm.
- * @param deviceAux is auxiliary array used to store temporary data during the reduction.
- *        If one calls this function more then once one might provide this array to avoid repetetive
- *        allocation of this array on the device inside of this function.
- *        The size of this array should be size / 128 * sizeof( T ).
- */
-template< typename Type, typename ParameterType, typename Index, tnlTupleOperation operation >
-bool tnlCUDALongVectorReduction( const Index size,
-                                 const Type* deviceInput1,
-                                 const Type* deviceInput2,
-                                 Type& result,
-                                 const ParameterType& parameter,
-                                 Type* deviceAux = 0 );
-#endif /* CUDALONGVECTORKERNELS_H_ */
diff --git a/src/implementation/CMakeLists.txt b/src/implementation/CMakeLists.txt
index c95a03737c8a40e33eb852bcf8e6a70e082c7fef..f22c95fa29d86b57e588a82339b72d71e5850194 100755
--- a/src/implementation/CMakeLists.txt
+++ b/src/implementation/CMakeLists.txt
@@ -5,6 +5,14 @@ ADD_SUBDIRECTORY( solvers )
 
 SET( headers  )
 
+IF( BUILD_CUDA )
+   set( tnl_implementation_CUDA__SOURCES
+        ${tnl_implementation_core_CUDA__SOURCES}
+        ${tnl_implementation_mesh_CUDA__SOURCES}
+        ${tnl_implementation_solvers_CUDA__SOURCES}         
+        PARENT_SCOPE )
+ENDIF() 
+
 set( tnl_implementation_SOURCES 
      ${tnl_implementation_core_SOURCES}
      ${tnl_implementation_mesh_SOURCES}
diff --git a/src/implementation/core/CMakeLists.txt b/src/implementation/core/CMakeLists.txt
index d137f42adec988c5bbd11f1b9b3dfacac6c17d9b..2ad45839048ae18618d506adb3a25d07e6572d70 100755
--- a/src/implementation/core/CMakeLists.txt
+++ b/src/implementation/core/CMakeLists.txt
@@ -1,7 +1,6 @@
 ADD_SUBDIRECTORY( cuda )
 
-SET( headers cuda-long-vector-kernels.h
-             vector-operations.h
+SET( headers vector-operations.h
              memory-operations.h
              tnlArray_impl.h
              tnlHost_impl.h
@@ -19,34 +18,39 @@ SET( headers cuda-long-vector-kernels.h
              tnlVector_impl.h )
 
 SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/implementation/core )
+set( common_SOURCES
+     ${CURRENT_DIR}/tnlTimerRT.cpp
+     ${CURRENT_DIR}/tnlFile.cpp
+     ${CURRENT_DIR}/tnlFlopsCounter.cpp 
+     ${CURRENT_DIR}/tnlLogger.cpp 
+     ${CURRENT_DIR}/tnlObject.cpp 
+     ${CURRENT_DIR}/tnlStatistics.cpp
+     ${CURRENT_DIR}/tnlString.cpp 
+     ${CURRENT_DIR}/tnlTimerCPU.cpp      
+     ${CURRENT_DIR}/mfilename.cpp 
+     ${CURRENT_DIR}/mpi-supp.cpp
+     ${CURRENT_DIR}/tnlSharedArray_impl.cpp
+     ${CURRENT_DIR}/tnlMultiArray_impl.cpp
+     ${CURRENT_DIR}/tnlMultiVector_impl.cpp
+     ${CURRENT_DIR}/tnlSharedVector_impl.cpp
+     ${CURRENT_DIR}/tnlVector_impl.cpp )     
+
 IF( BUILD_CUDA )
    set( tnl_implementation_core_CUDA__SOURCES
         ${tnl_implementation_core_cuda_CUDA__SOURCES}
+        ${common_SOURCES}
+        ${CURRENT_DIR}/memory-operations_impl.cu
         ${CURRENT_DIR}/tnlArray_impl.cu
         ${CURRENT_DIR}/tnlVector_impl.cu 
         PARENT_SCOPE )
 ENDIF()    
-set( tnl_implementation_core_SOURCES
-     ${tnl_implementation_core_CUDA__SOURCES}
+set( tnl_implementation_core_SOURCES     
      ${tnl_implementation_core_cuda_SOURCES}
+     ${common_SOURCES}
+     ${CURRENT_DIR}/memory-operations_impl.cpp
      ${CURRENT_DIR}/tnlArray_impl.cpp
      ${CURRENT_DIR}/tnlHost_impl.cpp
-     ${CURRENT_DIR}/tnlSharedArray_impl.cpp
-     ${CURRENT_DIR}/tnlMultiArray_impl.cpp
-     ${CURRENT_DIR}/tnlMultiVector_impl.cpp
-     ${CURRENT_DIR}/tnlSharedVector_impl.cpp
-     ${CURRENT_DIR}/tnlVector_impl.cpp
-     ${CURRENT_DIR}/tnlTimerRT.cpp
-     ${CURRENT_DIR}/tnlFile.cpp
-     ${CURRENT_DIR}/tnlFlopsCounter.cpp 
-     ${CURRENT_DIR}/tnlLogger.cpp 
-     ${CURRENT_DIR}/tnlObject.cpp 
-     ${CURRENT_DIR}/tnlStatistics.cpp
-     ${CURRENT_DIR}/tnlString.cpp 
-     ${CURRENT_DIR}/tnlTimerCPU.cpp      
-     ${CURRENT_DIR}/mfilename.cpp 
-     ${CURRENT_DIR}/mpi-supp.cpp     
-    PARENT_SCOPE )
+     PARENT_SCOPE )
     
     
         
diff --git a/src/implementation/core/cuda-long-vector-kernels.h b/src/implementation/core/cuda-long-vector-kernels.h
deleted file mode 100644
index 41c2cc80394db884509af6e83397aac38f25bc47..0000000000000000000000000000000000000000
--- a/src/implementation/core/cuda-long-vector-kernels.h
+++ /dev/null
@@ -1,587 +0,0 @@
-/***************************************************************************
-                          cuda-long-vector-kernels.h  -  description
-                             -------------------
-    begin                : Oct 28, 2010
-    copyright            : (C) 2010 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef CUDALONGVECTORKERNELS_H_
-#define CUDALONGVECTORKERNELS_H_
-
-#ifdef HAVE_CUDA
-#include <cuda.h>
-#endif
-#include <iostream>
-#include <core/tnlAssert.h>
-#include <core/cuda/reduction-operations.h>
-#include <implementation/core/memory-operations.h>
-
-using namespace std;
-
-
-/****
- * This constant says that arrays smaller than its value
- * are going to be reduced on CPU.
- */
-const int maxGPUReductionDataSize = 256;
-
-#ifdef HAVE_CUDA
-
-
-/***
- * For each thread in block with thread ID smaller then s this function reduces
- * data elements with indecis tid and tid + s. Here we assume that for each
- * tid the tid + s element also exists i.e. we have even number of elements.
- */
-template< typename Operation >
-__device__ void reduceAligned( const Operation& operation,
-                               typename Operation :: IndexType tid,
-                               typename Operation :: IndexType  s,
-                               typename Operation :: RealType* sdata )
-{
-   if( tid < s )
-   {
-      sdata[ tid ] = operation. commonReductionOnDevice( tid, tid + s, sdata );
-      /*if( operation == tnlParallelReductionAbsMin )
-         sdata[ tid ] = tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) );
-      if( operation == tnlParallelReductionAbsMax )
-         sdata[ tid ] = tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) );
-      if( operation == tnlParallelReductionLpNorm ||
-          operation == tnlParallelReductionSdot )
-         sdata[ tid ] = sdata[ tid ] + sdata[ tid + s ];*/
-   }
-}
-
-/***
- * For each thread in block with thread ID smaller then s this function reduces
- * data elements with indices tid and tid + s. This is a modified version of
- * the previous algorithm. Thid one works even for odd number of elements but
- * it is a bit slower.
- */
-template< typename Operation >
-__device__ void reduceNonAligned( const Operation& operation,
-                                  typename Operation :: IndexType tid,
-                                  typename Operation :: IndexType s,
-                                  typename Operation :: IndexType n,
-                                  typename Operation :: RealType* sdata )
-{
-   if( tid < s )
-   {
-      sdata[ tid ] = operation. commonReductionOnDevice( tid, tid + s, sdata );
-      /*if( operation == tnlParallelReductionAbsMin )
-         sdata[ tid ] = tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) );
-      if( operation == tnlParallelReductionAbsMax )
-         sdata[ tid ] = tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) );
-      if( operation == tnlParallelReductionLpNorm ||
-          operation == tnlParallelReductionSdot )
-         sdata[ tid ] = sdata[ tid ] + sdata[ tid + s ];*/
-   }
-   /* This is for the case when we have odd number of elements.
-    * The last one will be reduced using the thread with ID 0.
-    */
-   if( s > 32 )
-      __syncthreads();
-   if( 2 * s < n && tid == n - 1 )
-   {
-      sdata[ 0 ] = operation. commonReductionOnDevice( 0, tid, sdata );
-      /*if( operation == tnlParallelReductionAbsMin )
-         sdata[ 0 ] = tnlCudaMin( tnlCudaAbs( sdata[ 0] ), tnlCudaAbs( sdata[ tid + s ] ) );
-      if( operation == tnlParallelReductionAbsMax )
-         sdata[ 0 ] = tnlCudaMax( tnlCudaAbs( sdata[ 0 ] ), tnlCudaAbs( sdata[ tid + s ] ) );
-      if( operation == tnlParallelReductionLpNorm ||
-          operation == tnlParallelReductionSdot )
-         sdata[ 0 ] = sdata[ 0 ] + sdata[ tid + s ];*/
-
-   }
-}
-
-/***
- * The parallel reduction of one vector.
- *
- * WARNING: This kernel only reduce data in one block. Use rather tnlCUDASimpleReduction2
- *          to call this kernel then doing it by yourself.
- *          This kernel is very inefficient. It is here only for educative and testing reasons.
- *          Please use tnlCUDAReduction instead.
- *
- * The kernel parameters:
- * @param size is the number of all element to reduce - not just in one block.
- * @param deviceInput input data which we want to reduce
- * @param deviceOutput an array to which we write the result of reduction.
- *                     Each block of the grid writes one element in this array
- *                     (i.e. the size of this array equals the number of CUDA blocks).
- */
-template < typename Operation, int blockSize >
-__global__ void tnlCUDAReductionKernel( const Operation operation,
-                                        const typename Operation :: IndexType size,
-                                        const typename Operation :: RealType* deviceInput,
-                                        const typename Operation :: RealType* deviceInput2,
-                                        typename Operation :: RealType* deviceOutput )
-{
-   extern __shared__ __align__ ( 8 ) char __sdata[];
-   
-   typedef typename Operation :: IndexType IndexType;
-   typedef typename Operation :: RealType RealType;
-   RealType* sdata = reinterpret_cast< RealType* >( __sdata );
-
-   /***
-    * Get thread id (tid) and global thread id (gid).
-    * lastTId is the last relevant thread id in this block.
-    * gridSize is the number of element processed by all blocks at the
-    * same time.
-    */
-   IndexType tid = threadIdx. x;
-   IndexType gid = 2 * blockIdx. x * blockDim. x + threadIdx. x;
-   IndexType lastTId = size - 2 * blockIdx. x * blockDim. x;
-   IndexType gridSize = 2 * blockDim. x * gridDim.x;
-
-   /***
-    * Read data into the shared memory. We start with the
-    * sequential reduction.
-    */
-   if( gid + blockDim. x < size )
-   {
-      sdata[ tid ] = operation. initialValueOnDevice( gid, gid + blockDim. x, deviceInput, deviceInput2 );
-      /*if( operation == tnlParallelReductionMin )
-         sdata[ tid ] = tnlCudaMin( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] );
-      if( operation == tnlParallelReductionMax )
-         sdata[ tid ] = tnlCudaMax( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] );
-      if( operation == tnlParallelReductionAbsMin )
-         sdata[ tid ] = tnlCudaMin( tnlCudaAbs( deviceInput[ gid ] ), tnlCudaAbs( deviceInput[ gid + blockDim. x ] ) );
-      if( operation == tnlParallelReductionAbsMax )
-         sdata[ tid ] = tnlCudaMax( tnlCudaAbs( deviceInput[ gid ] ), tnlCudaAbs( deviceInput[ gid + blockDim. x ] ) );
-      if( operation == tnlParallelReductionSum )
-         sdata[ tid ] = deviceInput[ gid ] + deviceInput[ gid + blockDim. x ];
-      if( operation == tnlParallelReductionLpNorm )
-         sdata[ tid ] = powf( tnlCudaAbs( deviceInput[ gid ] ), parameter ) +
-                        powf( tnlCudaAbs( deviceInput[ gid + blockDim. x ] ), parameter );
-      if( operation == tnlParallelReductionSdot )
-         sdata[ tid ] = deviceInput[ gid ] * deviceInput2[ gid ] +
-                        deviceInput[ gid + blockDim. x ] * deviceInput2[ gid + blockDim. x ];*/
-   }
-   else if( gid < size )
-   {
-      sdata[ tid ] = operation. initialValueOnDevice( gid, deviceInput, deviceInput2 );
-      /*if( operation == tnlParallelReductionLpNorm )
-         sdata[ tid ] = powf( tnlCudaAbs( deviceInput[ gid ] ), parameter );
-      else
-         if( operation == tnlParallelReductionSdot )
-            sdata[ tid ] = deviceInput[ gid ] * deviceInput2[ gid ];
-         else
-            sdata[ tid ] = deviceInput[ gid ];*/
-   }
-   gid += gridSize;
-   while( gid + blockDim. x < size )
-   {
-      sdata[ tid ] = operation. firstReductionOnDevice( tid, gid, gid + blockDim. x, sdata, deviceInput, deviceInput2 );
-      /*if( operation == tnlParallelReductionMin )
-         sdata[ tid ] = :: tnlCudaMin( sdata[ tid ], :: tnlCudaMin( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] ) );
-      if( operation == tnlParallelReductionMax )
-         sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], :: tnlCudaMax( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] ) );
-      if( operation == tnlParallelReductionAbsMin )
-         sdata[ tid ] = :: tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), :: tnlCudaMin( tnlCudaAbs( deviceInput[ gid ] ), tnlCudaAbs( deviceInput[ gid + blockDim. x ] ) ) );
-      if( operation == tnlParallelReductionAbsMax )
-         sdata[ tid ] = :: tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), :: tnlCudaMax( tnlCudaAbs( deviceInput[ gid ] ), tnlCudaAbs( deviceInput[ gid + blockDim. x ] ) ) );
-      if( operation == tnlParallelReductionSum )
-         sdata[ tid ] += deviceInput[gid] + deviceInput[ gid + blockDim. x ];
-      if( operation == tnlParallelReductionLpNorm )
-         sdata[ tid ] += powf( tnlCudaAbs( deviceInput[gid] ), parameter ) +
-                         powf( tnlCudaAbs( deviceInput[ gid + blockDim. x ] ), parameter );
-      if( operation == tnlParallelReductionSdot )
-         sdata[ tid ] += deviceInput[ gid ] * deviceInput2[ gid ] +
-                         deviceInput[ gid + blockDim. x] * deviceInput2[ gid + blockDim. x ];*/
-      gid += gridSize;
-   }
-   if( gid < size )
-   {
-      sdata[ tid ] = operation. firstReductionOnDevice( tid, gid, sdata, deviceInput, deviceInput2 );
-      /*if( operation == tnlParallelReductionMin )
-         sdata[ tid ] = :: tnlCudaMin( sdata[ tid ], deviceInput[ gid ] );
-      if( operation == tnlParallelReductionMax )
-         sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], deviceInput[ gid ] );
-      if( operation == tnlParallelReductionAbsMin )
-         sdata[ tid ] = :: tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( deviceInput[ gid ] ) );
-      if( operation == tnlParallelReductionAbsMax )
-         sdata[ tid ] = :: tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( deviceInput[ gid ] ) );
-      if( operation == tnlParallelReductionSum )
-         sdata[ tid ] += deviceInput[gid];
-      if( operation == tnlParallelReductionLpNorm )
-         sdata[ tid ] += powf( tnlCudaAbs( deviceInput[gid] ), parameter );
-      if( operation == tnlParallelReductionSdot )
-         sdata[ tid ] += deviceInput[ gid ] * deviceInput2[ gid ];*/
-   }
-   __syncthreads();
-
-
-   /***
-    *  Process the parallel reduction.
-    *  We reduce the data with step s which is one half of the elements to reduce.
-    *  Each thread with ID < s reduce elements tid and tid + s. The result is stored
-    *  in shared memory in sdata 0 .. s. We set s = s / 2 ( i.e. s >>= 1) and repeat
-    *  the algorithm again until s = 1.
-    *  We also separate the case when the blockDim. x is power of 2 and the algorithm
-    *  can be written in more efficient way without some conditions.
-    */
-   unsigned int n = lastTId < blockDim. x ? lastTId : blockDim. x;
-   if( n == 128 || n ==  64 || n ==  32 || n ==  16 ||
-       n ==   8 || n ==   4 || n ==   2 || n == 256 ||
-       n == 512 )
-   {
-      if( blockSize >= 512 )
-      {
-         if( tid < 256 )
-            reduceAligned( operation, tid, 256, sdata );
-         __syncthreads();
-      }
-      if( blockSize >= 256 )
-      {
-         if( tid < 128 )
-            reduceAligned( operation, tid, 128, sdata );
-         __syncthreads();
-      }
-      if( blockSize >= 128 )
-      {
-         if( tid <  64 )
-            reduceAligned( operation, tid, 64, sdata );
-         __syncthreads();
-      }
-
-      /***
-       * This runs in one warp so it is synchronised implicitly.
-       */
-      if (tid < 32)
-      {
-         if( blockSize >= 64 )
-            reduceAligned( operation, tid, 32, sdata );
-         if( blockSize >= 32 )
-            reduceAligned( operation, tid, 16, sdata );
-         if( blockSize >= 16 )
-            reduceAligned( operation, tid,  8, sdata );
-         if( blockSize >=  8 )
-            reduceAligned( operation, tid,  4, sdata );
-         if( blockSize >=  4 )
-            reduceAligned( operation, tid,  2, sdata );
-         if( blockSize >=  2 )
-            reduceAligned( operation, tid,  1, sdata );
-      }
-   }
-   else
-   {
-      unsigned int s;
-      if( n >= 512 )
-      {
-         s = n / 2;
-         reduceNonAligned( operation, tid, s, n, sdata );
-         n = s;
-         __syncthreads();
-      }
-      if( n >= 256 )
-      {
-         s = n / 2;
-         reduceNonAligned( operation, tid, s, n, sdata );
-         n = s;
-         __syncthreads();
-      }
-      if( n >= 128 )
-      {
-         s = n / 2;
-         reduceNonAligned( operation, tid, s, n, sdata );
-         n = s;
-         __syncthreads();
-      }
-      if( n >= 64 )
-      {
-         s = n / 2;
-         reduceNonAligned( operation, tid, s, n, sdata );
-         n = s;
-         __syncthreads();
-      }
-      if( n >= 32 )
-      {
-         s = n / 2;
-         reduceNonAligned( operation, tid, s, n, sdata );
-         n = s;
-         __syncthreads();
-      }
-      /***
-       * This runs in one warp so it is synchronised implicitly.
-       */
-      if( n >= 16 )
-      {
-         s = n / 2;
-         reduceNonAligned( operation, tid, s, n, sdata );
-         n = s;
-      }
-      if( n >= 8 )
-      {
-         s = n / 2;
-         reduceNonAligned( operation, tid, s, n, sdata );
-         n = s;
-      }
-      if( n >= 4 )
-      {
-         s = n / 2;
-         reduceNonAligned( operation, tid, s, n, sdata );
-         n = s;
-      }
-      if( n >= 2 )
-      {
-         s = n / 2;
-         reduceNonAligned( operation, tid, s, n, sdata );
-         n = s;
-      }
-   }
-
-   /***
-    * Store the result back in the global memory.
-    */
-   if( tid == 0 )
-      deviceOutput[ blockIdx. x ] = sdata[ 0 ];
-}
-
-template< typename Operation >
-typename Operation :: IndexType reduceOnCudaDevice( const Operation& operation,
-                                                    const typename Operation :: IndexType size,
-                                                    const typename Operation :: RealType* input1,
-                                                    const typename Operation :: RealType* input2,
-                                                    typename Operation :: RealType*& output)
-{
-   typedef typename Operation :: IndexType IndexType;
-   typedef typename Operation :: RealType RealType;
-
-   const int desBlockSize = 512;
-   const int desGridSize = 2048;
-   dim3 blockSize( 0 ), gridSize( 0 );
-
-   /***
-    * Compute the CUDA block size aligned to the power of two.
-    */
-   blockSize. x = :: Min( size, desBlockSize );
-   IndexType alignedBlockSize = 1;
-   while( alignedBlockSize < blockSize. x ) alignedBlockSize <<= 1;
-   blockSize. x = alignedBlockSize;
-
-   gridSize. x = Min( ( IndexType ) ( size / blockSize. x + 1 ) / 2, desGridSize );
-
-   if( ! output &&
-       ! allocateMemoryCuda( output, :: Max( 1, size / desBlockSize ) ) )
-         return false;
-
-   IndexType shmem = blockSize. x * sizeof( RealType );
-   /***
-    * Depending on the blockSize we generate appropriate template instance.
-    */
-      switch( blockSize. x )
-      {
-         case 512:
-            tnlCUDAReductionKernel< Operation, 512 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case 256:
-            tnlCUDAReductionKernel< Operation, 256 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case 128:
-            tnlCUDAReductionKernel< Operation, 128 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case  64:
-            tnlCUDAReductionKernel< Operation,  64 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case  32:
-            tnlCUDAReductionKernel< Operation,  32 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case  16:
-            tnlCUDAReductionKernel< Operation,  16 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case   8:
-            tnlCUDAReductionKernel< Operation,   8 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case   4:
-            tnlCUDAReductionKernel< Operation,   4 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case   2:
-            tnlCUDAReductionKernel< Operation,   2 >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case   1:
-            tnlAssert( false, cerr << "blockSize should not be 1." << endl );
-            break;
-         default:
-            tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
-            break;
-      }
-   return gridSize. x;
-}
-#endif
-
-template< typename Operation >
-bool tnlCUDALongVectorReduction( const Operation& operation,
-                                 const typename Operation :: IndexType size,
-                                 const typename Operation :: RealType* deviceInput1,
-                                 const typename Operation :: RealType* deviceInput2,
-                                 typename Operation :: RealType& result )
-{
-#ifdef HAVE_CUDA
-
-   typedef typename Operation :: IndexType IndexType;
-   typedef typename Operation :: RealType RealType;
-
-   /****
-    * First check if the input array(s) is/are large enough for the reduction on GPU.
-    * Otherwise copy it/them to host and reduce on CPU.
-    */
-   RealType hostArray1[ maxGPUReductionDataSize ];
-   RealType hostArray2[ maxGPUReductionDataSize ];
-   if( size <= maxGPUReductionDataSize )
-   {
-      if( ! copyMemoryCudaToHost( hostArray1, deviceInput1, size ) )
-         return false;
-      if( deviceInput2 && ! copyMemoryCudaToHost( hostArray2, deviceInput2, size ) )
-         return false;
-      result = operation. initialValueOnHost( 0, hostArray1, hostArray2 );
-      for( IndexType i = 1; i < size; i ++ )
-         result = operation. reduceOnHost( i, result, hostArray1, hostArray2 );
-      return true;
-   }
-
-   /****
-    * Reduce the data on the CUDA device.
-    */
-   RealType* deviceAux1( 0 ), *deviceAux2( 0 );
-   IndexType reducedSize = reduceOnCudaDevice( operation,
-                                               size,
-                                               deviceInput1,
-                                               deviceInput2,
-                                               deviceAux1 );
-
-   while( reducedSize > maxGPUReductionDataSize )
-   {
-      reducedSize = reduceOnCudaDevice( operation,
-                                        reducedSize,
-                                        deviceAux1,
-                                        ( RealType* ) 0,
-                                        deviceAux2 );
-      Swap( deviceAux1, deviceAux2 );
-   }
-
-   /***
-    * Transfer the reduced data from device to host.
-    */
-   RealType resultArray[ maxGPUReductionDataSize ];
-   if( ! copyMemoryCudaToHost( resultArray, deviceAux1, reducedSize ) )
-      return false;
-
-   /***
-    * Reduce the data on the host system.
-    */
-   result = operation. initialValueOnHost( 0, resultArray, ( RealType* ) 0 );
-   for( IndexType i = 1; i < reducedSize; i ++ )
-      result = operation. reduceOnHost( i, result, resultArray, ( RealType*) 0 );
-
-   /****
-    * Free the memory allocated on the device.
-    */
-   if( deviceAux1 && ! freeMemoryCuda( deviceAux1 ) )
-      return false;
-   if( deviceAux2 && ! freeMemoryCuda( deviceAux2 ) )
-      return false;
-
-
-   return true;
-#else
-   cerr << "I am sorry but CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl;
-   return false;
-#endif
-};
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-// TODO: result of comparison should not be returned!!!
-template< typename Type, typename Index >
-bool tnlCUDALongVectorComparison( const Index size,
-                                  const Type* deviceInput1,
-                                  const Type* deviceInput2,
-                                  bool* deviceBoolAux = 0,
-                                  Type* deviceAux = 0 )
-{
-#ifdef HAVE_CUDA
-   tnlAssert( size > 0,
-              cerr << "You try to compare two CUDA long vectors with non-positive size." << endl
-                   << "The size is " << size );
-   //tnlVector< bool, tnlCuda, Index > boolArray( "tnlCUDALongVectorComparison:bool_array" );
-   bool* myDeviceBoolAux( 0 );
-   if( ! deviceBoolAux )
-   {
-      //if( ! boolArray. setSize( size ) )
-      if( ! allocateMemoryCuda( myDeviceBoolAux, size ) )
-         return false;
-      deviceBoolAux = myDeviceBoolAux;
-   }
-   dim3 blockSize( 0 ), gridSize( 0 );
-   blockSize. x = 256;
-   gridSize. x = size / blockSize. x + 1;
-
-   //compareTwoVectorsElementwise<<< gridSize, blockSize >>>( size,
-   //                                                         deviceInput1,
-   //                                                         deviceInput2,
-   //                                                         deviceBoolAux );
-   if( ! checkCudaDevice )
-      return false;
-   bool result;
-   if( ! tnlCUDALongVectorReduction< bool, bool, Index, tnlParallelReductionMin >( size,
-                                                                                   deviceBoolAux,
-                                                                                   ( bool* ) NULL,
-                                                                                   result,
-                                                                                   0 ) )
-
-
-      return false;
-   return result;
-#else
-   cerr << "I am sorry but CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl;
-   return;
-#endif
-}
-
-#endif /* CUDALONGVECTORKERNELS_H_ */
diff --git a/src/implementation/core/cuda/CMakeLists.txt b/src/implementation/core/cuda/CMakeLists.txt
index 6d119a124fbc6d614a5d5856fa1166fd1783e68c..54f83098f4c7bb940391702a4b68a08c87bbffc6 100755
--- a/src/implementation/core/cuda/CMakeLists.txt
+++ b/src/implementation/core/cuda/CMakeLists.txt
@@ -1,15 +1,19 @@
-SET( headers reduction_impl.h
-             reduction-operations_impl.h )
+SET( headers cuda-reduction_impl.h )
 
 SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/implementation/core/cuda )
 
 IF( BUILD_CUDA )
    set( tnl_implementation_core_cuda_CUDA__SOURCES
-        ${CURRENT_DIR}/reduction-operations_impl.cu
-        PARENT_SCOPE )
+        ${CURRENT_DIR}/cuda-reduction_impl.cu
+        PARENT_SCOPE )        
+else() 
+   set( tnl_implementation_core_cuda_SOURCES
+        ${CURRENT_DIR}/cuda-reduction_impl.cpp
+        PARENT_SCOPE )               
 endif()        
 
 set( tnl_implementation_core_cuda_SOURCES 
+     ${tnl_implementation_core_cuda_SOURCES}
      ${CURRENT_DIR}/device-check.cpp
      PARENT_SCOPE )
 
diff --git a/src/implementation/core/cuda/cuda-reduction_impl.cpp b/src/implementation/core/cuda/cuda-reduction_impl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e0818522af3a9ecb132a27cb459bf2e895659004
--- /dev/null
+++ b/src/implementation/core/cuda/cuda-reduction_impl.cpp
@@ -0,0 +1,895 @@
+/***************************************************************************
+                          cuda-reduction_impl.cpp  -  description
+                             -------------------
+    begin                : Mar 24, 2013
+    copyright            : (C) 2013 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#include <core/cuda/reduction-operations.h>
+#include <core/cuda/cuda-reduction.h>
+
+#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
+/****
+ * Sum
+ */
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< char, int > >
+                                   ( const tnlParallelReductionSum< char, int >& operation,
+                                     const typename tnlParallelReductionSum< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< int, int > >
+                                   ( const tnlParallelReductionSum< int, int >& operation,
+                                     const typename tnlParallelReductionSum< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< float, int > >
+                                   ( const tnlParallelReductionSum< float, int >& operation,
+                                     const typename tnlParallelReductionSum< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< double, int > >
+                                   ( const tnlParallelReductionSum< double, int>& operation,
+                                     const typename tnlParallelReductionSum< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, int > >
+                                   ( const tnlParallelReductionSum< long double, int>& operation,
+                                     const typename tnlParallelReductionSum< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< char, long int > >
+                                   ( const tnlParallelReductionSum< char, long int >& operation,
+                                     const typename tnlParallelReductionSum< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< int, long int > >
+                                   ( const tnlParallelReductionSum< int, long int >& operation,
+                                     const typename tnlParallelReductionSum< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< float, long int > >
+                                   ( const tnlParallelReductionSum< float, long int >& operation,
+                                     const typename tnlParallelReductionSum< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< double, long int > >
+                                   ( const tnlParallelReductionSum< double, long int>& operation,
+                                     const typename tnlParallelReductionSum< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, long int > >
+                                   ( const tnlParallelReductionSum< long double, long int>& operation,
+                                     const typename tnlParallelReductionSum< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< long double, long int> :: ResultType& result );*/
+
+/****
+ * Min
+ */
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< char, int > >
+                                   ( const tnlParallelReductionMin< char, int >& operation,
+                                     const typename tnlParallelReductionMin< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< int, int > >
+                                   ( const tnlParallelReductionMin< int, int >& operation,
+                                     const typename tnlParallelReductionMin< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< float, int > >
+                                   ( const tnlParallelReductionMin< float, int >& operation,
+                                     const typename tnlParallelReductionMin< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< double, int > >
+                                   ( const tnlParallelReductionMin< double, int>& operation,
+                                     const typename tnlParallelReductionMin< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, int > >
+                                   ( const tnlParallelReductionMin< long double, int>& operation,
+                                     const typename tnlParallelReductionMin< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< char, long int > >
+                                   ( const tnlParallelReductionMin< char, long int >& operation,
+                                     const typename tnlParallelReductionMin< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< int, long int > >
+                                   ( const tnlParallelReductionMin< int, long int >& operation,
+                                     const typename tnlParallelReductionMin< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< float, long int > >
+                                   ( const tnlParallelReductionMin< float, long int >& operation,
+                                     const typename tnlParallelReductionMin< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< double, long int > >
+                                   ( const tnlParallelReductionMin< double, long int>& operation,
+                                     const typename tnlParallelReductionMin< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, long int > >
+                                   ( const tnlParallelReductionMin< long double, long int>& operation,
+                                     const typename tnlParallelReductionMin< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< long double, long int> :: ResultType& result );*/
+
+/****
+ * Max
+ */
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< char, int > >
+                                   ( const tnlParallelReductionMax< char, int >& operation,
+                                     const typename tnlParallelReductionMax< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< int, int > >
+                                   ( const tnlParallelReductionMax< int, int >& operation,
+                                     const typename tnlParallelReductionMax< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< float, int > >
+                                   ( const tnlParallelReductionMax< float, int >& operation,
+                                     const typename tnlParallelReductionMax< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< double, int > >
+                                   ( const tnlParallelReductionMax< double, int>& operation,
+                                     const typename tnlParallelReductionMax< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, int > >
+                                   ( const tnlParallelReductionMax< long double, int>& operation,
+                                     const typename tnlParallelReductionMax< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< char, long int > >
+                                   ( const tnlParallelReductionMax< char, long int >& operation,
+                                     const typename tnlParallelReductionMax< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< int, long int > >
+                                   ( const tnlParallelReductionMax< int, long int >& operation,
+                                     const typename tnlParallelReductionMax< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< float, long int > >
+                                   ( const tnlParallelReductionMax< float, long int >& operation,
+                                     const typename tnlParallelReductionMax< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< double, long int > >
+                                   ( const tnlParallelReductionMax< double, long int>& operation,
+                                     const typename tnlParallelReductionMax< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, long int > >
+                                   ( const tnlParallelReductionMax< long double, long int>& operation,
+                                     const typename tnlParallelReductionMax< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< long double, long int> :: ResultType& result );*/
+
+/****
+ * Abs sum
+ */
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, int > >
+                                   ( const tnlParallelReductionAbsSum< char, int >& operation,
+                                     const typename tnlParallelReductionAbsSum< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< int, int > >
+                                   ( const tnlParallelReductionAbsSum< int, int >& operation,
+                                     const typename tnlParallelReductionAbsSum< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< float, int > >
+                                   ( const tnlParallelReductionAbsSum< float, int >& operation,
+                                     const typename tnlParallelReductionAbsSum< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, int > >
+                                   ( const tnlParallelReductionAbsSum< double, int>& operation,
+                                     const typename tnlParallelReductionAbsSum< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, int > >
+                                   ( const tnlParallelReductionAbsSum< long double, int>& operation,
+                                     const typename tnlParallelReductionAbsSum< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, long int > >
+                                   ( const tnlParallelReductionAbsSum< char, long int >& operation,
+                                     const typename tnlParallelReductionAbsSum< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< int, long int > >
+                                   ( const tnlParallelReductionAbsSum< int, long int >& operation,
+                                     const typename tnlParallelReductionAbsSum< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< float, long int > >
+                                   ( const tnlParallelReductionAbsSum< float, long int >& operation,
+                                     const typename tnlParallelReductionAbsSum< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, long int > >
+                                   ( const tnlParallelReductionAbsSum< double, long int>& operation,
+                                     const typename tnlParallelReductionAbsSum< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, long int > >
+                                   ( const tnlParallelReductionAbsSum< long double, long int>& operation,
+                                     const typename tnlParallelReductionAbsSum< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< long double, long int> :: ResultType& result );*/
+
+/****
+ * Abs min
+ */
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, int > >
+                                   ( const tnlParallelReductionAbsMin< char, int >& operation,
+                                     const typename tnlParallelReductionAbsMin< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, int > >
+                                   ( const tnlParallelReductionAbsMin< int, int >& operation,
+                                     const typename tnlParallelReductionAbsMin< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< float, int > >
+                                   ( const tnlParallelReductionAbsMin< float, int >& operation,
+                                     const typename tnlParallelReductionAbsMin< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, int > >
+                                   ( const tnlParallelReductionAbsMin< double, int>& operation,
+                                     const typename tnlParallelReductionAbsMin< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, int > >
+                                   ( const tnlParallelReductionAbsMin< long double, int>& operation,
+                                     const typename tnlParallelReductionAbsMin< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int > >
+                                   ( const tnlParallelReductionAbsMin< char, long int >& operation,
+                                     const typename tnlParallelReductionAbsMin< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, long int > >
+                                   ( const tnlParallelReductionAbsMin< int, long int >& operation,
+                                     const typename tnlParallelReductionAbsMin< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< float, long int > >
+                                   ( const tnlParallelReductionAbsMin< float, long int >& operation,
+                                     const typename tnlParallelReductionAbsMin< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, long int > >
+                                   ( const tnlParallelReductionAbsMin< double, long int>& operation,
+                                     const typename tnlParallelReductionAbsMin< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > >
+                                   ( const tnlParallelReductionAbsMin< long double, long int>& operation,
+                                     const typename tnlParallelReductionAbsMin< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< long double, long int> :: ResultType& result );*/
+/****
+ * Abs max
+ */
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, int > >
+                                   ( const tnlParallelReductionAbsMax< char, int >& operation,
+                                     const typename tnlParallelReductionAbsMax< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, int > >
+                                   ( const tnlParallelReductionAbsMax< int, int >& operation,
+                                     const typename tnlParallelReductionAbsMax< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< float, int > >
+                                   ( const tnlParallelReductionAbsMax< float, int >& operation,
+                                     const typename tnlParallelReductionAbsMax< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, int > >
+                                   ( const tnlParallelReductionAbsMax< double, int>& operation,
+                                     const typename tnlParallelReductionAbsMax< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, int > >
+                                   ( const tnlParallelReductionAbsMax< long double, int>& operation,
+                                     const typename tnlParallelReductionAbsMax< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int > >
+                                   ( const tnlParallelReductionAbsMax< char, long int >& operation,
+                                     const typename tnlParallelReductionAbsMax< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, long int > >
+                                   ( const tnlParallelReductionAbsMax< int, long int >& operation,
+                                     const typename tnlParallelReductionAbsMax< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< float, long int > >
+                                   ( const tnlParallelReductionAbsMax< float, long int >& operation,
+                                     const typename tnlParallelReductionAbsMax< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, long int > >
+                                   ( const tnlParallelReductionAbsMax< double, long int>& operation,
+                                     const typename tnlParallelReductionAbsMax< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > >
+                                   ( const tnlParallelReductionAbsMax< long double, long int>& operation,
+                                     const typename tnlParallelReductionAbsMax< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< long double, long int> :: ResultType& result );*/
+
+/****
+ * Logical AND
+ */
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, int > >
+                                   ( const tnlParallelReductionLogicalAnd< char, int >& operation,
+                                     const typename tnlParallelReductionLogicalAnd< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< int, int > >
+                                   ( const tnlParallelReductionLogicalAnd< int, int >& operation,
+                                     const typename tnlParallelReductionLogicalAnd< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< float, int > >
+                                   ( const tnlParallelReductionLogicalAnd< float, int >& operation,
+                                     const typename tnlParallelReductionLogicalAnd< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, int > >
+                                   ( const tnlParallelReductionLogicalAnd< double, int>& operation,
+                                     const typename tnlParallelReductionLogicalAnd< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, int > >
+                                   ( const tnlParallelReductionLogicalAnd< long double, int>& operation,
+                                     const typename tnlParallelReductionLogicalAnd< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, long int > >
+                                   ( const tnlParallelReductionLogicalAnd< char, long int >& operation,
+                                     const typename tnlParallelReductionLogicalAnd< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< int, long int > >
+                                   ( const tnlParallelReductionLogicalAnd< int, long int >& operation,
+                                     const typename tnlParallelReductionLogicalAnd< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< float, long int > >
+                                   ( const tnlParallelReductionLogicalAnd< float, long int >& operation,
+                                     const typename tnlParallelReductionLogicalAnd< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, long int > >
+                                   ( const tnlParallelReductionLogicalAnd< double, long int>& operation,
+                                     const typename tnlParallelReductionLogicalAnd< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, long int > >
+                                   ( const tnlParallelReductionLogicalAnd< long double, long int>& operation,
+                                     const typename tnlParallelReductionLogicalAnd< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< long double, long int> :: ResultType& result );*/
+
+/****
+ * Logical OR
+ */
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, int > >
+                                   ( const tnlParallelReductionLogicalOr< char, int >& operation,
+                                     const typename tnlParallelReductionLogicalOr< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< int, int > >
+                                   ( const tnlParallelReductionLogicalOr< int, int >& operation,
+                                     const typename tnlParallelReductionLogicalOr< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< float, int > >
+                                   ( const tnlParallelReductionLogicalOr< float, int >& operation,
+                                     const typename tnlParallelReductionLogicalOr< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, int > >
+                                   ( const tnlParallelReductionLogicalOr< double, int>& operation,
+                                     const typename tnlParallelReductionLogicalOr< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, int > >
+                                   ( const tnlParallelReductionLogicalOr< long double, int>& operation,
+                                     const typename tnlParallelReductionLogicalOr< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, long int > >
+                                   ( const tnlParallelReductionLogicalOr< char, long int >& operation,
+                                     const typename tnlParallelReductionLogicalOr< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< int, long int > >
+                                   ( const tnlParallelReductionLogicalOr< int, long int >& operation,
+                                     const typename tnlParallelReductionLogicalOr< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< float, long int > >
+                                   ( const tnlParallelReductionLogicalOr< float, long int >& operation,
+                                     const typename tnlParallelReductionLogicalOr< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, long int > >
+                                   ( const tnlParallelReductionLogicalOr< double, long int>& operation,
+                                     const typename tnlParallelReductionLogicalOr< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, long int > >
+                                   ( const tnlParallelReductionLogicalOr< long double, long int>& operation,
+                                     const typename tnlParallelReductionLogicalOr< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< long double, long int> :: ResultType& result );*/
+
+
+/****
+ * Lp Norm
+ */
+extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< float, int > >
+                                   ( const tnlParallelReductionLpNorm< float, int >& operation,
+                                     const typename tnlParallelReductionLpNorm< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, int > >
+                                   ( const tnlParallelReductionLpNorm< double, int>& operation,
+                                     const typename tnlParallelReductionLpNorm< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, int > >
+                                   ( const tnlParallelReductionLpNorm< long double, int>& operation,
+                                     const typename tnlParallelReductionLpNorm< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< char, long int > >
+                                   ( const tnlParallelReductionLpNorm< char, long int >& operation,
+                                     const typename tnlParallelReductionLpNorm< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< int, long int > >
+                                   ( const tnlParallelReductionLpNorm< int, long int >& operation,
+                                     const typename tnlParallelReductionLpNorm< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< float, long int > >
+                                   ( const tnlParallelReductionLpNorm< float, long int >& operation,
+                                     const typename tnlParallelReductionLpNorm< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, long int > >
+                                   ( const tnlParallelReductionLpNorm< double, long int>& operation,
+                                     const typename tnlParallelReductionLpNorm< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, long int > >
+                                   ( const tnlParallelReductionLpNorm< long double, long int>& operation,
+                                     const typename tnlParallelReductionLpNorm< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< long double, long int> :: ResultType& result );*/
+
+
+/****
+ * Equalities
+ */
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, int > >
+                                   ( const tnlParallelReductionEqualities< char, int >& operation,
+                                     const typename tnlParallelReductionEqualities< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< int, int > >
+                                   ( const tnlParallelReductionEqualities< int, int >& operation,
+                                     const typename tnlParallelReductionEqualities< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< float, int > >
+                                   ( const tnlParallelReductionEqualities< float, int >& operation,
+                                     const typename tnlParallelReductionEqualities< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, int > >
+                                   ( const tnlParallelReductionEqualities< double, int>& operation,
+                                     const typename tnlParallelReductionEqualities< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, int > >
+                                   ( const tnlParallelReductionEqualities< long double, int>& operation,
+                                     const typename tnlParallelReductionEqualities< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, long int > >
+                                   ( const tnlParallelReductionEqualities< char, long int >& operation,
+                                     const typename tnlParallelReductionEqualities< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< int, long int > >
+                                   ( const tnlParallelReductionEqualities< int, long int >& operation,
+                                     const typename tnlParallelReductionEqualities< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< float, long int > >
+                                   ( const tnlParallelReductionEqualities< float, long int >& operation,
+                                     const typename tnlParallelReductionEqualities< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, long int > >
+                                   ( const tnlParallelReductionEqualities< double, long int>& operation,
+                                     const typename tnlParallelReductionEqualities< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, long int > >
+                                   ( const tnlParallelReductionEqualities< long double, long int>& operation,
+                                     const typename tnlParallelReductionEqualities< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< long double, long int> :: ResultType& result );*/
+
+
+/****
+ * Inequalities
+ */
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, int > >
+                                   ( const tnlParallelReductionInequalities< char, int >& operation,
+                                     const typename tnlParallelReductionInequalities< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< int, int > >
+                                   ( const tnlParallelReductionInequalities< int, int >& operation,
+                                     const typename tnlParallelReductionInequalities< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< float, int > >
+                                   ( const tnlParallelReductionInequalities< float, int >& operation,
+                                     const typename tnlParallelReductionInequalities< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, int > >
+                                   ( const tnlParallelReductionInequalities< double, int>& operation,
+                                     const typename tnlParallelReductionInequalities< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, int > >
+                                   ( const tnlParallelReductionInequalities< long double, int>& operation,
+                                     const typename tnlParallelReductionInequalities< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, long int > >
+                                   ( const tnlParallelReductionInequalities< char, long int >& operation,
+                                     const typename tnlParallelReductionInequalities< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< int, long int > >
+                                   ( const tnlParallelReductionInequalities< int, long int >& operation,
+                                     const typename tnlParallelReductionInequalities< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< float, long int > >
+                                   ( const tnlParallelReductionInequalities< float, long int >& operation,
+                                     const typename tnlParallelReductionInequalities< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, long int > >
+                                   ( const tnlParallelReductionInequalities< double, long int>& operation,
+                                     const typename tnlParallelReductionInequalities< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, long int > >
+                                   ( const tnlParallelReductionInequalities< long double, long int>& operation,
+                                     const typename tnlParallelReductionInequalities< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< long double, long int> :: ResultType& result );*/
+
+
+/****
+ * Sdot
+ */
+extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< char, int > >
+                                   ( const tnlParallelReductionSdot< char, int >& operation,
+                                     const typename tnlParallelReductionSdot< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< int, int > >
+                                   ( const tnlParallelReductionSdot< int, int >& operation,
+                                     const typename tnlParallelReductionSdot< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< float, int > >
+                                   ( const tnlParallelReductionSdot< float, int >& operation,
+                                     const typename tnlParallelReductionSdot< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< double, int > >
+                                   ( const tnlParallelReductionSdot< double, int>& operation,
+                                     const typename tnlParallelReductionSdot< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< long double, int > >
+                                   ( const tnlParallelReductionSdot< long double, int>& operation,
+                                     const typename tnlParallelReductionSdot< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< char, long int > >
+                                   ( const tnlParallelReductionSdot< char, long int >& operation,
+                                     const typename tnlParallelReductionSdot< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< int, long int > >
+                                   ( const tnlParallelReductionSdot< int, long int >& operation,
+                                     const typename tnlParallelReductionSdot< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< float, long int > >
+                                   ( const tnlParallelReductionSdot< float, long int >& operation,
+                                     const typename tnlParallelReductionSdot< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< double, long int > >
+                                   ( const tnlParallelReductionSdot< double, long int>& operation,
+                                     const typename tnlParallelReductionSdot< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< long double, long int > >
+                                   ( const tnlParallelReductionSdot< long double, long int>& operation,
+                                     const typename tnlParallelReductionSdot< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< long double, long int> :: ResultType& result );*/
+
+#endif /* TEMPLATE_EXPLICIT_INSTANTIATION */
+
+
diff --git a/src/implementation/core/cuda/cuda-reduction_impl.cu b/src/implementation/core/cuda/cuda-reduction_impl.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ac420c07fbe87e8f78b383881c90f0fa4a8aaccc
--- /dev/null
+++ b/src/implementation/core/cuda/cuda-reduction_impl.cu
@@ -0,0 +1,882 @@
+/***************************************************************************
+                          cuda-reduction_impl.cu  -  description
+                             -------------------
+    begin                : Mar 24, 2013
+    copyright            : (C) 2013 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+ 
+#include <core/cuda/reduction-operations.h>
+#include <core/cuda/cuda-reduction.h>
+ 
+#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
+
+/****
+ * Sum 
+ */
+
+template bool reductionOnCudaDevice< tnlParallelReductionSum< char, int > >
+                                   ( const tnlParallelReductionSum< char, int >& operation,
+                                     const typename tnlParallelReductionSum< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< char, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionSum< int, int > >
+                                   ( const tnlParallelReductionSum< int, int >& operation,
+                                     const typename tnlParallelReductionSum< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< int, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionSum< float, int > >
+                                   ( const tnlParallelReductionSum< float, int >& operation,
+                                     const typename tnlParallelReductionSum< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< float, int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionSum< double, int > >
+                                   ( const tnlParallelReductionSum< double, int>& operation,
+                                     const typename tnlParallelReductionSum< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< double, int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, int > >
+                                   ( const tnlParallelReductionSum< long double, int>& operation,
+                                     const typename tnlParallelReductionSum< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< long double, int> :: ResultType& result );*/
+
+template bool reductionOnCudaDevice< tnlParallelReductionSum< char, long int > >
+                                   ( const tnlParallelReductionSum< char, long int >& operation,
+                                     const typename tnlParallelReductionSum< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< char, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionSum< int, long int > >
+                                   ( const tnlParallelReductionSum< int, long int >& operation,
+                                     const typename tnlParallelReductionSum< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< int, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionSum< float, long int > >
+                                   ( const tnlParallelReductionSum< float, long int >& operation,
+                                     const typename tnlParallelReductionSum< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< float, long int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionSum< double, long int > >
+                                   ( const tnlParallelReductionSum< double, long int>& operation,
+                                     const typename tnlParallelReductionSum< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< double, long int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, long int > >
+                                   ( const tnlParallelReductionSum< long double, long int>& operation,
+                                     const typename tnlParallelReductionSum< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< long double, long int> :: ResultType& result );*/
+
+/****
+ * Min
+ */
+
+template bool reductionOnCudaDevice< tnlParallelReductionMin< char, int > >
+                                   ( const tnlParallelReductionMin< char, int >& operation,
+                                     const typename tnlParallelReductionMin< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< char, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionMin< int, int > >
+                                   ( const tnlParallelReductionMin< int, int >& operation,
+                                     const typename tnlParallelReductionMin< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< int, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionMin< float, int > >
+                                   ( const tnlParallelReductionMin< float, int >& operation,
+                                     const typename tnlParallelReductionMin< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< float, int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionMin< double, int > >
+                                   ( const tnlParallelReductionMin< double, int>& operation,
+                                     const typename tnlParallelReductionMin< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< double, int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, int > >
+                                   ( const tnlParallelReductionMin< long double, int>& operation,
+                                     const typename tnlParallelReductionMin< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< long double, int> :: ResultType& result );*/
+
+template bool reductionOnCudaDevice< tnlParallelReductionMin< char, long int > >
+                                   ( const tnlParallelReductionMin< char, long int >& operation,
+                                     const typename tnlParallelReductionMin< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< char, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionMin< int, long int > >
+                                   ( const tnlParallelReductionMin< int, long int >& operation,
+                                     const typename tnlParallelReductionMin< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< int, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionMin< float, long int > >
+                                   ( const tnlParallelReductionMin< float, long int >& operation,
+                                     const typename tnlParallelReductionMin< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< float, long int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionMin< double, long int > >
+                                   ( const tnlParallelReductionMin< double, long int>& operation,
+                                     const typename tnlParallelReductionMin< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< double, long int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, long int > >
+                                   ( const tnlParallelReductionMin< long double, long int>& operation,
+                                     const typename tnlParallelReductionMin< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< long double, long int> :: ResultType& result );*/
+
+/****
+ * Max
+ */
+
+template bool reductionOnCudaDevice< tnlParallelReductionMax< char, int > >
+                                   ( const tnlParallelReductionMax< char, int >& operation,
+                                     const typename tnlParallelReductionMax< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< char, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionMax< int, int > >
+                                   ( const tnlParallelReductionMax< int, int >& operation,
+                                     const typename tnlParallelReductionMax< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< int, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionMax< float, int > >
+                                   ( const tnlParallelReductionMax< float, int >& operation,
+                                     const typename tnlParallelReductionMax< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< float, int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionMax< double, int > >
+                                   ( const tnlParallelReductionMax< double, int>& operation,
+                                     const typename tnlParallelReductionMax< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< double, int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, int > >
+                                   ( const tnlParallelReductionMax< long double, int>& operation,
+                                     const typename tnlParallelReductionMax< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< long double, int> :: ResultType& result );*/
+
+template bool reductionOnCudaDevice< tnlParallelReductionMax< char, long int > >
+                                   ( const tnlParallelReductionMax< char, long int >& operation,
+                                     const typename tnlParallelReductionMax< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< char, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionMax< int, long int > >
+                                   ( const tnlParallelReductionMax< int, long int >& operation,
+                                     const typename tnlParallelReductionMax< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< int, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionMax< float, long int > >
+                                   ( const tnlParallelReductionMax< float, long int >& operation,
+                                     const typename tnlParallelReductionMax< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< float, long int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionMax< double, long int > >
+                                   ( const tnlParallelReductionMax< double, long int>& operation,
+                                     const typename tnlParallelReductionMax< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< double, long int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, long int > >
+                                   ( const tnlParallelReductionMax< long double, long int>& operation,
+                                     const typename tnlParallelReductionMax< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< long double, long int> :: ResultType& result );*/
+
+/****
+ * Abs sum
+ */
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, int > >
+                                   ( const tnlParallelReductionAbsSum< char, int >& operation,
+                                     const typename tnlParallelReductionAbsSum< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< char, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< int, int > >
+                                   ( const tnlParallelReductionAbsSum< int, int >& operation,
+                                     const typename tnlParallelReductionAbsSum< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< int, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< float, int > >
+                                   ( const tnlParallelReductionAbsSum< float, int >& operation,
+                                     const typename tnlParallelReductionAbsSum< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< float, int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, int > >
+                                   ( const tnlParallelReductionAbsSum< double, int>& operation,
+                                     const typename tnlParallelReductionAbsSum< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< double, int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, int > >
+                                   ( const tnlParallelReductionAbsSum< long double, int>& operation,
+                                     const typename tnlParallelReductionAbsSum< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< long double, int> :: ResultType& result );*/
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, long int > >
+                                   ( const tnlParallelReductionAbsSum< char, long int >& operation,
+                                     const typename tnlParallelReductionAbsSum< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< char, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< int, long int > >
+                                   ( const tnlParallelReductionAbsSum< int, long int >& operation,
+                                     const typename tnlParallelReductionAbsSum< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< int, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< float, long int > >
+                                   ( const tnlParallelReductionAbsSum< float, long int >& operation,
+                                     const typename tnlParallelReductionAbsSum< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< float, long int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, long int > >
+                                   ( const tnlParallelReductionAbsSum< double, long int>& operation,
+                                     const typename tnlParallelReductionAbsSum< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< double, long int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, long int > >
+                                   ( const tnlParallelReductionAbsSum< long double, long int>& operation,
+                                     const typename tnlParallelReductionAbsSum< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< long double, long int> :: ResultType& result );*/
+
+/****
+ * Abs min
+ */
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, int > >
+                                   ( const tnlParallelReductionAbsMin< char, int >& operation,
+                                     const typename tnlParallelReductionAbsMin< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< char, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, int > >
+                                   ( const tnlParallelReductionAbsMin< int, int >& operation,
+                                     const typename tnlParallelReductionAbsMin< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< int, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< float, int > >
+                                   ( const tnlParallelReductionAbsMin< float, int >& operation,
+                                     const typename tnlParallelReductionAbsMin< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< float, int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, int > >
+                                   ( const tnlParallelReductionAbsMin< double, int>& operation,
+                                     const typename tnlParallelReductionAbsMin< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< double, int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, int > >
+                                   ( const tnlParallelReductionAbsMin< long double, int>& operation,
+                                     const typename tnlParallelReductionAbsMin< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< long double, int> :: ResultType& result );*/
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int > >
+                                   ( const tnlParallelReductionAbsMin< char, long int >& operation,
+                                     const typename tnlParallelReductionAbsMin< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< char, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, long int > >
+                                   ( const tnlParallelReductionAbsMin< int, long int >& operation,
+                                     const typename tnlParallelReductionAbsMin< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< int, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< float, long int > >
+                                   ( const tnlParallelReductionAbsMin< float, long int >& operation,
+                                     const typename tnlParallelReductionAbsMin< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< float, long int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, long int > >
+                                   ( const tnlParallelReductionAbsMin< double, long int>& operation,
+                                     const typename tnlParallelReductionAbsMin< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< double, long int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > >
+                                   ( const tnlParallelReductionAbsMin< long double, long int>& operation,
+                                     const typename tnlParallelReductionAbsMin< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< long double, long int> :: ResultType& result );*/
+/****
+ * Abs max
+ */
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, int > >
+                                   ( const tnlParallelReductionAbsMax< char, int >& operation,
+                                     const typename tnlParallelReductionAbsMax< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< char, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, int > >
+                                   ( const tnlParallelReductionAbsMax< int, int >& operation,
+                                     const typename tnlParallelReductionAbsMax< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< int, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< float, int > >
+                                   ( const tnlParallelReductionAbsMax< float, int >& operation,
+                                     const typename tnlParallelReductionAbsMax< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< float, int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, int > >
+                                   ( const tnlParallelReductionAbsMax< double, int>& operation,
+                                     const typename tnlParallelReductionAbsMax< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< double, int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, int > >
+                                   ( const tnlParallelReductionAbsMax< long double, int>& operation,
+                                     const typename tnlParallelReductionAbsMax< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< long double, int> :: ResultType& result );*/
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int > >
+                                   ( const tnlParallelReductionAbsMax< char, long int >& operation,
+                                     const typename tnlParallelReductionAbsMax< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< char, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, long int > >
+                                   ( const tnlParallelReductionAbsMax< int, long int >& operation,
+                                     const typename tnlParallelReductionAbsMax< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< int, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< float, long int > >
+                                   ( const tnlParallelReductionAbsMax< float, long int >& operation,
+                                     const typename tnlParallelReductionAbsMax< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< float, long int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, long int > >
+                                   ( const tnlParallelReductionAbsMax< double, long int>& operation,
+                                     const typename tnlParallelReductionAbsMax< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< double, long int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > >
+                                   ( const tnlParallelReductionAbsMax< long double, long int>& operation,
+                                     const typename tnlParallelReductionAbsMax< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< long double, long int> :: ResultType& result );*/
+
+/****
+ * Logical AND
+ */
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, int > >
+                                   ( const tnlParallelReductionLogicalAnd< char, int >& operation,
+                                     const typename tnlParallelReductionLogicalAnd< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< char, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< int, int > >
+                                   ( const tnlParallelReductionLogicalAnd< int, int >& operation,
+                                     const typename tnlParallelReductionLogicalAnd< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< int, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< float, int > >
+                                   ( const tnlParallelReductionLogicalAnd< float, int >& operation,
+                                     const typename tnlParallelReductionLogicalAnd< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< float, int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, int > >
+                                   ( const tnlParallelReductionLogicalAnd< double, int>& operation,
+                                     const typename tnlParallelReductionLogicalAnd< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< double, int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, int > >
+                                   ( const tnlParallelReductionLogicalAnd< long double, int>& operation,
+                                     const typename tnlParallelReductionLogicalAnd< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< long double, int> :: ResultType& result );*/
+
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, long int > >
+                                   ( const tnlParallelReductionLogicalAnd< char, long int >& operation,
+                                     const typename tnlParallelReductionLogicalAnd< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< char, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< int, long int > >
+                                   ( const tnlParallelReductionLogicalAnd< int, long int >& operation,
+                                     const typename tnlParallelReductionLogicalAnd< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< int, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< float, long int > >
+                                   ( const tnlParallelReductionLogicalAnd< float, long int >& operation,
+                                     const typename tnlParallelReductionLogicalAnd< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< float, long int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, long int > >
+                                   ( const tnlParallelReductionLogicalAnd< double, long int>& operation,
+                                     const typename tnlParallelReductionLogicalAnd< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< double, long int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, long int > >
+                                   ( const tnlParallelReductionLogicalAnd< long double, long int>& operation,
+                                     const typename tnlParallelReductionLogicalAnd< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< long double, long int> :: ResultType& result );*/
+
+/****
+ * Logical OR
+ */
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, int > >
+                                   ( const tnlParallelReductionLogicalOr< char, int >& operation,
+                                     const typename tnlParallelReductionLogicalOr< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< char, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< int, int > >
+                                   ( const tnlParallelReductionLogicalOr< int, int >& operation,
+                                     const typename tnlParallelReductionLogicalOr< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< int, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< float, int > >
+                                   ( const tnlParallelReductionLogicalOr< float, int >& operation,
+                                     const typename tnlParallelReductionLogicalOr< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< float, int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, int > >
+                                   ( const tnlParallelReductionLogicalOr< double, int>& operation,
+                                     const typename tnlParallelReductionLogicalOr< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< double, int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, int > >
+                                   ( const tnlParallelReductionLogicalOr< long double, int>& operation,
+                                     const typename tnlParallelReductionLogicalOr< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< long double, int> :: ResultType& result );*/
+
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, long int > >
+                                   ( const tnlParallelReductionLogicalOr< char, long int >& operation,
+                                     const typename tnlParallelReductionLogicalOr< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< char, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< int, long int > >
+                                   ( const tnlParallelReductionLogicalOr< int, long int >& operation,
+                                     const typename tnlParallelReductionLogicalOr< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< int, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< float, long int > >
+                                   ( const tnlParallelReductionLogicalOr< float, long int >& operation,
+                                     const typename tnlParallelReductionLogicalOr< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< float, long int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, long int > >
+                                   ( const tnlParallelReductionLogicalOr< double, long int>& operation,
+                                     const typename tnlParallelReductionLogicalOr< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< double, long int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, long int > >
+                                   ( const tnlParallelReductionLogicalOr< long double, long int>& operation,
+                                     const typename tnlParallelReductionLogicalOr< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< long double, long int> :: ResultType& result );*/
+
+
+/****
+ * Lp Norm
+ */
+template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< float, int > >
+                                   ( const tnlParallelReductionLpNorm< float, int >& operation,
+                                     const typename tnlParallelReductionLpNorm< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< float, int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, int > >
+                                   ( const tnlParallelReductionLpNorm< double, int>& operation,
+                                     const typename tnlParallelReductionLpNorm< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< double, int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, int > >
+                                   ( const tnlParallelReductionLpNorm< long double, int>& operation,
+                                     const typename tnlParallelReductionLpNorm< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< long double, int> :: ResultType& result );*/
+
+
+
+template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< float, long int > >
+                                   ( const tnlParallelReductionLpNorm< float, long int >& operation,
+                                     const typename tnlParallelReductionLpNorm< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< float, long int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, long int > >
+                                   ( const tnlParallelReductionLpNorm< double, long int>& operation,
+                                     const typename tnlParallelReductionLpNorm< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< double, long int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, long int > >
+                                   ( const tnlParallelReductionLpNorm< long double, long int>& operation,
+                                     const typename tnlParallelReductionLpNorm< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< long double, long int> :: ResultType& result );*/
+
+
+/****
+ * Equalities
+ */
+template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, int > >
+                                   ( const tnlParallelReductionEqualities< char, int >& operation,
+                                     const typename tnlParallelReductionEqualities< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< char, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionEqualities< int, int > >
+                                   ( const tnlParallelReductionEqualities< int, int >& operation,
+                                     const typename tnlParallelReductionEqualities< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< int, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionEqualities< float, int > >
+                                   ( const tnlParallelReductionEqualities< float, int >& operation,
+                                     const typename tnlParallelReductionEqualities< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< float, int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, int > >
+                                   ( const tnlParallelReductionEqualities< double, int>& operation,
+                                     const typename tnlParallelReductionEqualities< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< double, int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, int > >
+                                   ( const tnlParallelReductionEqualities< long double, int>& operation,
+                                     const typename tnlParallelReductionEqualities< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< long double, int> :: ResultType& result );*/
+
+template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, long int > >
+                                   ( const tnlParallelReductionEqualities< char, long int >& operation,
+                                     const typename tnlParallelReductionEqualities< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< char, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionEqualities< int, long int > >
+                                   ( const tnlParallelReductionEqualities< int, long int >& operation,
+                                     const typename tnlParallelReductionEqualities< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< int, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionEqualities< float, long int > >
+                                   ( const tnlParallelReductionEqualities< float, long int >& operation,
+                                     const typename tnlParallelReductionEqualities< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< float, long int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, long int > >
+                                   ( const tnlParallelReductionEqualities< double, long int>& operation,
+                                     const typename tnlParallelReductionEqualities< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< double, long int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, long int > >
+                                   ( const tnlParallelReductionEqualities< long double, long int>& operation,
+                                     const typename tnlParallelReductionEqualities< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< long double, long int> :: ResultType& result );*/
+
+
+/****
+ * Inequalities
+ */
+template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, int > >
+                                   ( const tnlParallelReductionInequalities< char, int >& operation,
+                                     const typename tnlParallelReductionInequalities< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< char, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionInequalities< int, int > >
+                                   ( const tnlParallelReductionInequalities< int, int >& operation,
+                                     const typename tnlParallelReductionInequalities< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< int, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionInequalities< float, int > >
+                                   ( const tnlParallelReductionInequalities< float, int >& operation,
+                                     const typename tnlParallelReductionInequalities< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< float, int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, int > >
+                                   ( const tnlParallelReductionInequalities< double, int>& operation,
+                                     const typename tnlParallelReductionInequalities< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< double, int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, int > >
+                                   ( const tnlParallelReductionInequalities< long double, int>& operation,
+                                     const typename tnlParallelReductionInequalities< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< long double, int> :: ResultType& result );*/
+
+template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, long int > >
+                                   ( const tnlParallelReductionInequalities< char, long int >& operation,
+                                     const typename tnlParallelReductionInequalities< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< char, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionInequalities< int, long int > >
+                                   ( const tnlParallelReductionInequalities< int, long int >& operation,
+                                     const typename tnlParallelReductionInequalities< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< int, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionInequalities< float, long int > >
+                                   ( const tnlParallelReductionInequalities< float, long int >& operation,
+                                     const typename tnlParallelReductionInequalities< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< float, long int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, long int > >
+                                   ( const tnlParallelReductionInequalities< double, long int>& operation,
+                                     const typename tnlParallelReductionInequalities< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< double, long int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, long int > >
+                                   ( const tnlParallelReductionInequalities< long double, long int>& operation,
+                                     const typename tnlParallelReductionInequalities< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< long double, long int> :: ResultType& result );*/
+
+
+/****
+ * Sdot
+ */
+template bool reductionOnCudaDevice< tnlParallelReductionSdot< char, int > >
+                                   ( const tnlParallelReductionSdot< char, int >& operation,
+                                     const typename tnlParallelReductionSdot< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< char, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionSdot< int, int > >
+                                   ( const tnlParallelReductionSdot< int, int >& operation,
+                                     const typename tnlParallelReductionSdot< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< int, int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionSdot< float, int > >
+                                   ( const tnlParallelReductionSdot< float, int >& operation,
+                                     const typename tnlParallelReductionSdot< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< float, int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionSdot< double, int > >
+                                   ( const tnlParallelReductionSdot< double, int>& operation,
+                                     const typename tnlParallelReductionSdot< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< double, int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionSdot< long double, int > >
+                                   ( const tnlParallelReductionSdot< long double, int>& operation,
+                                     const typename tnlParallelReductionSdot< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< long double, int> :: ResultType& result );*/
+
+template bool reductionOnCudaDevice< tnlParallelReductionSdot< char, long int > >
+                                   ( const tnlParallelReductionSdot< char, long int >& operation,
+                                     const typename tnlParallelReductionSdot< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< char, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionSdot< int, long int > >
+                                   ( const tnlParallelReductionSdot< int, long int >& operation,
+                                     const typename tnlParallelReductionSdot< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< int, long int > :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionSdot< float, long int > >
+                                   ( const tnlParallelReductionSdot< float, long int >& operation,
+                                     const typename tnlParallelReductionSdot< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< float, long int> :: ResultType& result );
+
+template bool reductionOnCudaDevice< tnlParallelReductionSdot< double, long int > >
+                                   ( const tnlParallelReductionSdot< double, long int>& operation,
+                                     const typename tnlParallelReductionSdot< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< double, long int> :: ResultType& result );
+
+/*template bool reductionOnCudaDevice< tnlParallelReductionSdot< long double, long int > >
+                                   ( const tnlParallelReductionSdot< long double, long int>& operation,
+                                     const typename tnlParallelReductionSdot< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< long double, long int> :: ResultType& result );*/
+
+#endif
diff --git a/src/implementation/core/cuda/cuda-reduction_impl.h b/src/implementation/core/cuda/cuda-reduction_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..6eaf2f674e5d0232fa068c2f9cbdda98ffc3e3db
--- /dev/null
+++ b/src/implementation/core/cuda/cuda-reduction_impl.h
@@ -0,0 +1,1313 @@
+/***************************************************************************
+                          cuda-reduction_impl.h  -  description
+                             -------------------
+    begin                : Mar 24, 2013
+    copyright            : (C) 2013 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef CUDA_REDUCTION_IMPL_H_
+#define CUDA_REDUCTION_IMPL_H_
+
+#ifdef HAVE_CUDA
+#include <cuda.h>
+#endif
+#include <iostream>
+#include <core/tnlAssert.h>
+#include <core/cuda/reduction-operations.h>
+#include <implementation/core/memory-operations.h>
+
+using namespace std;
+
+
+/****
+ * This constant says that arrays smaller than its value
+ * are going to be reduced on CPU.
+ */
+const int maxGPUReductionDataSize = 256;
+
+#ifdef HAVE_CUDA
+
+
+/***
+ * For each thread in block with thread ID smaller then s this function reduces
+ * data elements with indecis tid and tid + s. Here we assume that for each
+ * tid the tid + s element also exists i.e. we have even number of elements.
+ */
+template< typename Operation >
+__device__ void reduceAligned( const Operation& operation,
+                               typename Operation :: IndexType tid,
+                               typename Operation :: IndexType  s,
+                               typename Operation :: ResultType* sdata )
+{
+   if( tid < s )
+   {
+      sdata[ tid ] = operation. commonReductionOnDevice( tid, tid + s, sdata );
+   }
+}
+
+/***
+ * For each thread in block with thread ID smaller then s this function reduces
+ * data elements with indices tid and tid + s. This is a modified version of
+ * the previous algorithm. Thid one works even for odd number of elements but
+ * it is a bit slower.
+ */
+template< typename Operation >
+__device__ void reduceNonAligned( const Operation& operation,
+                                  typename Operation :: IndexType tid,
+                                  typename Operation :: IndexType s,
+                                  typename Operation :: IndexType n,
+                                  typename Operation :: ResultType* sdata )
+{
+   if( tid < s )
+   {
+      sdata[ tid ] = operation. commonReductionOnDevice( tid, tid + s, sdata );
+   }
+   /* This is for the case when we have odd number of elements.
+    * The last one will be reduced using the thread with ID 0.
+    */
+   if( s > 32 )
+      __syncthreads();
+   if( 2 * s < n && tid == n - 1 )
+   {
+      sdata[ 0 ] = operation. commonReductionOnDevice( 0, tid, sdata );
+   }
+}
+
+/***
+ * The parallel reduction of one vector.
+ *
+ * WARNING: This kernel only reduce data in one block. Use rather tnlCUDASimpleReduction2
+ *          to call this kernel then doing it by yourself.
+ *          This kernel is very inefficient. It is here only for educative and testing reasons.
+ *          Please use tnlCUDAReduction instead.
+ *
+ * The kernel parameters:
+ * @param size is the number of all element to reduce - not just in one block.
+ * @param deviceInput input data which we want to reduce
+ * @param deviceOutput an array to which we write the result of reduction.
+ *                     Each block of the grid writes one element in this array
+ *                     (i.e. the size of this array equals the number of CUDA blocks).
+ */
+template < typename Operation, int blockSize >
+__global__ void tnlCUDAReductionKernel( const Operation operation,
+                                        const typename Operation :: IndexType size,
+                                        const typename Operation :: RealType* deviceInput,
+                                        const typename Operation :: RealType* deviceInput2,
+                                        typename Operation :: ResultType* deviceOutput )
+{
+   extern __shared__ __align__ ( 8 ) char __sdata[];
+   
+   typedef typename Operation :: IndexType IndexType;
+   typedef typename Operation :: RealType RealType;
+   typedef typename Operation :: ResultType ResultType;
+
+   ResultType* sdata = reinterpret_cast< ResultType* >( __sdata );
+
+   /***
+    * Get thread id (tid) and global thread id (gid).
+    * lastTId is the last relevant thread id in this block.
+    * gridSize is the number of element processed by all blocks at the
+    * same time.
+    */
+   IndexType tid = threadIdx. x;
+   IndexType gid = 2 * blockIdx. x * blockDim. x + threadIdx. x;
+   IndexType lastTId = size - 2 * blockIdx. x * blockDim. x;
+   IndexType gridSize = 2 * blockDim. x * gridDim.x;
+
+   /***
+    * Read data into the shared memory. We start with the
+    * sequential reduction.
+    */
+   if( gid + blockDim. x < size )
+      sdata[ tid ] = operation. initialValueOnDevice( gid, gid + blockDim. x, deviceInput, deviceInput2 );
+   else if( gid < size )
+      sdata[ tid ] = operation. initialValueOnDevice( gid, deviceInput, deviceInput2 );
+
+   gid += gridSize;
+   while( gid + blockDim. x < size )
+   {
+      sdata[ tid ] = operation. firstReductionOnDevice( tid, gid, gid + blockDim. x, sdata, deviceInput, deviceInput2 );
+      gid += gridSize;
+   }
+   if( gid < size )
+      sdata[ tid ] = operation. firstReductionOnDevice( tid, gid, sdata, deviceInput, deviceInput2 );
+   __syncthreads();
+
+
+   /***
+    *  Perform the parallel reduction.
+    *  We reduce the data with step s which is one half of the elements to reduce.
+    *  Each thread with ID < s reduce elements tid and tid + s. The result is stored
+    *  in shared memory in sdata 0 .. s. We set s = s / 2 ( i.e. s >>= 1) and repeat
+    *  the algorithm again until s = 1.
+    *  We also separate the case when the blockDim. x is power of 2 and the algorithm
+    *  can be written in more efficient way without some conditions.
+    */
+   unsigned int n = lastTId < blockDim. x ? lastTId : blockDim. x;
+   if( n == 128 || n ==  64 || n ==  32 || n ==  16 ||
+       n ==   8 || n ==   4 || n ==   2 || n == 256 ||
+       n == 512 )
+   {
+      if( blockSize >= 512 )
+      {
+         if( tid < 256 )
+            reduceAligned( operation, tid, 256, sdata );
+         __syncthreads();
+      }
+      if( blockSize >= 256 )
+      {
+         if( tid < 128 )
+            reduceAligned( operation, tid, 128, sdata );
+         __syncthreads();
+      }
+      if( blockSize >= 128 )
+      {
+         if( tid <  64 )
+            reduceAligned( operation, tid, 64, sdata );
+         __syncthreads();
+      }
+
+      /***
+       * This runs in one warp so it is synchronised implicitly.
+       */
+      if (tid < 32)
+      {
+         if( blockSize >= 64 )
+            reduceAligned( operation, tid, 32, sdata );
+         if( blockSize >= 32 )
+            reduceAligned( operation, tid, 16, sdata );
+         if( blockSize >= 16 )
+            reduceAligned( operation, tid,  8, sdata );
+         if( blockSize >=  8 )
+            reduceAligned( operation, tid,  4, sdata );
+         if( blockSize >=  4 )
+            reduceAligned( operation, tid,  2, sdata );
+         if( blockSize >=  2 )
+            reduceAligned( operation, tid,  1, sdata );
+      }
+   }
+   else
+   {
+      unsigned int s;
+      if( n >= 512 )
+      {
+         s = n / 2;
+         reduceNonAligned( operation, tid, s, n, sdata );
+         n = s;
+         __syncthreads();
+      }
+      if( n >= 256 )
+      {
+         s = n / 2;
+         reduceNonAligned( operation, tid, s, n, sdata );
+         n = s;
+         __syncthreads();
+      }
+      if( n >= 128 )
+      {
+         s = n / 2;
+         reduceNonAligned( operation, tid, s, n, sdata );
+         n = s;
+         __syncthreads();
+      }
+      if( n >= 64 )
+      {
+         s = n / 2;
+         reduceNonAligned( operation, tid, s, n, sdata );
+         n = s;
+         __syncthreads();
+      }
+      if( n >= 32 )
+      {
+         s = n / 2;
+         reduceNonAligned( operation, tid, s, n, sdata );
+         n = s;
+         __syncthreads();
+      }
+      /***
+       * This runs in one warp so it is synchronised implicitly.
+       */
+      if( n >= 16 )
+      {
+         s = n / 2;
+         reduceNonAligned( operation, tid, s, n, sdata );
+         n = s;
+      }
+      if( n >= 8 )
+      {
+         s = n / 2;
+         reduceNonAligned( operation, tid, s, n, sdata );
+         n = s;
+      }
+      if( n >= 4 )
+      {
+         s = n / 2;
+         reduceNonAligned( operation, tid, s, n, sdata );
+         n = s;
+      }
+      if( n >= 2 )
+      {
+         s = n / 2;
+         reduceNonAligned( operation, tid, s, n, sdata );
+         n = s;
+      }
+   }
+
+   /***
+    * Store the result back in the global memory.
+    */
+   if( tid == 0 )
+      deviceOutput[ blockIdx. x ] = sdata[ 0 ];
+}
+
+template< typename Operation >
+typename Operation :: IndexType reduceOnCudaDevice( const Operation& operation,
+                                                    const typename Operation :: IndexType size,
+                                                    const typename Operation :: RealType* input1,
+                                                    const typename Operation :: RealType* input2,
+                                                    typename Operation :: ResultType*& output)
+{
+   typedef typename Operation :: IndexType IndexType;
+   typedef typename Operation :: RealType RealType;
+   typedef typename Operation :: ResultType ResultType;
+
+   const IndexType desBlockSize( 512 );
+   const IndexType desGridSize( 2048 );
+   dim3 blockSize( 0 ), gridSize( 0 );
+
+   /***
+    * Compute the CUDA block size aligned to the power of two.
+    */
+   blockSize. x = :: Min( size, desBlockSize );
+   IndexType alignedBlockSize = 1;
+   while( alignedBlockSize < blockSize. x ) alignedBlockSize <<= 1;
+   blockSize. x = alignedBlockSize;
+
+   gridSize. x = Min( ( IndexType ) ( size / blockSize. x + 1 ) / 2, desGridSize );
+
+   if( ! output &&
+       ! allocateMemoryCuda( output, :: Max( ( IndexType ) 1, size / desBlockSize ) ) )
+         return false;
+
+   IndexType shmem = blockSize. x * sizeof( ResultType );
+   /***
+    * Depending on the blockSize we generate appropriate template instance.
+    */
+      switch( blockSize. x )
+      {
+         case 512:
+            tnlCUDAReductionKernel< Operation, 512 >
+            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+            break;
+         case 256:
+            tnlCUDAReductionKernel< Operation, 256 >
+            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+            break;
+         case 128:
+            tnlCUDAReductionKernel< Operation, 128 >
+            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+            break;
+         case  64:
+            tnlCUDAReductionKernel< Operation,  64 >
+            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+            break;
+         case  32:
+            tnlCUDAReductionKernel< Operation,  32 >
+            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+            break;
+         case  16:
+            tnlCUDAReductionKernel< Operation,  16 >
+            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+            break;
+         case   8:
+            tnlCUDAReductionKernel< Operation,   8 >
+            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+            break;
+         case   4:
+            tnlCUDAReductionKernel< Operation,   4 >
+            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+            break;
+         case   2:
+            tnlCUDAReductionKernel< Operation,   2 >
+            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+            break;
+         case   1:
+            tnlAssert( false, cerr << "blockSize should not be 1." << endl );
+            break;
+         default:
+            tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
+            break;
+      }
+   return gridSize. x;
+}
+#endif
+
+template< typename Operation >
+bool reductionOnCudaDevice( const Operation& operation,
+                            const typename Operation :: IndexType size,
+                            const typename Operation :: RealType* deviceInput1,
+                            const typename Operation :: RealType* deviceInput2,
+                            typename Operation :: ResultType& result )
+{
+#ifdef HAVE_CUDA
+
+   typedef typename Operation :: IndexType IndexType;
+   typedef typename Operation :: RealType RealType;
+   typedef typename Operation :: ResultType ResultType;
+   typedef typename Operation :: LaterReductionOperation LaterReductionOperation;
+
+   /***
+    * First check if the input array(s) is/are large enough for the reduction on GPU.
+    * Otherwise copy it/them to host and reduce on CPU.
+    */
+   RealType hostArray1[ maxGPUReductionDataSize ];
+   RealType hostArray2[ maxGPUReductionDataSize ];
+   if( size <= maxGPUReductionDataSize )
+   {
+      if( ! copyMemoryCudaToHost( hostArray1, deviceInput1, size ) )
+         return false;
+      if( deviceInput2 && ! copyMemoryCudaToHost( hostArray2, deviceInput2, size ) )
+         return false;
+      result = operation. initialValueOnHost( 0, hostArray1, hostArray2 );
+      for( IndexType i = 1; i < size; i ++ )
+         result = operation. reduceOnHost( i, result, hostArray1, hostArray2 );
+      return true;
+   }
+
+   /****
+    * Reduce the data on the CUDA device.
+    */
+   ResultType* deviceAux1( 0 ), *deviceAux2( 0 );
+   IndexType reducedSize = reduceOnCudaDevice( operation,
+                                               size,
+                                               deviceInput1,
+                                               deviceInput2,
+                                               deviceAux1 );
+
+   LaterReductionOperation laterReductionOperation;
+   while( reducedSize > maxGPUReductionDataSize )
+   {
+      reducedSize = reduceOnCudaDevice( laterReductionOperation,
+                                        reducedSize,
+                                        deviceAux1,
+                                        ( ResultType* ) 0,
+                                        deviceAux2 );
+      Swap( deviceAux1, deviceAux2 );
+   }
+
+   /***
+    * Transfer the reduced data from device to host.
+    */
+   ResultType resultArray[ maxGPUReductionDataSize ];
+   if( ! copyMemoryCudaToHost( resultArray, deviceAux1, reducedSize ) )
+      return false;
+
+   /***
+    * Reduce the data on the host system.
+    */
+   //for( IndexType i = 0; i < reducedSize; i ++ )
+   //   cout << resultArray[ i ] << ", ";
+   result = laterReductionOperation. initialValueOnHost( 0, resultArray, ( ResultType* ) 0 );
+   for( IndexType i = 1; i < reducedSize; i ++ )
+      result = laterReductionOperation. reduceOnHost( i, result, resultArray, ( ResultType*) 0 );
+
+   /****
+    * Free the memory allocated on the device.
+    */
+   if( deviceAux1 && ! freeMemoryCuda( deviceAux1 ) )
+      return false;
+   if( deviceAux2 && ! freeMemoryCuda( deviceAux2 ) )
+      return false;
+   return true;
+#else
+   cerr << "I am sorry but CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl;
+   return false;
+#endif
+};
+
+#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
+
+/****
+ * Sum
+ */
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< char, int > >
+                                   ( const tnlParallelReductionSum< char, int >& operation,
+                                     const typename tnlParallelReductionSum< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< int, int > >
+                                   ( const tnlParallelReductionSum< int, int >& operation,
+                                     const typename tnlParallelReductionSum< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< float, int > >
+                                   ( const tnlParallelReductionSum< float, int >& operation,
+                                     const typename tnlParallelReductionSum< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< double, int > >
+                                   ( const tnlParallelReductionSum< double, int>& operation,
+                                     const typename tnlParallelReductionSum< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, int > >
+                                   ( const tnlParallelReductionSum< long double, int>& operation,
+                                     const typename tnlParallelReductionSum< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< char, long int > >
+                                   ( const tnlParallelReductionSum< char, long int >& operation,
+                                     const typename tnlParallelReductionSum< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< int, long int > >
+                                   ( const tnlParallelReductionSum< int, long int >& operation,
+                                     const typename tnlParallelReductionSum< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< float, long int > >
+                                   ( const tnlParallelReductionSum< float, long int >& operation,
+                                     const typename tnlParallelReductionSum< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSum< double, long int > >
+                                   ( const tnlParallelReductionSum< double, long int>& operation,
+                                     const typename tnlParallelReductionSum< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, long int > >
+                                   ( const tnlParallelReductionSum< long double, long int>& operation,
+                                     const typename tnlParallelReductionSum< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSum< long double, long int> :: ResultType& result );*/
+
+/****
+ * Min
+ */
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< char, int > >
+                                   ( const tnlParallelReductionMin< char, int >& operation,
+                                     const typename tnlParallelReductionMin< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< int, int > >
+                                   ( const tnlParallelReductionMin< int, int >& operation,
+                                     const typename tnlParallelReductionMin< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< float, int > >
+                                   ( const tnlParallelReductionMin< float, int >& operation,
+                                     const typename tnlParallelReductionMin< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< double, int > >
+                                   ( const tnlParallelReductionMin< double, int>& operation,
+                                     const typename tnlParallelReductionMin< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, int > >
+                                   ( const tnlParallelReductionMin< long double, int>& operation,
+                                     const typename tnlParallelReductionMin< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< char, long int > >
+                                   ( const tnlParallelReductionMin< char, long int >& operation,
+                                     const typename tnlParallelReductionMin< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< int, long int > >
+                                   ( const tnlParallelReductionMin< int, long int >& operation,
+                                     const typename tnlParallelReductionMin< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< float, long int > >
+                                   ( const tnlParallelReductionMin< float, long int >& operation,
+                                     const typename tnlParallelReductionMin< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMin< double, long int > >
+                                   ( const tnlParallelReductionMin< double, long int>& operation,
+                                     const typename tnlParallelReductionMin< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, long int > >
+                                   ( const tnlParallelReductionMin< long double, long int>& operation,
+                                     const typename tnlParallelReductionMin< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMin< long double, long int> :: ResultType& result );*/
+
+/****
+ * Max
+ */
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< char, int > >
+                                   ( const tnlParallelReductionMax< char, int >& operation,
+                                     const typename tnlParallelReductionMax< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< int, int > >
+                                   ( const tnlParallelReductionMax< int, int >& operation,
+                                     const typename tnlParallelReductionMax< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< float, int > >
+                                   ( const tnlParallelReductionMax< float, int >& operation,
+                                     const typename tnlParallelReductionMax< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< double, int > >
+                                   ( const tnlParallelReductionMax< double, int>& operation,
+                                     const typename tnlParallelReductionMax< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, int > >
+                                   ( const tnlParallelReductionMax< long double, int>& operation,
+                                     const typename tnlParallelReductionMax< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< char, long int > >
+                                   ( const tnlParallelReductionMax< char, long int >& operation,
+                                     const typename tnlParallelReductionMax< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< int, long int > >
+                                   ( const tnlParallelReductionMax< int, long int >& operation,
+                                     const typename tnlParallelReductionMax< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< float, long int > >
+                                   ( const tnlParallelReductionMax< float, long int >& operation,
+                                     const typename tnlParallelReductionMax< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionMax< double, long int > >
+                                   ( const tnlParallelReductionMax< double, long int>& operation,
+                                     const typename tnlParallelReductionMax< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, long int > >
+                                   ( const tnlParallelReductionMax< long double, long int>& operation,
+                                     const typename tnlParallelReductionMax< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionMax< long double, long int> :: ResultType& result );*/
+
+/****
+ * Abs sum
+ */
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, int > >
+                                   ( const tnlParallelReductionAbsSum< char, int >& operation,
+                                     const typename tnlParallelReductionAbsSum< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< int, int > >
+                                   ( const tnlParallelReductionAbsSum< int, int >& operation,
+                                     const typename tnlParallelReductionAbsSum< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< float, int > >
+                                   ( const tnlParallelReductionAbsSum< float, int >& operation,
+                                     const typename tnlParallelReductionAbsSum< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, int > >
+                                   ( const tnlParallelReductionAbsSum< double, int>& operation,
+                                     const typename tnlParallelReductionAbsSum< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, int > >
+                                   ( const tnlParallelReductionAbsSum< long double, int>& operation,
+                                     const typename tnlParallelReductionAbsSum< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, long int > >
+                                   ( const tnlParallelReductionAbsSum< char, long int >& operation,
+                                     const typename tnlParallelReductionAbsSum< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< int, long int > >
+                                   ( const tnlParallelReductionAbsSum< int, long int >& operation,
+                                     const typename tnlParallelReductionAbsSum< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< float, long int > >
+                                   ( const tnlParallelReductionAbsSum< float, long int >& operation,
+                                     const typename tnlParallelReductionAbsSum< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, long int > >
+                                   ( const tnlParallelReductionAbsSum< double, long int>& operation,
+                                     const typename tnlParallelReductionAbsSum< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, long int > >
+                                   ( const tnlParallelReductionAbsSum< long double, long int>& operation,
+                                     const typename tnlParallelReductionAbsSum< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsSum< long double, long int> :: ResultType& result );*/
+
+/****
+ * Abs min
+ */
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, int > >
+                                   ( const tnlParallelReductionAbsMin< char, int >& operation,
+                                     const typename tnlParallelReductionAbsMin< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, int > >
+                                   ( const tnlParallelReductionAbsMin< int, int >& operation,
+                                     const typename tnlParallelReductionAbsMin< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< float, int > >
+                                   ( const tnlParallelReductionAbsMin< float, int >& operation,
+                                     const typename tnlParallelReductionAbsMin< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, int > >
+                                   ( const tnlParallelReductionAbsMin< double, int>& operation,
+                                     const typename tnlParallelReductionAbsMin< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, int > >
+                                   ( const tnlParallelReductionAbsMin< long double, int>& operation,
+                                     const typename tnlParallelReductionAbsMin< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int > >
+                                   ( const tnlParallelReductionAbsMin< char, long int >& operation,
+                                     const typename tnlParallelReductionAbsMin< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, long int > >
+                                   ( const tnlParallelReductionAbsMin< int, long int >& operation,
+                                     const typename tnlParallelReductionAbsMin< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< float, long int > >
+                                   ( const tnlParallelReductionAbsMin< float, long int >& operation,
+                                     const typename tnlParallelReductionAbsMin< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, long int > >
+                                   ( const tnlParallelReductionAbsMin< double, long int>& operation,
+                                     const typename tnlParallelReductionAbsMin< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > >
+                                   ( const tnlParallelReductionAbsMin< long double, long int>& operation,
+                                     const typename tnlParallelReductionAbsMin< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMin< long double, long int> :: ResultType& result );*/
+/****
+ * Abs max
+ */
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, int > >
+                                   ( const tnlParallelReductionAbsMax< char, int >& operation,
+                                     const typename tnlParallelReductionAbsMax< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, int > >
+                                   ( const tnlParallelReductionAbsMax< int, int >& operation,
+                                     const typename tnlParallelReductionAbsMax< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< float, int > >
+                                   ( const tnlParallelReductionAbsMax< float, int >& operation,
+                                     const typename tnlParallelReductionAbsMax< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, int > >
+                                   ( const tnlParallelReductionAbsMax< double, int>& operation,
+                                     const typename tnlParallelReductionAbsMax< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, int > >
+                                   ( const tnlParallelReductionAbsMax< long double, int>& operation,
+                                     const typename tnlParallelReductionAbsMax< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int > >
+                                   ( const tnlParallelReductionAbsMax< char, long int >& operation,
+                                     const typename tnlParallelReductionAbsMax< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, long int > >
+                                   ( const tnlParallelReductionAbsMax< int, long int >& operation,
+                                     const typename tnlParallelReductionAbsMax< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< float, long int > >
+                                   ( const tnlParallelReductionAbsMax< float, long int >& operation,
+                                     const typename tnlParallelReductionAbsMax< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, long int > >
+                                   ( const tnlParallelReductionAbsMax< double, long int>& operation,
+                                     const typename tnlParallelReductionAbsMax< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > >
+                                   ( const tnlParallelReductionAbsMax< long double, long int>& operation,
+                                     const typename tnlParallelReductionAbsMax< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionAbsMax< long double, long int> :: ResultType& result );*/
+
+/****
+ * Logical AND
+ */
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, int > >
+                                   ( const tnlParallelReductionLogicalAnd< char, int >& operation,
+                                     const typename tnlParallelReductionLogicalAnd< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< int, int > >
+                                   ( const tnlParallelReductionLogicalAnd< int, int >& operation,
+                                     const typename tnlParallelReductionLogicalAnd< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< float, int > >
+                                   ( const tnlParallelReductionLogicalAnd< float, int >& operation,
+                                     const typename tnlParallelReductionLogicalAnd< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, int > >
+                                   ( const tnlParallelReductionLogicalAnd< double, int>& operation,
+                                     const typename tnlParallelReductionLogicalAnd< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, int > >
+                                   ( const tnlParallelReductionLogicalAnd< long double, int>& operation,
+                                     const typename tnlParallelReductionLogicalAnd< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, long int > >
+                                   ( const tnlParallelReductionLogicalAnd< char, long int >& operation,
+                                     const typename tnlParallelReductionLogicalAnd< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< int, long int > >
+                                   ( const tnlParallelReductionLogicalAnd< int, long int >& operation,
+                                     const typename tnlParallelReductionLogicalAnd< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< float, long int > >
+                                   ( const tnlParallelReductionLogicalAnd< float, long int >& operation,
+                                     const typename tnlParallelReductionLogicalAnd< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, long int > >
+                                   ( const tnlParallelReductionLogicalAnd< double, long int>& operation,
+                                     const typename tnlParallelReductionLogicalAnd< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, long int > >
+                                   ( const tnlParallelReductionLogicalAnd< long double, long int>& operation,
+                                     const typename tnlParallelReductionLogicalAnd< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalAnd< long double, long int> :: ResultType& result );*/
+
+/****
+ * Logical OR
+ */
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, int > >
+                                   ( const tnlParallelReductionLogicalOr< char, int >& operation,
+                                     const typename tnlParallelReductionLogicalOr< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< int, int > >
+                                   ( const tnlParallelReductionLogicalOr< int, int >& operation,
+                                     const typename tnlParallelReductionLogicalOr< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< float, int > >
+                                   ( const tnlParallelReductionLogicalOr< float, int >& operation,
+                                     const typename tnlParallelReductionLogicalOr< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, int > >
+                                   ( const tnlParallelReductionLogicalOr< double, int>& operation,
+                                     const typename tnlParallelReductionLogicalOr< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, int > >
+                                   ( const tnlParallelReductionLogicalOr< long double, int>& operation,
+                                     const typename tnlParallelReductionLogicalOr< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, long int > >
+                                   ( const tnlParallelReductionLogicalOr< char, long int >& operation,
+                                     const typename tnlParallelReductionLogicalOr< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< int, long int > >
+                                   ( const tnlParallelReductionLogicalOr< int, long int >& operation,
+                                     const typename tnlParallelReductionLogicalOr< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< float, long int > >
+                                   ( const tnlParallelReductionLogicalOr< float, long int >& operation,
+                                     const typename tnlParallelReductionLogicalOr< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, long int > >
+                                   ( const tnlParallelReductionLogicalOr< double, long int>& operation,
+                                     const typename tnlParallelReductionLogicalOr< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, long int > >
+                                   ( const tnlParallelReductionLogicalOr< long double, long int>& operation,
+                                     const typename tnlParallelReductionLogicalOr< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLogicalOr< long double, long int> :: ResultType& result );*/
+
+
+/****
+ * Lp Norm
+ */
+extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< float, int > >
+                                   ( const tnlParallelReductionLpNorm< float, int >& operation,
+                                     const typename tnlParallelReductionLpNorm< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, int > >
+                                   ( const tnlParallelReductionLpNorm< double, int>& operation,
+                                     const typename tnlParallelReductionLpNorm< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, int > >
+                                   ( const tnlParallelReductionLpNorm< long double, int>& operation,
+                                     const typename tnlParallelReductionLpNorm< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< char, long int > >
+                                   ( const tnlParallelReductionLpNorm< char, long int >& operation,
+                                     const typename tnlParallelReductionLpNorm< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< int, long int > >
+                                   ( const tnlParallelReductionLpNorm< int, long int >& operation,
+                                     const typename tnlParallelReductionLpNorm< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< float, long int > >
+                                   ( const tnlParallelReductionLpNorm< float, long int >& operation,
+                                     const typename tnlParallelReductionLpNorm< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, long int > >
+                                   ( const tnlParallelReductionLpNorm< double, long int>& operation,
+                                     const typename tnlParallelReductionLpNorm< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, long int > >
+                                   ( const tnlParallelReductionLpNorm< long double, long int>& operation,
+                                     const typename tnlParallelReductionLpNorm< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionLpNorm< long double, long int> :: ResultType& result );*/
+
+
+/****
+ * Equalities
+ */
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, int > >
+                                   ( const tnlParallelReductionEqualities< char, int >& operation,
+                                     const typename tnlParallelReductionEqualities< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< int, int > >
+                                   ( const tnlParallelReductionEqualities< int, int >& operation,
+                                     const typename tnlParallelReductionEqualities< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< float, int > >
+                                   ( const tnlParallelReductionEqualities< float, int >& operation,
+                                     const typename tnlParallelReductionEqualities< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, int > >
+                                   ( const tnlParallelReductionEqualities< double, int>& operation,
+                                     const typename tnlParallelReductionEqualities< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, int > >
+                                   ( const tnlParallelReductionEqualities< long double, int>& operation,
+                                     const typename tnlParallelReductionEqualities< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, long int > >
+                                   ( const tnlParallelReductionEqualities< char, long int >& operation,
+                                     const typename tnlParallelReductionEqualities< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< int, long int > >
+                                   ( const tnlParallelReductionEqualities< int, long int >& operation,
+                                     const typename tnlParallelReductionEqualities< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< float, long int > >
+                                   ( const tnlParallelReductionEqualities< float, long int >& operation,
+                                     const typename tnlParallelReductionEqualities< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, long int > >
+                                   ( const tnlParallelReductionEqualities< double, long int>& operation,
+                                     const typename tnlParallelReductionEqualities< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, long int > >
+                                   ( const tnlParallelReductionEqualities< long double, long int>& operation,
+                                     const typename tnlParallelReductionEqualities< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionEqualities< long double, long int> :: ResultType& result );*/
+
+
+/****
+ * Inequalities
+ */
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, int > >
+                                   ( const tnlParallelReductionInequalities< char, int >& operation,
+                                     const typename tnlParallelReductionInequalities< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< int, int > >
+                                   ( const tnlParallelReductionInequalities< int, int >& operation,
+                                     const typename tnlParallelReductionInequalities< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< float, int > >
+                                   ( const tnlParallelReductionInequalities< float, int >& operation,
+                                     const typename tnlParallelReductionInequalities< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, int > >
+                                   ( const tnlParallelReductionInequalities< double, int>& operation,
+                                     const typename tnlParallelReductionInequalities< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, int > >
+                                   ( const tnlParallelReductionInequalities< long double, int>& operation,
+                                     const typename tnlParallelReductionInequalities< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, long int > >
+                                   ( const tnlParallelReductionInequalities< char, long int >& operation,
+                                     const typename tnlParallelReductionInequalities< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< int, long int > >
+                                   ( const tnlParallelReductionInequalities< int, long int >& operation,
+                                     const typename tnlParallelReductionInequalities< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< float, long int > >
+                                   ( const tnlParallelReductionInequalities< float, long int >& operation,
+                                     const typename tnlParallelReductionInequalities< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, long int > >
+                                   ( const tnlParallelReductionInequalities< double, long int>& operation,
+                                     const typename tnlParallelReductionInequalities< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, long int > >
+                                   ( const tnlParallelReductionInequalities< long double, long int>& operation,
+                                     const typename tnlParallelReductionInequalities< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionInequalities< long double, long int> :: ResultType& result );*/
+
+
+/****
+ * Sdot
+ */
+extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< char, int > >
+                                   ( const tnlParallelReductionSdot< char, int >& operation,
+                                     const typename tnlParallelReductionSdot< char, int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< char, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< char, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< char, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< int, int > >
+                                   ( const tnlParallelReductionSdot< int, int >& operation,
+                                     const typename tnlParallelReductionSdot< int, int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< int, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< int, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< int, int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< float, int > >
+                                   ( const tnlParallelReductionSdot< float, int >& operation,
+                                     const typename tnlParallelReductionSdot< float, int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< float, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< float, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< float, int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< double, int > >
+                                   ( const tnlParallelReductionSdot< double, int>& operation,
+                                     const typename tnlParallelReductionSdot< double, int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< double, int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< long double, int > >
+                                   ( const tnlParallelReductionSdot< long double, int>& operation,
+                                     const typename tnlParallelReductionSdot< long double, int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< long double, int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< long double, int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< long double, int> :: ResultType& result );*/
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< char, long int > >
+                                   ( const tnlParallelReductionSdot< char, long int >& operation,
+                                     const typename tnlParallelReductionSdot< char, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< char, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< char, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< char, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< int, long int > >
+                                   ( const tnlParallelReductionSdot< int, long int >& operation,
+                                     const typename tnlParallelReductionSdot< int, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< int, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< int, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< int, long int > :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< float, long int > >
+                                   ( const tnlParallelReductionSdot< float, long int >& operation,
+                                     const typename tnlParallelReductionSdot< float, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< float, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< float, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< float, long int> :: ResultType& result );
+
+extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< double, long int > >
+                                   ( const tnlParallelReductionSdot< double, long int>& operation,
+                                     const typename tnlParallelReductionSdot< double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< double, long int> :: ResultType& result );
+
+/*extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< long double, long int > >
+                                   ( const tnlParallelReductionSdot< long double, long int>& operation,
+                                     const typename tnlParallelReductionSdot< long double, long int > :: IndexType size,
+                                     const typename tnlParallelReductionSdot< long double, long int > :: RealType* deviceInput1,
+                                     const typename tnlParallelReductionSdot< long double, long int > :: RealType* deviceInput2,
+                                     typename tnlParallelReductionSdot< long double, long int> :: ResultType& result );*/
+
+#endif /* TEMPLATE_EXPLICIT_INSTANTIATION */
+
+#endif /* CUDA_REDUCTION_IMPL_H_ */
diff --git a/src/implementation/core/cuda/reduction_impl.h b/src/implementation/core/cuda/reduction_impl.h
deleted file mode 100644
index 388afec7010a0e44defbd82785d69b5337011b06..0000000000000000000000000000000000000000
--- a/src/implementation/core/cuda/reduction_impl.h
+++ /dev/null
@@ -1,805 +0,0 @@
-/***************************************************************************
-                          cuda-long-vector-kernels.h  -  description
-                             -------------------
-    begin                : Oct 28, 2010
-    copyright            : (C) 2010 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef CUDALONGVECTORKERNELS_H_
-#define CUDALONGVECTORKERNELS_H_
-
-#ifdef HAVE_CUDA
-#include <cuda.h>
-#endif
-#include <iostream>
-#include <core/tnlVector.h>
-
-using namespace std;
-
-enum tnlTupleOperation { tnlParallelReductionMin = 1,
-                          tnlParallelReductionMax,
-                          tnlParallelReductionSum,
-                          tnlParallelReductionAbsMin,
-                          tnlParallelReductionAbsMax,
-                          tnlParallelReductionAbsSum,
-                          tnlParallelReductionLpNorm,
-                          tnlParallelReductionSdot };
-
-/****
- * This constant says that arrays smaller than its value
- * are going to be reduced on CPU.
- */
-const int maxGPUReductionDataSize = 256;
-
-/****
- * The following kernels and functions have been adopted from
- *
- * M. Harris, “Optimizing parallel reduction in cuda,” NVIDIA CUDA SDK, 2007.
- *
- * The code was extended even for data arrays with size different from
- * a power of 2.
- *
- * For the educative and also testing/debugging reasons we have 6 version of this algorithm.
- * The slower version can be found as a part o ftesting code. See directory tests.
- * Version 1 is the slowest and version 6 is the fastest - teste on CUDA architecture 1.0 - 1.3.
- * Another improvements are possible for the future devices.
- *
- */
-
-#ifdef HAVE_CUDA
-/***
- * This function returns minimum of two numbers stored on the device.
- */
-template< class T > __device__ T tnlCudaMin( const T& a,
-                                             const T& b )
-{
-   return a < b ? a : b;
-}
-
-__device__ int tnlCudaMin( const int& a,
-                           const int& b )
-{
-   return min( a, b );
-}
-
-__device__ float tnlCudaMin( const float& a,
-                             const float& b )
-{
-   return fminf( a, b );
-}
-
-__device__ double tnlCudaMin( const double& a,
-                              const double& b )
-{
-   return fmin( a, b );
-}
-
-/***
- * This function returns maximum of two numbers stored on the device.
- */
-template< class T > __device__ T tnlCudaMax( const T& a,
-                                             const T& b )
-{
-   return a > b ? a : b;
-}
-
-__device__ int tnlCudaMax( const int& a,
-                           const int& b )
-{
-   return max( a, b );
-}
-
-__device__ float tnlCudaMax( const float& a,
-                             const float& b )
-{
-   return fmaxf( a, b );
-}
-
-__device__ double tnlCudaMax( const double& a,
-                              const double& b )
-{
-   return fmax( a, b );
-}
-
-/***
- * This function returns absolute value of given number on the device.
- */
-__device__ int tnlCudaAbs( const int& a )
-{
-   return abs( a );
-}
-
-__device__ float tnlCudaAbs( const float& a )
-{
-   return fabs( a );
-}
-
-__device__ double tnlCudaAbs( const double& a )
-{
-   return fabs( a );
-}
-
-/***
- * For each thread in block with thread ID smaller then s this function reduces
- * data elements with indecis tid and tid + s. Here we assume that for each
- * tid the tid + s element also exists i.e. we have even number of elements.
- */
-template< class T, tnlTupleOperation operation >
-__device__ void reduceAligned( unsigned int tid,
-                               unsigned int s,
-                               T* sdata )
-{
-   if( tid < s )
-   {
-      if( operation == tnlParallelReductionMin )
-         sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
-      if( operation == tnlParallelReductionMax )
-         sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
-      if( operation == tnlParallelReductionSum )
-         sdata[ tid ] += sdata[ tid + s ];
-      if( operation == tnlParallelReductionAbsMin )
-         sdata[ tid ] = tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) );
-      if( operation == tnlParallelReductionAbsMax )
-         sdata[ tid ] = tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) );
-      if( operation == tnlParallelReductionLpNorm ||
-          operation == tnlParallelReductionSdot )
-         sdata[ tid ] = sdata[ tid ] + sdata[ tid + s ];
-   }
-}
-
-/***
- * For each thread in block with thread ID smaller then s this function reduces
- * data elements with indices tid and tid + s. This is a modified version of
- * the previous algorithm. Thid one works even for odd number of elements but
- * it is a bit slower.
- */
-template< class T, tnlTupleOperation operation >
-__device__ void reduceNonAligned( unsigned int tid,
-                                  unsigned int s,
-                                  unsigned int n,
-                                  T* sdata )
-{
-   if( tid < s )
-   {
-      if( operation == tnlParallelReductionMin )
-         sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
-      if( operation == tnlParallelReductionMax )
-         sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
-      if( operation == tnlParallelReductionSum )
-         sdata[ tid ] += sdata[ tid + s ];
-      if( operation == tnlParallelReductionAbsMin )
-         sdata[ tid ] = tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) );
-      if( operation == tnlParallelReductionAbsMax )
-         sdata[ tid ] = tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) );
-      if( operation == tnlParallelReductionLpNorm ||
-          operation == tnlParallelReductionSdot )
-         sdata[ tid ] = sdata[ tid ] + sdata[ tid + s ];
-   }
-   /* This is for the case when we have odd number of elements.
-    * The last one will be reduced using the thread with ID 0.
-    */
-   if( s > 32 )
-      __syncthreads();
-   if( 2 * s < n && tid == n - 1 )
-   {
-      if( operation == tnlParallelReductionMin )
-         sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
-      if( operation == tnlParallelReductionMax )
-         sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
-      if( operation == tnlParallelReductionSum )
-         sdata[ 0 ] += sdata[ tid ];
-      if( operation == tnlParallelReductionAbsMin )
-         sdata[ 0 ] = tnlCudaMin( tnlCudaAbs( sdata[ 0] ), tnlCudaAbs( sdata[ tid + s ] ) );
-      if( operation == tnlParallelReductionAbsMax )
-         sdata[ 0 ] = tnlCudaMax( tnlCudaAbs( sdata[ 0 ] ), tnlCudaAbs( sdata[ tid + s ] ) );
-      if( operation == tnlParallelReductionLpNorm ||
-          operation == tnlParallelReductionSdot )
-         sdata[ 0 ] = sdata[ 0 ] + sdata[ tid + s ];
-
-   }
-}
-
-/***
- * The parallel reduction of one vector.
- *
- * WARNING: This kernel only reduce data in one block. Use rather tnlCUDASimpleReduction2
- *          to call this kernel then doing it by yourself.
- *          This kernel is very inefficient. It is here only for educative and testing reasons.
- *          Please use tnlCUDAReduction instead.
- *
- * The kernel parameters:
- * @param size is the number of all element to reduce - not just in one block.
- * @param deviceInput input data which we want to reduce
- * @param deviceOutput an array to which we write the result of reduction.
- *                     Each block of the grid writes one element in this array
- *                     (i.e. the size of this array equals the number of CUDA blocks).
- */
-template < typename Type, typename ParameterType, typename Index, tnlTupleOperation operation, int blockSize >
-__global__ void tnlCUDAReductionKernel( const Index size,
-                                        const Type* deviceInput,
-                                        const Type* deviceInput2,
-                                        Type* deviceOutput,
-                                        const ParameterType parameter,
-                                        Type* dbg_array1 = 0 )
-{
-   extern __shared__ __align__ ( 8 ) char __sdata[];
-   Type* sdata = reinterpret_cast< Type* >( __sdata );
-
-   /***
-    * Get thread id (tid) and global thread id (gid).
-    * lastTId is the last relevant thread id in this block.
-    * gridSize is the number of element processed by all blocks at the
-    * same time.
-    */
-   unsigned int tid = threadIdx. x;
-   unsigned int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x;
-   unsigned int lastTId = size - 2 * blockIdx. x * blockDim. x;
-   unsigned int gridSize = 2 * blockDim. x * gridDim.x;
-
-   /***
-    * Read data into the shared memory. We start with the
-    * sequential reduction.
-    */
-   if( gid + blockDim. x < size )
-   {
-      if( operation == tnlParallelReductionMin )
-         sdata[ tid ] = tnlCudaMin( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] );
-      if( operation == tnlParallelReductionMax )
-         sdata[ tid ] = tnlCudaMax( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] );
-      if( operation == tnlParallelReductionAbsMin )
-         sdata[ tid ] = tnlCudaMin( tnlCudaAbs( deviceInput[ gid ] ), tnlCudaAbs( deviceInput[ gid + blockDim. x ] ) );
-      if( operation == tnlParallelReductionAbsMax )
-         sdata[ tid ] = tnlCudaMax( tnlCudaAbs( deviceInput[ gid ] ), tnlCudaAbs( deviceInput[ gid + blockDim. x ] ) );
-      if( operation == tnlParallelReductionSum )
-         sdata[ tid ] = deviceInput[ gid ] + deviceInput[ gid + blockDim. x ];
-      if( operation == tnlParallelReductionLpNorm )
-         sdata[ tid ] = powf( tnlCudaAbs( deviceInput[ gid ] ), parameter ) +
-                        powf( tnlCudaAbs( deviceInput[ gid + blockDim. x ] ), parameter );
-      if( operation == tnlParallelReductionSdot )
-         sdata[ tid ] = deviceInput[ gid ] * deviceInput2[ gid ] +
-                        deviceInput[ gid + blockDim. x ] * deviceInput2[ gid + blockDim. x ];
-   }
-   else if( gid < size )
-   {
-      if( operation == tnlParallelReductionLpNorm )
-         sdata[ tid ] = powf( tnlCudaAbs( deviceInput[ gid ] ), parameter );
-      else
-         if( operation == tnlParallelReductionSdot )
-            sdata[ tid ] = deviceInput[ gid ] * deviceInput2[ gid ];
-         else
-            sdata[ tid ] = deviceInput[ gid ];
-   }
-   gid += gridSize;
-   while( gid + blockDim. x < size )
-   {
-      if( operation == tnlParallelReductionMin )
-         sdata[ tid ] = :: tnlCudaMin( sdata[ tid ], :: tnlCudaMin( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] ) );
-      if( operation == tnlParallelReductionMax )
-         sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], :: tnlCudaMax( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] ) );
-      if( operation == tnlParallelReductionAbsMin )
-         sdata[ tid ] = :: tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), :: tnlCudaMin( tnlCudaAbs( deviceInput[ gid ] ), tnlCudaAbs( deviceInput[ gid + blockDim. x ] ) ) );
-      if( operation == tnlParallelReductionAbsMax )
-         sdata[ tid ] = :: tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), :: tnlCudaMax( tnlCudaAbs( deviceInput[ gid ] ), tnlCudaAbs( deviceInput[ gid + blockDim. x ] ) ) );
-      if( operation == tnlParallelReductionSum )
-         sdata[ tid ] += deviceInput[gid] + deviceInput[ gid + blockDim. x ];
-      if( operation == tnlParallelReductionLpNorm )
-         sdata[ tid ] += powf( tnlCudaAbs( deviceInput[gid] ), parameter ) +
-                         powf( tnlCudaAbs( deviceInput[ gid + blockDim. x ] ), parameter );
-      if( operation == tnlParallelReductionSdot )
-         sdata[ tid ] += deviceInput[ gid ] * deviceInput2[ gid ] +
-                         deviceInput[ gid + blockDim. x] * deviceInput2[ gid + blockDim. x ];
-      gid += gridSize;
-   }
-   if( gid < size )
-   {
-      if( operation == tnlParallelReductionMin )
-         sdata[ tid ] = :: tnlCudaMin( sdata[ tid ], deviceInput[ gid ] );
-      if( operation == tnlParallelReductionMax )
-         sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], deviceInput[ gid ] );
-      if( operation == tnlParallelReductionAbsMin )
-         sdata[ tid ] = :: tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( deviceInput[ gid ] ) );
-      if( operation == tnlParallelReductionAbsMax )
-         sdata[ tid ] = :: tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( deviceInput[ gid ] ) );
-      if( operation == tnlParallelReductionSum )
-         sdata[ tid ] += deviceInput[gid];
-      if( operation == tnlParallelReductionLpNorm )
-         sdata[ tid ] += powf( tnlCudaAbs( deviceInput[gid] ), parameter );
-      if( operation == tnlParallelReductionSdot )
-         sdata[ tid ] += deviceInput[ gid ] * deviceInput2[ gid ];
-   }
-   __syncthreads();
-
-
-   /***
-    *  Process the parallel reduction.
-    *  We reduce the data with step s which is one half of the elements to reduce.
-    *  Each thread with ID < s reduce elements tid and tid + s. The result is stored
-    *  in shared memory in sdata 0 .. s. We set s = s / 2 ( i.e. s >>= 1) and repeat
-    *  the algorithm again until s = 1.
-    *  We also separate the case when the blockDim. x is power of 2 and the algorithm
-    *  can be written in more efficient way without some conditions.
-    */
-   unsigned int n = lastTId < blockDim. x ? lastTId : blockDim. x;
-   if( n == 128 || n ==  64 || n ==  32 || n ==  16 ||
-       n ==   8 || n ==   4 || n ==   2 || n == 256 ||
-       n == 512 )
-   {
-      if( blockSize >= 512 )
-      {
-         if( tid < 256 )
-            reduceAligned< Type, operation >( tid, 256, sdata );
-         __syncthreads();
-      }
-      if( blockSize >= 256 )
-      {
-         if( tid < 128 )
-            reduceAligned< Type, operation >( tid, 128, sdata );
-         __syncthreads();
-      }
-      if( blockSize >= 128 )
-      {
-         if( tid <  64 )
-            reduceAligned< Type, operation >( tid, 64, sdata );
-         __syncthreads();
-      }
-
-      /***
-       * This runs in one warp so it is synchronised implicitly.
-       */
-      if (tid < 32)
-      {
-         if( blockSize >= 64 )
-            reduceAligned< Type, operation >( tid, 32, sdata );
-         if( blockSize >= 32 )
-            reduceAligned< Type, operation >( tid, 16, sdata );
-         if( blockSize >= 16 )
-            reduceAligned< Type, operation >( tid,  8, sdata );
-         if( blockSize >=  8 )
-            reduceAligned< Type, operation >( tid,  4, sdata );
-         if( blockSize >=  4 )
-            reduceAligned< Type, operation >( tid,  2, sdata );
-         if( blockSize >=  2 )
-            reduceAligned< Type, operation >( tid,  1, sdata );
-      }
-   }
-   else
-   {
-      unsigned int s;
-      if( n >= 512 )
-      {
-         s = n / 2;
-         reduceNonAligned< Type, operation >( tid, s, n, sdata );
-         n = s;
-         __syncthreads();
-      }
-      if( n >= 256 )
-      {
-         s = n / 2;
-         reduceNonAligned< Type, operation >( tid, s, n, sdata );
-         n = s;
-         __syncthreads();
-      }
-      if( n >= 128 )
-      {
-         s = n / 2;
-         reduceNonAligned< Type, operation >( tid, s, n, sdata );
-         n = s;
-         __syncthreads();
-      }
-      if( n >= 64 )
-      {
-         s = n / 2;
-         reduceNonAligned< Type, operation >( tid, s, n, sdata );
-         n = s;
-         __syncthreads();
-      }
-      if( n >= 32 )
-      {
-         s = n / 2;
-         reduceNonAligned< Type, operation >( tid, s, n, sdata );
-         n = s;
-         __syncthreads();
-      }
-      /***
-       * This runs in one warp so it is synchronised implicitly.
-       */
-      if( n >= 16 )
-      {
-         s = n / 2;
-         reduceNonAligned< Type, operation >( tid, s, n, sdata );
-         n = s;
-      }
-      if( n >= 8 )
-      {
-         s = n / 2;
-         reduceNonAligned< Type, operation >( tid, s, n, sdata );
-         n = s;
-      }
-      if( n >= 4 )
-      {
-         s = n / 2;
-         reduceNonAligned< Type, operation >( tid, s, n, sdata );
-         n = s;
-      }
-      if( n >= 2 )
-      {
-         s = n / 2;
-         reduceNonAligned< Type, operation >( tid, s, n, sdata );
-         n = s;
-      }
-
-   }
-
-   /***
-    * Store the result back in the global memory.
-    */
-   if( tid == 0 )
-      deviceOutput[ blockIdx. x ] = sdata[ 0 ];
-}
-
-#endif
-/***
- * The template calling the final CUDA kernel for the single vector reduction.
- * The template parameters are:
- * @param T is the type of data we want to reduce
- * @param operation is the operation reducing the data.
- *        It can be tnlParallelReductionSum, tnlParallelReductionMin or tnlParallelReductionMax.
- * The function parameters:
- * @param size tells number of elements in the data array.
- * @param deviceInput1 is the pointer to an array storing the data we want
- *        to reduce. This array must stay on the device!.
- * @param deviceInput2 is the pointer to an array storing the coupling data for example
- *        the second vector for the SDOT operation. This array must stay on the device!.
- * @param result will contain the result of the reduction if everything was ok
- *        and the return code is true.
- * @param parameter can be used for example for the passing the parameter p of Lp norm.
- * @param deviceAux is auxiliary array used to store temporary data during the reduction.
- *        If one calls this function more then once one might provide this array to avoid repetetive
- *        allocation of this array on the device inside of this function.
- *        The size of this array should be size / 128 * sizeof( T ).
- */
-template< typename Type, typename ParameterType, typename Index, tnlTupleOperation operation >
-bool tnlCUDALongVectorReduction( const Index size,
-                                 const Type* deviceInput1,
-                                 const Type* deviceInput2,
-                                 Type& result,
-                                 const ParameterType& parameter,
-                                 Type* deviceAux = 0 )
-{
-#ifdef HAVE_CUDA
-   /***
-    * Set parameters:
-    * @param desBlockSize is desired block size with which we get the best performance (on CUDA rach 1.0 to 1.3)
-    * @param desGridSize is desired grid size
-    */
-   const int desBlockSize = 512;
-   const int desGridSize = 2048;
-
-   Type* dbg_array1; // debuging array
-
-   /***
-    * Allocating auxiliary device memory to store temporary reduced arrays.
-    * For example in the first iteration we reduce the number of elements
-    * from size to size / 2. We store this new data in deviceAux array.
-    * If one calls the CUDA reduction more then once then one can provide
-    * auxiliary array by passing it via the parameter deviceAux.
-    */
-   tnlVector< Type, tnlCuda > deviceAuxVct( "tnlCUDAOneVectorReduction:deviceAuxVct" );
-   if( ! deviceAux )
-   {
-      int sizeAlloc = :: Max( 1, size / desBlockSize );
-      if( ! deviceAuxVct. setSize( sizeAlloc ) )
-         return false;
-      deviceAux = deviceAuxVct. getData();
-   }
-
-   /***
-    * Setup parameters of the kernel:
-    * @param sizeReduced is the size of reduced data after each step of parallel reduction
-    * @param reductionInput tells what data we shell reduce. We start with the input if this fuction
-    *                       and after the 1st reduction step we switch this pointer to deviceAux.
-    */
-   int sizeReduced = size;
-   const Type* reductionInput1 = deviceInput1;
-   const Type* reductionInput2 = deviceInput2;
-   int reductionSteps( 0 );
-   while( sizeReduced > maxGPUReductionDataSize )
-   {
-      dim3 blockSize( 0 ), gridSize( 0 );
-      blockSize. x = :: Min( sizeReduced, desBlockSize );
-      gridSize. x = :: Min( ( int ) ( sizeReduced / blockSize. x + 1 ) / 2, desGridSize );
-
-      /***
-       * We align the blockSize to the power of 2.
-       */
-      Index alignedBlockSize = 1;
-      while( alignedBlockSize < blockSize. x ) alignedBlockSize <<= 1;
-      blockSize. x = alignedBlockSize;
-      Index shmem = blockSize. x * sizeof( Type );
-      /***
-       * Depending on the blockSize we generate appropriate template instance.
-       */
-      if( reductionSteps > 0 &&
-          ( operation == tnlParallelReductionSdot ||
-            operation == tnlParallelReductionLpNorm ) )
-      {
-         /***
-          * For operations like SDOT or LpNorm we need to switch to tnlParallelReductionSum after the
-          * first reduction step.
-          */
-         switch( blockSize. x )
-         {
-            case 512:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 512 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case 256:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 256 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case 128:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 128 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case  64:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum,  64 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case  32:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum,  32 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case  16:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum,  16 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case   8:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum,   8 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case   4:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum,   4 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case   2:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum,   2 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case   1:
-               tnlAssert( false, cerr << "blockSize should not be 1." << endl );
-               break;
-            default:
-               tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
-               break;
-         }
-      }
-      else
-         switch( blockSize. x )
-         {
-            case 512:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 512 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case 256:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 256 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case 128:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 128 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case  64:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, operation,  64 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case  32:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, operation,  32 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case  16:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, operation,  16 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case   8:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, operation,   8 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case   4:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, operation,   4 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case   2:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, operation,   2 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case   1:
-               tnlAssert( false, cerr << "blockSize should not be 1." << endl );
-               break;
-            default:
-               tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
-               break;
-         }
-      sizeReduced = gridSize. x;
-      reductionInput1 = deviceAux;
-      reductionSteps ++;
-   }
-
-   /***
-    * We transfer reduced data from device to host.
-    * If sizeReduced equals size the previous loop was not processed and we read
-    * data directly from the input.
-    */
-   Type result_array[ maxGPUReductionDataSize ];
-   Type result_array2[ maxGPUReductionDataSize ];
-   if( sizeReduced == size )
-   {
-      if( cudaMemcpy( result_array, deviceInput1, sizeReduced * sizeof( Type ), cudaMemcpyDeviceToHost ) != cudaSuccess )
-      {
-         CHECK_CUDA_ERROR;
-         return false;
-      }
-      switch( operation )
-      {
-         case tnlParallelReductionLpNorm:
-            result = pow( tnlAbs( result_array[ 0 ] ), parameter );
-            for( Index i = 1; i < sizeReduced; i ++ )
-               result += pow( tnlAbs( result_array[ i ] ), parameter );
-            result = pow( result, 1.0/ parameter );
-            return true;
-         case tnlParallelReductionSdot:
-            if( cudaMemcpy( result_array2, deviceInput2, sizeReduced * sizeof( Type ), cudaMemcpyDeviceToHost ) != cudaSuccess )
-            {
-               CHECK_CUDA_ERROR;
-            }
-            else
-            {
-               result = 0;
-               for( Index i = 0; i < sizeReduced; i ++ )
-                  result += result_array[ i ] * result_array2[ i ] ;
-               return true;
-            }
-      }
-   }
-   else
-      if( cudaMemcpy( result_array, deviceAux, sizeReduced * sizeof( Type ), cudaMemcpyDeviceToHost ) != cudaSuccess )
-      {
-         CHECK_CUDA_ERROR;
-         return false;
-      }
-   switch( operation )
-   {
-      case tnlParallelReductionMax:
-         result = result_array[ 0 ];
-         for( Index i = 1; i < sizeReduced; i ++ )
-            result = Max( result, result_array[ i ] );
-         break;
-      case tnlParallelReductionMin:
-         result = result_array[ 0 ];
-         for( Index i = 1; i < sizeReduced; i ++ )
-            result = Min( result, result_array[ i ] );
-         break;
-      case tnlParallelReductionSum:
-      case tnlParallelReductionLpNorm:
-      case tnlParallelReductionSdot:
-         result = result_array[ 0 ];
-         for( Index i = 1; i < sizeReduced; i ++ )
-            result += result_array[ i ];
-         break;
-      case tnlParallelReductionAbsMax:
-         result = tnlAbs( result_array[ 0 ] );
-         for( Index i = 1; i < sizeReduced; i ++ )
-            result = Max( result, tnlAbs( result_array[ i ] ) );
-         break;
-      case tnlParallelReductionAbsMin:
-         result = tnlAbs( result_array[ 0 ] );
-         for( Index i = 1; i < sizeReduced; i ++ )
-            result = Min( result, tnlAbs( result_array[ i ] ) );
-         break;
-   }
-   return true;
-#else
-   cerr << "I am sorry but CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl;
-   return false;
-#endif
-};
-
-#ifdef HAVE_CUDA
-/***
- * This kernel just compares two vectors element by element. It writes
- * the result of the comparison into array result. This array must be
- * then reduced.
- */
-template< typename Real, typename Index >
-__global__ void compareTwoVectorsElementwise( const Index size,
-                                              const Real* vector1,
-                                              const Real* vector2,
-                                              bool* result )
-{
-   Index gid = blockDim. x * blockIdx. x + threadIdx. x;
-   if( gid < size )
-   {
-      if( vector1[ gid ] == vector2[ gid ] )
-         result[ gid ] = true;
-      else
-         result[ gid ] = false;
-   }
-}
-#endif
-
-/***
- * The template for comparison of two long vectors on the CUDA device.
- * The template parameters are:
- * @param T is the type of data we want to reduce
- * @param operation is the operation reducing the data.
- *        It can be tnlParallelReductionSum, tnlParallelReductionMin or tnlParallelReductionMax.
- * The function parameters:
- * @param size tells number of elements in the data array.
- * @param deviceInput1 is the pointer to an array storing the data we want
- *        to reduce. This array must stay on the device!.
- * @param deviceInput2 is the pointer to an array storing the coupling data for example
- *        the second vector for the SDOT operation. This array most stay on the device!.
- * @param result will contain the result of the reduction if everything was ok
- *        and the return code is true.
- * @param deviceAux is auxiliary array used to store temporary data during the reduction.
- *        If one calls this function more then once one might provide this array to avoid repetetive
- *        allocation of this array on the device inside of this function.
- *        The size of this array should be size / 128 * sizeof( T ).
- *
- * This function first calls kernel which compares each couples of elements from both vectors.
- * Result is written into a bool array. The minimum value then says if both vectors equal.
- *
- */
-template< typename Type, typename Index >
-bool tnlCUDALongVectorComparison( const Index size,
-                                  const Type* deviceInput1,
-                                  const Type* deviceInput2,
-                                  bool* deviceBoolAux = 0,
-                                  Type* deviceAux = 0 )
-{
-#ifdef HAVE_CUDA
-   tnlAssert( size > 0,
-              cerr << "You try to compare two CUDA long vectors with non-positive size." << endl
-                   << "The size is " << size );
-   tnlVector< bool, tnlCuda, Index > boolArray( "tnlCUDALongVectorComparison:bool_array" );
-   if( ! deviceBoolAux )
-   {
-      if( ! boolArray. setSize( size ) )
-         return false;
-      deviceBoolAux = boolArray. getData();
-   }
-   dim3 blockSize( 0 ), gridSize( 0 );
-   blockSize. x = 256;
-   gridSize. x = size / blockSize. x + 1;
-
-   compareTwoVectorsElementwise<<< gridSize, blockSize >>>( size,
-                                                            deviceInput1,
-                                                            deviceInput2,
-                                                            deviceBoolAux );
-   CHECK_CUDA_ERROR;
-   bool result;
-   if( ! tnlCUDALongVectorReduction< bool, bool, Index, tnlParallelReductionMin >( size,
-                                                                                   deviceBoolAux,
-                                                                                   ( bool* ) NULL,
-                                                                                   result,
-                                                                                   0 ) )
-
-
-      return false;
-   return result;
-#else
-   cerr << "I am sorry but CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl;
-   return;
-#endif
-}
-
-#endif /* CUDALONGVECTORKERNELS_H_ */
diff --git a/src/implementation/core/memory-operations.h b/src/implementation/core/memory-operations.h
index 759ca067c4e059a456362f5a4fdf64d417b03a28..09a721996b0ae099bc98d6c0fb7f974baa5a60fc 100644
--- a/src/implementation/core/memory-operations.h
+++ b/src/implementation/core/memory-operations.h
@@ -42,6 +42,8 @@ bool allocateMemoryCuda( Element*& data,
                    ( size_t ) size * sizeof( Element ) ) != cudaSuccess )
       data = 0;
    return checkCudaDevice;
+#else
+   return false;
 #endif
 }
 
@@ -58,8 +60,10 @@ bool freeMemoryCuda( Element* data )
 #ifdef HAVE_CUDA
       cudaFree( data );
       return checkCudaDevice;
-#endif
+#else
+   cerr << "I am sorry but CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl;
    return true;
+#endif
 }
 
 template< typename Element, typename Index >
@@ -101,7 +105,7 @@ bool setMemoryCuda( Element* data,
       blockSize. x = 256;
       Index blocksNumber = ceil( ( double ) size / ( double ) blockSize. x );
       Index elementsPerThread = ceil( ( double ) blocksNumber / ( double ) maxCudaGridSize );
-      gridSize. x = Min( blocksNumber, maxCudaGridSize );
+      gridSize. x = Min( blocksNumber, ( Index ) maxCudaGridSize );
       //cout << "blocksNumber = " << blocksNumber << "Grid size = " << gridSize. x << " elementsPerThread = " << elementsPerThread << endl;
       setVectorValueCudaKernel<<< blockSize, gridSize >>>( data, size, value, elementsPerThread );
 
@@ -140,7 +144,7 @@ bool copyMemoryHostToCuda( Element* destination,
    }
    return true;
 #else
-   cerr << "CUDA support is missing in this system." << endl;
+   cerr << "CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl;
    return false;
 #endif
 }
@@ -163,7 +167,7 @@ bool copyMemoryCudaToHost( Element* destination,
    }
    return true;
 #else
-   cerr << "CUDA support is missing in this system." << endl;
+   cerr << "CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl;
    return false;
 #endif
 }
@@ -180,7 +184,7 @@ bool copyMemoryCudaToCuda( Element* destination,
                    cudaMemcpyDeviceToDevice ) != cudaSuccess )
    return checkCudaDevice;
 #else
-   cerr << "CUDA support is missing in this system." << endl;
+   cerr << "CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl;
    return false;
 #endif
 }
diff --git a/src/implementation/core/cuda/reduction-operations_impl.cu b/src/implementation/core/memory-operations_impl.cpp
similarity index 87%
rename from src/implementation/core/cuda/reduction-operations_impl.cu
rename to src/implementation/core/memory-operations_impl.cpp
index b86aa68fcb6b246962d6c8a2c6a24f2fe861b0ec..c6f387e5f0b31de151f23d6c355e9c29d7bd3f3c 100644
--- a/src/implementation/core/cuda/reduction-operations_impl.cu
+++ b/src/implementation/core/memory-operations_impl.cpp
@@ -1,7 +1,7 @@
 /***************************************************************************
-                          reduction-operations.cu  -  description
+                          memory_operations_impl.cpp  -  description
                              -------------------
-    begin                : Mar 22, 2013
+    begin                : Mar 24, 2013
     copyright            : (C) 2013 by Tomas Oberhuber
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
@@ -13,4 +13,8 @@
  *   the Free Software Foundation; either version 2 of the License, or     *
  *   (at your option) any later version.                                   *
  *                                                                         *
- ***************************************************************************/
\ No newline at end of file
+ ***************************************************************************/
+
+
+
+
diff --git a/src/implementation/core/memory-operations_impl.cu b/src/implementation/core/memory-operations_impl.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/implementation/core/vector-operations.h b/src/implementation/core/vector-operations.h
index b584cf245b9aaffad20f8cf17fe2297e2576667f..d06c1cb5c35f41a5ca8f36d491a813abfc2ea6ad 100644
--- a/src/implementation/core/vector-operations.h
+++ b/src/implementation/core/vector-operations.h
@@ -18,7 +18,7 @@
 #ifndef VECTOROPERATIONS_H_
 #define VECTOROPERATIONS_H_
 
-#include <implementation/core/cuda-long-vector-kernels.h>
+#include <core/cuda/cuda-reduction.h>
 
 template< typename Vector >
 typename Vector :: RealType getHostVectorMax( const Vector& v )
@@ -38,7 +38,7 @@ typename Vector :: RealType getCudaVectorMax( const Vector& v )
    typedef typename Vector :: RealType Real;
    typedef typename Vector :: IndexType Index;
    Real result( 0 );
-   /*tnlCUDALongVectorReduction< Real,
+   /*reductionOnCudaDevice< Real,
                                Real,
                                Index,
                                tnlParallelReductionMax >
@@ -85,7 +85,7 @@ typename Vector :: RealType getCudaVectorMin( const Vector& v )
    typedef typename Vector :: RealType Real;
    typedef typename Vector :: IndexType Index;
    Real result( 0 );
-   /*tnlCUDALongVectorReduction< Real,
+   /*reductionOnCudaDevice< Real,
                                Real,
                                Index,
                                tnlParallelReductionMin >
@@ -132,7 +132,7 @@ typename Vector :: RealType getCudaVectorAbsMax( const Vector& v )
    typedef typename Vector :: RealType Real;
    typedef typename Vector :: IndexType Index;
    Real result( 0 );
-   /*tnlCUDALongVectorReduction< Real,
+   /*reductionOnCudaDevice< Real,
                                Real,
                                Index,
                                tnlParallelReductionAbsMax >
@@ -179,7 +179,7 @@ typename Vector :: RealType getCudaVectorAbsMin( const Vector& v )
    typedef typename Vector :: RealType Real;
    typedef typename Vector :: IndexType Index;
    Real result( 0 );
-   /*tnlCUDALongVectorReduction< Real,
+   /*reductionOnCudaDevice< Real,
                                Real,
                                Index,
                                tnlParallelReductionAbsMin >
@@ -249,7 +249,7 @@ typename Vector :: RealType getCudaVectorLpNorm( const Vector& v,
    typedef typename Vector :: IndexType Index;
 
    Real result( 0 );
-   /*tnlCUDALongVectorReduction< Real,
+   /*reductionOnCudaDevice< Real,
                                Real,
                                Index,
                                tnlParallelReductionLpNorm >
@@ -300,7 +300,7 @@ typename Vector :: RealType getCudaVectorSum( const Vector& v )
    typedef typename Vector :: IndexType Index;
 
    Real result( 0 );
-   /*tnlCUDALongVectorReduction< Real,
+   /*reductionOnCudaDevice< Real,
                                Real,
                                Index,
                                tnlParallelReductionSum >
@@ -713,7 +713,7 @@ typename Vector1 :: RealType getCudaVectorSdot( const Vector1& v1,
    typedef typename Vector1 :: IndexType Index;
 
    Real result( 0 );
-   /*tnlCUDALongVectorReduction< Real,
+   /*reductionOnCudaDevice< Real,
                                Real,
                                Index,
                                tnlParallelReductionSdot >
diff --git a/src/solvers/ode/tnlMersonSolver.h b/src/solvers/ode/tnlMersonSolver.h
index 34b96dba1de2fe9350d394072981f6693d40ca3c..38f6329815df192b51162bd099d0ff2a079bd4b7 100644
--- a/src/solvers/ode/tnlMersonSolver.h
+++ b/src/solvers/ode/tnlMersonSolver.h
@@ -19,7 +19,6 @@
 #define tnlMersonSolverH
 
 #include <math.h>
-#include <implementation/core/cuda-long-vector-kernels.h>
 #include <solvers/ode/tnlExplicitSolver.h>
 
 /****
diff --git a/tests/benchmarks/tnl-benchmarks.h b/tests/benchmarks/tnl-benchmarks.h
index 465f28c0deccc68e3631e407f4678e724d2919c2..bb40f95948086bcd9cd5ff99cb36eb6a786c7024 100644
--- a/tests/benchmarks/tnl-benchmarks.h
+++ b/tests/benchmarks/tnl-benchmarks.h
@@ -219,19 +219,19 @@ void reductionBenchmark( const int size,
                                                                    device_aux. getData() );
             break;
          default:
-            tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionSum >( size,
+            reductionOnCudaDevice< T, T, int, tnlParallelReductionSum >( size,
                                                                               device_vector. getData(),
                                                                               NULL,
                                                                               sum,
                                                                               0.0,
                                                                               device_aux. getData() );
-            tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionMin >( size,
+            reductionOnCudaDevice< T, T, int, tnlParallelReductionMin >( size,
                                                                               device_vector. getData(),
                                                                               NULL,
                                                                               min,
                                                                               0.0,
                                                                               device_aux. getData() );
-            tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionMax >( size,
+            reductionOnCudaDevice< T, T, int, tnlParallelReductionMax >( size,
                                                                               device_vector. getData(),
                                                                               NULL,
                                                                               max,
diff --git a/tests/unit-tests/core/cuda/tnlCudaReductionTester.h b/tests/unit-tests/core/cuda/tnlCudaReductionTester.h
index ee8dbd83a5cd23df63808d27a423c8ebc7d10668..b6d21aa09369775a8887d8e1943a82cb9b4cf2f6 100644
--- a/tests/unit-tests/core/cuda/tnlCudaReductionTester.h
+++ b/tests/unit-tests/core/cuda/tnlCudaReductionTester.h
@@ -25,7 +25,7 @@
 #include <cppunit/TestCase.h>
 #include <cppunit/Message.h>
 #include <core/cuda/device-check.h>
-#include <implementation/core/cuda-long-vector-kernels.h>
+#include <core/cuda/cuda-reduction.h>
 
 class tnlCudaReductionTester : public CppUnit :: TestCase
 {
@@ -42,21 +42,40 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
 
       suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >(
                                 "shortConstantSequenceTest",
-                                &tnlCudaReductionTester :: shortConstantSequenceTest< float > )
+                                &tnlCudaReductionTester :: shortConstantSequenceTest< double > )
                                );
-
       suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >(
                                 "longConstantSequenceTest",
-                                &tnlCudaReductionTester :: longConstantSequenceTest< float > )
+                                &tnlCudaReductionTester :: longConstantSequenceTest< double > )
                                );
-
-
       suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >(
                                 "linearSequenceTest",
-                                &tnlCudaReductionTester :: linearSequenceTest< float > )
+                                &tnlCudaReductionTester :: linearSequenceTest< double > )
+                               );
+      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >(
+                                "shortLogicalOperationsTest",
+                                &tnlCudaReductionTester :: shortLogicalOperationsTest< int > )
+                               );
+      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >(
+                                "longLogicalOperationsTest",
+                                &tnlCudaReductionTester :: longLogicalOperationsTest< int > )
+                               );
+      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >(
+                                "shortComparisonTest",
+                                &tnlCudaReductionTester :: shortComparisonTest< int > )
+                               );
+      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >(
+                                "longComparisonTest",
+                                &tnlCudaReductionTester :: longComparisonTest< int > )
+                               );
+      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >(
+                                "shortSdotTest",
+                                &tnlCudaReductionTester :: shortSdotTest< double > )
+                               );
+      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >(
+                                "longSdotTest",
+                                &tnlCudaReductionTester :: longSdotTest< double > )
                                );
-
-
       return suiteOfTests;
    }
 
@@ -76,45 +95,51 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
    void shortConstantSequenceTest()
    {
       const int shortSequence( 128 );
-      const int longSequence( 8192 );
       RealType *hostData, *deviceData;
       allocateMemoryHost( hostData, shortSequence );
       allocateMemoryCuda( deviceData, shortSequence );
       CPPUNIT_ASSERT( checkCudaDevice );
 
-      setConstantSequence( shortSequence, ( RealType ) -1, hostData, deviceData );
       RealType result;
 
+      setConstantSequence( shortSequence, ( RealType ) -1, hostData, deviceData );
       tnlParallelReductionSum< RealType, int > sumOperation;
       CPPUNIT_ASSERT(
-         ( tnlCUDALongVectorReduction( sumOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
+         ( reductionOnCudaDevice( sumOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == -shortSequence );
 
       tnlParallelReductionMin< RealType, int > minOperation;
       CPPUNIT_ASSERT(
-         ( tnlCUDALongVectorReduction( minOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
+         ( reductionOnCudaDevice( minOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == -1 );
 
       tnlParallelReductionMax< RealType, int > maxOperation;
       CPPUNIT_ASSERT(
-         ( tnlCUDALongVectorReduction( maxOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
+         ( reductionOnCudaDevice( maxOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == -1 );
 
       tnlParallelReductionAbsSum< RealType, int > absSumOperation;
       CPPUNIT_ASSERT(
-         ( tnlCUDALongVectorReduction( absSumOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
+         ( reductionOnCudaDevice( absSumOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == shortSequence );
 
       tnlParallelReductionAbsMin< RealType, int > absMinOperation;
       CPPUNIT_ASSERT(
-         ( tnlCUDALongVectorReduction( absMinOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
+         ( reductionOnCudaDevice( absMinOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == 1 );
 
       tnlParallelReductionAbsMax< RealType, int > absMaxOperation;
       CPPUNIT_ASSERT(
-         ( tnlCUDALongVectorReduction( absMaxOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
+         ( reductionOnCudaDevice( absMaxOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == 1 );
 
+      tnlParallelReductionLpNorm< RealType, int > lpNormOperation;
+      lpNormOperation. setPower( 2.0 );
+      CPPUNIT_ASSERT(
+         ( reductionOnCudaDevice( lpNormOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == shortSequence );
+
+
       freeMemoryHost( hostData );
       freeMemoryCuda( deviceData );
       CPPUNIT_ASSERT( checkCudaDevice );
@@ -123,45 +148,86 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
    template< typename RealType >
    void longConstantSequenceTest()
    {
-      const int longSequence( 8192 );
+      const int longSequence( 172892 );
       RealType *hostData, *deviceData;
       allocateMemoryHost( hostData, longSequence );
       allocateMemoryCuda( deviceData, longSequence );
       CPPUNIT_ASSERT( checkCudaDevice );
 
-      setConstantSequence( longSequence, ( RealType ) -1, hostData, deviceData );
       RealType result;
 
+      setConstantSequence( longSequence, ( RealType ) -1, hostData, deviceData );
       tnlParallelReductionSum< RealType, int > sumOperation;
       CPPUNIT_ASSERT( 
-         ( tnlCUDALongVectorReduction( sumOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+         ( reductionOnCudaDevice( sumOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == -longSequence );
 
       tnlParallelReductionMin< RealType, int > minOperation;
       CPPUNIT_ASSERT(
-         ( tnlCUDALongVectorReduction( minOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+         ( reductionOnCudaDevice( minOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == -1 );
 
       tnlParallelReductionMax< RealType, int > maxOperation;
       CPPUNIT_ASSERT(
-         ( tnlCUDALongVectorReduction( maxOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+         ( reductionOnCudaDevice( maxOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == -1 );
 
       tnlParallelReductionAbsSum< RealType, int > absSumOperation;
       CPPUNIT_ASSERT(
-         ( tnlCUDALongVectorReduction( absSumOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+         ( reductionOnCudaDevice( absSumOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == longSequence );
 
       tnlParallelReductionAbsMin< RealType, int > absMinOperation;
       CPPUNIT_ASSERT(
-         ( tnlCUDALongVectorReduction( absMinOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+         ( reductionOnCudaDevice( absMinOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == 1 );
 
       tnlParallelReductionAbsMax< RealType, int > absMaxOperation;
       CPPUNIT_ASSERT(
-         ( tnlCUDALongVectorReduction( absMaxOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+         ( reductionOnCudaDevice( absMaxOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == 1 );
 
+      tnlParallelReductionLpNorm< RealType, int > lpNormOperation;
+      lpNormOperation. setPower( 2.0 );
+      CPPUNIT_ASSERT(
+         ( reductionOnCudaDevice( lpNormOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == longSequence );
+
+      setConstantSequence( longSequence, ( RealType ) 2, hostData, deviceData );
+      CPPUNIT_ASSERT(
+         ( reductionOnCudaDevice( sumOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 2 * longSequence );
+
+      CPPUNIT_ASSERT(
+         ( reductionOnCudaDevice( minOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 2 );
+
+      CPPUNIT_ASSERT(
+         ( reductionOnCudaDevice( maxOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 2 );
+
+      CPPUNIT_ASSERT(
+         ( reductionOnCudaDevice( absSumOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 2 * longSequence );
+
+      CPPUNIT_ASSERT(
+         ( reductionOnCudaDevice( absMinOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 2 );
+
+      CPPUNIT_ASSERT(
+         ( reductionOnCudaDevice( absMaxOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 2 );
+
+      lpNormOperation. setPower( 2.0 );
+      CPPUNIT_ASSERT(
+         ( reductionOnCudaDevice( lpNormOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 4 * longSequence );
+      lpNormOperation. setPower( 3.0 );
+      CPPUNIT_ASSERT(
+         ( reductionOnCudaDevice( lpNormOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 8 * longSequence );
+
+
       freeMemoryHost( hostData );
       freeMemoryCuda( deviceData );
       CPPUNIT_ASSERT( checkCudaDevice );
@@ -170,7 +236,7 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
    template< typename RealType >
    void linearSequenceTest()
    {
-      const int size( 1024 );
+      const int size( 10245 );
       RealType *hostData, *deviceData;
       allocateMemoryHost( hostData, size );
       allocateMemoryCuda( deviceData, size );
@@ -187,31 +253,31 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
       tnlParallelReductionSum< RealType, int > sumOperation;
       RealType result;
       CPPUNIT_ASSERT(
-         ( tnlCUDALongVectorReduction( sumOperation, size, deviceData, ( RealType* ) 0, result ) ) );
+         ( reductionOnCudaDevice( sumOperation, size, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == sum );
       tnlParallelReductionMin< RealType, int > minOperation;
       CPPUNIT_ASSERT(
-         ( tnlCUDALongVectorReduction( minOperation, size, deviceData, ( RealType* ) 0, result ) ) );
+         ( reductionOnCudaDevice( minOperation, size, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == -size );
 
       tnlParallelReductionMax< RealType, int > maxOperation;
       CPPUNIT_ASSERT(
-         ( tnlCUDALongVectorReduction( maxOperation, size, deviceData, ( RealType* ) 0, result ) ) );
+         ( reductionOnCudaDevice( maxOperation, size, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == -1 );
 
       tnlParallelReductionAbsSum< RealType, int > absSumOperation;
       CPPUNIT_ASSERT(
-         ( tnlCUDALongVectorReduction( absSumOperation, size, deviceData, ( RealType* ) 0, result ) ) );
+         ( reductionOnCudaDevice( absSumOperation, size, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == tnlAbs( sum ) );
 
       tnlParallelReductionAbsMin< RealType, int > absMinOperation;
       CPPUNIT_ASSERT(
-         ( tnlCUDALongVectorReduction( absMinOperation, size, deviceData, ( RealType* ) 0, result ) ) );
+         ( reductionOnCudaDevice( absMinOperation, size, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == 1 );
 
       tnlParallelReductionAbsMax< RealType, int > absMaxOperation;
       CPPUNIT_ASSERT(
-         ( tnlCUDALongVectorReduction( absMaxOperation, size, deviceData, ( RealType* ) 0, result ) ) );
+         ( reductionOnCudaDevice( absMaxOperation, size, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == size );
 
       freeMemoryHost( hostData );
@@ -219,6 +285,280 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
       CPPUNIT_ASSERT( checkCudaDevice );
    };
 
+   template< typename Type >
+   void shortLogicalOperationsTest()
+   {
+      int size( 125 );
+      Type *hostData, *deviceData;
+      allocateMemoryHost( hostData, size );
+      allocateMemoryCuda( deviceData, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      for( int i = 0; i < size; i ++ )
+         hostData[ i ] = 1;
+
+      copyMemoryHostToCuda( deviceData, hostData, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      tnlParallelReductionLogicalAnd< Type, int > andOperation;
+      tnlParallelReductionLogicalOr< Type, int > orOperation;
+      Type result;
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( andOperation, size, deviceData, ( Type* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 1 );
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( orOperation, size, deviceData, ( Type* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 1 );
+
+      hostData[ 0 ] = 0;
+      copyMemoryHostToCuda( deviceData, hostData, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( andOperation, size, deviceData, ( Type* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 0 );
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( orOperation, size, deviceData, ( Type* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 1 );
+
+      for( int i = 0; i < size; i ++ )
+         hostData[ i ] = 0;
+
+      copyMemoryHostToCuda( deviceData, hostData, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( andOperation, size, deviceData, ( Type* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 0 );
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( orOperation, size, deviceData, ( Type* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 0 );
+   }
+
+   template< typename Type >
+   void longLogicalOperationsTest()
+   {
+      int size( 7628198 );
+      Type *hostData, *deviceData;
+      allocateMemoryHost( hostData, size );
+      allocateMemoryCuda( deviceData, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      for( int i = 0; i < size; i ++ )
+         hostData[ i ] = 1;
+
+      copyMemoryHostToCuda( deviceData, hostData, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      tnlParallelReductionLogicalAnd< Type, int > andOperation;
+      tnlParallelReductionLogicalOr< Type, int > orOperation;
+      Type result;
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( andOperation, size, deviceData, ( Type* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 1 );
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( orOperation, size, deviceData, ( Type* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 1 );
+
+      hostData[ 0 ] = 0;
+      copyMemoryHostToCuda( deviceData, hostData, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( andOperation, size, deviceData, ( Type* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 0 );
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( orOperation, size, deviceData, ( Type* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 1 );
+
+      for( int i = 0; i < size; i ++ )
+         hostData[ i ] = 0;
+
+      copyMemoryHostToCuda( deviceData, hostData, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( andOperation, size, deviceData, ( Type* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 0 );
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( orOperation, size, deviceData, ( Type* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 0 );
+   }
+
+   template< typename Type >
+   void shortComparisonTest()
+   {
+      const int size( 125 );
+      Type *hostData1, *hostData2,
+           *deviceData1, *deviceData2;
+      allocateMemoryHost( hostData1, size );
+      allocateMemoryHost( hostData2, size );
+      allocateMemoryCuda( deviceData1, size );
+      allocateMemoryCuda( deviceData2, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      for( int i = 0; i < size; i ++ )
+         hostData1[ i ] = hostData2[ i ] = 1;
+      copyMemoryHostToCuda( deviceData1, hostData1, size );
+      copyMemoryHostToCuda( deviceData2, hostData2, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      bool result( false );
+      tnlParallelReductionEqualities< Type, int > equalityOperation;
+      tnlParallelReductionInequalities< Type, int > inequalityOperation;
+
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( equalityOperation, size, deviceData1, deviceData2, result ) ) );
+      CPPUNIT_ASSERT( result == true );
+
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( inequalityOperation, size, deviceData1, deviceData2, result ) ) );
+      CPPUNIT_ASSERT( result == false );
+
+      hostData1[ 0 ] = 0;
+      copyMemoryHostToCuda( deviceData1, hostData1, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( equalityOperation, size, deviceData1, deviceData2, result ) ) );
+      CPPUNIT_ASSERT( result == false );
+
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( inequalityOperation, size, deviceData1, deviceData2, result ) ) );
+      CPPUNIT_ASSERT( result == false );
+
+      for( int i = 0; i < size; i ++ )
+         hostData1[ i ] = 0;
+      copyMemoryHostToCuda( deviceData1, hostData1, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( equalityOperation, size, deviceData1, deviceData2, result ) ) );
+      CPPUNIT_ASSERT( result == false );
+
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( inequalityOperation, size, deviceData1, deviceData2, result ) ) );
+      CPPUNIT_ASSERT( result == true );
+   }
+
+   template< typename Type >
+   void longComparisonTest()
+   {
+      const int size( 1258976 );
+      Type *hostData1, *hostData2,
+           *deviceData1, *deviceData2;
+      allocateMemoryHost( hostData1, size );
+      allocateMemoryHost( hostData2, size );
+      allocateMemoryCuda( deviceData1, size );
+      allocateMemoryCuda( deviceData2, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      for( int i = 0; i < size; i ++ )
+         hostData1[ i ] = hostData2[ i ] = 1;
+      copyMemoryHostToCuda( deviceData1, hostData1, size );
+      copyMemoryHostToCuda( deviceData2, hostData2, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      bool result( false );
+      tnlParallelReductionEqualities< Type, int > equalityOperation;
+      tnlParallelReductionInequalities< Type, int > inequalityOperation;
+
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( equalityOperation, size, deviceData1, deviceData2, result ) ) );
+      CPPUNIT_ASSERT( result == true );
+
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( inequalityOperation, size, deviceData1, deviceData2, result ) ) );
+      CPPUNIT_ASSERT( result == false );
+
+      hostData1[ 0 ] = 0;
+      copyMemoryHostToCuda( deviceData1, hostData1, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( equalityOperation, size, deviceData1, deviceData2, result ) ) );
+      CPPUNIT_ASSERT( result == false );
+
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( inequalityOperation, size, deviceData1, deviceData2, result ) ) );
+      CPPUNIT_ASSERT( result == false );
+
+      for( int i = 0; i < size; i ++ )
+         hostData1[ i ] = 0;
+      copyMemoryHostToCuda( deviceData1, hostData1, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( equalityOperation, size, deviceData1, deviceData2, result ) ) );
+      CPPUNIT_ASSERT( result == false );
+
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( inequalityOperation, size, deviceData1, deviceData2, result ) ) );
+      CPPUNIT_ASSERT( result == true );
+   };
+
+   template< typename Type >
+   void shortSdotTest()
+   {
+      const int size( 125 );
+      Type *hostData1, *hostData2,
+           *deviceData1, *deviceData2;
+      allocateMemoryHost( hostData1, size );
+      allocateMemoryHost( hostData2, size );
+      allocateMemoryCuda( deviceData1, size );
+      allocateMemoryCuda( deviceData2, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      hostData1[ 0 ] = 0;
+      hostData2[ 0 ] = 1;
+      Type sdot( 0.0 );
+      for( int i = 1; i < size; i ++ )
+      {
+         hostData1[ i ] = i;
+         hostData2[ i ] = -hostData2[ i - 1 ];
+         sdot += hostData1[ i ] * hostData2[ i ];
+      }
+      copyMemoryHostToCuda( deviceData1, hostData1, size );
+      copyMemoryHostToCuda( deviceData2, hostData2, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      Type result( 0.0 );
+      tnlParallelReductionSdot< Type, int > sdotOperation;
+
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( sdotOperation, size, deviceData1, deviceData2, result ) ) );
+      CPPUNIT_ASSERT( result == sdot );
+   };
+
+
+   template< typename Type >
+   void longSdotTest()
+   {
+      const int size( 125789 );
+      Type *hostData1, *hostData2,
+           *deviceData1, *deviceData2;
+      allocateMemoryHost( hostData1, size );
+      allocateMemoryHost( hostData2, size );
+      allocateMemoryCuda( deviceData1, size );
+      allocateMemoryCuda( deviceData2, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      hostData1[ 0 ] = 0;
+      hostData2[ 0 ] = 1;
+      Type sdot( 0.0 );
+      for( int i = 1; i < size; i ++ )
+      {
+         hostData1[ i ] = i;
+         hostData2[ i ] = -hostData2[ i - 1 ];
+         sdot += hostData1[ i ] * hostData2[ i ];
+      }
+      copyMemoryHostToCuda( deviceData1, hostData1, size );
+      copyMemoryHostToCuda( deviceData2, hostData2, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      Type result( 0.0 );
+      tnlParallelReductionSdot< Type, int > sdotOperation;
+
+      CPPUNIT_ASSERT(
+          ( reductionOnCudaDevice( sdotOperation, size, deviceData1, deviceData2, result ) ) );
+      CPPUNIT_ASSERT( result == sdot );
+   };
 };