From 15dcbd438ec93bd6dd9679f5cab9b36395f5bfed Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Thu, 10 Dec 2015 23:00:19 +0100
Subject: [PATCH] Optimizing CUDA L2 norm.

---
 src/core/cuda/CMakeLists.txt                  |   2 +-
 src/core/cuda/cuda-prefix-sum_impl.cu         |  16 +-
 src/core/cuda/cuda-prefix-sum_impl.h          |  26 +-
 src/core/cuda/cuda-reduction-abs-max_impl.cu  |  20 +-
 src/core/cuda/cuda-reduction-abs-min_impl.cu  |  20 +-
 src/core/cuda/cuda-reduction-abs-sum_impl.cu  |  20 +-
 src/core/cuda/cuda-reduction-and_impl.cu      |  20 +-
 .../cuda/cuda-reduction-diff-abs-max_impl.cu  |  20 +-
 .../cuda/cuda-reduction-diff-abs-min_impl.cu  |  20 +-
 .../cuda/cuda-reduction-diff-abs-sum_impl.cu  |  20 +-
 .../cuda/cuda-reduction-diff-l2-norm_impl.cu  |  16 +-
 .../cuda/cuda-reduction-diff-lp-norm_impl.cu  |  16 +-
 src/core/cuda/cuda-reduction-diff-max_impl.cu |  20 +-
 src/core/cuda/cuda-reduction-diff-min_impl.cu |  20 +-
 src/core/cuda/cuda-reduction-diff-sum_impl.cu |  20 +-
 .../cuda/cuda-reduction-equalities_impl.cu    |  20 +-
 .../cuda/cuda-reduction-inequalities_impl.cu  |  20 +-
 src/core/cuda/cuda-reduction-l2-norm_impl.cu  |  16 +-
 src/core/cuda/cuda-reduction-lp-norm_impl.cu  |  14 +-
 src/core/cuda/cuda-reduction-max_impl.cu      |  20 +-
 src/core/cuda/cuda-reduction-min_impl.cu      |  20 +-
 src/core/cuda/cuda-reduction-or_impl.cu       |  20 +-
 .../cuda-reduction-scalar-product_impl.cu     |  20 +-
 src/core/cuda/cuda-reduction-sum_impl.cu      |  22 +-
 src/core/cuda/cuda-reduction_impl.h           |  10 +-
 src/core/cuda/reduction-operations.h          | 162 ++++-----
 src/core/cuda/tnlCudaReduction.h              |   4 +-
 src/core/cuda/tnlCudaReduction_impl.h         |   8 +-
 src/core/vectors/CMakeLists.txt               |  15 +-
 .../vectors/tnlVectorOperationsCuda_impl.cpp  | 328 ++++++++++++++++++
 ...mpl.cu => tnlVectorOperationsCuda_impl.cu} |  53 ++-
 ...l.cpp => tnlVectorOperationsHost_impl.cpp} | 307 +++-------------
 tests/benchmarks/tnl-cuda-benchmarks.h        |  61 +++-
 33 files changed, 816 insertions(+), 580 deletions(-)
 create mode 100644 src/core/vectors/tnlVectorOperationsCuda_impl.cpp
 rename src/core/vectors/{tnlVectorOperations_impl.cu => tnlVectorOperationsCuda_impl.cu} (85%)
 rename src/core/vectors/{tnlVectorOperations_impl.cpp => tnlVectorOperationsHost_impl.cpp} (52%)

diff --git a/src/core/cuda/CMakeLists.txt b/src/core/cuda/CMakeLists.txt
index b1e0aababd..815e594637 100755
--- a/src/core/cuda/CMakeLists.txt
+++ b/src/core/cuda/CMakeLists.txt
@@ -34,7 +34,7 @@ IF( BUILD_CUDA )
         ${CURRENT_DIR}/cuda-reduction-diff-l2-norm_impl.cu        
         ${CURRENT_DIR}/cuda-reduction-diff-lp-norm_impl.cu        
         ${CURRENT_DIR}/cuda-prefix-sum_impl.cu
-        PARENT_SCOPE )        
+        PARENT_SCOPE ) 
 endif() 
 
 set( tnl_core_cuda_SOURCES
diff --git a/src/core/cuda/cuda-prefix-sum_impl.cu b/src/core/cuda/cuda-prefix-sum_impl.cu
index 74f3e85fb7..58393298d2 100644
--- a/src/core/cuda/cuda-prefix-sum_impl.cu
+++ b/src/core/cuda/cuda-prefix-sum_impl.cu
@@ -23,7 +23,7 @@ template bool cudaPrefixSum( const int size,
                              const int blockSize,
                              const int *deviceInput,
                              int* deviceOutput,
-                             const tnlParallelReductionSum< int, int >& operation,
+                             tnlParallelReductionSum< int, int >& operation,
                              const enumPrefixSumType prefixSumType );
 
 
@@ -32,7 +32,7 @@ template bool cudaPrefixSum( const int size,
                              const int blockSize,
                              const float *deviceInput,
                              float* deviceOutput,
-                             const tnlParallelReductionSum< float, int >& operation,
+                             tnlParallelReductionSum< float, int >& operation,
                              const enumPrefixSumType prefixSumType );
 #endif
 
@@ -40,7 +40,7 @@ template bool cudaPrefixSum( const int size,
                              const int blockSize,
                              const double *deviceInput,
                              double* deviceOutput,
-                             const tnlParallelReductionSum< double, int >& operation,
+                             tnlParallelReductionSum< double, int >& operation,
                              const enumPrefixSumType prefixSumType );
 
 #ifdef INSTANTIATE_LONG_DOUBLE
@@ -48,7 +48,7 @@ template bool cudaPrefixSum( const int size,
                              const int blockSize,
                              const long double *deviceInput,
                              long double* deviceOutput,
-                             const tnlParallelReductionSum< long double, int >& operation,
+                             tnlParallelReductionSum< long double, int >& operation,
                              const enumPrefixSumType prefixSumType );
 #endif
 
@@ -57,7 +57,7 @@ template bool cudaPrefixSum( const long int size,
                              const long int blockSize,
                              const int *deviceInput,
                              int* deviceOutput,
-                             const tnlParallelReductionSum< int, long int >& operation,
+                             tnlParallelReductionSum< int, long int >& operation,
                              const enumPrefixSumType prefixSumType );
 
 
@@ -66,7 +66,7 @@ template bool cudaPrefixSum( const long int size,
                              const long int blockSize,
                              const float *deviceInput,
                              float* deviceOutput,
-                             const tnlParallelReductionSum< float, long int >& operation,
+                             tnlParallelReductionSum< float, long int >& operation,
                              const enumPrefixSumType prefixSumType );
 #endif
 
@@ -74,7 +74,7 @@ template bool cudaPrefixSum( const long int size,
                              const long int blockSize,
                              const double *deviceInput,
                              double* deviceOutput,
-                             const tnlParallelReductionSum< double, long int >& operation,
+                             tnlParallelReductionSum< double, long int >& operation,
                              const enumPrefixSumType prefixSumType );
 
 #ifdef INSTANTIATE_LONG_DOUBLE
@@ -82,7 +82,7 @@ template bool cudaPrefixSum( const long int size,
                              const long int blockSize,
                              const long double *deviceInput,
                              long double* deviceOutput,
-                             const tnlParallelReductionSum< long double, long int >& operation,
+                             tnlParallelReductionSum< long double, long int >& operation,
                              const enumPrefixSumType prefixSumType );
 #endif
 #endif 
diff --git a/src/core/cuda/cuda-prefix-sum_impl.h b/src/core/cuda/cuda-prefix-sum_impl.h
index ad2d02aa33..2c5644cc20 100644
--- a/src/core/cuda/cuda-prefix-sum_impl.h
+++ b/src/core/cuda/cuda-prefix-sum_impl.h
@@ -30,7 +30,7 @@ template< typename DataType,
           typename Operation,
           typename Index >
 __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumType,
-                                              const Operation operation,
+                                              Operation operation,
                                               const Index size,
                                               const Index elementsInBlock,
                                               const DataType* input,
@@ -159,7 +159,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
 template< typename DataType,
           typename Operation,
           typename Index >
-__global__ void cudaSecondPhaseBlockPrefixSum( const Operation operation,
+__global__ void cudaSecondPhaseBlockPrefixSum( Operation operation,
                                                const Index size,
                                                const Index elementsInBlock,
                                                const Index gridShift,
@@ -186,7 +186,7 @@ template< typename DataType,
           typename Operation,
           typename Index >
 bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType,
-                             const Operation& operation,
+                             Operation& operation,
                              const Index size,
                              const Index blockSize,
                              const Index elementsInBlock,
@@ -276,7 +276,7 @@ template< typename DataType,
           typename Operation,
           typename Index >
 bool cudaGridPrefixSum( enumPrefixSumType prefixSumType,
-                        const Operation& operation,
+                        Operation& operation,
                         const Index size,
                         const Index blockSize,
                         const Index elementsInBlock,
@@ -313,7 +313,7 @@ bool cudaPrefixSum( const Index size,
                     const Index blockSize,
                     const DataType *deviceInput,
                     DataType* deviceOutput,
-                    const Operation& operation,
+                    Operation& operation,
                     const enumPrefixSumType prefixSumType )
 {
    /****
@@ -361,7 +361,7 @@ extern template bool cudaPrefixSum( const int size,
                                     const int blockSize,
                                     const int *deviceInput,
                                     int* deviceOutput,
-                                    const tnlParallelReductionSum< int, int >& operation,
+                                    tnlParallelReductionSum< int, int >& operation,
                                     const enumPrefixSumType prefixSumType );
 
 
@@ -369,14 +369,14 @@ extern template bool cudaPrefixSum( const int size,
                                     const int blockSize,
                                     const float *deviceInput,
                                     float* deviceOutput,
-                                    const tnlParallelReductionSum< float, int >& operation,
+                                    tnlParallelReductionSum< float, int >& operation,
                                     const enumPrefixSumType prefixSumType );
 
 extern template bool cudaPrefixSum( const int size,
                                     const int blockSize,
                                     const double *deviceInput,
                                     double* deviceOutput,
-                                    const tnlParallelReductionSum< double, int >& operation,
+                                    tnlParallelReductionSum< double, int >& operation,
                                     const enumPrefixSumType prefixSumType );
 
 #ifdef INSTANTIATE_LONG_DOUBLE
@@ -384,7 +384,7 @@ extern template bool cudaPrefixSum( const int size,
                                     const int blockSize,
                                     const long double *deviceInput,
                                     long double* deviceOutput,
-                                    const tnlParallelReductionSum< long double, int >& operation,
+                                    tnlParallelReductionSum< long double, int >& operation,
                                     const enumPrefixSumType prefixSumType );
 #endif
 
@@ -393,7 +393,7 @@ extern template bool cudaPrefixSum( const long int size,
                                     const long int blockSize,
                                     const int *deviceInput,
                                     int* deviceOutput,
-                                    const tnlParallelReductionSum< int, long int >& operation,
+                                    tnlParallelReductionSum< int, long int >& operation,
                                     const enumPrefixSumType prefixSumType );
 
 
@@ -401,14 +401,14 @@ extern template bool cudaPrefixSum( const long int size,
                                     const long int blockSize,
                                     const float *deviceInput,
                                     float* deviceOutput,
-                                    const tnlParallelReductionSum< float, long int >& operation,
+                                    tnlParallelReductionSum< float, long int >& operation,
                                     const enumPrefixSumType prefixSumType );
 
 extern template bool cudaPrefixSum( const long int size,
                                     const long int blockSize,
                                     const double *deviceInput,
                                     double* deviceOutput,
-                                    const tnlParallelReductionSum< double, long int >& operation,
+                                    tnlParallelReductionSum< double, long int >& operation,
                                     const enumPrefixSumType prefixSumType );
 
 #ifdef INSTANTIATE_LONG_DOUBLE
@@ -416,7 +416,7 @@ extern template bool cudaPrefixSum( const long int size,
                                     const long int blockSize,
                                     const long double *deviceInput,
                                     long double* deviceOutput,
-                                    const tnlParallelReductionSum< long double, long int >& operation,
+                                    tnlParallelReductionSum< long double, long int >& operation,
                                     const enumPrefixSumType prefixSumType );
 #endif
 #endif
diff --git a/src/core/cuda/cuda-reduction-abs-max_impl.cu b/src/core/cuda/cuda-reduction-abs-max_impl.cu
index 8540fc71a3..e69a225ada 100644
--- a/src/core/cuda/cuda-reduction-abs-max_impl.cu
+++ b/src/core/cuda/cuda-reduction-abs-max_impl.cu
@@ -25,28 +25,28 @@
  */
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, int > >
-                                   ( const tnlParallelReductionAbsMax< char, int >& operation,
+                                   ( tnlParallelReductionAbsMax< char, int >& operation,
                                      const typename tnlParallelReductionAbsMax< char, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< char, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< char, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< char, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, int > >
-                                   ( const tnlParallelReductionAbsMax< int, int >& operation,
+                                   ( tnlParallelReductionAbsMax< int, int >& operation,
                                      const typename tnlParallelReductionAbsMax< int, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< int, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< int, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< int, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< float, int > >
-                                   ( const tnlParallelReductionAbsMax< float, int >& operation,
+                                   ( tnlParallelReductionAbsMax< float, int >& operation,
                                      const typename tnlParallelReductionAbsMax< float, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, int > >
-                                   ( const tnlParallelReductionAbsMax< double, int>& operation,
+                                   ( tnlParallelReductionAbsMax< double, int>& operation,
                                      const typename tnlParallelReductionAbsMax< double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput2,
@@ -54,7 +54,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, int > >
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, int > >
-                                   ( const tnlParallelReductionAbsMax< long double, int>& operation,
+                                   ( tnlParallelReductionAbsMax< long double, int>& operation,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput2,
@@ -62,7 +62,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, in
 #endif                                     
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int > >
-                                   ( const tnlParallelReductionAbsMax< char, long int >& operation,
+                                   ( tnlParallelReductionAbsMax< char, long int >& operation,
                                      const typename tnlParallelReductionAbsMax< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< char, long int > :: RealType* deviceInput2,
@@ -70,21 +70,21 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, long int > >
-                                   ( const tnlParallelReductionAbsMax< int, long int >& operation,
+                                   ( tnlParallelReductionAbsMax< int, long int >& operation,
                                      const typename tnlParallelReductionAbsMax< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< float, long int > >
-                                   ( const tnlParallelReductionAbsMax< float, long int >& operation,
+                                   ( tnlParallelReductionAbsMax< float, long int >& operation,
                                      const typename tnlParallelReductionAbsMax< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMax< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, long int > >
-                                   ( const tnlParallelReductionAbsMax< double, long int>& operation,
+                                   ( tnlParallelReductionAbsMax< double, long int>& operation,
                                      const typename tnlParallelReductionAbsMax< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput2,
@@ -92,7 +92,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, long in
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > >
-                                   ( const tnlParallelReductionAbsMax< long double, long int>& operation,
+                                   ( tnlParallelReductionAbsMax< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-abs-min_impl.cu b/src/core/cuda/cuda-reduction-abs-min_impl.cu
index 629fa37ddc..21da19efa4 100644
--- a/src/core/cuda/cuda-reduction-abs-min_impl.cu
+++ b/src/core/cuda/cuda-reduction-abs-min_impl.cu
@@ -25,28 +25,28 @@
  */
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, int > >
-                                   ( const tnlParallelReductionAbsMin< char, int >& operation,
+                                   ( tnlParallelReductionAbsMin< char, int >& operation,
                                      const typename tnlParallelReductionAbsMin< char, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< char, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< char, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< char, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, int > >
-                                   ( const tnlParallelReductionAbsMin< int, int >& operation,
+                                   ( tnlParallelReductionAbsMin< int, int >& operation,
                                      const typename tnlParallelReductionAbsMin< int, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< int, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< int, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< int, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< float, int > >
-                                   ( const tnlParallelReductionAbsMin< float, int >& operation,
+                                   ( tnlParallelReductionAbsMin< float, int >& operation,
                                      const typename tnlParallelReductionAbsMin< float, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, int > >
-                                   ( const tnlParallelReductionAbsMin< double, int>& operation,
+                                   ( tnlParallelReductionAbsMin< double, int>& operation,
                                      const typename tnlParallelReductionAbsMin< double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput2,
@@ -54,7 +54,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, int > >
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, int > >
-                                   ( const tnlParallelReductionAbsMin< long double, int>& operation,
+                                   ( tnlParallelReductionAbsMin< long double, int>& operation,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput2,
@@ -62,7 +62,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, in
 #endif
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int > >
-                                   ( const tnlParallelReductionAbsMin< char, long int >& operation,
+                                   ( tnlParallelReductionAbsMin< char, long int >& operation,
                                      const typename tnlParallelReductionAbsMin< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< char, long int > :: RealType* deviceInput2,
@@ -70,21 +70,21 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, long int > >
-                                   ( const tnlParallelReductionAbsMin< int, long int >& operation,
+                                   ( tnlParallelReductionAbsMin< int, long int >& operation,
                                      const typename tnlParallelReductionAbsMin< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< float, long int > >
-                                   ( const tnlParallelReductionAbsMin< float, long int >& operation,
+                                   ( tnlParallelReductionAbsMin< float, long int >& operation,
                                      const typename tnlParallelReductionAbsMin< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsMin< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, long int > >
-                                   ( const tnlParallelReductionAbsMin< double, long int>& operation,
+                                   ( tnlParallelReductionAbsMin< double, long int>& operation,
                                      const typename tnlParallelReductionAbsMin< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput2,
@@ -92,7 +92,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, long in
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > >
-                                   ( const tnlParallelReductionAbsMin< long double, long int>& operation,
+                                   ( tnlParallelReductionAbsMin< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-abs-sum_impl.cu b/src/core/cuda/cuda-reduction-abs-sum_impl.cu
index a023631a03..96c8ab7a9c 100644
--- a/src/core/cuda/cuda-reduction-abs-sum_impl.cu
+++ b/src/core/cuda/cuda-reduction-abs-sum_impl.cu
@@ -25,28 +25,28 @@
  */
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, int > >
-                                   ( const tnlParallelReductionAbsSum< char, int >& operation,
+                                   ( tnlParallelReductionAbsSum< char, int >& operation,
                                      const typename tnlParallelReductionAbsSum< char, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< char, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< char, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< char, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< int, int > >
-                                   ( const tnlParallelReductionAbsSum< int, int >& operation,
+                                   ( tnlParallelReductionAbsSum< int, int >& operation,
                                      const typename tnlParallelReductionAbsSum< int, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< int, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< int, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< int, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< float, int > >
-                                   ( const tnlParallelReductionAbsSum< float, int >& operation,
+                                   ( tnlParallelReductionAbsSum< float, int >& operation,
                                      const typename tnlParallelReductionAbsSum< float, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, int > >
-                                   ( const tnlParallelReductionAbsSum< double, int>& operation,
+                                   ( tnlParallelReductionAbsSum< double, int>& operation,
                                      const typename tnlParallelReductionAbsSum< double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput2,
@@ -54,7 +54,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, int > >
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, int > >
-                                   ( const tnlParallelReductionAbsSum< long double, int>& operation,
+                                   ( tnlParallelReductionAbsSum< long double, int>& operation,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput2,
@@ -62,7 +62,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, in
 #endif
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, long int > >
-                                   ( const tnlParallelReductionAbsSum< char, long int >& operation,
+                                   ( tnlParallelReductionAbsSum< char, long int >& operation,
                                      const typename tnlParallelReductionAbsSum< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< char, long int > :: RealType* deviceInput2,
@@ -70,21 +70,21 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, long int
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< int, long int > >
-                                   ( const tnlParallelReductionAbsSum< int, long int >& operation,
+                                   ( tnlParallelReductionAbsSum< int, long int >& operation,
                                      const typename tnlParallelReductionAbsSum< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< float, long int > >
-                                   ( const tnlParallelReductionAbsSum< float, long int >& operation,
+                                   ( tnlParallelReductionAbsSum< float, long int >& operation,
                                      const typename tnlParallelReductionAbsSum< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionAbsSum< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, long int > >
-                                   ( const tnlParallelReductionAbsSum< double, long int>& operation,
+                                   ( tnlParallelReductionAbsSum< double, long int>& operation,
                                      const typename tnlParallelReductionAbsSum< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput2,
@@ -92,7 +92,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, long in
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, long int > >
-                                   ( const tnlParallelReductionAbsSum< long double, long int>& operation,
+                                   ( tnlParallelReductionAbsSum< long double, long int>& operation,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-and_impl.cu b/src/core/cuda/cuda-reduction-and_impl.cu
index ac71e46e19..1baa1699f1 100644
--- a/src/core/cuda/cuda-reduction-and_impl.cu
+++ b/src/core/cuda/cuda-reduction-and_impl.cu
@@ -24,28 +24,28 @@
  * Logical AND
  */
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, int > >
-                                   ( const tnlParallelReductionLogicalAnd< char, int >& operation,
+                                   ( tnlParallelReductionLogicalAnd< char, int >& operation,
                                      const typename tnlParallelReductionLogicalAnd< char, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< char, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< char, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< char, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< int, int > >
-                                   ( const tnlParallelReductionLogicalAnd< int, int >& operation,
+                                   ( tnlParallelReductionLogicalAnd< int, int >& operation,
                                      const typename tnlParallelReductionLogicalAnd< int, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< int, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< int, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< int, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< float, int > >
-                                   ( const tnlParallelReductionLogicalAnd< float, int >& operation,
+                                   ( tnlParallelReductionLogicalAnd< float, int >& operation,
                                      const typename tnlParallelReductionLogicalAnd< float, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, int > >
-                                   ( const tnlParallelReductionLogicalAnd< double, int>& operation,
+                                   ( tnlParallelReductionLogicalAnd< double, int>& operation,
                                      const typename tnlParallelReductionLogicalAnd< double, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput2,
@@ -53,7 +53,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, int
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, int > >
-                                   ( const tnlParallelReductionLogicalAnd< long double, int>& operation,
+                                   ( tnlParallelReductionLogicalAnd< long double, int>& operation,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput2,
@@ -62,28 +62,28 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, long int > >
-                                   ( const tnlParallelReductionLogicalAnd< char, long int >& operation,
+                                   ( tnlParallelReductionLogicalAnd< char, long int >& operation,
                                      const typename tnlParallelReductionLogicalAnd< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< char, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< int, long int > >
-                                   ( const tnlParallelReductionLogicalAnd< int, long int >& operation,
+                                   ( tnlParallelReductionLogicalAnd< int, long int >& operation,
                                      const typename tnlParallelReductionLogicalAnd< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< float, long int > >
-                                   ( const tnlParallelReductionLogicalAnd< float, long int >& operation,
+                                   ( tnlParallelReductionLogicalAnd< float, long int >& operation,
                                      const typename tnlParallelReductionLogicalAnd< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalAnd< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, long int > >
-                                   ( const tnlParallelReductionLogicalAnd< double, long int>& operation,
+                                   ( tnlParallelReductionLogicalAnd< double, long int>& operation,
                                      const typename tnlParallelReductionLogicalAnd< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput2,
@@ -91,7 +91,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, lon
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, long int > >
-                                   ( const tnlParallelReductionLogicalAnd< long double, long int>& operation,
+                                   ( tnlParallelReductionLogicalAnd< long double, long int>& operation,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-diff-abs-max_impl.cu b/src/core/cuda/cuda-reduction-diff-abs-max_impl.cu
index 291810ef74..a65ec09dee 100644
--- a/src/core/cuda/cuda-reduction-diff-abs-max_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-abs-max_impl.cu
@@ -25,28 +25,28 @@
  */
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< char, int > >
-                                   ( const tnlParallelReductionDiffAbsMax< char, int >& operation,
+                                   ( tnlParallelReductionDiffAbsMax< char, int >& operation,
                                      const typename tnlParallelReductionDiffAbsMax< char, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< char, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< char, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< char, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< int, int > >
-                                   ( const tnlParallelReductionDiffAbsMax< int, int >& operation,
+                                   ( tnlParallelReductionDiffAbsMax< int, int >& operation,
                                      const typename tnlParallelReductionDiffAbsMax< int, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< int, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< int, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< int, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< float, int > >
-                                   ( const tnlParallelReductionDiffAbsMax< float, int >& operation,
+                                   ( tnlParallelReductionDiffAbsMax< float, int >& operation,
                                      const typename tnlParallelReductionDiffAbsMax< float, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< double, int > >
-                                   ( const tnlParallelReductionDiffAbsMax< double, int>& operation,
+                                   ( tnlParallelReductionDiffAbsMax< double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsMax< double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< double, int > :: RealType* deviceInput2,
@@ -54,7 +54,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< double, int
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, int > >
-                                   ( const tnlParallelReductionDiffAbsMax< long double, int>& operation,
+                                   ( tnlParallelReductionDiffAbsMax< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< long double, int > :: RealType* deviceInput2,
@@ -63,28 +63,28 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< char, long int > >
-                                   ( const tnlParallelReductionDiffAbsMax< char, long int >& operation,
+                                   ( tnlParallelReductionDiffAbsMax< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsMax< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< char, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< int, long int > >
-                                   ( const tnlParallelReductionDiffAbsMax< int, long int >& operation,
+                                   ( tnlParallelReductionDiffAbsMax< int, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsMax< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< float, long int > >
-                                   ( const tnlParallelReductionDiffAbsMax< float, long int >& operation,
+                                   ( tnlParallelReductionDiffAbsMax< float, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsMax< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMax< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< double, long int > >
-                                   ( const tnlParallelReductionDiffAbsMax< double, long int>& operation,
+                                   ( tnlParallelReductionDiffAbsMax< double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsMax< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< double, long int > :: RealType* deviceInput2,
@@ -92,7 +92,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< double, lon
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMax< long double, long int > >
-                                   ( const tnlParallelReductionDiffAbsMax< long double, long int>& operation,
+                                   ( tnlParallelReductionDiffAbsMax< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMax< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-diff-abs-min_impl.cu b/src/core/cuda/cuda-reduction-diff-abs-min_impl.cu
index d9ce714abd..9610ee2675 100644
--- a/src/core/cuda/cuda-reduction-diff-abs-min_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-abs-min_impl.cu
@@ -26,28 +26,28 @@
  */
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< char, int > >
-                                   ( const tnlParallelReductionDiffAbsMin< char, int >& operation,
+                                   ( tnlParallelReductionDiffAbsMin< char, int >& operation,
                                      const typename tnlParallelReductionDiffAbsMin< char, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< char, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< char, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< char, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< int, int > >
-                                   ( const tnlParallelReductionDiffAbsMin< int, int >& operation,
+                                   ( tnlParallelReductionDiffAbsMin< int, int >& operation,
                                      const typename tnlParallelReductionDiffAbsMin< int, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< int, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< int, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< int, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< float, int > >
-                                   ( const tnlParallelReductionDiffAbsMin< float, int >& operation,
+                                   ( tnlParallelReductionDiffAbsMin< float, int >& operation,
                                      const typename tnlParallelReductionDiffAbsMin< float, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< double, int > >
-                                   ( const tnlParallelReductionDiffAbsMin< double, int>& operation,
+                                   ( tnlParallelReductionDiffAbsMin< double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsMin< double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< double, int > :: RealType* deviceInput2,
@@ -55,7 +55,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< double, int
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, int > >
-                                   ( const tnlParallelReductionDiffAbsMin< long double, int>& operation,
+                                   ( tnlParallelReductionDiffAbsMin< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< long double, int > :: RealType* deviceInput2,
@@ -64,28 +64,28 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< char, long int > >
-                                   ( const tnlParallelReductionDiffAbsMin< char, long int >& operation,
+                                   ( tnlParallelReductionDiffAbsMin< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsMin< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< char, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< int, long int > >
-                                   ( const tnlParallelReductionDiffAbsMin< int, long int >& operation,
+                                   ( tnlParallelReductionDiffAbsMin< int, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsMin< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< float, long int > >
-                                   ( const tnlParallelReductionDiffAbsMin< float, long int >& operation,
+                                   ( tnlParallelReductionDiffAbsMin< float, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsMin< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsMin< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< double, long int > >
-                                   ( const tnlParallelReductionDiffAbsMin< double, long int>& operation,
+                                   ( tnlParallelReductionDiffAbsMin< double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsMin< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< double, long int > :: RealType* deviceInput2,
@@ -93,7 +93,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< double, lon
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsMin< long double, long int > >
-                                   ( const tnlParallelReductionDiffAbsMin< long double, long int>& operation,
+                                   ( tnlParallelReductionDiffAbsMin< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsMin< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-diff-abs-sum_impl.cu b/src/core/cuda/cuda-reduction-diff-abs-sum_impl.cu
index 5298d03349..b70f953add 100644
--- a/src/core/cuda/cuda-reduction-diff-abs-sum_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-abs-sum_impl.cu
@@ -25,28 +25,28 @@
  */
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< char, int > >
-                                   ( const tnlParallelReductionDiffAbsSum< char, int >& operation,
+                                   ( tnlParallelReductionDiffAbsSum< char, int >& operation,
                                      const typename tnlParallelReductionDiffAbsSum< char, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< char, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< char, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< char, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< int, int > >
-                                   ( const tnlParallelReductionDiffAbsSum< int, int >& operation,
+                                   ( tnlParallelReductionDiffAbsSum< int, int >& operation,
                                      const typename tnlParallelReductionDiffAbsSum< int, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< int, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< int, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< int, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< float, int > >
-                                   ( const tnlParallelReductionDiffAbsSum< float, int >& operation,
+                                   ( tnlParallelReductionDiffAbsSum< float, int >& operation,
                                      const typename tnlParallelReductionDiffAbsSum< float, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< double, int > >
-                                   ( const tnlParallelReductionDiffAbsSum< double, int>& operation,
+                                   ( tnlParallelReductionDiffAbsSum< double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsSum< double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< double, int > :: RealType* deviceInput2,
@@ -54,7 +54,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< double, int
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, int > >
-                                   ( const tnlParallelReductionDiffAbsSum< long double, int>& operation,
+                                   ( tnlParallelReductionDiffAbsSum< long double, int>& operation,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< long double, int > :: RealType* deviceInput2,
@@ -63,28 +63,28 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< char, long int > >
-                                   ( const tnlParallelReductionDiffAbsSum< char, long int >& operation,
+                                   ( tnlParallelReductionDiffAbsSum< char, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsSum< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< char, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< int, long int > >
-                                   ( const tnlParallelReductionDiffAbsSum< int, long int >& operation,
+                                   ( tnlParallelReductionDiffAbsSum< int, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsSum< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< float, long int > >
-                                   ( const tnlParallelReductionDiffAbsSum< float, long int >& operation,
+                                   ( tnlParallelReductionDiffAbsSum< float, long int >& operation,
                                      const typename tnlParallelReductionDiffAbsSum< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffAbsSum< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< double, long int > >
-                                   ( const tnlParallelReductionDiffAbsSum< double, long int>& operation,
+                                   ( tnlParallelReductionDiffAbsSum< double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsSum< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< double, long int > :: RealType* deviceInput2,
@@ -92,7 +92,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< double, lon
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffAbsSum< long double, long int > >
-                                   ( const tnlParallelReductionDiffAbsSum< long double, long int>& operation,
+                                   ( tnlParallelReductionDiffAbsSum< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffAbsSum< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-diff-l2-norm_impl.cu b/src/core/cuda/cuda-reduction-diff-l2-norm_impl.cu
index 4f3e95b749..a6bfae5b94 100644
--- a/src/core/cuda/cuda-reduction-diff-l2-norm_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-l2-norm_impl.cu
@@ -24,14 +24,14 @@
  * Diff L2 Norm
  */
 template bool reductionOnCudaDevice< tnlParallelReductionDiffL2Norm< float, int > >
-                                   ( const tnlParallelReductionDiffL2Norm< float, int >& operation,
+                                   ( tnlParallelReductionDiffL2Norm< float, int >& operation,
                                      const typename tnlParallelReductionDiffL2Norm< float, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffL2Norm< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffL2Norm< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffL2Norm< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffL2Norm< double, int > >
-                                   ( const tnlParallelReductionDiffL2Norm< double, int>& operation,
+                                   ( tnlParallelReductionDiffL2Norm< double, int>& operation,
                                      const typename tnlParallelReductionDiffL2Norm< double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffL2Norm< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffL2Norm< double, int > :: RealType* deviceInput2,
@@ -39,7 +39,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffL2Norm< double, int
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffL2Norm< long double, int > >
-                                   ( const tnlParallelReductionDiffL2Norm< long double, int>& operation,
+                                   ( tnlParallelReductionDiffL2Norm< long double, int>& operation,
                                      const typename tnlParallelReductionDiffL2Norm< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffL2Norm< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffL2Norm< long double, int > :: RealType* deviceInput2,
@@ -48,28 +48,28 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffL2Norm< long double
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffL2Norm< char, long int > >
-                                   ( const tnlParallelReductionDiffL2Norm< char, long int >& operation,
+                                   ( tnlParallelReductionDiffL2Norm< char, long int >& operation,
                                      const typename tnlParallelReductionDiffL2Norm< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffL2Norm< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffL2Norm< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffL2Norm< char, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffL2Norm< int, long int > >
-                                   ( const tnlParallelReductionDiffL2Norm< int, long int >& operation,
+                                   ( tnlParallelReductionDiffL2Norm< int, long int >& operation,
                                      const typename tnlParallelReductionDiffL2Norm< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffL2Norm< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffL2Norm< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffL2Norm< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffL2Norm< float, long int > >
-                                   ( const tnlParallelReductionDiffL2Norm< float, long int >& operation,
+                                   ( tnlParallelReductionDiffL2Norm< float, long int >& operation,
                                      const typename tnlParallelReductionDiffL2Norm< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffL2Norm< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffL2Norm< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffL2Norm< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffL2Norm< double, long int > >
-                                   ( const tnlParallelReductionDiffL2Norm< double, long int>& operation,
+                                   ( tnlParallelReductionDiffL2Norm< double, long int>& operation,
                                      const typename tnlParallelReductionDiffL2Norm< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffL2Norm< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffL2Norm< double, long int > :: RealType* deviceInput2,
@@ -77,7 +77,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffL2Norm< double, lon
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffL2Norm< long double, long int > >
-                                   ( const tnlParallelReductionDiffL2Norm< long double, long int>& operation,
+                                   ( tnlParallelReductionDiffL2Norm< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffL2Norm< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffL2Norm< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffL2Norm< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-diff-lp-norm_impl.cu b/src/core/cuda/cuda-reduction-diff-lp-norm_impl.cu
index 2359564477..a762c1d194 100644
--- a/src/core/cuda/cuda-reduction-diff-lp-norm_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-lp-norm_impl.cu
@@ -24,14 +24,14 @@
  * Diff Lp Norm
  */
 template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< float, int > >
-                                   ( const tnlParallelReductionDiffLpNorm< float, int >& operation,
+                                   ( tnlParallelReductionDiffLpNorm< float, int >& operation,
                                      const typename tnlParallelReductionDiffLpNorm< float, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< double, int > >
-                                   ( const tnlParallelReductionDiffLpNorm< double, int>& operation,
+                                   ( tnlParallelReductionDiffLpNorm< double, int>& operation,
                                      const typename tnlParallelReductionDiffLpNorm< double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< double, int > :: RealType* deviceInput2,
@@ -39,7 +39,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< double, int
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, int > >
-                                   ( const tnlParallelReductionDiffLpNorm< long double, int>& operation,
+                                   ( tnlParallelReductionDiffLpNorm< long double, int>& operation,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< long double, int > :: RealType* deviceInput2,
@@ -48,28 +48,28 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< char, long int > >
-                                   ( const tnlParallelReductionDiffLpNorm< char, long int >& operation,
+                                   ( tnlParallelReductionDiffLpNorm< char, long int >& operation,
                                      const typename tnlParallelReductionDiffLpNorm< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< char, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< int, long int > >
-                                   ( const tnlParallelReductionDiffLpNorm< int, long int >& operation,
+                                   ( tnlParallelReductionDiffLpNorm< int, long int >& operation,
                                      const typename tnlParallelReductionDiffLpNorm< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< float, long int > >
-                                   ( const tnlParallelReductionDiffLpNorm< float, long int >& operation,
+                                   ( tnlParallelReductionDiffLpNorm< float, long int >& operation,
                                      const typename tnlParallelReductionDiffLpNorm< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffLpNorm< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< double, long int > >
-                                   ( const tnlParallelReductionDiffLpNorm< double, long int>& operation,
+                                   ( tnlParallelReductionDiffLpNorm< double, long int>& operation,
                                      const typename tnlParallelReductionDiffLpNorm< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< double, long int > :: RealType* deviceInput2,
@@ -77,7 +77,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< double, lon
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffLpNorm< long double, long int > >
-                                   ( const tnlParallelReductionDiffLpNorm< long double, long int>& operation,
+                                   ( tnlParallelReductionDiffLpNorm< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffLpNorm< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-diff-max_impl.cu b/src/core/cuda/cuda-reduction-diff-max_impl.cu
index fe91ae6ef6..822d9e779c 100644
--- a/src/core/cuda/cuda-reduction-diff-max_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-max_impl.cu
@@ -25,28 +25,28 @@
  */
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< char, int > >
-                                   ( const tnlParallelReductionDiffMax< char, int >& operation,
+                                   ( tnlParallelReductionDiffMax< char, int >& operation,
                                      const typename tnlParallelReductionDiffMax< char, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< char, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< char, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< char, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< int, int > >
-                                   ( const tnlParallelReductionDiffMax< int, int >& operation,
+                                   ( tnlParallelReductionDiffMax< int, int >& operation,
                                      const typename tnlParallelReductionDiffMax< int, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< int, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< int, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< int, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< float, int > >
-                                   ( const tnlParallelReductionDiffMax< float, int >& operation,
+                                   ( tnlParallelReductionDiffMax< float, int >& operation,
                                      const typename tnlParallelReductionDiffMax< float, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< double, int > >
-                                   ( const tnlParallelReductionDiffMax< double, int>& operation,
+                                   ( tnlParallelReductionDiffMax< double, int>& operation,
                                      const typename tnlParallelReductionDiffMax< double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< double, int > :: RealType* deviceInput2,
@@ -54,7 +54,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< double, int >
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, int > >
-                                   ( const tnlParallelReductionDiffMax< long double, int>& operation,
+                                   ( tnlParallelReductionDiffMax< long double, int>& operation,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< long double, int > :: RealType* deviceInput2,
@@ -63,28 +63,28 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, i
 
 #ifdef INSTANTIATE_LONG_INT                                     
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< char, long int > >
-                                   ( const tnlParallelReductionDiffMax< char, long int >& operation,
+                                   ( tnlParallelReductionDiffMax< char, long int >& operation,
                                      const typename tnlParallelReductionDiffMax< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< char, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< int, long int > >
-                                   ( const tnlParallelReductionDiffMax< int, long int >& operation,
+                                   ( tnlParallelReductionDiffMax< int, long int >& operation,
                                      const typename tnlParallelReductionDiffMax< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< float, long int > >
-                                   ( const tnlParallelReductionDiffMax< float, long int >& operation,
+                                   ( tnlParallelReductionDiffMax< float, long int >& operation,
                                      const typename tnlParallelReductionDiffMax< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMax< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< double, long int > >
-                                   ( const tnlParallelReductionDiffMax< double, long int>& operation,
+                                   ( tnlParallelReductionDiffMax< double, long int>& operation,
                                      const typename tnlParallelReductionDiffMax< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< double, long int > :: RealType* deviceInput2,
@@ -92,7 +92,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< double, long i
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMax< long double, long int > >
-                                   ( const tnlParallelReductionDiffMax< long double, long int>& operation,
+                                   ( tnlParallelReductionDiffMax< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMax< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-diff-min_impl.cu b/src/core/cuda/cuda-reduction-diff-min_impl.cu
index ed13335b8b..5973ad0bcf 100644
--- a/src/core/cuda/cuda-reduction-diff-min_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-min_impl.cu
@@ -26,28 +26,28 @@
  */
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< char, int > >
-                                   ( const tnlParallelReductionDiffMin< char, int >& operation,
+                                   ( tnlParallelReductionDiffMin< char, int >& operation,
                                      const typename tnlParallelReductionDiffMin< char, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< char, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< char, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< char, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< int, int > >
-                                   ( const tnlParallelReductionDiffMin< int, int >& operation,
+                                   ( tnlParallelReductionDiffMin< int, int >& operation,
                                      const typename tnlParallelReductionDiffMin< int, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< int, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< int, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< int, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< float, int > >
-                                   ( const tnlParallelReductionDiffMin< float, int >& operation,
+                                   ( tnlParallelReductionDiffMin< float, int >& operation,
                                      const typename tnlParallelReductionDiffMin< float, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< double, int > >
-                                   ( const tnlParallelReductionDiffMin< double, int>& operation,
+                                   ( tnlParallelReductionDiffMin< double, int>& operation,
                                      const typename tnlParallelReductionDiffMin< double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< double, int > :: RealType* deviceInput2,
@@ -55,7 +55,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< double, int >
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, int > >
-                                   ( const tnlParallelReductionDiffMin< long double, int>& operation,
+                                   ( tnlParallelReductionDiffMin< long double, int>& operation,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< long double, int > :: RealType* deviceInput2,
@@ -64,28 +64,28 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, i
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< char, long int > >
-                                   ( const tnlParallelReductionDiffMin< char, long int >& operation,
+                                   ( tnlParallelReductionDiffMin< char, long int >& operation,
                                      const typename tnlParallelReductionDiffMin< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< char, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< int, long int > >
-                                   ( const tnlParallelReductionDiffMin< int, long int >& operation,
+                                   ( tnlParallelReductionDiffMin< int, long int >& operation,
                                      const typename tnlParallelReductionDiffMin< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< float, long int > >
-                                   ( const tnlParallelReductionDiffMin< float, long int >& operation,
+                                   ( tnlParallelReductionDiffMin< float, long int >& operation,
                                      const typename tnlParallelReductionDiffMin< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffMin< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< double, long int > >
-                                   ( const tnlParallelReductionDiffMin< double, long int>& operation,
+                                   ( tnlParallelReductionDiffMin< double, long int>& operation,
                                      const typename tnlParallelReductionDiffMin< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< double, long int > :: RealType* deviceInput2,
@@ -93,7 +93,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< double, long i
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffMin< long double, long int > >
-                                   ( const tnlParallelReductionDiffMin< long double, long int>& operation,
+                                   ( tnlParallelReductionDiffMin< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffMin< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-diff-sum_impl.cu b/src/core/cuda/cuda-reduction-diff-sum_impl.cu
index aa08778ea7..8f2eb27587 100644
--- a/src/core/cuda/cuda-reduction-diff-sum_impl.cu
+++ b/src/core/cuda/cuda-reduction-diff-sum_impl.cu
@@ -25,28 +25,28 @@
  */
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< char, int > >
-                                   ( const tnlParallelReductionDiffSum< char, int >& operation,
+                                   ( tnlParallelReductionDiffSum< char, int >& operation,
                                      const typename tnlParallelReductionDiffSum< char, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< char, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< char, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< char, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< int, int > >
-                                   ( const tnlParallelReductionDiffSum< int, int >& operation,
+                                   ( tnlParallelReductionDiffSum< int, int >& operation,
                                      const typename tnlParallelReductionDiffSum< int, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< int, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< int, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< int, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< float, int > >
-                                   ( const tnlParallelReductionDiffSum< float, int >& operation,
+                                   ( tnlParallelReductionDiffSum< float, int >& operation,
                                      const typename tnlParallelReductionDiffSum< float, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< double, int > >
-                                   ( const tnlParallelReductionDiffSum< double, int>& operation,
+                                   ( tnlParallelReductionDiffSum< double, int>& operation,
                                      const typename tnlParallelReductionDiffSum< double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< double, int > :: RealType* deviceInput2,
@@ -54,7 +54,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< double, int >
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, int > >
-                                   ( const tnlParallelReductionDiffSum< long double, int>& operation,
+                                   ( tnlParallelReductionDiffSum< long double, int>& operation,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< long double, int > :: RealType* deviceInput2,
@@ -63,28 +63,28 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, i
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< char, long int > >
-                                   ( const tnlParallelReductionDiffSum< char, long int >& operation,
+                                   ( tnlParallelReductionDiffSum< char, long int >& operation,
                                      const typename tnlParallelReductionDiffSum< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< char, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< int, long int > >
-                                   ( const tnlParallelReductionDiffSum< int, long int >& operation,
+                                   ( tnlParallelReductionDiffSum< int, long int >& operation,
                                      const typename tnlParallelReductionDiffSum< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< float, long int > >
-                                   ( const tnlParallelReductionDiffSum< float, long int >& operation,
+                                   ( tnlParallelReductionDiffSum< float, long int >& operation,
                                      const typename tnlParallelReductionDiffSum< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionDiffSum< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< double, long int > >
-                                   ( const tnlParallelReductionDiffSum< double, long int>& operation,
+                                   ( tnlParallelReductionDiffSum< double, long int>& operation,
                                      const typename tnlParallelReductionDiffSum< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< double, long int > :: RealType* deviceInput2,
@@ -92,7 +92,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< double, long i
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionDiffSum< long double, long int > >
-                                   ( const tnlParallelReductionDiffSum< long double, long int>& operation,
+                                   ( tnlParallelReductionDiffSum< long double, long int>& operation,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionDiffSum< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-equalities_impl.cu b/src/core/cuda/cuda-reduction-equalities_impl.cu
index 6bf7f02630..398012099e 100644
--- a/src/core/cuda/cuda-reduction-equalities_impl.cu
+++ b/src/core/cuda/cuda-reduction-equalities_impl.cu
@@ -24,28 +24,28 @@
  * Equalities
  */
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, int > >
-                                   ( const tnlParallelReductionEqualities< char, int >& operation,
+                                   ( tnlParallelReductionEqualities< char, int >& operation,
                                      const typename tnlParallelReductionEqualities< char, int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< char, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< char, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< char, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< int, int > >
-                                   ( const tnlParallelReductionEqualities< int, int >& operation,
+                                   ( tnlParallelReductionEqualities< int, int >& operation,
                                      const typename tnlParallelReductionEqualities< int, int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< int, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< int, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< int, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< float, int > >
-                                   ( const tnlParallelReductionEqualities< float, int >& operation,
+                                   ( tnlParallelReductionEqualities< float, int >& operation,
                                      const typename tnlParallelReductionEqualities< float, int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, int > >
-                                   ( const tnlParallelReductionEqualities< double, int>& operation,
+                                   ( tnlParallelReductionEqualities< double, int>& operation,
                                      const typename tnlParallelReductionEqualities< double, int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput2,
@@ -53,7 +53,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, int
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, int > >
-                                   ( const tnlParallelReductionEqualities< long double, int>& operation,
+                                   ( tnlParallelReductionEqualities< long double, int>& operation,
                                      const typename tnlParallelReductionEqualities< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput2,
@@ -62,28 +62,28 @@ template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, long int > >
-                                   ( const tnlParallelReductionEqualities< char, long int >& operation,
+                                   ( tnlParallelReductionEqualities< char, long int >& operation,
                                      const typename tnlParallelReductionEqualities< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< char, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< int, long int > >
-                                   ( const tnlParallelReductionEqualities< int, long int >& operation,
+                                   ( tnlParallelReductionEqualities< int, long int >& operation,
                                      const typename tnlParallelReductionEqualities< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< float, long int > >
-                                   ( const tnlParallelReductionEqualities< float, long int >& operation,
+                                   ( tnlParallelReductionEqualities< float, long int >& operation,
                                      const typename tnlParallelReductionEqualities< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionEqualities< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, long int > >
-                                   ( const tnlParallelReductionEqualities< double, long int>& operation,
+                                   ( tnlParallelReductionEqualities< double, long int>& operation,
                                      const typename tnlParallelReductionEqualities< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput2,
@@ -91,7 +91,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, lon
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, long int > >
-                                   ( const tnlParallelReductionEqualities< long double, long int>& operation,
+                                   ( tnlParallelReductionEqualities< long double, long int>& operation,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-inequalities_impl.cu b/src/core/cuda/cuda-reduction-inequalities_impl.cu
index 828c88af10..479d9b2a31 100644
--- a/src/core/cuda/cuda-reduction-inequalities_impl.cu
+++ b/src/core/cuda/cuda-reduction-inequalities_impl.cu
@@ -24,28 +24,28 @@
  * Inequalities
  */
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, int > >
-                                   ( const tnlParallelReductionInequalities< char, int >& operation,
+                                   ( tnlParallelReductionInequalities< char, int >& operation,
                                      const typename tnlParallelReductionInequalities< char, int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< char, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< char, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< char, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< int, int > >
-                                   ( const tnlParallelReductionInequalities< int, int >& operation,
+                                   ( tnlParallelReductionInequalities< int, int >& operation,
                                      const typename tnlParallelReductionInequalities< int, int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< int, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< int, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< int, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< float, int > >
-                                   ( const tnlParallelReductionInequalities< float, int >& operation,
+                                   ( tnlParallelReductionInequalities< float, int >& operation,
                                      const typename tnlParallelReductionInequalities< float, int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, int > >
-                                   ( const tnlParallelReductionInequalities< double, int>& operation,
+                                   ( tnlParallelReductionInequalities< double, int>& operation,
                                      const typename tnlParallelReductionInequalities< double, int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput2,
@@ -53,7 +53,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, i
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, int > >
-                                   ( const tnlParallelReductionInequalities< long double, int>& operation,
+                                   ( tnlParallelReductionInequalities< long double, int>& operation,
                                      const typename tnlParallelReductionInequalities< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput2,
@@ -62,28 +62,28 @@ template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long doub
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, long int > >
-                                   ( const tnlParallelReductionInequalities< char, long int >& operation,
+                                   ( tnlParallelReductionInequalities< char, long int >& operation,
                                      const typename tnlParallelReductionInequalities< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< char, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< int, long int > >
-                                   ( const tnlParallelReductionInequalities< int, long int >& operation,
+                                   ( tnlParallelReductionInequalities< int, long int >& operation,
                                      const typename tnlParallelReductionInequalities< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< float, long int > >
-                                   ( const tnlParallelReductionInequalities< float, long int >& operation,
+                                   ( tnlParallelReductionInequalities< float, long int >& operation,
                                      const typename tnlParallelReductionInequalities< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionInequalities< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, long int > >
-                                   ( const tnlParallelReductionInequalities< double, long int>& operation,
+                                   ( tnlParallelReductionInequalities< double, long int>& operation,
                                      const typename tnlParallelReductionInequalities< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput2,
@@ -91,7 +91,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, l
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, long int > >
-                                   ( const tnlParallelReductionInequalities< long double, long int>& operation,
+                                   ( tnlParallelReductionInequalities< long double, long int>& operation,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-l2-norm_impl.cu b/src/core/cuda/cuda-reduction-l2-norm_impl.cu
index 29e6b265c1..214a15b53b 100644
--- a/src/core/cuda/cuda-reduction-l2-norm_impl.cu
+++ b/src/core/cuda/cuda-reduction-l2-norm_impl.cu
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          cuda-reduction-lp-norm_impl.cu  -  description
+                          cuda-reduction-l2-norm_impl.cu  -  description
                              -------------------
     begin                : Jan 19, 2014
     copyright            : (C) 2014 by Tomas Oberhuber
@@ -24,14 +24,14 @@
  * L2 Norm
  */
 template bool reductionOnCudaDevice< tnlParallelReductionL2Norm< float, int > >
-                                   ( const tnlParallelReductionL2Norm< float, int >& operation,
+                                   ( tnlParallelReductionL2Norm< float, int >& operation,
                                      const typename tnlParallelReductionL2Norm< float, int > :: IndexType size,
                                      const typename tnlParallelReductionL2Norm< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionL2Norm< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionL2Norm< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionL2Norm< double, int > >
-                                   ( const tnlParallelReductionL2Norm< double, int>& operation,
+                                   ( tnlParallelReductionL2Norm< double, int>& operation,
                                      const typename tnlParallelReductionL2Norm< double, int > :: IndexType size,
                                      const typename tnlParallelReductionL2Norm< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionL2Norm< double, int > :: RealType* deviceInput2,
@@ -39,7 +39,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionL2Norm< double, int > >
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionL2Norm< long double, int > >
-                                   ( const tnlParallelReductionL2Norm< long double, int>& operation,
+                                   ( tnlParallelReductionL2Norm< long double, int>& operation,
                                      const typename tnlParallelReductionL2Norm< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionL2Norm< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionL2Norm< long double, int > :: RealType* deviceInput2,
@@ -48,21 +48,21 @@ template bool reductionOnCudaDevice< tnlParallelReductionL2Norm< long double, in
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionL2Norm< int, long int > >
-                                   ( const tnlParallelReductionL2Norm< int, long int >& operation,
+                                   ( tnlParallelReductionL2Norm< int, long int >& operation,
                                      const typename tnlParallelReductionL2Norm< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionL2Norm< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionL2Norm< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionL2Norm< int, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionL2Norm< float, long int > >
-                                   ( const tnlParallelReductionL2Norm< float, long int >& operation,
+                                   ( tnlParallelReductionL2Norm< float, long int >& operation,
                                      const typename tnlParallelReductionL2Norm< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionL2Norm< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionL2Norm< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionL2Norm< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionL2Norm< double, long int > >
-                                   ( const tnlParallelReductionL2Norm< double, long int>& operation,
+                                   ( tnlParallelReductionL2Norm< double, long int>& operation,
                                      const typename tnlParallelReductionL2Norm< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionL2Norm< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionL2Norm< double, long int > :: RealType* deviceInput2,
@@ -70,7 +70,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionL2Norm< double, long in
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionL2Norm< long double, long int > >
-                                   ( const tnlParallelReductionL2Norm< long double, long int>& operation,
+                                   ( tnlParallelReductionL2Norm< long double, long int>& operation,
                                      const typename tnlParallelReductionL2Norm< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionL2Norm< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionL2Norm< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-lp-norm_impl.cu b/src/core/cuda/cuda-reduction-lp-norm_impl.cu
index a5f5d6644c..4de9501590 100644
--- a/src/core/cuda/cuda-reduction-lp-norm_impl.cu
+++ b/src/core/cuda/cuda-reduction-lp-norm_impl.cu
@@ -24,14 +24,14 @@
  * Lp Norm
  */
 template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< float, int > >
-                                   ( const tnlParallelReductionLpNorm< float, int >& operation,
+                                   ( tnlParallelReductionLpNorm< float, int >& operation,
                                      const typename tnlParallelReductionLpNorm< float, int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, int > >
-                                   ( const tnlParallelReductionLpNorm< double, int>& operation,
+                                   ( tnlParallelReductionLpNorm< double, int>& operation,
                                      const typename tnlParallelReductionLpNorm< double, int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput2,
@@ -39,7 +39,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, int > >
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, int > >
-                                   ( const tnlParallelReductionLpNorm< long double, int>& operation,
+                                   ( tnlParallelReductionLpNorm< long double, int>& operation,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput2,
@@ -48,21 +48,21 @@ template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, in
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< int, long int > >
-                                   ( const tnlParallelReductionLpNorm< int, long int >& operation,
+                                   ( tnlParallelReductionLpNorm< int, long int >& operation,
                                      const typename tnlParallelReductionLpNorm< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< int, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< float, long int > >
-                                   ( const tnlParallelReductionLpNorm< float, long int >& operation,
+                                   ( tnlParallelReductionLpNorm< float, long int >& operation,
                                      const typename tnlParallelReductionLpNorm< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLpNorm< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, long int > >
-                                   ( const tnlParallelReductionLpNorm< double, long int>& operation,
+                                   ( tnlParallelReductionLpNorm< double, long int>& operation,
                                      const typename tnlParallelReductionLpNorm< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput2,
@@ -70,7 +70,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, long in
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, long int > >
-                                   ( const tnlParallelReductionLpNorm< long double, long int>& operation,
+                                   ( tnlParallelReductionLpNorm< long double, long int>& operation,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-max_impl.cu b/src/core/cuda/cuda-reduction-max_impl.cu
index cba153c81b..cfca4156f9 100644
--- a/src/core/cuda/cuda-reduction-max_impl.cu
+++ b/src/core/cuda/cuda-reduction-max_impl.cu
@@ -25,28 +25,28 @@
  */
 
 template bool reductionOnCudaDevice< tnlParallelReductionMax< char, int > >
-                                   ( const tnlParallelReductionMax< char, int >& operation,
+                                   ( tnlParallelReductionMax< char, int >& operation,
                                      const typename tnlParallelReductionMax< char, int > :: IndexType size,
                                      const typename tnlParallelReductionMax< char, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< char, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< char, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionMax< int, int > >
-                                   ( const tnlParallelReductionMax< int, int >& operation,
+                                   ( tnlParallelReductionMax< int, int >& operation,
                                      const typename tnlParallelReductionMax< int, int > :: IndexType size,
                                      const typename tnlParallelReductionMax< int, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< int, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< int, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionMax< float, int > >
-                                   ( const tnlParallelReductionMax< float, int >& operation,
+                                   ( tnlParallelReductionMax< float, int >& operation,
                                      const typename tnlParallelReductionMax< float, int > :: IndexType size,
                                      const typename tnlParallelReductionMax< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionMax< double, int > >
-                                   ( const tnlParallelReductionMax< double, int>& operation,
+                                   ( tnlParallelReductionMax< double, int>& operation,
                                      const typename tnlParallelReductionMax< double, int > :: IndexType size,
                                      const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput2,
@@ -54,7 +54,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionMax< double, int > >
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, int > >
-                                   ( const tnlParallelReductionMax< long double, int>& operation,
+                                   ( tnlParallelReductionMax< long double, int>& operation,
                                      const typename tnlParallelReductionMax< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput2,
@@ -63,28 +63,28 @@ template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, int >
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionMax< char, long int > >
-                                   ( const tnlParallelReductionMax< char, long int >& operation,
+                                   ( tnlParallelReductionMax< char, long int >& operation,
                                      const typename tnlParallelReductionMax< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionMax< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< char, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionMax< int, long int > >
-                                   ( const tnlParallelReductionMax< int, long int >& operation,
+                                   ( tnlParallelReductionMax< int, long int >& operation,
                                      const typename tnlParallelReductionMax< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionMax< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionMax< float, long int > >
-                                   ( const tnlParallelReductionMax< float, long int >& operation,
+                                   ( tnlParallelReductionMax< float, long int >& operation,
                                      const typename tnlParallelReductionMax< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionMax< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMax< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionMax< double, long int > >
-                                   ( const tnlParallelReductionMax< double, long int>& operation,
+                                   ( tnlParallelReductionMax< double, long int>& operation,
                                      const typename tnlParallelReductionMax< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput2,
@@ -92,7 +92,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionMax< double, long int >
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, long int > >
-                                   ( const tnlParallelReductionMax< long double, long int>& operation,
+                                   ( tnlParallelReductionMax< long double, long int>& operation,
                                      const typename tnlParallelReductionMax< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-min_impl.cu b/src/core/cuda/cuda-reduction-min_impl.cu
index dc5a1f4140..535b38a32f 100644
--- a/src/core/cuda/cuda-reduction-min_impl.cu
+++ b/src/core/cuda/cuda-reduction-min_impl.cu
@@ -25,28 +25,28 @@
  */
 
 template bool reductionOnCudaDevice< tnlParallelReductionMin< char, int > >
-                                   ( const tnlParallelReductionMin< char, int >& operation,
+                                   ( tnlParallelReductionMin< char, int >& operation,
                                      const typename tnlParallelReductionMin< char, int > :: IndexType size,
                                      const typename tnlParallelReductionMin< char, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< char, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< char, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionMin< int, int > >
-                                   ( const tnlParallelReductionMin< int, int >& operation,
+                                   ( tnlParallelReductionMin< int, int >& operation,
                                      const typename tnlParallelReductionMin< int, int > :: IndexType size,
                                      const typename tnlParallelReductionMin< int, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< int, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< int, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionMin< float, int > >
-                                   ( const tnlParallelReductionMin< float, int >& operation,
+                                   ( tnlParallelReductionMin< float, int >& operation,
                                      const typename tnlParallelReductionMin< float, int > :: IndexType size,
                                      const typename tnlParallelReductionMin< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionMin< double, int > >
-                                   ( const tnlParallelReductionMin< double, int>& operation,
+                                   ( tnlParallelReductionMin< double, int>& operation,
                                      const typename tnlParallelReductionMin< double, int > :: IndexType size,
                                      const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput2,
@@ -54,7 +54,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionMin< double, int > >
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, int > >
-                                   ( const tnlParallelReductionMin< long double, int>& operation,
+                                   ( tnlParallelReductionMin< long double, int>& operation,
                                      const typename tnlParallelReductionMin< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput2,
@@ -63,28 +63,28 @@ template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, int >
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionMin< char, long int > >
-                                   ( const tnlParallelReductionMin< char, long int >& operation,
+                                   ( tnlParallelReductionMin< char, long int >& operation,
                                      const typename tnlParallelReductionMin< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionMin< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< char, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionMin< int, long int > >
-                                   ( const tnlParallelReductionMin< int, long int >& operation,
+                                   ( tnlParallelReductionMin< int, long int >& operation,
                                      const typename tnlParallelReductionMin< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionMin< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionMin< float, long int > >
-                                   ( const tnlParallelReductionMin< float, long int >& operation,
+                                   ( tnlParallelReductionMin< float, long int >& operation,
                                      const typename tnlParallelReductionMin< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionMin< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionMin< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionMin< double, long int > >
-                                   ( const tnlParallelReductionMin< double, long int>& operation,
+                                   ( tnlParallelReductionMin< double, long int>& operation,
                                      const typename tnlParallelReductionMin< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput2,
@@ -92,7 +92,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionMin< double, long int >
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, long int > >
-                                   ( const tnlParallelReductionMin< long double, long int>& operation,
+                                   ( tnlParallelReductionMin< long double, long int>& operation,
                                      const typename tnlParallelReductionMin< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-or_impl.cu b/src/core/cuda/cuda-reduction-or_impl.cu
index 811ec445fd..f312e5ff45 100644
--- a/src/core/cuda/cuda-reduction-or_impl.cu
+++ b/src/core/cuda/cuda-reduction-or_impl.cu
@@ -24,28 +24,28 @@
  * Logical OR
  */
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, int > >
-                                   ( const tnlParallelReductionLogicalOr< char, int >& operation,
+                                   ( tnlParallelReductionLogicalOr< char, int >& operation,
                                      const typename tnlParallelReductionLogicalOr< char, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< char, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< char, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< char, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< int, int > >
-                                   ( const tnlParallelReductionLogicalOr< int, int >& operation,
+                                   ( tnlParallelReductionLogicalOr< int, int >& operation,
                                      const typename tnlParallelReductionLogicalOr< int, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< int, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< int, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< int, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< float, int > >
-                                   ( const tnlParallelReductionLogicalOr< float, int >& operation,
+                                   ( tnlParallelReductionLogicalOr< float, int >& operation,
                                      const typename tnlParallelReductionLogicalOr< float, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, int > >
-                                   ( const tnlParallelReductionLogicalOr< double, int>& operation,
+                                   ( tnlParallelReductionLogicalOr< double, int>& operation,
                                      const typename tnlParallelReductionLogicalOr< double, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput2,
@@ -53,7 +53,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, int
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, int > >
-                                   ( const tnlParallelReductionLogicalOr< long double, int>& operation,
+                                   ( tnlParallelReductionLogicalOr< long double, int>& operation,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput2,
@@ -62,28 +62,28 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double,
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, long int > >
-                                   ( const tnlParallelReductionLogicalOr< char, long int >& operation,
+                                   ( tnlParallelReductionLogicalOr< char, long int >& operation,
                                      const typename tnlParallelReductionLogicalOr< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< char, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< int, long int > >
-                                   ( const tnlParallelReductionLogicalOr< int, long int >& operation,
+                                   ( tnlParallelReductionLogicalOr< int, long int >& operation,
                                      const typename tnlParallelReductionLogicalOr< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< float, long int > >
-                                   ( const tnlParallelReductionLogicalOr< float, long int >& operation,
+                                   ( tnlParallelReductionLogicalOr< float, long int >& operation,
                                      const typename tnlParallelReductionLogicalOr< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionLogicalOr< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, long int > >
-                                   ( const tnlParallelReductionLogicalOr< double, long int>& operation,
+                                   ( tnlParallelReductionLogicalOr< double, long int>& operation,
                                      const typename tnlParallelReductionLogicalOr< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput2,
@@ -91,7 +91,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, long
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, long int > >
-                                   ( const tnlParallelReductionLogicalOr< long double, long int>& operation,
+                                   ( tnlParallelReductionLogicalOr< long double, long int>& operation,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-scalar-product_impl.cu b/src/core/cuda/cuda-reduction-scalar-product_impl.cu
index 082d65540a..6de27958bd 100644
--- a/src/core/cuda/cuda-reduction-scalar-product_impl.cu
+++ b/src/core/cuda/cuda-reduction-scalar-product_impl.cu
@@ -24,28 +24,28 @@
  * ScalarProduct
  */
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< char, int > >
-                                   ( const tnlParallelReductionScalarProduct< char, int >& operation,
+                                   ( tnlParallelReductionScalarProduct< char, int >& operation,
                                      const typename tnlParallelReductionScalarProduct< char, int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< char, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< char, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< char, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< int, int > >
-                                   ( const tnlParallelReductionScalarProduct< int, int >& operation,
+                                   ( tnlParallelReductionScalarProduct< int, int >& operation,
                                      const typename tnlParallelReductionScalarProduct< int, int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< int, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< int, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< int, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< float, int > >
-                                   ( const tnlParallelReductionScalarProduct< float, int >& operation,
+                                   ( tnlParallelReductionScalarProduct< float, int >& operation,
                                      const typename tnlParallelReductionScalarProduct< float, int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< double, int > >
-                                   ( const tnlParallelReductionScalarProduct< double, int>& operation,
+                                   ( tnlParallelReductionScalarProduct< double, int>& operation,
                                      const typename tnlParallelReductionScalarProduct< double, int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< double, int > :: RealType* deviceInput2,
@@ -53,7 +53,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< double,
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, int > >
-                                   ( const tnlParallelReductionScalarProduct< long double, int>& operation,
+                                   ( tnlParallelReductionScalarProduct< long double, int>& operation,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< long double, int > :: RealType* deviceInput2,
@@ -62,28 +62,28 @@ template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long dou
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< char, long int > >
-                                   ( const tnlParallelReductionScalarProduct< char, long int >& operation,
+                                   ( tnlParallelReductionScalarProduct< char, long int >& operation,
                                      const typename tnlParallelReductionScalarProduct< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< char, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< int, long int > >
-                                   ( const tnlParallelReductionScalarProduct< int, long int >& operation,
+                                   ( tnlParallelReductionScalarProduct< int, long int >& operation,
                                      const typename tnlParallelReductionScalarProduct< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< float, long int > >
-                                   ( const tnlParallelReductionScalarProduct< float, long int >& operation,
+                                   ( tnlParallelReductionScalarProduct< float, long int >& operation,
                                      const typename tnlParallelReductionScalarProduct< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionScalarProduct< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< double, long int > >
-                                   ( const tnlParallelReductionScalarProduct< double, long int>& operation,
+                                   ( tnlParallelReductionScalarProduct< double, long int>& operation,
                                      const typename tnlParallelReductionScalarProduct< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< double, long int > :: RealType* deviceInput2,
@@ -91,7 +91,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< double,
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionScalarProduct< long double, long int > >
-                                   ( const tnlParallelReductionScalarProduct< long double, long int>& operation,
+                                   ( tnlParallelReductionScalarProduct< long double, long int>& operation,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionScalarProduct< long double, long int > :: RealType* deviceInput2,
diff --git a/src/core/cuda/cuda-reduction-sum_impl.cu b/src/core/cuda/cuda-reduction-sum_impl.cu
index 8447ea5f00..62ed0a2beb 100644
--- a/src/core/cuda/cuda-reduction-sum_impl.cu
+++ b/src/core/cuda/cuda-reduction-sum_impl.cu
@@ -25,28 +25,28 @@
  */
 
 template bool reductionOnCudaDevice< tnlParallelReductionSum< char, int > >
-                                   ( const tnlParallelReductionSum< char, int >& operation,
+                                   ( tnlParallelReductionSum< char, int >& operation,
                                      const typename tnlParallelReductionSum< char, int > :: IndexType size,
                                      const typename tnlParallelReductionSum< char, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< char, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< char, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionSum< int, int > >
-                                   ( const tnlParallelReductionSum< int, int >& operation,
+                                   ( tnlParallelReductionSum< int, int >& operation,
                                      const typename tnlParallelReductionSum< int, int > :: IndexType size,
                                      const typename tnlParallelReductionSum< int, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< int, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< int, int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionSum< float, int > >
-                                   ( const tnlParallelReductionSum< float, int >& operation,
+                                   ( tnlParallelReductionSum< float, int >& operation,
                                      const typename tnlParallelReductionSum< float, int > :: IndexType size,
                                      const typename tnlParallelReductionSum< float, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< float, int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< float, int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionSum< double, int > >
-                                   ( const tnlParallelReductionSum< double, int>& operation,
+                                   ( tnlParallelReductionSum< double, int>& operation,
                                      const typename tnlParallelReductionSum< double, int > :: IndexType size,
                                      const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput2,
@@ -54,7 +54,7 @@ template bool reductionOnCudaDevice< tnlParallelReductionSum< double, int > >
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, int > >
-                                   ( const tnlParallelReductionSum< long double, int>& operation,
+                                   ( tnlParallelReductionSum< long double, int>& operation,
                                      const typename tnlParallelReductionSum< long double, int > :: IndexType size,
                                      const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput2,
@@ -63,28 +63,28 @@ template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, int >
 
 #ifdef INSTANTIATE_LONG_INT
 template bool reductionOnCudaDevice< tnlParallelReductionSum< char, long int > >
-                                   ( const tnlParallelReductionSum< char, long int >& operation,
+                                   ( tnlParallelReductionSum< char, long int >& operation,
                                      const typename tnlParallelReductionSum< char, long int > :: IndexType size,
                                      const typename tnlParallelReductionSum< char, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< char, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< char, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionSum< int, long int > >
-                                   ( const tnlParallelReductionSum< int, long int >& operation,
+                                   ( tnlParallelReductionSum< int, long int >& operation,
                                      const typename tnlParallelReductionSum< int, long int > :: IndexType size,
                                      const typename tnlParallelReductionSum< int, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< int, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< int, long int > :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionSum< float, long int > >
-                                   ( const tnlParallelReductionSum< float, long int >& operation,
+                                   ( tnlParallelReductionSum< float, long int >& operation,
                                      const typename tnlParallelReductionSum< float, long int > :: IndexType size,
                                      const typename tnlParallelReductionSum< float, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< float, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< float, long int> :: ResultType& result );
 
 template bool reductionOnCudaDevice< tnlParallelReductionSum< double, long int > >
-                                   ( const tnlParallelReductionSum< double, long int>& operation,
+                                   ( tnlParallelReductionSum< double, long int>& operation,
                                      const typename tnlParallelReductionSum< double, long int > :: IndexType size,
                                      const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput2,
@@ -92,11 +92,11 @@ template bool reductionOnCudaDevice< tnlParallelReductionSum< double, long int >
 
 #ifdef INSTANTIATE_LONG_DOUBLE
 template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, long int > >
-                                   ( const tnlParallelReductionSum< long double, long int>& operation,
+                                   ( tnlParallelReductionSum< long double, long int>& operation,
                                      const typename tnlParallelReductionSum< long double, long int > :: IndexType size,
                                      const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput1,
                                      const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput2,
                                      typename tnlParallelReductionSum< long double, long int> :: ResultType& result );
 #endif                                     
 #endif                                     
-#endif
\ No newline at end of file
+#endif
diff --git a/src/core/cuda/cuda-reduction_impl.h b/src/core/cuda/cuda-reduction_impl.h
index 389d166ca9..e8795ec7bc 100644
--- a/src/core/cuda/cuda-reduction_impl.h
+++ b/src/core/cuda/cuda-reduction_impl.h
@@ -42,14 +42,14 @@ using namespace std;
  * are reduced on CPU. The constant must not be larger
  * than maximal CUDA grid size.
  */
-const int minGPUReductionDataSize = 128;//65536; //16384;//1024;//256;
+const int minGPUReductionDataSize = 256;//65536; //16384;//1024;//256;
 
 //static tnlCudaReductionBuffer cudaReductionBuffer( 8 * minGPUReductionDataSize );
 
 #ifdef HAVE_CUDA
 
 template< typename Operation, int blockSize >
-__global__ void tnlCUDAReductionKernel( const Operation operation,
+__global__ void tnlCUDAReductionKernel( Operation operation,
                                         const typename Operation :: IndexType size,
                                         const typename Operation :: RealType* input1,
                                         const typename Operation :: RealType* input2,
@@ -60,7 +60,7 @@ __global__ void tnlCUDAReductionKernel( const Operation operation,
 };
 
 template< typename Operation >
-typename Operation::IndexType reduceOnCudaDevice( const Operation& operation,
+typename Operation::IndexType reduceOnCudaDevice( Operation& operation,
                                                   const typename Operation::IndexType size,
                                                   const typename Operation::RealType* input1,
                                                   const typename Operation::RealType* input2,
@@ -135,7 +135,7 @@ typename Operation::IndexType reduceOnCudaDevice( const Operation& operation,
 #endif
 
 template< typename Operation >
-bool reductionOnCudaDevice( const Operation& operation,
+bool reductionOnCudaDevice( Operation& operation,
                             const typename Operation :: IndexType size,
                             const typename Operation :: RealType* deviceInput1,
                             const typename Operation :: RealType* deviceInput2,
@@ -231,7 +231,6 @@ bool reductionOnCudaDevice( const Operation& operation,
 /****
  * Sum
  */
-
 extern template bool reductionOnCudaDevice< tnlParallelReductionSum< char, int > >
                                    ( const tnlParallelReductionSum< char, int >& operation,
                                      const typename tnlParallelReductionSum< char, int > :: IndexType size,
@@ -311,7 +310,6 @@ extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double
 /****
  * Min
  */
-
 extern template bool reductionOnCudaDevice< tnlParallelReductionMin< char, int > >
                                    ( const tnlParallelReductionMin< char, int >& operation,
                                      const typename tnlParallelReductionMin< char, int > :: IndexType size,
diff --git a/src/core/cuda/reduction-operations.h b/src/core/cuda/reduction-operations.h
index 8733bcd881..b328d5b3f6 100644
--- a/src/core/cuda/reduction-operations.h
+++ b/src/core/cuda/reduction-operations.h
@@ -203,17 +203,17 @@ class tnlParallelReductionSum
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return current + data1[ idx ];
    };
    
-   __cuda_callable__ ResultType initialValue() const { return 0; };
+   __cuda_callable__ ResultType initialValue() { return 0; };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
-                                              const RealType* data2 ) const
+                                              const RealType* data2 )
    {
       result += data1[ index ];
    }
@@ -221,13 +221,13 @@ class tnlParallelReductionSum
 #ifdef HAVE_CUDA
 
    __device__ void commonReductionOnDevice( ResultType& result,
-                                            const ResultType& data ) const
+                                            const ResultType& data )
    {
       result += data;
    };
    
    __device__ void commonReductionOnDevice( volatile ResultType& result,
-                                            volatile const ResultType& data ) const
+                                            volatile const ResultType& data )
    {
       result += data;
    };
@@ -248,30 +248,30 @@ class tnlParallelReductionMin
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return Min( current, data1[ idx ] );
    };
 
-   __cuda_callable__ ResultType initialValue() const { return tnlMaxValue< ResultType>(); };
+   __cuda_callable__ ResultType initialValue() { return tnlMaxValue< ResultType>(); };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
-                                              const RealType* data2 ) const
+                                              const RealType* data2 )
    {
       result = tnlCudaMin( result, data1[ index ] );
    }
    
 #ifdef HAVE_CUDA   
    __device__ void commonReductionOnDevice( ResultType& result,
-                                            const ResultType& data ) const
+                                            const ResultType& data )
    {
       result = tnlCudaMin( result, data );
    };
    
    __device__ void commonReductionOnDevice( volatile ResultType& result,
-                                            volatile const ResultType& data ) const
+                                            volatile const ResultType& data )
    {
       result = tnlCudaMin( result, data );
    };
@@ -293,30 +293,30 @@ class tnlParallelReductionMax
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return Max( current, data1[ idx ] );
    };
 
-   __cuda_callable__ ResultType initialValue() const { return tnlMinValue< ResultType>(); };
+   __cuda_callable__ ResultType initialValue() { return tnlMinValue< ResultType>(); };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
-                                              const RealType* data2 ) const
+                                              const RealType* data2 )
    {
       result = tnlCudaMax( result, data1[ index ] );
    }   
    
 #ifdef HAVE_CUDA   
    __device__ void commonReductionOnDevice( ResultType& result,
-                                            const ResultType& data ) const
+                                            const ResultType& data )
    {
       result = tnlCudaMax( result, data );
    };
 
    __device__ void commonReductionOnDevice( volatile ResultType& result,
-                                            volatile const ResultType& data ) const
+                                            volatile const ResultType& data )
    {
       result = tnlCudaMax( result, data );
    };   
@@ -336,17 +336,17 @@ class tnlParallelReductionLogicalAnd
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return current && data1[ idx ];
    };
 
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) true; };
+   __cuda_callable__ ResultType initialValue() { return ( ResultType ) true; };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
-                                              const RealType* data2 ) const
+                                              const RealType* data2 )
    {
       result = result && data1[ index ];
    }
@@ -354,13 +354,13 @@ class tnlParallelReductionLogicalAnd
    
 #ifdef HAVE_CUDA   
    __device__ void commonReductionOnDevice( ResultType& result,
-                                            const ResultType& data ) const
+                                            const ResultType& data )
    {
       result = result && data;
    };
    
    __device__ void commonReductionOnDevice( volatile ResultType& result,
-                                            volatile const ResultType& data ) const
+                                            volatile const ResultType& data )
    {
       result = result && data;
    };
@@ -383,31 +383,31 @@ class tnlParallelReductionLogicalOr
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return current || data1[ idx ];
    };
    
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) false; };
+   __cuda_callable__ ResultType initialValue() { return ( ResultType ) false; };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
-                                              const RealType* data2 ) const
+                                              const RealType* data2 )
    {
       result = result || data1[ index ];
    }
 
 
-#ifdef HAVE_CUDA   
+#ifdef HAVE_CUDA
    __device__ void commonReductionOnDevice( ResultType& result,
-                                            const ResultType& data ) const
+                                            const ResultType& data )
    {
       result = result || data;
    };
    
    __device__ void commonReductionOnDevice( volatile ResultType& result,
-                                            volatile const ResultType& data ) const
+                                            volatile const ResultType& data )
    {
       result = result || data;
    };
@@ -427,17 +427,17 @@ class tnlParallelReductionAbsSum : public tnlParallelReductionSum< Real, Index >
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return current + tnlAbs( data1[ idx ] );
    };
 
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
+   __cuda_callable__ ResultType initialValue() { return ( ResultType ) 0; };
 
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
-                                              const RealType* data2 ) const
+                                              const RealType* data2 )
    {
       result += tnlCudaAbs( data1[ index ] );
    }
@@ -456,17 +456,17 @@ class tnlParallelReductionAbsMin : public tnlParallelReductionMin< Real, Index >
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return Min( current, tnlAbs( data1[ idx ] ) );
    };
 
-   __cuda_callable__ ResultType initialValue() const { return tnlMaxValue< ResultType>(); };
+   __cuda_callable__ ResultType initialValue() { return tnlMaxValue< ResultType>(); };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
-                                              const RealType* data2 ) const
+                                              const RealType* data2 )
    {
       result = tnlCudaMin( result, tnlCudaAbs( data1[ index ] ) );
    }   
@@ -485,17 +485,17 @@ class tnlParallelReductionAbsMax : public tnlParallelReductionMax< Real, Index >
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return Max( current, tnlAbs( data1[ idx ] ) );
    };
 
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
+   __cuda_callable__ ResultType initialValue() { return ( ResultType ) 0; };
 
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
-                                              const RealType* data2 ) const
+                                              const RealType* data2 )
    {
       result = tnlCudaMax( result, tnlCudaAbs( data1[ index ] ) );
    }   
@@ -514,19 +514,21 @@ class tnlParallelReductionL2Norm : public tnlParallelReductionSum< Real, Index >
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
-      return current + data1[ idx ] * data1[ idx ];
+      const RealType& aux = data1[ idx ];
+      return current + aux * aux;
    };
 
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
+   __cuda_callable__ ResultType initialValue() { return ( ResultType ) 0; };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
-                                              const RealType* data2 ) const
+                                              const RealType* data2 )
    {
-      result += data1[ index ] * data1[ index ];
+      const RealType& aux = data1[ index ];
+      result += aux * aux;
    }
 };
 
@@ -549,17 +551,17 @@ class tnlParallelReductionLpNorm : public tnlParallelReductionSum< Real, Index >
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return current + pow( tnlAbs( data1[ idx ] ), p );
    };
 
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
+   __cuda_callable__ ResultType initialValue() { return ( ResultType ) 0; };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
-                                              const RealType* data2 ) const
+                                              const RealType* data2 )
    {
       result += tnlCudaPow( tnlCudaAbs( data1[ index ] ), p );
    }
@@ -582,17 +584,17 @@ class tnlParallelReductionEqualities : public tnlParallelReductionLogicalAnd< bo
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return current && ( data1[ idx ] == data2[ idx ] );
    };
 
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) true; }; 
+   __cuda_callable__ ResultType initialValue() { return ( ResultType ) true; }; 
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
-                                              const RealType* data2 ) const
+                                              const RealType* data2 )
    {
       result = result && ( data1[ index ] == data2[ index ] );
    }
@@ -611,17 +613,17 @@ class tnlParallelReductionInequalities : public tnlParallelReductionLogicalAnd<
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return current && ( data1[ idx ] != data2[ idx ] );
    };
 
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) false; };
+   __cuda_callable__ ResultType initialValue() { return ( ResultType ) false; };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
-                                              const RealType* data2 ) const
+                                              const RealType* data2 )
    {
       result = result && ( data1[ index ] != data2[ index ] );
    }
@@ -640,17 +642,17 @@ class tnlParallelReductionScalarProduct : public tnlParallelReductionSum< Real,
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return current + ( data1[ idx ] * data2[ idx ] );
    };
 
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
+   __cuda_callable__ ResultType initialValue() { return ( ResultType ) 0; };
    
    __cuda_callable__ inline void cudaFirstReduction( ResultType& result, 
                                                  const IndexType index,
                                                  const RealType* data1,
-                                                 const RealType* data2 ) const
+                                                 const RealType* data2 )
    {
       result += data1[ index ] * data2[ index ];
    }   
@@ -669,17 +671,17 @@ class tnlParallelReductionDiffSum : public tnlParallelReductionSum< Real, Index
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return current + ( data1[ idx ] - data2[ idx ] );
    };
    
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };   
+   __cuda_callable__ ResultType initialValue() { return ( ResultType ) 0; };   
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                           const IndexType index,
                                           const RealType* data1,
-                                          const RealType* data2 ) const
+                                          const RealType* data2 )
    {
       result += data1[ index ] - data2[ index ];
    }   
@@ -698,17 +700,17 @@ class tnlParallelReductionDiffMin : public tnlParallelReductionMin< Real, Index
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return Min( current, data1[ idx ] - data2[ idx ] );
    };
 
-   __cuda_callable__ ResultType initialValue() const { return tnlMaxValue< ResultType>(); };
+   __cuda_callable__ ResultType initialValue() { return tnlMaxValue< ResultType>(); };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                           const IndexType index,
                                           const RealType* data1,
-                                          const RealType* data2 ) const
+                                          const RealType* data2 )
    {
       result = tnlCudaMin( result, data1[ index ] - data2[ index ] );
    }
@@ -727,17 +729,17 @@ class tnlParallelReductionDiffMax : public tnlParallelReductionMax< Real, Index
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return Max( current, data1[ idx ] - data2[ idx ] );
    };
 
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
+   __cuda_callable__ ResultType initialValue() { return ( ResultType ) 0; };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
-                                              const RealType* data2 ) const
+                                              const RealType* data2 )
    {
       result = tnlCudaMax( result, data1[ index ] - data2[ index ] );
    }
@@ -756,17 +758,17 @@ class tnlParallelReductionDiffAbsSum : public tnlParallelReductionMax< Real, Ind
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return current + tnlAbs( data1[ idx ] - data2[ idx ] );
    };
 
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
+   __cuda_callable__ ResultType initialValue() { return ( ResultType ) 0; };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                           const IndexType index,
                                           const RealType* data1,
-                                          const RealType* data2 ) const
+                                          const RealType* data2 )
    {
       result += tnlCudaAbs( data1[ index ] - data2[ index ] );
    }
@@ -785,17 +787,17 @@ class tnlParallelReductionDiffAbsMin : public tnlParallelReductionMin< Real, Ind
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return Min( current, tnlAbs( data1[ idx ] - data2[ idx ] ) );
    };
 
-   __cuda_callable__ ResultType initialValue() const { return tnlMaxValue< ResultType>(); };
+   __cuda_callable__ ResultType initialValue() { return tnlMaxValue< ResultType>(); };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                           const IndexType index,
                                           const RealType* data1,
-                                          const RealType* data2 ) const
+                                          const RealType* data2 )
    {
       result = tnlCudaMin( result, tnlCudaAbs( data1[ index ] - data2[ index ] ) );
    }
@@ -814,17 +816,17 @@ class tnlParallelReductionDiffAbsMax : public tnlParallelReductionMax< Real, Ind
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return Max( current, tnlAbs( data1[ idx ] - data2[ idx ] ) );
    };
 
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
+   __cuda_callable__ ResultType initialValue() { return ( ResultType ) 0; };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                           const IndexType index,
                                           const RealType* data1,
-                                          const RealType* data2 ) const
+                                          const RealType* data2 )
    {
       result = tnlCudaMax( result, tnlCudaAbs( data1[ index ] - data2[ index ] ) );
    }
@@ -843,22 +845,26 @@ class tnlParallelReductionDiffL2Norm : public tnlParallelReductionSum< Real, Ind
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
-      const RealType aux( data2[ idx ] - data1[ idx ]  );
+      this->aux = data2[ idx ] - data1[ idx ];
       return current + aux * aux;
    };
 
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
+   __cuda_callable__ ResultType initialValue() { return ( ResultType ) 0; };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
-                                              const RealType* data2 ) const
+                                              const RealType* data2 )
    {
-      const RealType aux( data2[ index ] - data1[ index ]  );
+      this->aux = data2[ index ] - data1[ index ];
       result += aux * aux;
    }
+   
+   protected:
+      
+      RealType aux;
 };
 
 template< typename Real, typename Index >
@@ -879,17 +885,17 @@ class tnlParallelReductionDiffLpNorm : public tnlParallelReductionSum< Real, Ind
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
-                            const RealType* data2 ) const
+                            const RealType* data2 )
    {
       return current + pow( tnlAbs( data1[ idx ] - data2[ idx ] ), p );
    };
 
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
+   __cuda_callable__ ResultType initialValue() { return ( ResultType ) 0; };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
-                                              const RealType* data2 ) const
+                                              const RealType* data2 )
    {
       result += tnlCudaPow( tnlCudaAbs( data1[ index ] - data2[ index ] ), p );
    }
diff --git a/src/core/cuda/tnlCudaReduction.h b/src/core/cuda/tnlCudaReduction.h
index 9b7bf7ab2c..d5084cff8b 100644
--- a/src/core/cuda/tnlCudaReduction.h
+++ b/src/core/cuda/tnlCudaReduction.h
@@ -30,7 +30,7 @@ class tnlCUDAReduction
       typedef typename Operation::ResultType ResultType;
 
       
-      __device__ static void reduce( const Operation operation,
+      __device__ static void reduce( Operation& operation,
                                      const IndexType size,
                                      const RealType* input1,
                                      const RealType* input2,
@@ -47,7 +47,7 @@ class tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockS
       typedef typename Operation::RealType RealType;
       typedef typename Operation::ResultType ResultType;
       
-      __device__ static void reduce( const Operation operation,
+      __device__ static void reduce( Operation operation,
                                      const IndexType size,
                                      const RealType* input1,
                                      const RealType* input2,
diff --git a/src/core/cuda/tnlCudaReduction_impl.h b/src/core/cuda/tnlCudaReduction_impl.h
index 5187141a1a..5fa295f65e 100644
--- a/src/core/cuda/tnlCudaReduction_impl.h
+++ b/src/core/cuda/tnlCudaReduction_impl.h
@@ -22,7 +22,7 @@ template< typename Operation, int blockSize >
 __device__
 void
 tnlCUDAReduction< Operation, blockSize >::
-reduce( const Operation operation,
+reduce( Operation& operation,
         const IndexType size,
         const RealType* input1,
         const RealType* input2,
@@ -158,7 +158,7 @@ template< typename Real, typename Index, int blockSize >
 __device__
 void
 tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize >::
-reduce( const Operation operation,
+reduce( Operation& operation,
         const IndexType size,
         const RealType* input1,
         const RealType* input2,
@@ -182,7 +182,7 @@ reduce( const Operation operation,
     * sequential reduction.
     */
    sdata[ tid ] = ( RealType ) 0;
-   while( gid + 4 * gridSize < size )
+   /*while( gid + 4 * gridSize < size )
    {
       sdata[ tid ] += input1[ gid                ] * input2[ gid ];
       sdata[ tid ] += input1[ gid + gridSize     ] * input2[ gid + gridSize ];
@@ -195,7 +195,7 @@ reduce( const Operation operation,
       sdata[ tid ] += input1[ gid            ] * input2[ gid ];
       sdata[ tid ] += input1[ gid + gridSize ] * input2[ gid + gridSize ];
       gid += 2*gridSize;
-   }
+   }*/
    while( gid < size )
    {
       sdata[ tid ] += input1[ gid ] * input2[ gid ];
diff --git a/src/core/vectors/CMakeLists.txt b/src/core/vectors/CMakeLists.txt
index 63b3ef39b8..72b06ef4ab 100755
--- a/src/core/vectors/CMakeLists.txt
+++ b/src/core/vectors/CMakeLists.txt
@@ -18,7 +18,7 @@ set( headers tnlVector.h
 
 SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/core/vectors )
 set( common_SOURCES
-     ${CURRENT_DIR}/tnlVectorOperations_impl.cpp
+     ${CURRENT_DIR}/tnlVectorOperationsHost_impl.cpp
      ${CURRENT_DIR}/tnlMultiVector_impl.cpp
      ${CURRENT_DIR}/tnlSharedVector_impl.cpp
      ${CURRENT_DIR}/tnlVector_impl.cpp
@@ -27,14 +27,19 @@ set( common_SOURCES
 IF( BUILD_CUDA )
    set( tnl_core_vectors_CUDA__SOURCES
         ${common_SOURCES}
-        ${CURRENT_DIR}/tnlVectorOperations_impl.cu
+        ${CURRENT_DIR}/tnlVectorOperationsCuda_impl.cu
         ${CURRENT_DIR}/tnlVector_impl.cu
         ${CURRENT_DIR}/tnlStaticVector_impl.cu 
         PARENT_SCOPE )
+   set( tnl_core_vectors_SOURCES     
+        ${common_SOURCES}
+        PARENT_SCOPE )
+else()
+   set( tnl_core_vectors_SOURCES     
+        ${common_SOURCES}
+        ${CURRENT_DIR}/tnlVectorOperationsCuda_impl.cpp
+        PARENT_SCOPE )
 ENDIF()    
 
-set( tnl_core_vectors_SOURCES     
-     ${common_SOURCES}
-     PARENT_SCOPE )
 
 INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/core/vectors )
\ No newline at end of file
diff --git a/src/core/vectors/tnlVectorOperationsCuda_impl.cpp b/src/core/vectors/tnlVectorOperationsCuda_impl.cpp
new file mode 100644
index 0000000000..c9b21196d2
--- /dev/null
+++ b/src/core/vectors/tnlVectorOperationsCuda_impl.cpp
@@ -0,0 +1,328 @@
+/***************************************************************************
+                          tnlVectorOperationsCuda_impl.cpp  -  description
+                             -------------------
+    begin                : Dec 10, 2015
+    copyright            : (C) 2015 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#include <core/vectors/tnlVectorOperations.h> 
+
+#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
+
+/****
+ * Max
+ */
+template int         tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< int, tnlCuda, int >& v );
+template long int    tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< float, tnlCuda, int >& v );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+template int         tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< int, tnlCuda, long int >& v );
+template long int    tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< float, tnlCuda, long int >& v );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
+
+/****
+ * Min
+ */
+template int         tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< int, tnlCuda, int >& v );
+template long int    tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< float, tnlCuda, int >& v );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+template int         tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< int, tnlCuda, long int >& v );
+template long int    tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< float, tnlCuda, long int >& v );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
+
+/****
+ * Abs max
+ */
+template int         tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< int, tnlCuda, int >& v );
+template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< float, tnlCuda, int >& v );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+template int         tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< int, tnlCuda, long int >& v );
+template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< float, tnlCuda, long int >& v );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
+
+
+/****
+ * Abs min
+ */
+template int         tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< int, tnlCuda, int >& v );
+template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< float, tnlCuda, int >& v );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+template int         tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< int, tnlCuda, long int >& v );
+template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< float, tnlCuda, long int >& v );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
+
+/****
+ * L2 norm
+ */
+template int         tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< int, tnlCuda, int >& v );
+template long int    tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< float, tnlCuda, int >& v );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+template int         tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< int, tnlCuda, long int >& v );
+template long int    tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< float, tnlCuda, long int >& v );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
+
+/****
+ * L1 norm
+ */
+template int         tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< int, tnlCuda, int >& v );
+template long int    tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< float, tnlCuda, int >& v );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+template int         tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< int, tnlCuda, long int >& v );
+template long int    tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< float, tnlCuda, long int >& v );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
+
+/****
+ * Lp norm
+ */
+template int         tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< int, tnlCuda, int >& v, const int& p );
+template long int    tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long int, tnlCuda, int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< float, tnlCuda, int >& v, const float& p );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< double, tnlCuda, int >& v, const double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, int >& v, const long double& p );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+template int         tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< int, tnlCuda, long int >& v, const int& p );
+template long int    tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long int, tnlCuda, long int >& v, const long int& p );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< float, tnlCuda, long int >& v, const float& p );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< double, tnlCuda, long int >& v, const double& p );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, long int >& v, const long double& p );
+#endif
+#endif
+
+
+
+/****
+ * Sum
+ */
+template int         tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< int, tnlCuda, int >& v );
+template long int    tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< float, tnlCuda, int >& v );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+template int         tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< int, tnlCuda, long int >& v );
+template long int    tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< float, tnlCuda, long int >& v );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
+
+/****
+ * Difference max
+ */
+template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
+template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
+template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
+
+/****
+ * Difference min
+ */
+template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
+template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
+template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
+
+/****
+ * Difference abs max
+ */
+template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
+template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
+template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
+
+
+/****
+ * Difference abs min
+ */
+template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
+template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
+template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
+#endif
+#endif
+        
+#endif
+ 
+
diff --git a/src/core/vectors/tnlVectorOperations_impl.cu b/src/core/vectors/tnlVectorOperationsCuda_impl.cu
similarity index 85%
rename from src/core/vectors/tnlVectorOperations_impl.cu
rename to src/core/vectors/tnlVectorOperationsCuda_impl.cu
index b8af43c24d..6190d289a3 100644
--- a/src/core/vectors/tnlVectorOperations_impl.cu
+++ b/src/core/vectors/tnlVectorOperationsCuda_impl.cu
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          tnlVectorOperations_impl.cu  -  description
+                          tnlVectorOperationsCuda_impl.cu  -  description
                              -------------------
     begin                : Jul 20, 2013
     copyright            : (C) 2013 by Tomas Oberhuber
@@ -120,6 +120,55 @@ template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlV
 #endif
 #endif
 
+/****
+ * L2 norm
+ */
+template int         tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< int, tnlCuda, int >& v );
+template long int    tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< float, tnlCuda, int >& v );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+template int         tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< int, tnlCuda, long int >& v );
+template long int    tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< float, tnlCuda, long int >& v );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorL2Norm( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
+
+/****
+ * L1 norm
+ */
+template int         tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< int, tnlCuda, int >& v );
+template long int    tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< long int, tnlCuda, int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< float, tnlCuda, int >& v );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< double, tnlCuda, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< long double, tnlCuda, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+template int         tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< int, tnlCuda, long int >& v );
+template long int    tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< long int, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< float, tnlCuda, long int >& v );
+#endif
+template double      tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< double, tnlCuda, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlCuda >::getVectorL1Norm( const tnlVector< long double, tnlCuda, long int >& v );
+#endif
+#endif
 
 /****
  * Lp norm
@@ -146,6 +195,8 @@ template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlV
 #endif
 #endif
 
+
+
 /****
  * Sum
  */
diff --git a/src/core/vectors/tnlVectorOperations_impl.cpp b/src/core/vectors/tnlVectorOperationsHost_impl.cpp
similarity index 52%
rename from src/core/vectors/tnlVectorOperations_impl.cpp
rename to src/core/vectors/tnlVectorOperationsHost_impl.cpp
index dbd2275c17..6b95318297 100644
--- a/src/core/vectors/tnlVectorOperations_impl.cpp
+++ b/src/core/vectors/tnlVectorOperationsHost_impl.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          tnlVectorOperations_impl.cpp  -  description
+                          tnlVectorOperationsHost_impl.cpp  -  description
                              -------------------
     begin                : Jul 20, 2013
     copyright            : (C) 2013 by Tomas Oberhuber
@@ -119,6 +119,57 @@ template long double tnlVectorOperations< tnlHost >::getVectorAbsMin( const tnlV
 #endif
 #endif
 
+/****
+ * L1 norm
+ */
+template int         tnlVectorOperations< tnlHost >::getVectorL1Norm( const tnlVector< int, tnlHost, int >& v );
+template long int    tnlVectorOperations< tnlHost >::getVectorL1Norm( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlHost >::getVectorL1Norm( const tnlVector< float, tnlHost, int >& v );
+#endif
+template double      tnlVectorOperations< tnlHost >::getVectorL1Norm( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlHost >::getVectorL1Norm( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+template int         tnlVectorOperations< tnlHost >::getVectorL1Norm( const tnlVector< int, tnlHost, long int >& v );
+template long int    tnlVectorOperations< tnlHost >::getVectorL1Norm( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlHost >::getVectorL1Norm( const tnlVector< float, tnlHost, long int >& v );
+#endif
+template double      tnlVectorOperations< tnlHost >::getVectorL1Norm( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlHost >::getVectorL1Norm( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
+
+/****
+ * L2 norm
+ */
+template int         tnlVectorOperations< tnlHost >::getVectorL2Norm( const tnlVector< int, tnlHost, int >& v );
+template long int    tnlVectorOperations< tnlHost >::getVectorL2Norm( const tnlVector< long int, tnlHost, int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlHost >::getVectorL2Norm( const tnlVector< float, tnlHost, int >& v );
+#endif
+template double      tnlVectorOperations< tnlHost >::getVectorL2Norm( const tnlVector< double, tnlHost, int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlHost >::getVectorL2Norm( const tnlVector< long double, tnlHost, int >& v );
+#endif
+
+#ifdef INSTANTIATE_LONG_INT
+template int         tnlVectorOperations< tnlHost >::getVectorL2Norm( const tnlVector< int, tnlHost, long int >& v );
+template long int    tnlVectorOperations< tnlHost >::getVectorL2Norm( const tnlVector< long int, tnlHost, long int >& v );
+#ifdef INSTANTIATE_FLOAT
+template float       tnlVectorOperations< tnlHost >::getVectorL2Norm( const tnlVector< float, tnlHost, long int >& v );
+#endif
+template double      tnlVectorOperations< tnlHost >::getVectorL2Norm( const tnlVector< double, tnlHost, long int >& v );
+#ifdef INSTANTIATE_LONG_DOUBLE
+template long double tnlVectorOperations< tnlHost >::getVectorL2Norm( const tnlVector< long double, tnlHost, long int >& v );
+#endif
+#endif
+
+
 /****
  * Lp norm
  */
@@ -144,6 +195,8 @@ template long double tnlVectorOperations< tnlHost >::getVectorLpNorm( const tnlV
 #endif
 #endif
 
+
+
 /****
  * Sum
  */
@@ -269,258 +322,6 @@ template long double tnlVectorOperations< tnlHost >::getVectorDifferenceAbsMin(
 #endif
 #endif
 
-/****
- * Max
- */
-template int         tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< int, tnlCuda, int >& v );
-template long int    tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long int, tnlCuda, int >& v );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< float, tnlCuda, int >& v );
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< double, tnlCuda, int >& v );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, int >& v );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-template int         tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< int, tnlCuda, long int >& v );
-template long int    tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long int, tnlCuda, long int >& v );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< float, tnlCuda, long int >& v );
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< double, tnlCuda, long int >& v );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorMax( const tnlVector< long double, tnlCuda, long int >& v );
-#endif
-#endif
-
-
-/****
- * Min
- */
-template int         tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< int, tnlCuda, int >& v );
-template long int    tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long int, tnlCuda, int >& v );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< float, tnlCuda, int >& v );
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< double, tnlCuda, int >& v );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, int >& v );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-template int         tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< int, tnlCuda, long int >& v );
-template long int    tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long int, tnlCuda, long int >& v );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< float, tnlCuda, long int >& v );
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< double, tnlCuda, long int >& v );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorMin( const tnlVector< long double, tnlCuda, long int >& v );
-#endif
-#endif
-
-/****
- * Abs max
- */
-template int         tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< int, tnlCuda, int >& v );
-template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long int, tnlCuda, int >& v );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< float, tnlCuda, int >& v );
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< double, tnlCuda, int >& v );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, int >& v );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-template int         tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< int, tnlCuda, long int >& v );
-template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long int, tnlCuda, long int >& v );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< float, tnlCuda, long int >& v );
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< double, tnlCuda, long int >& v );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorAbsMax( const tnlVector< long double, tnlCuda, long int >& v );
-#endif
-#endif
-
-/****
- * Abs min
- */
-template int         tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< int, tnlCuda, int >& v );
-template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long int, tnlCuda, int >& v );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< float, tnlCuda, int >& v );
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< double, tnlCuda, int >& v );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, int >& v );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-template int         tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< int, tnlCuda, long int >& v );
-template long int    tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long int, tnlCuda, long int >& v );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< float, tnlCuda, long int >& v );
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< double, tnlCuda, long int >& v );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorAbsMin( const tnlVector< long double, tnlCuda, long int >& v );
-#endif
-#endif
-
-/****
- * Lp norm
- */
-template int         tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< int, tnlCuda, int >& v, const int& p );
-template long int    tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long int, tnlCuda, int >& v, const long int& p );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< float, tnlCuda, int >& v, const float& p );
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< double, tnlCuda, int >& v, const double& p );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, int >& v, const long double& p );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-template int         tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< int, tnlCuda, long int >& v, const int& p );
-template long int    tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long int, tnlCuda, long int >& v, const long int& p );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< float, tnlCuda, long int >& v, const float& p );
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< double, tnlCuda, long int >& v, const double& p );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorLpNorm( const tnlVector< long double, tnlCuda, long int >& v, const long double& p );
-#endif
-#endif
-
-/****
- * Sum
- */
-template int         tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< int, tnlCuda, int >& v );
-template long int    tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long int, tnlCuda, int >& v );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< float, tnlCuda, int >& v );
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< double, tnlCuda, int >& v );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, int >& v );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-template int         tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< int, tnlCuda, long int >& v );
-template long int    tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long int, tnlCuda, long int >& v );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< float, tnlCuda, long int >& v );
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< double, tnlCuda, long int >& v );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorSum( const tnlVector< long double, tnlCuda, long int >& v );
-#endif
-#endif
-
-
-/****
- * Difference max
- */
-template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
-template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
-template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
-#endif
-#endif
-
-
-/****
- * Difference min
- */
-template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
-template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
-template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
-#endif
-#endif
-
-/****
- * Difference abs max
- */
-template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
-template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
-template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMax( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
-#endif
-#endif
-
-/****
- * Difference abs min
- */
-template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< int, tnlCuda, int >& v1, const tnlVector< int, tnlCuda, int >& v2 );
-template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlCuda, int >& v1, const tnlVector< long int, tnlCuda, int >& v2 );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< float, tnlCuda, int >& v1,  const tnlVector< float, tnlCuda, int >& v2);
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< double, tnlCuda, int >& v1, const tnlVector< double, tnlCuda, int >& v2 );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, int >& v1, const tnlVector< long double, tnlCuda, int >& v2 );
-#endif
-
-#ifdef INSTANTIATE_LONG_INT
-template int         tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< int, tnlCuda, long int >& v1, const tnlVector< int, tnlCuda, long int >& v2 );
-template long int    tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long int, tnlCuda, long int >& v1, const tnlVector< long int, tnlCuda, long int >& v2 );
-#ifdef INSTANTIATE_FLOAT
-template float       tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< float, tnlCuda, long int >& v1, const tnlVector< float, tnlCuda, long int >& v2 );
-#endif
-template double      tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< double, tnlCuda, long int >& v1, const tnlVector< double, tnlCuda, long int >& v2 );
-#ifdef INSTANTIATE_LONG_DOUBLE
-template long double tnlVectorOperations< tnlCuda >::getVectorDifferenceAbsMin( const tnlVector< long double, tnlCuda, long int >& v1, const tnlVector< long double, tnlCuda, long int >& v2 );
-#endif
-#endif
 
 #endif
 
diff --git a/tests/benchmarks/tnl-cuda-benchmarks.h b/tests/benchmarks/tnl-cuda-benchmarks.h
index d043077408..6da40ffd0c 100644
--- a/tests/benchmarks/tnl-cuda-benchmarks.h
+++ b/tests/benchmarks/tnl-cuda-benchmarks.h
@@ -21,6 +21,7 @@
 #include <tnlConfig.h>
 #include <core/vectors/tnlVector.h>
 #include <core/tnlTimerRT.h>
+#include <matrices/tnlSlicedEllpackMatrix.h>
 
 #ifdef HAVE_CUBLAS
 //#include <cublas.h>
@@ -115,7 +116,6 @@ int main( int argc, char* argv[] )
    timer.start();
    for( int i = 0; i < loops; i++ )
       resultDevice = deviceVector.scalarProduct( deviceVector );
-   cout << "Time: " << timer.getTime() << endl;
    timer.stop();
    bandwidth = 2 * datasetSize / timer.getTime();
    cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl;
@@ -142,7 +142,6 @@ int main( int argc, char* argv[] )
    bandwidth = 2 * datasetSize / timer.getTime();
    cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl;
 #endif    
-#endif
 
    cout << "Benchmarking L2 norm on CPU: ";
    timer.reset();
@@ -150,7 +149,7 @@ int main( int argc, char* argv[] )
    for( int i = 0; i < loops; i++ )
      resultHost = hostVector.lpNorm( 2.0 );
    timer.stop();
-   bandwidth = 2 * datasetSize / timer.getTime();
+   bandwidth = datasetSize / timer.getTime();
    cout << bandwidth << " GB/sec." << endl;
     
    cout << "Benchmarking L2 norm on GPU: " << endl;
@@ -158,9 +157,8 @@ int main( int argc, char* argv[] )
    timer.start();
    for( int i = 0; i < loops; i++ )
       resultDevice = deviceVector.lpNorm( 2.0 );
-   cout << "Time: " << timer.getTime() << endl;
    timer.stop();
-   bandwidth = 2 * datasetSize / timer.getTime();
+   bandwidth = datasetSize / timer.getTime();
    cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl;
    if( resultHost != resultDevice )
    {
@@ -168,7 +166,7 @@ int main( int argc, char* argv[] )
       //return EXIT_FAILURE;
    }
 
-   
+   /*
    cout << "Benchmarking prefix-sum on CPU ..." << endl;
    timer.reset();
    timer.start();
@@ -192,8 +190,57 @@ int main( int argc, char* argv[] )
       {
          cerr << "Error in prefix sum at position " << i << ":  " << hostVector.getElement( i ) << " != " << deviceVector.getElement( i ) << endl;
       }
-
+*/
+   /****
+    * Sliced Ellpack test
+    */
+   const int elementsPerRow( 5 );
+   tnlSlicedEllpackMatrix< double, tnlHost > hostMatrix;
+   tnlSlicedEllpackMatrix< double, tnlCuda > deviceMatrix;
+   tnlVector< int, tnlHost, int > hostRowLengths;
+   tnlVector< int, tnlCuda, int > deviceRowLengths;
+   hostRowLengths.setSize( size );
+   deviceRowLengths.setSize( size );
+   hostMatrix.setDimensions( size, size );
+   deviceMatrix.setDimensions( size, size );
+   hostRowLengths.setValue( elementsPerRow );
+   deviceRowLengths.setValue( elementsPerRow );
+   hostMatrix.setCompressedRowsLengths( hostRowLengths );
+   deviceMatrix.setCompressedRowsLengths( deviceRowLengths );
+   int elements( 0 );
+   for( int row = 0; row < size; row++ )
+   {
+      int col = Max( 0, row - elementsPerRow / 2 );   
+      for( int element = 0; element < elementsPerRow; element++ )
+      {
+         if( col + element < size )
+         {
+            hostMatrix.setElement( row, col + element, 1.0 );
+            deviceMatrix.setElement( row, col + element, 1.0 );
+            elements++;
+         }
+      }      
+   }
+   datasetSize = loops * elements * sizeof( double ) / oneGB;
+   cout << "Benchmarking SpMV on CPU: ";
+   timer.reset();
+   for( int i = 0; i < loops; i++ )
+      hostMatrix.vectorProduct( hostVector, hostVector2 );
+   timer.stop();
+   double hostTime = timer.getTime();
+   bandwidth = 2 * datasetSize / loops / timer.getTime();
+   cout << timer.getTime() << " => " << bandwidth << " GB/s" << endl;
+   
+   cout << "Benchmarking SpMV on GPU: ";
+   timer.reset();
+   for( int i = 0; i < loops; i++ )
+      deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
+   timer.stop();
+   bandwidth = 2 * datasetSize / loops / timer.getTime();
+   cout << timer.getTime() << " => " << bandwidth << " GB/s" << " speedup " << hostTime / timer.getTime() << endl;
+   
    return EXIT_SUCCESS;
+#endif
 }
 
 #endif /* TNLCUDABENCHMARKS_H_ */
-- 
GitLab