diff --git a/src/core/cuda/cuda-prefix-sum_impl.h b/src/core/cuda/cuda-prefix-sum_impl.h
index 75d86ece9e0742f4de062b08deacd85e809cb3c6..ad2d02aa3341c21056b9e2994df93918aa38db52 100644
--- a/src/core/cuda/cuda-prefix-sum_impl.h
+++ b/src/core/cuda/cuda-prefix-sum_impl.h
@@ -86,8 +86,8 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
    while( chunkPointer < chunkSize &&
           chunkOffset + chunkPointer < lastElementInBlock )
    {
-      operation.performInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ],
-                                sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
+      operation.commonReductionOnDevice( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ],
+                                         sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
       auxData[ threadIdx. x ] =
          sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer  ) ];
       chunkPointer ++;
@@ -100,7 +100,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
    const int warpIdx = threadIdx. x / tnlCuda::getWarpSize();
    for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 )
       if( threadInWarpIdx >= stride && threadIdx. x < numberOfChunks )
-         operation.performInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] );
+         operation.commonReductionOnDevice( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] );
 
    if( threadInWarpIdx == tnlCuda::getWarpSize() - 1 )
       warpSums[ warpIdx ] = auxData[ threadIdx. x ];
@@ -112,14 +112,14 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
    if( warpIdx == 0 )
       for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 )
          if( threadInWarpIdx >= stride )
-            operation.performInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] );
+            operation.commonReductionOnDevice( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] );
    __syncthreads();
 
    /****
     * Shift the warp prefix-sums.
     */
    if( warpIdx > 0 )
-      operation.performInPlace( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] );
+      operation.commonReductionOnDevice( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] );
 
    /***
     *  Store the result back in global memory.
@@ -132,7 +132,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
       DataType chunkShift( operation.initialValue() );
       if( chunkIdx > 0 )
          chunkShift = auxData[ chunkIdx - 1 ];
-      operation.performInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift );
+      operation.commonReductionOnDevice( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift );
       output[ blockOffset + idx ] = sharedData[ tnlCuda::getInterleaving( idx ) ];
       idx += blockDim. x;
    }
@@ -141,10 +141,15 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
    if( threadIdx. x == 0 )
    {
       if( prefixSumType == exclusivePrefixSum )
-         auxArray[ blockIdx. x ] =
-            operation.commonReductionOnDevice( tnlCuda::getInterleaving( lastElementInBlock - 1 ),
-                                               tnlCuda::getInterleaving( lastElementInBlock ),
-                                               sharedData );
+      {
+         /*auxArray[ blockIdx. x ] = operation.commonReductionOnDevice( tnlCuda::getInterleaving( lastElementInBlock - 1 ),
+                                                                      tnlCuda::getInterleaving( lastElementInBlock ),
+                                                                      sharedData );*/
+         DataType aux = operation.initialValue();
+         operation.commonReductionOnDevice( aux, sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ] );
+         operation.commonReductionOnDevice( aux, sharedData[ tnlCuda::getInterleaving( lastElementInBlock ) ] );
+         auxArray[ blockIdx. x ] = aux;
+      }
       else
          auxArray[ blockIdx. x ] = sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ];
    }
diff --git a/src/core/cuda/cuda-reduction_impl.h b/src/core/cuda/cuda-reduction_impl.h
index 5cc93fd8add91780acedc5eb079fa33733ffe226..434131239ca8507ea0deca3dca6392aca9225b73 100644
--- a/src/core/cuda/cuda-reduction_impl.h
+++ b/src/core/cuda/cuda-reduction_impl.h
@@ -37,27 +37,25 @@
 
 using namespace std;
 
-
 /****
  * Arrays smaller than the following constant
  * are reduced on CPU. The constant must not be larger
  * than maximal CUDA grid size.
  */
-const int minGPUReductionDataSize = 256;//65536; //16384;//1024;//256;
+const int minGPUReductionDataSize = 128;//65536; //16384;//1024;//256;
 
 static tnlCudaReductionBuffer cudaReductionBuffer( 8 * minGPUReductionDataSize );
 
 #ifdef HAVE_CUDA
 
-
-template< typename Operation, int blockSize, bool isSizePow2 >
+template< typename Operation, int blockSize >
 __global__ void tnlCUDAReductionKernel( const Operation operation,
                                         const typename Operation :: IndexType size,
                                         const typename Operation :: RealType* input1,
                                         const typename Operation :: RealType* input2,
                                         typename Operation :: ResultType* output )
 {
-   typedef tnlCUDAReduction< Operation, blockSize, isSizePow2 > Reduction;
+   typedef tnlCUDAReduction< Operation, blockSize > Reduction;
    Reduction::reduce( operation, size, input1, input2, output );
 };
 
@@ -73,124 +71,61 @@ typename Operation::IndexType reduceOnCudaDevice( const Operation& operation,
    typedef typename Operation :: ResultType ResultType;
    
    const IndexType desGridSize( minGPUReductionDataSize );   
-   dim3 blockSize( 256 ), gridSize( 0 );
-   
+   dim3 blockSize( 256 ), gridSize( 0 );   
    gridSize. x = Min( tnlCuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
-
-   /*#ifdef CUDA_REDUCTION_PROFILING
-      tnlTimerRT timer;
-      timer.reset();
-      timer.start();
-   #endif */     
    
    if( ! cudaReductionBuffer.setSize( gridSize.x * sizeof( ResultType ) ) )
       return false;
-   output = cudaReductionBuffer.template getData< ResultType >();
-      
-
+   output = cudaReductionBuffer.template getData< ResultType >();      
    IndexType shmem = blockSize.x * sizeof( ResultType );
+   
    /***
     * Depending on the blockSize we generate appropriate template instance.
     */
-
-   if( isPow2( size ) )
-   {      
-      switch( blockSize.x )         
-      {
-         case 512:
-            tnlCUDAReductionKernel< Operation, 512, true >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case 256:
-            tnlCUDAReductionKernel< Operation, 256, true >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case 128:
-            tnlCUDAReductionKernel< Operation, 128, true >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case  64:
-            tnlCUDAReductionKernel< Operation,  64, true >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case  32:
-            tnlCUDAReductionKernel< Operation,  32, true >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case  16:
-            tnlCUDAReductionKernel< Operation,  16, true >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-        case   8:
-            tnlCUDAReductionKernel< Operation,   8, true >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case   4:
-            tnlCUDAReductionKernel< Operation,   4, true >
-           <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-           break;
-         case   2:
-            tnlCUDAReductionKernel< Operation,   2, true >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case   1:
-            tnlAssert( false, cerr << "blockSize should not be 1." << endl );
-         default:
-            tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
-      }
-   }
-   else
+   switch( blockSize.x )         
    {
-      switch( blockSize.x )
-      {
-         case 512:
-            tnlCUDAReductionKernel< Operation, 512, false >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case 256:
-            tnlCUDAReductionKernel< Operation, 256, false >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case 128:
-            tnlCUDAReductionKernel< Operation, 128, false >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case  64:
-            tnlCUDAReductionKernel< Operation,  64, false >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case  32:
-            tnlCUDAReductionKernel< Operation,  32, false >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case  16:
-            tnlCUDAReductionKernel< Operation,  16, false >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-        case   8:
-            tnlCUDAReductionKernel< Operation,   8, false >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case   4:
-            tnlCUDAReductionKernel< Operation,   4, false >
-           <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-           break;
-         case   2:
-            tnlCUDAReductionKernel< Operation,   2, false >
-            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-            break;
-         case   1:
-            tnlAssert( false, cerr << "blockSize should not be 1." << endl );
-         default:
-            tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
-      }
+      case 512:
+         tnlCUDAReductionKernel< Operation, 512 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case 256:
+         tnlCUDAReductionKernel< Operation, 256 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case 128:
+         tnlCUDAReductionKernel< Operation, 128 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case  64:
+         tnlCUDAReductionKernel< Operation,  64 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case  32:
+         tnlCUDAReductionKernel< Operation,  32 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case  16:
+         tnlCUDAReductionKernel< Operation,  16 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+     case   8:
+         tnlCUDAReductionKernel< Operation,   8 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case   4:
+         tnlCUDAReductionKernel< Operation,   4 >
+        <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+        break;
+      case   2:
+         tnlCUDAReductionKernel< Operation,   2 >
+         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         break;
+      case   1:
+         tnlAssert( false, cerr << "blockSize should not be 1." << endl );
+      default:
+         tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
    }
    //checkCudaDevice;
-   /*#ifdef CUDA_REDUCTION_PROFILING
-      //cudaThreadSynchronize();
-      timer.stop();
-      cout << "   Main reduction on GPU took " << timer.getTime() << " sec. " << endl;
-   #endif   */      
    return gridSize. x;
 }
 #endif
@@ -222,62 +157,64 @@ bool reductionOnCudaDevice( const Operation& operation,
       if( deviceInput2 && ! 
           tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< RealType, RealType, IndexType >( hostArray2, deviceInput2, size ) )
          return false;
-      result = operation. initialValueOnHost( 0, hostArray1, hostArray2 );
-      for( IndexType i = 1; i < size; i ++ )
-         result = operation. reduceOnHost( i, result, hostArray1, hostArray2 );
+      result = operation.initialValue();
+      for( IndexType i = 0; i < size; i ++ )
+         result = operation.reduceOnHost( i, result, hostArray1, hostArray2 );
       return true;
    }
 
+   #ifdef CUDA_REDUCTION_PROFILING
+      tnlTimerRT timer;
+      timer.reset();
+      timer.start();
+   #endif   
+
    /****
     * Reduce the data on the CUDA device.
-    */
-#ifdef CUDA_REDUCTION_PROFILING
-   tnlTimerRT timer;
-   timer.reset();
-   timer.start();
-#endif   
+    */      
    ResultType* deviceAux1( 0 );
    IndexType reducedSize = reduceOnCudaDevice( operation,
                                                size,
                                                deviceInput1,
                                                deviceInput2,
                                                deviceAux1 );
-#ifdef CUDA_REDUCTION_PROFILING
-   timer.stop();
-   cout << "   Reduction on GPU to size " << reducedSize << " took " << timer.getTime() << " sec. " << endl;
-#endif      
+   #ifdef CUDA_REDUCTION_PROFILING
+      timer.stop();
+      cout << "   Reduction on GPU to size " << reducedSize << " took " << timer.getTime() << " sec. " << endl;
+      timer.reset();
+      timer.start();
+   #endif   
 
    /***
     * Transfer the reduced data from device to host.
     */
-#ifdef CUDA_REDUCTION_PROFILING
-   timer.reset();
-   timer.start();
-#endif   
    ResultType resultArray[ minGPUReductionDataSize ];
    if( ! tnlArrayOperations< tnlHost, tnlCuda >::copyMemory< ResultType, ResultType, IndexType >( resultArray, deviceAux1, reducedSize ) )
       return false;
-#ifdef CUDA_REDUCTION_PROFILING   
-   timer.stop();
-   cout << "   Transferring data to CPU took " << timer.getTime() << " sec. " << endl;
-#endif   
+   
+   #ifdef CUDA_REDUCTION_PROFILING   
+      timer.stop();
+      cout << "   Transferring data to CPU took " << timer.getTime() << " sec. " << endl;
+   #endif   
 
+   #ifdef CUDA_REDUCTION_PROFILING
+      timer.reset();
+      timer.start();
+   #endif      
+   
    /***
     * Reduce the data on the host system.
-    */
+    */    
    LaterReductionOperation laterReductionOperation;
-#ifdef CUDA_REDUCTION_PROFILING
-   timer.reset();
-   timer.start();
-#endif      
-   result = laterReductionOperation. initialValueOnHost( 0, resultArray, ( ResultType* ) 0 );
-   for( IndexType i = 1; i < reducedSize; i ++ )
-      result = laterReductionOperation. reduceOnHost( i, result, resultArray, ( ResultType*) 0 );
-#ifdef CUDA_REDUCTION_PROFILING
-   cudaThreadSynchronize();
-   timer.stop();
-   cout << "   Reduction of small data set on CPU took " << timer.getTime() << " sec. " << endl;
-#endif 
+   result = laterReductionOperation. initialValue();
+   for( IndexType i = 0; i < reducedSize; i ++ )
+      result = laterReductionOperation.reduceOnHost( i, result, resultArray, ( ResultType*) 0 );
+   
+   #ifdef CUDA_REDUCTION_PROFILING
+      timer.stop();
+      cout << "   Reduction of small data set on CPU took " << timer.getTime() << " sec. " << endl;
+   #endif 
+   
    return checkCudaDevice;
 #else
    tnlCudaSupportMissingMessage;;
diff --git a/src/core/cuda/reduction-operations.h b/src/core/cuda/reduction-operations.h
index 04566a8d8627dae8dc767630439da5830292bd9a..f804847f2e8ba12be96f82a965dc6a98eb1df237 100644
--- a/src/core/cuda/reduction-operations.h
+++ b/src/core/cuda/reduction-operations.h
@@ -200,13 +200,6 @@ class tnlParallelReductionSum
    typedef Real ResultType;
    typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return data1[ idx ];
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -226,46 +219,6 @@ class tnlParallelReductionSum
    }
    
 #ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ] + data1[ idx2 ];
-   };
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ];
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] + data2[ idx2 ] + data2[ idx3 ];
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] + data2[ idx2 ];
-   };
-
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  volatile const ResultType* data ) const
-   {
-      return data[ idx1 ] + data[ idx2 ];
-   };
 
    __device__ ResultType commonReductionOnDevice( ResultType& result,
                                                   const ResultType& data ) const
@@ -279,13 +232,6 @@ class tnlParallelReductionSum
       result += data;
    };
 
-   
-   __device__ void performInPlace( ResultType& a,
-                                   const ResultType& b ) const
-   {
-      a += b;
-   }
-
 #endif
 };
 
@@ -299,13 +245,6 @@ class tnlParallelReductionMin
    typedef Real ResultType;
    typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return data1[ idx ];
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -324,41 +263,7 @@ class tnlParallelReductionMin
       result = tnlCudaMin( result, data1[ index ] );
    }
    
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaMin( data1[ idx1 ], data1[ idx2 ] );
-   }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ];
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMin( data1[ idx1 ], tnlCudaMin(  data2[ idx2 ],  data2[ idx3 ] ) );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMin( data1[ idx1 ], data2[ idx2 ] );
-   };
-   
+#ifdef HAVE_CUDA   
    __device__ ResultType commonReductionOnDevice( ResultType& result,
                                                   const ResultType& data ) const
    {
@@ -385,13 +290,6 @@ class tnlParallelReductionMax
    typedef Real ResultType;
    typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return data1[ idx ];
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -410,48 +308,7 @@ class tnlParallelReductionMax
       result = tnlCudaMax( result, data1[ index ] );
    }   
    
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaMax( data1[ idx1 ], data1[ idx2 ] );
-   }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ];
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMax( data1[ idx1 ], tnlCudaMax( data2[ idx2 ], data2[ idx3 ] ) );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMax( data1[ idx1 ], data2[ idx2 ] );
-   };
-
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  volatile const ResultType* data ) const
-   {
-      return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
-   };
-   
+#ifdef HAVE_CUDA   
    __device__ ResultType commonReductionOnDevice( ResultType& result,
                                                   const ResultType& data ) const
    {
@@ -467,216 +324,112 @@ class tnlParallelReductionMax
 };
 
 template< typename Real, typename Index >
-class tnlParallelReductionAbsSum : public tnlParallelReductionSum< Real, Index >
+class tnlParallelReductionLogicalAnd
 {
    public:
 
    typedef Real RealType;
    typedef Index IndexType;
    typedef Real ResultType;
-   typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation;
-
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return tnlAbs( data1[ idx ] );
-   };
+   typedef tnlParallelReductionLogicalAnd< Real, Index > LaterReductionOperation;
 
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
                             const RealType* data2 ) const
    {
-      return current + tnlAbs( data1[ idx ] );
+      return current && data1[ idx ];
    };
 
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
-
+   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) true; };
+   
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
                                               const RealType* data2 ) const
    {
-      result += tnlCudaAbs( data1[ index ] );
+      result = result && data1[ index ];
    }
    
    
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaAbs( data1[ idx1 ] ) + tnlCudaAbs( data1[ idx2 ] );
-   };
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaAbs( data1[ idx1 ] );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] ) + tnlCudaAbs( data2[ idx3 ] );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] );
-   };
-
-   /*__device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  volatile const ResultType* data ) const
+#ifdef HAVE_CUDA   
+   __device__ ResultType commonReductionOnDevice( ResultType& result,
+                                                  const ResultType& data ) const
    {
-      return data[ idx1 ] + data[ idx2 ];
+      result = result && data;
    };
    
-   __device__ ResultType commonReductionOnDevice( const ResultType& data1,
-                                                  const ResultType& data2 ) const
+   __device__ ResultType commonReductionOnDevice( volatile ResultType& result,
+                                                  volatile const ResultType& data ) const
    {
-      return data1 + data2;
+      result = result && data;
    };
    
-   __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1,
-                                                  volatile const ResultType& data2 ) const
-   {
-      return data1 + data2;
-   };*/
-   
-   
+
 #endif
 };
 
+
 template< typename Real, typename Index >
-class tnlParallelReductionAbsMin : public tnlParallelReductionMin< Real, Index >
+class tnlParallelReductionLogicalOr
 {
    public:
 
    typedef Real RealType;
    typedef Index IndexType;
    typedef Real ResultType;
-   typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation;
-
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return tnlAbs( data1[ idx ] );
-   };
+   typedef tnlParallelReductionLogicalOr< Real, Index > LaterReductionOperation;
 
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
                             const RealType* data2 ) const
    {
-      return Min( current, tnlAbs( data1[ idx ] ) );
+      return current || data1[ idx ];
    };
-
-   __cuda_callable__ ResultType initialValue() const { return tnlMaxValue< ResultType>(); };
+   
+   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) false; };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
                                               const RealType* data2 ) const
    {
-      result = tnlCudaMin( result, tnlCudaAbs( data1[ index ] ) );
-   }   
-   
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaMin( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) );
+      result = result || data1[ index ];
    }
 
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaAbs( data1[ idx1 ] );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMin( data1[ idx1 ], tnlCudaMin(  tnlCudaAbs( data2[ idx2 ] ),  tnlCudaAbs( data2[ idx3 ] ) ) );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMin( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) );
-   };
 
-   /*__device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  volatile const ResultType* data ) const
+#ifdef HAVE_CUDA   
+   __device__ ResultType commonReductionOnDevice( ResultType& result,
+                                                  const ResultType& data ) const
    {
-      volatile ResultType aux = tnlCudaAbs( data[ idx2 ] );
-      return tnlCudaMin( data[ idx1 ],  aux );
+      result = result || data;
    };
    
-   __device__ ResultType commonReductionOnDevice( const ResultType& data1,
-                                                  const ResultType& data2 ) const
+   __device__ ResultType commonReductionOnDevice( volatile ResultType& result,
+                                                  volatile const ResultType& data ) const
    {
-      return tnlCudaMin( data1, data2 );
+      result = result || data;
    };
-   
-   __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1,
-                                                  volatile const ResultType& data2 ) const
-   {
-      return tnlCudaMin( data1, data2 );
-   };*/
-   
-   
 #endif
 };
 
 template< typename Real, typename Index >
-class tnlParallelReductionAbsMax : public tnlParallelReductionMax< Real, Index >
+class tnlParallelReductionAbsSum : public tnlParallelReductionSum< Real, Index >
 {
    public:
 
    typedef Real RealType;
    typedef Index IndexType;
    typedef Real ResultType;
-   typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation;
-
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return tnlAbs( data1[ idx ] );
-   };
+   typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation;
 
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
                             const RealType* data2 ) const
    {
-      return Max( current, tnlAbs( data1[ idx ] ) );
+      return current + tnlAbs( data1[ idx ] );
    };
 
    __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
@@ -686,179 +439,82 @@ class tnlParallelReductionAbsMax : public tnlParallelReductionMax< Real, Index >
                                               const RealType* data1,
                                               const RealType* data2 ) const
    {
-      result = tnlCudaMax( result, tnlCudaAbs( data1[ index ] ) );
-   }
-   
-   
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaMax( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) );
+      result += tnlCudaAbs( data1[ index ] );
    }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaAbs( data1[ idx1 ] );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMax( data1[ idx1 ], tnlCudaMax( tnlCudaAbs( data2[ idx2 ] ), tnlCudaAbs( data2[ idx3 ] ) ) );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMax( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) );
-   };
-
-   /*__device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  volatile const ResultType* data ) const
-   {
-      volatile ResultType aux = tnlCudaAbs( data[ idx2 ] );
-      return tnlCudaMax( data[ idx1 ], aux );
-   };
-   
-   __device__ ResultType commonReductionOnDevice( const ResultType& data1,
-                                                  const ResultType& data2 ) const
-   {
-      return tnlCudaMax( data1, data2 );
-   };
-   
-   __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1,
-                                                  volatile const ResultType& data2 ) const
-   {
-      return tnlCudaMax( data1, data2 );
-   };*/
-   
-   
-#endif
 };
 
 template< typename Real, typename Index >
-class tnlParallelReductionLogicalAnd
+class tnlParallelReductionAbsMin : public tnlParallelReductionMin< Real, Index >
 {
    public:
 
    typedef Real RealType;
    typedef Index IndexType;
    typedef Real ResultType;
-   typedef tnlParallelReductionLogicalAnd< Real, Index > LaterReductionOperation;
-
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return data1[ idx ];
-   };
+   typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation;
 
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
                             const RealType* data2 ) const
    {
-      return current && data1[ idx ];
+      return Min( current, tnlAbs( data1[ idx ] ) );
    };
 
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) true; };
+   __cuda_callable__ ResultType initialValue() const { return tnlMaxValue< ResultType>(); };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
                                               const RealType* data2 ) const
    {
-      result = result && data1[ index ];
-   }
-   
-   
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ] && data1[ idx2 ];
-   }
+      result = tnlCudaMin( result, tnlCudaAbs( data1[ index ] ) );
+   }   
+};
 
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ];
-   };
+template< typename Real, typename Index >
+class tnlParallelReductionAbsMax : public tnlParallelReductionMax< Real, Index >
+{
+   public:
 
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] && data2[ idx2 ] && data2[ idx3 ];
-   };
+   typedef Real RealType;
+   typedef Index IndexType;
+   typedef Real ResultType;
+   typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation;
 
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
+   ResultType reduceOnHost( const IndexType idx,
+                            const ResultType& current,
+                            const RealType* data1,
+                            const RealType* data2 ) const
    {
-      return data1[ idx1 ] && data2[ idx2 ];
+      return Max( current, tnlAbs( data1[ idx ] ) );
    };
 
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  volatile const ResultType* data ) const
-   {
-      return data[ idx1 ] && data[ idx2 ];
-   };
-   
-   __device__ ResultType commonReductionOnDevice( ResultType& result,
-                                                  const ResultType& data ) const
-   {
-      result = result && data;
-   };
-   
-   __device__ ResultType commonReductionOnDevice( volatile ResultType& result,
-                                                  volatile const ResultType& data ) const
-   {
-      result = result && data;
-   };
-   
+   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
 
-#endif
+   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
+                                              const IndexType index,
+                                              const RealType* data1,
+                                              const RealType* data2 ) const
+   {
+      result = tnlCudaMax( result, tnlCudaAbs( data1[ index ] ) );
+   }   
 };
 
 
 template< typename Real, typename Index >
-class tnlParallelReductionLogicalOr
+class tnlParallelReductionLpNorm : public tnlParallelReductionSum< Real, Index >
 {
    public:
 
    typedef Real RealType;
    typedef Index IndexType;
    typedef Real ResultType;
-   typedef tnlParallelReductionLogicalOr< Real, Index > LaterReductionOperation;
+   typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
+   void setPower( const RealType& p )
    {
-      return data1[ idx ];
+      this -> p = p;
    };
 
    ResultType reduceOnHost( const IndexType idx,
@@ -866,200 +522,34 @@ class tnlParallelReductionLogicalOr
                             const RealType* data1,
                             const RealType* data2 ) const
    {
-      return current || data1[ idx ];
+      return current + pow( tnlAbs( data1[ idx ] ), p );
    };
-   
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) false; };
+
+   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
                                               const IndexType index,
                                               const RealType* data1,
                                               const RealType* data2 ) const
    {
-      result = result || data1[ index ];
+      result += tnlCudaPow( tnlCudaAbs( data1[ index ] ), p );
    }
+   
+   protected:
 
+   RealType p;
+};
 
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ] || data1[ idx2 ];
-   }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ];
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] || data2[ idx2 ] || data2[ idx3 ];
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] || data2[ idx2 ];
-   };
-
-   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  volatile const ResultType* data ) const
-   {
-      return data[ idx1 ] || data[ idx2 ];
-   };
-   
-   __device__ ResultType commonReductionOnDevice( ResultType& result,
-                                                  const ResultType& data ) const
-   {
-      result = result || data;
-   };
-   
-   __device__ ResultType commonReductionOnDevice( volatile ResultType& result,
-                                                  volatile const ResultType& data ) const
-   {
-      result = result || data;
-   };
-   
-   
-#endif
-};
-
-template< typename Real, typename Index >
-class tnlParallelReductionLpNorm : public tnlParallelReductionSum< Real, Index >
-{
-   public:
-
-   typedef Real RealType;
-   typedef Index IndexType;
-   typedef Real ResultType;
-   typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation;
-
-   void setPower( const RealType& p )
-   {
-      this -> p = p;
-   };
-
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return pow( tnlAbs( data1[ idx ] ), p );
-   };
-
-   ResultType reduceOnHost( const IndexType idx,
-                            const ResultType& current,
-                            const RealType* data1,
-                            const RealType* data2 ) const
-   {
-      return current + pow( tnlAbs( data1[ idx ] ), p );
-   };
-
-   __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
-   
-   __cuda_callable__ void cudaFirstReduction( ResultType& result, 
-                                              const IndexType index,
-                                              const RealType* data1,
-                                              const RealType* data2 ) const
-   {
-      result += tnlCudaPow( tnlCudaAbs( data1[ index ] ), p );
-   }
-   
-   
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] ), p ) + tnlCudaPow( tnlCudaAbs( data1[ idx2 ] ), p );
-   }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] ), p );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] +
-             tnlCudaPow( tnlCudaAbs( data2[ idx2 ] ), p ) +
-             tnlCudaPow( tnlCudaAbs( data2[ idx3 ] ), p );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] + tnlCudaPow( tnlCudaAbs( data2[ idx2 ] ), p );
-   };
-
-   /*__device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  volatile const ResultType* data ) const
-   {
-      return data[ idx1 ] + data[ idx2 ];
-   };
-   
-   __device__ ResultType commonReductionOnDevice( const ResultType& data1,
-                                                  const ResultType& data2 ) const
-   {
-      return data1 + data2;
-   };
-   
-   __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1,
-                                                  volatile const ResultType& data2 ) const
-   {
-      return data1 + data2;
-   };*/
-   
-   
-#endif
-
-   protected:
-
-   RealType p;
-};
-
-template< typename Real, typename Index >
-class tnlParallelReductionEqualities : public tnlParallelReductionLogicalAnd< bool, Index >
-{
-   public:
+template< typename Real, typename Index >
+class tnlParallelReductionEqualities : public tnlParallelReductionLogicalAnd< bool, Index >
+{
+   public:
 
    typedef Real RealType;
    typedef Index IndexType;
    typedef bool ResultType;
    typedef tnlParallelReductionLogicalAnd< bool, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return  ( data1[ idx ] == data2[ idx ] );
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -1077,66 +567,6 @@ class tnlParallelReductionEqualities : public tnlParallelReductionLogicalAnd< bo
    {
       result = result && ( data1[ index ] == data2[ index ] );
    }
-   
-   
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return ( data1[ idx1 ] == data2[ idx1 ] ) && ( data1[ idx2 ] == data2[ idx2] );
-   }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return ( data1[ idx1 ]== data2[ idx1 ] );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] &&
-             ( data2[ idx2 ] == data2[ idx2] ) &&
-             ( data2[ idx3 ] == data3[ idx3] );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] && ( data2[ idx2 ] == data3[ idx2 ] );
-   };
-
-  /* __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  volatile const ResultType* data ) const
-   {
-      return data[ idx1 ] && data[ idx2 ];
-   };
-   
-   __device__ ResultType commonReductionOnDevice( const ResultType& data1,
-                                                  const ResultType& data2 ) const
-   {
-      return data1 && data2;
-   };
-   
-   __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1,
-                                                  volatile const ResultType& data2 ) const
-   {
-      return data1 && data2;
-   };*/
-
-   
-#endif
 };
 
 template< typename Real, typename Index >
@@ -1149,13 +579,6 @@ class tnlParallelReductionInequalities : public tnlParallelReductionLogicalAnd<
    typedef bool ResultType;
    typedef tnlParallelReductionLogicalAnd< bool, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return  ( data1[ idx ] != data2[ idx ] );
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -1173,66 +596,6 @@ class tnlParallelReductionInequalities : public tnlParallelReductionLogicalAnd<
    {
       result = result && ( data1[ index ] != data2[ index ] );
    }
-   
-   
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return ( data1[ idx1 ] != data2[ idx1 ] ) && ( data1[ idx2 ] != data2[ idx2] );
-   }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return ( data1[ idx1 ] != data2[ idx1 ] );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] &&
-             ( data2[ idx2 ] != data2[ idx2] ) &&
-             ( data2[ idx3 ] != data3[ idx3] );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] && ( data2[ idx2 ] != data3[ idx2 ] );
-   };
-
-   /*__device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  volatile const ResultType* data ) const
-   {
-      return data[ idx1 ] && data[ idx2 ];
-   };
-   
-   __device__ ResultType commonReductionOnDevice( const ResultType& data1,
-                                                  const ResultType& data2 ) const
-   {
-      return data1 && data2;
-   };
-   
-   __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1,
-                                                  volatile const ResultType& data2 ) const
-   {
-      return data1 && data2;
-   };*/
-   
-   
-#endif
 };
 
 template< typename Real, typename Index >
@@ -1245,13 +608,6 @@ class tnlParallelReductionScalarProduct : public tnlParallelReductionSum< Real,
    typedef Real ResultType;
    typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return  data1[ idx ] * data2[ idx ];
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -1268,66 +624,7 @@ class tnlParallelReductionScalarProduct : public tnlParallelReductionSum< Real,
                                                  const RealType* data2 ) const
    {
       result += data1[ index ] * data2[ index ];
-   }
-
-   
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return ( data1[ idx1 ] * data2[ idx1 ] ) + ( data1[ idx2 ] * data2[ idx2] );
-   }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return ( data1[ idx1 ] * data2[ idx1 ] );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] +
-             ( data2[ idx2 ] * data3[ idx2] ) +
-             ( data2[ idx3 ] * data3[ idx3] );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] + ( data2[ idx2 ] * data3[ idx2 ] );
-   };
-
-   /*__device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  volatile const ResultType* data ) const
-   {
-      return data[ idx1 ] + data[ idx2 ];
-   };
-   
-   __device__ ResultType commonReductionOnDevice( const ResultType& data1,
-                                                  const ResultType& data2 ) const
-   {
-      return data1 + data2;
-   };
-
-   __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1,
-                                                  volatile const ResultType& data2 ) const
-   {
-      return data1 + data2;
-   };*/
-
-#endif
+   }   
 };
 
 template< typename Real, typename Index >
@@ -1340,13 +637,6 @@ class tnlParallelReductionDiffSum : public tnlParallelReductionSum< Real, Index
    typedef Real ResultType;
    typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return data1[ idx ] - data2[ idx ];
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -1363,67 +653,7 @@ class tnlParallelReductionDiffSum : public tnlParallelReductionSum< Real, Index
                                           const RealType* data2 ) const
    {
       result += data1[ index ] - data2[ index ];
-   }
-   
-   
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return ( data1[ idx1 ] - data2[ idx1 ] ) + ( data1[ idx2 ] - data2[ idx2 ] );
-   };
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ] - data2[ idx1 ];
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] +
-             ( data2[ idx2 ] - data3[ idx2 ] ) +
-             ( data2[ idx3 ] - data3[ idx3 ] );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] + ( data2[ idx2 ] - data3[ idx2 ] );
-   };
-
-   /*__device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  volatile const ResultType* data ) const
-   {
-      return data[ idx1 ] + data[ idx2 ];
-   };
-   
-   __device__ ResultType commonReductionOnDevice( const ResultType& data1,
-                                                  const ResultType& data2 ) const
-   {
-      return data1 + data2;
-   };
-   
-   __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1,
-                                                  volatile const ResultType& data2 ) const
-   {
-      return data1 + data2;
-   };*/
-   
-
-#endif
+   }   
 };
 
 template< typename Real, typename Index >
@@ -1436,13 +666,6 @@ class tnlParallelReductionDiffMin : public tnlParallelReductionMin< Real, Index
    typedef Real ResultType;
    typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return data1[ idx ] - data2[ idx ];
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -1460,66 +683,6 @@ class tnlParallelReductionDiffMin : public tnlParallelReductionMin< Real, Index
    {
       result = tnlCudaMin( result, data1[ index ] - data2[ index ] );
    }
-   
-   
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaMin( data1[ idx1 ] - data2[ idx1 ], data1[ idx2 ] - data2[ idx2 ] );
-   }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ] - data2[ idx1 ];
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMin( data1[ idx1 ],
-                         tnlCudaMin(  data2[ idx2 ] - data3[ idx2 ],
-                                      data2[ idx3 ] - data3[ idx3 ] ) );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMin( data1[ idx1 ], data2[ idx2 ] - data3[ idx2 ] );
-   };
-
-  /* __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  volatile const ResultType* data ) const
-   {
-      return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
-   };
-   
-   __device__ ResultType commonReductionOnDevice( const ResultType& data1,
-                                                  const ResultType& data2 ) const
-   {
-      return tnlCudaMin( data1, data2 );
-   };
-   
-   __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1,
-                                                  volatile const ResultType& data2 ) const
-   {
-      return tnlCudaMin( data1, data2 );
-   };*/
-   
-
-#endif
 };
 
 template< typename Real, typename Index >
@@ -1532,13 +695,6 @@ class tnlParallelReductionDiffMax : public tnlParallelReductionMax< Real, Index
    typedef Real ResultType;
    typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return data1[ idx ] - data2[ idx ];
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -1556,67 +712,6 @@ class tnlParallelReductionDiffMax : public tnlParallelReductionMax< Real, Index
    {
       result = tnlCudaMax( result, data1[ index ] - data2[ index ] );
    }
-   
-   
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaMax( data1[ idx1 ] - data2[ idx1 ],
-                         data1[ idx2 ] - data2[ idx2 ] );
-   }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return data1[ idx1 ] - data2[ idx1 ];
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMax( data1[ idx1 ],
-                         tnlCudaMax( data2[ idx2 ] - data3[ idx2 ],
-                                     data2[ idx3 ] - data3[ idx3 ] ) );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMax( data1[ idx1 ], data2[ idx2 ] - data3[ idx2 ] );
-   };
-
-  /* __device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  volatile const ResultType* data ) const
-   {
-      return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
-   };
-   
-   __device__ ResultType commonReductionOnDevice( const ResultType& data1,
-                                                  const ResultType& data2 ) const
-   {
-      return tnlCudaMax( data1, data2 );
-   };
-   
-   __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1,
-                                                  volatile const ResultType& data2 ) const
-   {
-      return tnlCudaMax( data1, data2 );
-   };*/
-
-   
-#endif
 };
 
 template< typename Real, typename Index >
@@ -1629,13 +724,6 @@ class tnlParallelReductionDiffAbsSum : public tnlParallelReductionMax< Real, Ind
    typedef Real ResultType;
    typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return tnlAbs( data1[ idx ] - data2[ idx ] );
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -1653,67 +741,6 @@ class tnlParallelReductionDiffAbsSum : public tnlParallelReductionMax< Real, Ind
    {
       result += tnlCudaAbs( data1[ index ] - data2[ index ] );
    }
-   
-   
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ) + tnlCudaAbs( data1[ idx2 ] - data2[ idx2 ] );
-   };
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] +
-             tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ) +
-             tnlCudaAbs( data2[ idx3 ] - data3[ idx3 ] );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] +
-             tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] );
-   };
-
-   /*__device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  volatile const ResultType* data ) const
-   {
-      return data[ idx1 ] + data[ idx2 ];
-   };
-   
-   __device__ ResultType commonReductionOnDevice( const ResultType& data1,
-                                                  const ResultType& data2 ) const
-   {
-      return data1 + data2;
-   };
-   
-   __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1,
-                                                  volatile const ResultType& data2 ) const
-   {
-      return data1 + data2;
-   };*/
-   
-   
-#endif
 };
 
 template< typename Real, typename Index >
@@ -1726,13 +753,6 @@ class tnlParallelReductionDiffAbsMin : public tnlParallelReductionMin< Real, Ind
    typedef Real ResultType;
    typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return tnlAbs( data1[ idx ] - data2[ idx ] );
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -1750,69 +770,6 @@ class tnlParallelReductionDiffAbsMin : public tnlParallelReductionMin< Real, Ind
    {
       result = tnlCudaMin( result, tnlCudaAbs( data1[ index ] - data2[ index ] ) );
    }
-   
-   
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaMin( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ),
-                         tnlCudaAbs( data1[ idx2 ] - data2[ idx2 ] ) );
-   }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMin( data1[ idx1 ],
-                         tnlCudaMin(  tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ),
-                                      tnlCudaAbs( data2[ idx3 ] - data3[ idx3 ] ) ) );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMin( data1[ idx1 ],
-                         tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ) );
-   };
-
-   /*__device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  volatile const ResultType* data ) const
-   {
-      //return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
-      return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
-   };
-   
-   __device__ ResultType commonReductionOnDevice( const ResultType& data1,
-                                                  const ResultType& data2 ) const
-   {
-      return tnlCudaMin( data1, data2 );
-   };
-   
-   __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1,
-                                                  volatile const ResultType& data2 ) const
-   {
-      return tnlCudaMin( data1, data2 );
-   };*/
-
-   
-#endif
 };
 
 template< typename Real, typename Index >
@@ -1825,13 +782,6 @@ class tnlParallelReductionDiffAbsMax : public tnlParallelReductionMax< Real, Ind
    typedef Real ResultType;
    typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation;
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return tnlAbs( data1[ idx ] -data2[ idx ] );
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -1849,69 +799,6 @@ class tnlParallelReductionDiffAbsMax : public tnlParallelReductionMax< Real, Ind
    {
       result = tnlCudaMax( result, tnlCudaAbs( data1[ index ] - data2[ index ] ) );
    }
-   
-   
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaMax( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ),
-                         tnlCudaAbs( data1[ idx2 ] - data2[ idx2 ] ) );
-   }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMax( data1[ idx1 ],
-                         tnlCudaMax( tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ),
-                                     tnlCudaAbs( data2[ idx3 ] - data3[ idx3 ] ) ) );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return tnlCudaMax( data1[ idx1 ],
-                         tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ) );
-   };
-
-   /*__device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  volatile const ResultType* data ) const
-   {
-      //return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
-      return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
-   };
-   
-   __device__ ResultType commonReductionOnDevice( const ResultType& data1,
-                                                  const ResultType& data2 ) const
-   {
-      return tnlCudaMax( data1, data2 );
-   };
-   
-   __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1,
-                                                  volatile const ResultType& data2 ) const
-   {
-      return tnlCudaMax( data1, data2 );
-   };*/
-
-   
-#endif
 };
 
 template< typename Real, typename Index >
@@ -1929,13 +816,6 @@ class tnlParallelReductionDiffLpNorm : public tnlParallelReductionSum< Real, Ind
       this -> p = p;
    };
 
-   ResultType initialValueOnHost( const IndexType idx,
-                                  const RealType* data1,
-                                  const RealType* data2 ) const
-   {
-      return pow( tnlAbs( data1[ idx ] - data2[ idx ] ), p );
-   };
-
    ResultType reduceOnHost( const IndexType idx,
                             const ResultType& current,
                             const RealType* data1,
@@ -1947,74 +827,13 @@ class tnlParallelReductionDiffLpNorm : public tnlParallelReductionSum< Real, Ind
    __cuda_callable__ ResultType initialValue() const { return ( ResultType ) 0; };
    
    __cuda_callable__ void cudaFirstReduction( ResultType& result, 
-                                          const IndexType index,
-                                          const RealType* data1,
-                                          const RealType* data2 ) const
+                                              const IndexType index,
+                                              const RealType* data1,
+                                              const RealType* data2 ) const
    {
       result += tnlCudaPow( tnlCudaAbs( data1[ index ] - data2[ index ] ), p );
    }
    
-   
-#ifdef HAVE_CUDA
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const IndexType idx2,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ), p ) +
-             tnlCudaPow( tnlCudaAbs( data1[ idx2 ] - data2[ idx2 ] ), p );
-   }
-
-   __device__ ResultType initialValueOnDevice( const IndexType idx1,
-                                               const RealType* data1,
-                                               const RealType* data2 ) const
-   {
-      return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ), p );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const IndexType idx3,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] +
-             tnlCudaPow( tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ), p ) +
-             tnlCudaPow( tnlCudaAbs( data2[ idx3 ] - data3[ idx3 ] ), p );
-   };
-
-   __device__ ResultType cudaFirstReductionOnDevice( const IndexType idx1,
-                                                 const IndexType idx2,
-                                                 const ResultType* data1,
-                                                 const RealType* data2,
-                                                 const RealType* data3 ) const
-   {
-      return data1[ idx1 ] + tnlCudaPow( tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ), p );
-   };
-
-   /*__device__ ResultType commonReductionOnDevice( const IndexType idx1,
-                                                  const IndexType idx2,
-                                                  volatile const ResultType* data ) const
-   {
-      return data[ idx1 ] + data[ idx2 ];
-   };
-   
-   __device__ ResultType commonReductionOnDevice( const ResultType& data1,
-                                                  const ResultType& data2 ) const
-   {
-      return data1 + data2;
-   };
-   
-   __device__ ResultType commonReductionOnDevice( volatile const ResultType& data1,
-                                                  volatile const ResultType& data2 ) const
-   {
-      return data1 + data2;
-   };*/
-
-   
-#endif
-
    protected:
 
    RealType p;
diff --git a/src/core/cuda/tnlCudaReduction.h b/src/core/cuda/tnlCudaReduction.h
index 563fc8fe124eae21122fc7fd709c52ccf04c293c..9b7bf7ab2cb234ecbeae74f4a571cc8383720cb0 100644
--- a/src/core/cuda/tnlCudaReduction.h
+++ b/src/core/cuda/tnlCudaReduction.h
@@ -20,7 +20,7 @@
 
 #ifdef HAVE_CUDA
 
-template< typename Operation, int blockSize, bool isSizePow2 >
+template< typename Operation, int blockSize >
 class tnlCUDAReduction
 {
    public:
@@ -37,8 +37,8 @@ class tnlCUDAReduction
                                      ResultType* output );
 };
       
-/*template< typename Real, typename Index, int blockSize, bool isSizePow2 >
-class tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize, isSizePow2 >
+/*template< typename Real, typename Index, int blockSize >
+class tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize >
 {
    public:
       
diff --git a/src/core/cuda/tnlCudaReduction_impl.h b/src/core/cuda/tnlCudaReduction_impl.h
index 20b47bbf46987d9f61992e9f25a9aa48c41e8b36..479ef7972505a2dbe7629886b2535b094371fde1 100644
--- a/src/core/cuda/tnlCudaReduction_impl.h
+++ b/src/core/cuda/tnlCudaReduction_impl.h
@@ -18,10 +18,10 @@
 #ifndef TNLCUDAREDUCTION_IMPL_H
 #define	TNLCUDAREDUCTION_IMPL_H
 
-template< typename Operation, int blockSize, bool isSizePow2 >      
+template< typename Operation, int blockSize >      
 __device__
 void
-tnlCUDAReduction< Operation, blockSize, isSizePow2 >::
+tnlCUDAReduction< Operation, blockSize >::
 reduce( const Operation operation,
         const IndexType size,
         const RealType* input1,
@@ -154,10 +154,10 @@ reduce( const Operation operation,
 
 #ifdef UNDEF
 
-template< typename Real, typename Index, int blockSize, bool isSizePow2 >      
+template< typename Real, typename Index, int blockSize >      
 __device__
 void
-tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize, isSizePow2 >::
+tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize >::
 reduce( const Operation operation,
         const IndexType size,
         const RealType* input1,
diff --git a/tests/benchmarks/tnl-cuda-benchmarks.h b/tests/benchmarks/tnl-cuda-benchmarks.h
index dc82af7423c0cfb0ef91675ef3fda1d365cf9d7a..38c108b5d90aad5232d0428c8ffa1bb9878526e6 100644
--- a/tests/benchmarks/tnl-cuda-benchmarks.h
+++ b/tests/benchmarks/tnl-cuda-benchmarks.h
@@ -100,20 +100,19 @@ int main( int argc, char* argv[] )
     cout << bandwidth << " GB/sec." << endl;
     */
 
-    Real resultHost, resultDevice;
-    cout << "Benchmarking scalar product on CPU: ";
-    timer.reset();
-    timer.start();
-    for( int i = 0; i < loops; i++ )
-      resultHost = hostVector.scalarProduct( hostVector2 );
-    timer.stop();
-    bandwidth = 2 * datasetSize / timer.getTime();
-    cout << bandwidth << " GB/sec." << endl;
+   Real resultHost, resultDevice;
+   cout << "Benchmarking scalar product on CPU: ";
+   timer.reset();
+   timer.start();
+   for( int i = 0; i < loops; i++ )
+     resultHost = hostVector.scalarProduct( hostVector2 );
+   timer.stop();
+   bandwidth = 2 * datasetSize / timer.getTime();
+   cout << bandwidth << " GB/sec." << endl;
     
    cout << "Benchmarking scalar product on GPU: " << endl;
    timer.reset();
    timer.start();
-   cout << "Time: " << timer.getTime() << endl;
    for( int i = 0; i < loops; i++ )
       resultDevice = deviceVector.scalarProduct( deviceVector );
    cout << "Time: " << timer.getTime() << endl;
@@ -144,6 +143,31 @@ int main( int argc, char* argv[] )
    cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl;
 #endif    
 #endif
+   
+   cout << "Benchmarking prefix-sum on CPU ..." << endl;
+   timer.reset();
+   timer.start();
+   hostVector.computePrefixSum();
+   cout << "Time: " << timer.getTime() << endl;
+   timer.stop();
+   bandwidth = 2 * datasetSize / loops / timer.getTime();
+   cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl;
+   
+   cout << "Benchmarking prefix-sum on GPU ..." << endl;
+   timer.reset();
+   timer.start();
+   deviceVector.computePrefixSum();
+   cout << "Time: " << timer.getTime() << endl;
+   timer.stop();
+   bandwidth = 2 * datasetSize / loops / timer.getTime();
+   cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl;
+
+   for( int i = 0; i < size; i++ )
+      if( hostVector.getElement( i ) != deviceVector.getElement( i ) )
+      {
+         cerr << "Error in prefix sum at position " << i << ":  " << hostVector.getElement( i ) << " != " << deviceVector.getElement( i ) << endl;
+      }
+
    return EXIT_SUCCESS;
 }