diff --git a/src/core/cuda/reduction-operations.h b/src/core/cuda/reduction-operations.h
new file mode 100644
index 0000000000000000000000000000000000000000..de7488f5864cd906618e9c0258ed20a9b67180ca
--- /dev/null
+++ b/src/core/cuda/reduction-operations.h
@@ -0,0 +1,226 @@
+/***************************************************************************
+                          reduction-operations.h  -  description
+                             -------------------
+    begin                : Mar 22, 2013
+    copyright            : (C) 2013 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef REDUCTION_OPERATIONS_H_
+#define REDUCTION_OPERATIONS_H_
+
+#ifdef HAVE_CUDA
+#include <cuda.h>
+#include <core/mfuncs.h>
+
+enum tnlTupleOperation {  tnlParallelReductionMax,
+                          tnlParallelReductionAbsMin,
+                          tnlParallelReductionAbsMax,
+                          tnlParallelReductionAbsSum,
+                          tnlParallelReductionLpNorm,
+                          tnlParallelReductionSdot };
+
+
+/***
+ * This function returns minimum of two numbers stored on the device.
+ */
+template< class T > __device__ T tnlCudaMin( const T& a,
+                                             const T& b )
+{
+   return a < b ? a : b;
+}
+
+__device__ int tnlCudaMin( const int& a,
+                           const int& b )
+{
+   return min( a, b );
+}
+
+__device__ float tnlCudaMin( const float& a,
+                             const float& b )
+{
+   return fminf( a, b );
+}
+
+__device__ double tnlCudaMin( const double& a,
+                              const double& b )
+{
+   return fmin( a, b );
+}
+
+/***
+ * This function returns maximum of two numbers stored on the device.
+ */
+template< class T > __device__ T tnlCudaMax( const T& a,
+                                             const T& b )
+{
+   return a > b ? a : b;
+}
+
+__device__ int tnlCudaMax( const int& a,
+                           const int& b )
+{
+   return max( a, b );
+}
+
+__device__ float tnlCudaMax( const float& a,
+                             const float& b )
+{
+   return fmaxf( a, b );
+}
+
+__device__ double tnlCudaMax( const double& a,
+                              const double& b )
+{
+   return fmax( a, b );
+}
+
+/***
+ * This function returns absolute value of given number on the device.
+ */
+__device__ int tnlCudaAbs( const int& a )
+{
+   return abs( a );
+}
+
+__device__ float tnlCudaAbs( const float& a )
+{
+   return fabs( a );
+}
+
+__device__ double tnlCudaAbs( const double& a )
+{
+   return fabs( a );
+}
+
+template< typename Real, typename Index >
+class tnlParallelReductionSum
+{
+   public:
+
+   typedef Real RealType;
+   typedef Index IndexType;
+
+
+   RealType reduceOnHost( const RealType& data1,
+                          const RealType& data2 ) const
+   {
+      return data1 + data2;
+   }
+
+   __device__ RealType initialValueOnDevice( const IndexType idx1,
+                                             const IndexType idx2,
+                                             const RealType* data1,
+                                             const RealType* data2 ) const
+   {
+      return data1[ idx1 ] + data1[ idx2 ];
+   }
+
+   __device__ RealType initialValueOnDevice( const IndexType idx1,
+                                             const RealType* data1,
+                                             const RealType* data2 ) const
+   {
+      return data1[ idx1 ];
+   };
+
+   __device__ RealType firstReductionOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const IndexType idx3,
+                                               const RealType* data1,
+                                               const RealType* data2,
+                                               const RealType* data3 ) const
+   {
+      return data1[ idx1 ] + data2[ idx2 ] + data2[ idx3 ];
+   };
+
+   __device__ RealType firstReductionOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const RealType* data1,
+                                               const RealType* data2,
+                                               const RealType* data3 ) const
+   {
+      return data1[ idx1 ] + data2[ idx2 ];
+   };
+
+   __device__ RealType commonReductionOnDevice( const IndexType idx1,
+                                                const IndexType idx2,
+                                                const RealType* data ) const
+   {
+      return data[ idx1 ] + data[ idx2 ];
+   };
+};
+
+template< typename Real, typename Index >
+class tnlParallelReductionMin
+{
+   public:
+
+   typedef Real RealType;
+   typedef Index IndexType;
+
+
+   RealType reduceOnHost( const RealType& data1,
+                          const RealType& data2 ) const
+   {
+      return Min( data1, data2 );
+   }
+
+   __device__ RealType initialValueOnDevice( const IndexType idx1,
+                                             const IndexType idx2,
+                                             const RealType* data1,
+                                             const RealType* data2 ) const
+   {
+      return tnlCudaMin( data1[ idx1 ], data1[ idx2 ] );
+   }
+
+   __device__ RealType initialValueOnDevice( const IndexType idx1,
+                                             const RealType* data1,
+                                             const RealType* data2 ) const
+   {
+      return data1[ idx1 ];
+   };
+
+   __device__ RealType firstReductionOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const IndexType idx3,
+                                               const RealType* data1,
+                                               const RealType* data2,
+                                               const RealType* data3 ) const
+   {
+      return tnlCudaMin( data1[ idx1 ], tnlCudaMin(  data2[ idx2 ],  data2[ idx3 ] ) );
+   };
+
+   __device__ RealType firstReductionOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const RealType* data1,
+                                               const RealType* data2,
+                                               const RealType* data3 ) const
+   {
+      return tnlCudaMin( data1[ idx1 ], data2[ idx2 ] );
+   };
+
+   __device__ RealType commonReductionOnDevice( const IndexType idx1,
+                                                const IndexType idx2,
+                                                const RealType* data ) const
+   {
+      return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
+   };
+};
+
+
+
+
+#include <implementation/core/cuda/reduction-operations_impl.h>
+
+#endif
+
+#endif /* REDUCTION_OPERATIONS_H_ */
diff --git a/src/implementation/core/CMakeLists.txt b/src/implementation/core/CMakeLists.txt
index 29eba8ff9682f270236623a19c54262684dc7133..10e99346155a315c0daf053c3bd1375a9e6fdd5b 100755
--- a/src/implementation/core/CMakeLists.txt
+++ b/src/implementation/core/CMakeLists.txt
@@ -21,8 +21,10 @@ SET( headers cuda-long-vector-kernels.h
 SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/implementation/core )
 IF( BUILD_CUDA )
    set( tnl_implementation_core_CUDA__SOURCES
+        ${tnl_implementation_core_cuda_CUDA__SOURCES}
         ${CURRENT_DIR}/tnlArray_impl.cu
-        ${CURRENT_DIR}/tnlVector_impl.cu )
+        ${CURRENT_DIR}/tnlVector_impl.cu 
+        PARENT_SCOPE )
 ENDIF()    
 set( tnl_implementation_core_SOURCES
      ${tnl_implementation_core_CUDA__SOURCES}
diff --git a/src/implementation/core/cuda-long-vector-kernels.h b/src/implementation/core/cuda-long-vector-kernels.h
index 388afec7010a0e44defbd82785d69b5337011b06..c9f49847ff2b7855b3bd5e4b8cb32596c20bc0fe 100644
--- a/src/implementation/core/cuda-long-vector-kernels.h
+++ b/src/implementation/core/cuda-long-vector-kernels.h
@@ -22,18 +22,12 @@
 #include <cuda.h>
 #endif
 #include <iostream>
-#include <core/tnlVector.h>
+#include <core/tnlAssert.h>
+#include <core/cuda/reduction-operations.h>
+#include <implementation/core/memory-operations.h>
 
 using namespace std;
 
-enum tnlTupleOperation { tnlParallelReductionMin = 1,
-                          tnlParallelReductionMax,
-                          tnlParallelReductionSum,
-                          tnlParallelReductionAbsMin,
-                          tnlParallelReductionAbsMax,
-                          tnlParallelReductionAbsSum,
-                          tnlParallelReductionLpNorm,
-                          tnlParallelReductionSdot };
 
 /****
  * This constant says that arrays smaller than its value
@@ -41,107 +35,24 @@ enum tnlTupleOperation { tnlParallelReductionMin = 1,
  */
 const int maxGPUReductionDataSize = 256;
 
-/****
- * The following kernels and functions have been adopted from
- *
- * M. Harris, â€śOptimizing parallel reduction in cuda,â€ť NVIDIA CUDA SDK, 2007.
- *
- * The code was extended even for data arrays with size different from
- * a power of 2.
- *
- * For the educative and also testing/debugging reasons we have 6 version of this algorithm.
- * The slower version can be found as a part o ftesting code. See directory tests.
- * Version 1 is the slowest and version 6 is the fastest - teste on CUDA architecture 1.0 - 1.3.
- * Another improvements are possible for the future devices.
- *
- */
-
 #ifdef HAVE_CUDA
-/***
- * This function returns minimum of two numbers stored on the device.
- */
-template< class T > __device__ T tnlCudaMin( const T& a,
-                                             const T& b )
-{
-   return a < b ? a : b;
-}
-
-__device__ int tnlCudaMin( const int& a,
-                           const int& b )
-{
-   return min( a, b );
-}
-
-__device__ float tnlCudaMin( const float& a,
-                             const float& b )
-{
-   return fminf( a, b );
-}
-
-__device__ double tnlCudaMin( const double& a,
-                              const double& b )
-{
-   return fmin( a, b );
-}
-
-/***
- * This function returns maximum of two numbers stored on the device.
- */
-template< class T > __device__ T tnlCudaMax( const T& a,
-                                             const T& b )
-{
-   return a > b ? a : b;
-}
-
-__device__ int tnlCudaMax( const int& a,
-                           const int& b )
-{
-   return max( a, b );
-}
-
-__device__ float tnlCudaMax( const float& a,
-                             const float& b )
-{
-   return fmaxf( a, b );
-}
-
-__device__ double tnlCudaMax( const double& a,
-                              const double& b )
-{
-   return fmax( a, b );
-}
 
-/***
- * This function returns absolute value of given number on the device.
- */
-__device__ int tnlCudaAbs( const int& a )
-{
-   return abs( a );
-}
-
-__device__ float tnlCudaAbs( const float& a )
-{
-   return fabs( a );
-}
-
-__device__ double tnlCudaAbs( const double& a )
-{
-   return fabs( a );
-}
 
 /***
  * For each thread in block with thread ID smaller then s this function reduces
  * data elements with indecis tid and tid + s. Here we assume that for each
  * tid the tid + s element also exists i.e. we have even number of elements.
  */
-template< class T, tnlTupleOperation operation >
-__device__ void reduceAligned( unsigned int tid,
-                               unsigned int s,
-                               T* sdata )
+template< typename Operation >
+__device__ void reduceAligned( const Operation& operation,
+                               typename Operation :: IndexType tid,
+                               typename Operation :: IndexType  s,
+                               typename Operation :: RealType* sdata )
 {
    if( tid < s )
    {
-      if( operation == tnlParallelReductionMin )
+      sdata[ tid ] = operation. commonReductionOnDevice( tid, tid + s, sdata );
+      /*if( operation == tnlParallelReductionMin )
          sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
       if( operation == tnlParallelReductionMax )
          sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
@@ -153,7 +64,7 @@ __device__ void reduceAligned( unsigned int tid,
          sdata[ tid ] = tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) );
       if( operation == tnlParallelReductionLpNorm ||
           operation == tnlParallelReductionSdot )
-         sdata[ tid ] = sdata[ tid ] + sdata[ tid + s ];
+         sdata[ tid ] = sdata[ tid ] + sdata[ tid + s ];*/
    }
 }
 
@@ -163,15 +74,17 @@ __device__ void reduceAligned( unsigned int tid,
  * the previous algorithm. Thid one works even for odd number of elements but
  * it is a bit slower.
  */
-template< class T, tnlTupleOperation operation >
-__device__ void reduceNonAligned( unsigned int tid,
-                                  unsigned int s,
-                                  unsigned int n,
-                                  T* sdata )
+template< typename Operation >
+__device__ void reduceNonAligned( const Operation& operation,
+                                  typename Operation :: IndexType tid,
+                                  typename Operation :: IndexType s,
+                                  typename Operation :: IndexType n,
+                                  typename Operation :: RealType* sdata )
 {
    if( tid < s )
    {
-      if( operation == tnlParallelReductionMin )
+      sdata[ tid ] = operation. commonReductionOnDevice( tid, tid + s, sdata );
+      /*if( operation == tnlParallelReductionMin )
          sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
       if( operation == tnlParallelReductionMax )
          sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
@@ -183,7 +96,7 @@ __device__ void reduceNonAligned( unsigned int tid,
          sdata[ tid ] = tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) );
       if( operation == tnlParallelReductionLpNorm ||
           operation == tnlParallelReductionSdot )
-         sdata[ tid ] = sdata[ tid ] + sdata[ tid + s ];
+         sdata[ tid ] = sdata[ tid ] + sdata[ tid + s ];*/
    }
    /* This is for the case when we have odd number of elements.
     * The last one will be reduced using the thread with ID 0.
@@ -192,7 +105,8 @@ __device__ void reduceNonAligned( unsigned int tid,
       __syncthreads();
    if( 2 * s < n && tid == n - 1 )
    {
-      if( operation == tnlParallelReductionMin )
+      sdata[ 0 ] = operation. commonReductionOnDevice( 0, tid, sdata );
+      /*if( operation == tnlParallelReductionMin )
          sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
       if( operation == tnlParallelReductionMax )
          sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
@@ -204,7 +118,7 @@ __device__ void reduceNonAligned( unsigned int tid,
          sdata[ 0 ] = tnlCudaMax( tnlCudaAbs( sdata[ 0 ] ), tnlCudaAbs( sdata[ tid + s ] ) );
       if( operation == tnlParallelReductionLpNorm ||
           operation == tnlParallelReductionSdot )
-         sdata[ 0 ] = sdata[ 0 ] + sdata[ tid + s ];
+         sdata[ 0 ] = sdata[ 0 ] + sdata[ tid + s ];*/
 
    }
 }
@@ -224,16 +138,18 @@ __device__ void reduceNonAligned( unsigned int tid,
  *                     Each block of the grid writes one element in this array
  *                     (i.e. the size of this array equals the number of CUDA blocks).
  */
-template < typename Type, typename ParameterType, typename Index, tnlTupleOperation operation, int blockSize >
-__global__ void tnlCUDAReductionKernel( const Index size,
-                                        const Type* deviceInput,
-                                        const Type* deviceInput2,
-                                        Type* deviceOutput,
-                                        const ParameterType parameter,
-                                        Type* dbg_array1 = 0 )
+template < typename Operation, int blockSize >
+__global__ void tnlCUDAReductionKernel( const Operation operation,
+                                        const typename Operation :: IndexType size,
+                                        const typename Operation :: RealType* deviceInput,
+                                        const typename Operation :: RealType* deviceInput2,
+                                        typename Operation :: RealType* deviceOutput )
 {
    extern __shared__ __align__ ( 8 ) char __sdata[];
-   Type* sdata = reinterpret_cast< Type* >( __sdata );
+   
+   typedef typename Operation :: IndexType IndexType;
+   typedef typename Operation :: RealType RealType;
+   RealType* sdata = reinterpret_cast< RealType* >( __sdata );
 
    /***
     * Get thread id (tid) and global thread id (gid).
@@ -241,10 +157,10 @@ __global__ void tnlCUDAReductionKernel( const Index size,
     * gridSize is the number of element processed by all blocks at the
     * same time.
     */
-   unsigned int tid = threadIdx. x;
-   unsigned int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x;
-   unsigned int lastTId = size - 2 * blockIdx. x * blockDim. x;
-   unsigned int gridSize = 2 * blockDim. x * gridDim.x;
+   IndexType tid = threadIdx. x;
+   IndexType gid = 2 * blockIdx. x * blockDim. x + threadIdx. x;
+   IndexType lastTId = size - 2 * blockIdx. x * blockDim. x;
+   IndexType gridSize = 2 * blockDim. x * gridDim.x;
 
    /***
     * Read data into the shared memory. We start with the
@@ -252,7 +168,8 @@ __global__ void tnlCUDAReductionKernel( const Index size,
     */
    if( gid + blockDim. x < size )
    {
-      if( operation == tnlParallelReductionMin )
+      sdata[ tid ] = operation. initialValueOnDevice( gid, gid + blockDim. x, deviceInput, deviceInput2 );
+      /*if( operation == tnlParallelReductionMin )
          sdata[ tid ] = tnlCudaMin( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] );
       if( operation == tnlParallelReductionMax )
          sdata[ tid ] = tnlCudaMax( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] );
@@ -267,22 +184,24 @@ __global__ void tnlCUDAReductionKernel( const Index size,
                         powf( tnlCudaAbs( deviceInput[ gid + blockDim. x ] ), parameter );
       if( operation == tnlParallelReductionSdot )
          sdata[ tid ] = deviceInput[ gid ] * deviceInput2[ gid ] +
-                        deviceInput[ gid + blockDim. x ] * deviceInput2[ gid + blockDim. x ];
+                        deviceInput[ gid + blockDim. x ] * deviceInput2[ gid + blockDim. x ];*/
    }
    else if( gid < size )
    {
-      if( operation == tnlParallelReductionLpNorm )
+      sdata[ tid ] = operation. initialValueOnDevice( gid, deviceInput, deviceInput2 );
+      /*if( operation == tnlParallelReductionLpNorm )
          sdata[ tid ] = powf( tnlCudaAbs( deviceInput[ gid ] ), parameter );
       else
          if( operation == tnlParallelReductionSdot )
             sdata[ tid ] = deviceInput[ gid ] * deviceInput2[ gid ];
          else
-            sdata[ tid ] = deviceInput[ gid ];
+            sdata[ tid ] = deviceInput[ gid ];*/
    }
    gid += gridSize;
    while( gid + blockDim. x < size )
    {
-      if( operation == tnlParallelReductionMin )
+      sdata[ tid ] = operation. firstReductionOnDevice( tid, gid, gid + blockDim. x, sdata, deviceInput, deviceInput2 );
+      /*if( operation == tnlParallelReductionMin )
          sdata[ tid ] = :: tnlCudaMin( sdata[ tid ], :: tnlCudaMin( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] ) );
       if( operation == tnlParallelReductionMax )
          sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], :: tnlCudaMax( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] ) );
@@ -297,12 +216,13 @@ __global__ void tnlCUDAReductionKernel( const Index size,
                          powf( tnlCudaAbs( deviceInput[ gid + blockDim. x ] ), parameter );
       if( operation == tnlParallelReductionSdot )
          sdata[ tid ] += deviceInput[ gid ] * deviceInput2[ gid ] +
-                         deviceInput[ gid + blockDim. x] * deviceInput2[ gid + blockDim. x ];
+                         deviceInput[ gid + blockDim. x] * deviceInput2[ gid + blockDim. x ];*/
       gid += gridSize;
    }
    if( gid < size )
    {
-      if( operation == tnlParallelReductionMin )
+      sdata[ tid ] = operation. firstReductionOnDevice( tid, gid, sdata, deviceInput, deviceInput2 );
+      /*if( operation == tnlParallelReductionMin )
          sdata[ tid ] = :: tnlCudaMin( sdata[ tid ], deviceInput[ gid ] );
       if( operation == tnlParallelReductionMax )
          sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], deviceInput[ gid ] );
@@ -315,7 +235,7 @@ __global__ void tnlCUDAReductionKernel( const Index size,
       if( operation == tnlParallelReductionLpNorm )
          sdata[ tid ] += powf( tnlCudaAbs( deviceInput[gid] ), parameter );
       if( operation == tnlParallelReductionSdot )
-         sdata[ tid ] += deviceInput[ gid ] * deviceInput2[ gid ];
+         sdata[ tid ] += deviceInput[ gid ] * deviceInput2[ gid ];*/
    }
    __syncthreads();
 
@@ -337,19 +257,19 @@ __global__ void tnlCUDAReductionKernel( const Index size,
       if( blockSize >= 512 )
       {
          if( tid < 256 )
-            reduceAligned< Type, operation >( tid, 256, sdata );
+            reduceAligned( operation, tid, 256, sdata );
          __syncthreads();
       }
       if( blockSize >= 256 )
       {
          if( tid < 128 )
-            reduceAligned< Type, operation >( tid, 128, sdata );
+            reduceAligned( operation, tid, 128, sdata );
          __syncthreads();
       }
       if( blockSize >= 128 )
       {
          if( tid <  64 )
-            reduceAligned< Type, operation >( tid, 64, sdata );
+            reduceAligned( operation, tid, 64, sdata );
          __syncthreads();
       }
 
@@ -359,17 +279,17 @@ __global__ void tnlCUDAReductionKernel( const Index size,
       if (tid < 32)
       {
          if( blockSize >= 64 )
-            reduceAligned< Type, operation >( tid, 32, sdata );
+            reduceAligned( operation, tid, 32, sdata );
          if( blockSize >= 32 )
-            reduceAligned< Type, operation >( tid, 16, sdata );
+            reduceAligned( operation, tid, 16, sdata );
          if( blockSize >= 16 )
-            reduceAligned< Type, operation >( tid,  8, sdata );
+            reduceAligned( operation, tid,  8, sdata );
          if( blockSize >=  8 )
-            reduceAligned< Type, operation >( tid,  4, sdata );
+            reduceAligned( operation, tid,  4, sdata );
          if( blockSize >=  4 )
-            reduceAligned< Type, operation >( tid,  2, sdata );
+            reduceAligned( operation, tid,  2, sdata );
          if( blockSize >=  2 )
-            reduceAligned< Type, operation >( tid,  1, sdata );
+            reduceAligned( operation, tid,  1, sdata );
       }
    }
    else
@@ -378,35 +298,35 @@ __global__ void tnlCUDAReductionKernel( const Index size,
       if( n >= 512 )
       {
          s = n / 2;
-         reduceNonAligned< Type, operation >( tid, s, n, sdata );
+         reduceNonAligned( operation, tid, s, n, sdata );
          n = s;
          __syncthreads();
       }
       if( n >= 256 )
       {
          s = n / 2;
-         reduceNonAligned< Type, operation >( tid, s, n, sdata );
+         reduceNonAligned( operation, tid, s, n, sdata );
          n = s;
          __syncthreads();
       }
       if( n >= 128 )
       {
          s = n / 2;
-         reduceNonAligned< Type, operation >( tid, s, n, sdata );
+         reduceNonAligned( operation, tid, s, n, sdata );
          n = s;
          __syncthreads();
       }
       if( n >= 64 )
       {
          s = n / 2;
-         reduceNonAligned< Type, operation >( tid, s, n, sdata );
+         reduceNonAligned( operation, tid, s, n, sdata );
          n = s;
          __syncthreads();
       }
       if( n >= 32 )
       {
          s = n / 2;
-         reduceNonAligned< Type, operation >( tid, s, n, sdata );
+         reduceNonAligned( operation, tid, s, n, sdata );
          n = s;
          __syncthreads();
       }
@@ -416,25 +336,25 @@ __global__ void tnlCUDAReductionKernel( const Index size,
       if( n >= 16 )
       {
          s = n / 2;
-         reduceNonAligned< Type, operation >( tid, s, n, sdata );
+         reduceNonAligned( operation, tid, s, n, sdata );
          n = s;
       }
       if( n >= 8 )
       {
          s = n / 2;
-         reduceNonAligned< Type, operation >( tid, s, n, sdata );
+         reduceNonAligned( operation, tid, s, n, sdata );
          n = s;
       }
       if( n >= 4 )
       {
          s = n / 2;
-         reduceNonAligned< Type, operation >( tid, s, n, sdata );
+         reduceNonAligned( operation, tid, s, n, sdata );
          n = s;
       }
       if( n >= 2 )
       {
          s = n / 2;
-         reduceNonAligned< Type, operation >( tid, s, n, sdata );
+         reduceNonAligned( operation, tid, s, n, sdata );
          n = s;
       }
 
@@ -468,15 +388,18 @@ __global__ void tnlCUDAReductionKernel( const Index size,
  *        allocation of this array on the device inside of this function.
  *        The size of this array should be size / 128 * sizeof( T ).
  */
-template< typename Type, typename ParameterType, typename Index, tnlTupleOperation operation >
-bool tnlCUDALongVectorReduction( const Index size,
-                                 const Type* deviceInput1,
-                                 const Type* deviceInput2,
-                                 Type& result,
-                                 const ParameterType& parameter,
-                                 Type* deviceAux = 0 )
+template< typename Operation >
+bool tnlCUDALongVectorReduction( const Operation& operation,
+                                 const typename Operation :: IndexType size,
+                                 const typename Operation :: RealType* deviceInput1,
+                                 const typename Operation :: RealType* deviceInput2,
+                                 typename Operation :: RealType& result,
+                                 typename Operation :: RealType* deviceAux = 0 )
 {
 #ifdef HAVE_CUDA
+
+   typedef typename Operation :: IndexType IndexType;
+   typedef typename Operation :: RealType RealType;
    /***
     * Set parameters:
     * @param desBlockSize is desired block size with which we get the best performance (on CUDA rach 1.0 to 1.3)
@@ -485,7 +408,7 @@ bool tnlCUDALongVectorReduction( const Index size,
    const int desBlockSize = 512;
    const int desGridSize = 2048;
 
-   Type* dbg_array1; // debuging array
+   RealType* dbg_array1; // debuging array
 
    /***
     * Allocating auxiliary device memory to store temporary reduced arrays.
@@ -494,13 +417,15 @@ bool tnlCUDALongVectorReduction( const Index size,
     * If one calls the CUDA reduction more then once then one can provide
     * auxiliary array by passing it via the parameter deviceAux.
     */
-   tnlVector< Type, tnlCuda > deviceAuxVct( "tnlCUDAOneVectorReduction:deviceAuxVct" );
+   RealType* myDeviceAux( 0 );
    if( ! deviceAux )
    {
-      int sizeAlloc = :: Max( 1, size / desBlockSize );
-      if( ! deviceAuxVct. setSize( sizeAlloc ) )
+      if( ! allocateMemoryCuda( myDeviceAux, :: Max( 1, size / desBlockSize ) ) )
+      {
+         checkCudaDevice;
          return false;
-      deviceAux = deviceAuxVct. getData();
+      }
+      deviceAux = myDeviceAux;
    }
 
    /***
@@ -509,26 +434,27 @@ bool tnlCUDALongVectorReduction( const Index size,
     * @param reductionInput tells what data we shell reduce. We start with the input if this fuction
     *                       and after the 1st reduction step we switch this pointer to deviceAux.
     */
-   int sizeReduced = size;
-   const Type* reductionInput1 = deviceInput1;
-   const Type* reductionInput2 = deviceInput2;
-   int reductionSteps( 0 );
+   IndexType sizeReduced = size;
+   const RealType* reductionInput1 = deviceInput1;
+   const RealType* reductionInput2 = deviceInput2;
+   IndexType reductionSteps( 0 );
    while( sizeReduced > maxGPUReductionDataSize )
    {
       dim3 blockSize( 0 ), gridSize( 0 );
       blockSize. x = :: Min( sizeReduced, desBlockSize );
-      gridSize. x = :: Min( ( int ) ( sizeReduced / blockSize. x + 1 ) / 2, desGridSize );
+      gridSize. x = :: Min( ( IndexType ) ( sizeReduced / blockSize. x + 1 ) / 2, desGridSize );
 
       /***
        * We align the blockSize to the power of 2.
        */
-      Index alignedBlockSize = 1;
+      IndexType alignedBlockSize = 1;
       while( alignedBlockSize < blockSize. x ) alignedBlockSize <<= 1;
       blockSize. x = alignedBlockSize;
-      Index shmem = blockSize. x * sizeof( Type );
+      IndexType shmem = blockSize. x * sizeof( RealType );
       /***
        * Depending on the blockSize we generate appropriate template instance.
        */
+#ifdef UNDEF
       if( reductionSteps > 0 &&
           ( operation == tnlParallelReductionSdot ||
             operation == tnlParallelReductionLpNorm ) )
@@ -584,43 +510,44 @@ bool tnlCUDALongVectorReduction( const Index size,
          }
       }
       else
+#endif
          switch( blockSize. x )
          {
             case 512:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 512 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
+               tnlCUDAReductionKernel< Operation, 512 >
+               <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux );
                break;
             case 256:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 256 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
+               tnlCUDAReductionKernel< Operation, 256 >
+               <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux );
                break;
             case 128:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 128 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
+               tnlCUDAReductionKernel< Operation, 128 >
+               <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux );
                break;
             case  64:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, operation,  64 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
+               tnlCUDAReductionKernel< Operation,  64 >
+               <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux );
                break;
             case  32:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, operation,  32 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
+               tnlCUDAReductionKernel< Operation,  32 >
+               <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux );
                break;
             case  16:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, operation,  16 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
+               tnlCUDAReductionKernel< Operation,  16 >
+               <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux );
                break;
             case   8:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, operation,   8 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
+               tnlCUDAReductionKernel< Operation,   8 >
+               <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux );
                break;
             case   4:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, operation,   4 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
+               tnlCUDAReductionKernel< Operation,   4 >
+               <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux );
                break;
             case   2:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, operation,   2 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
+               tnlCUDAReductionKernel< Operation,   2 >
+               <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux );
                break;
             case   1:
                tnlAssert( false, cerr << "blockSize should not be 1." << endl );
@@ -639,15 +566,13 @@ bool tnlCUDALongVectorReduction( const Index size,
     * If sizeReduced equals size the previous loop was not processed and we read
     * data directly from the input.
     */
-   Type result_array[ maxGPUReductionDataSize ];
-   Type result_array2[ maxGPUReductionDataSize ];
+   RealType result_array[ maxGPUReductionDataSize ];
+   RealType result_array2[ maxGPUReductionDataSize ];
    if( sizeReduced == size )
    {
-      if( cudaMemcpy( result_array, deviceInput1, sizeReduced * sizeof( Type ), cudaMemcpyDeviceToHost ) != cudaSuccess )
-      {
-         CHECK_CUDA_ERROR;
+      if( ! copyMemoryCudaToHost( result_array, deviceInput1, sizeReduced ) )
          return false;
-      }
+#ifdef UNDEF
       switch( operation )
       {
          case tnlParallelReductionLpNorm:
@@ -657,10 +582,8 @@ bool tnlCUDALongVectorReduction( const Index size,
             result = pow( result, 1.0/ parameter );
             return true;
          case tnlParallelReductionSdot:
-            if( cudaMemcpy( result_array2, deviceInput2, sizeReduced * sizeof( Type ), cudaMemcpyDeviceToHost ) != cudaSuccess )
-            {
-               CHECK_CUDA_ERROR;
-            }
+            if( ! copyMemoryCudaToHost( result_array2, deviceInput2, sizeReduced ) )
+               return false;
             else
             {
                result = 0;
@@ -669,14 +592,15 @@ bool tnlCUDALongVectorReduction( const Index size,
                return true;
             }
       }
+#endif
    }
    else
-      if( cudaMemcpy( result_array, deviceAux, sizeReduced * sizeof( Type ), cudaMemcpyDeviceToHost ) != cudaSuccess )
-      {
-         CHECK_CUDA_ERROR;
+      if( ! copyMemoryCudaToHost( result_array, deviceAux, sizeReduced ) )
          return false;
-      }
-   switch( operation )
+   result = result_array[ 0 ];
+   for( IndexType i = 1; i < sizeReduced; i ++ )
+      result = operation. reduceOnHost( result, result_array[ i ] );
+   /*switch( operation )
    {
       case tnlParallelReductionMax:
          result = result_array[ 0 ];
@@ -705,6 +629,12 @@ bool tnlCUDALongVectorReduction( const Index size,
          for( Index i = 1; i < sizeReduced; i ++ )
             result = Min( result, tnlAbs( result_array[ i ] ) );
          break;
+   }*/
+   if( myDeviceAux )
+   {
+      freeMemoryCuda( myDeviceAux );
+      if( ! checkCudaDevice )
+         return false;
    }
    return true;
 #else
@@ -759,6 +689,7 @@ __global__ void compareTwoVectorsElementwise( const Index size,
  * Result is written into a bool array. The minimum value then says if both vectors equal.
  *
  */
+// TODO: result of comparison should not be returned!!!
 template< typename Type, typename Index >
 bool tnlCUDALongVectorComparison( const Index size,
                                   const Type* deviceInput1,
@@ -770,12 +701,14 @@ bool tnlCUDALongVectorComparison( const Index size,
    tnlAssert( size > 0,
               cerr << "You try to compare two CUDA long vectors with non-positive size." << endl
                    << "The size is " << size );
-   tnlVector< bool, tnlCuda, Index > boolArray( "tnlCUDALongVectorComparison:bool_array" );
+   //tnlVector< bool, tnlCuda, Index > boolArray( "tnlCUDALongVectorComparison:bool_array" );
+   bool* myDeviceBoolAux( 0 );
    if( ! deviceBoolAux )
    {
-      if( ! boolArray. setSize( size ) )
+      //if( ! boolArray. setSize( size ) )
+      if( ! allocateMemoryCuda( myDeviceBoolAux, size ) )
          return false;
-      deviceBoolAux = boolArray. getData();
+      deviceBoolAux = myDeviceBoolAux;
    }
    dim3 blockSize( 0 ), gridSize( 0 );
    blockSize. x = 256;
@@ -785,7 +718,8 @@ bool tnlCUDALongVectorComparison( const Index size,
                                                             deviceInput1,
                                                             deviceInput2,
                                                             deviceBoolAux );
-   CHECK_CUDA_ERROR;
+   if( ! checkCudaDevice )
+      return false;
    bool result;
    if( ! tnlCUDALongVectorReduction< bool, bool, Index, tnlParallelReductionMin >( size,
                                                                                    deviceBoolAux,
diff --git a/src/implementation/core/cuda/CMakeLists.txt b/src/implementation/core/cuda/CMakeLists.txt
index 266346483cd29086b07e2d774dd0a41755095092..5e9be45d30fbf42d70a12cf53528d2dd800be122 100755
--- a/src/implementation/core/cuda/CMakeLists.txt
+++ b/src/implementation/core/cuda/CMakeLists.txt
@@ -1,7 +1,14 @@
-SET( headers reduction_impl.h )
+SET( headers reduction_impl.h
+             reduction-operations_impl.h )
 
 SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/implementation/core/cuda )
 
+IF( BUILD_CUDA )
+   set( tnl_implementation_core_cuda_CUDA__SOURCES
+        ${CURRENT_DIR}/reduction-operations_impl.cu
+        PARENT_SCOPE )
+endif()        
+
 set( tnl_implementation_core_cuda_SOURCES 
      ${CURRENT_DIR}/device-check.cpp
      PARENT_SCOPE )
diff --git a/src/implementation/core/cuda/reduction-operations_impl.cu b/src/implementation/core/cuda/reduction-operations_impl.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b86aa68fcb6b246962d6c8a2c6a24f2fe861b0ec
--- /dev/null
+++ b/src/implementation/core/cuda/reduction-operations_impl.cu
@@ -0,0 +1,16 @@
+/***************************************************************************
+                          reduction-operations.cu  -  description
+                             -------------------
+    begin                : Mar 22, 2013
+    copyright            : (C) 2013 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
\ No newline at end of file
diff --git a/src/implementation/core/cuda/reduction-operations_impl.h b/src/implementation/core/cuda/reduction-operations_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..113b837a5909e9f834ed08a8dbcd6d9c8027ab93
--- /dev/null
+++ b/src/implementation/core/cuda/reduction-operations_impl.h
@@ -0,0 +1,23 @@
+/***************************************************************************
+                          reduction-operations_impl.h  -  description
+                             -------------------
+    begin                : Mar 22, 2013
+    copyright            : (C) 2013 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef REDUCTION_OPERATIONS_IMPL_H_
+#define REDUCTION_OPERATIONS_IMPL_H_
+
+
+
+#endif /* REDUCTION_OPERATIONS_IMPL_H_ */
diff --git a/src/implementation/core/memory-operations.h b/src/implementation/core/memory-operations.h
index dae87f9fa932883f76ff1d9191befc580c2e6cb7..6a645b7560310df4039764860ec946b7ac641faf 100644
--- a/src/implementation/core/memory-operations.h
+++ b/src/implementation/core/memory-operations.h
@@ -25,20 +25,23 @@
 const int tnlGPUvsCPUTransferBufferSize( 1 << 20 );
 
 template< typename Element, typename Index >
-void allocateMemoryHost( Element*& data,
+bool allocateMemoryHost( Element*& data,
                          const Index size )
 {
-   data = new Element[ size ];
+   if( ! ( data = new Element[ size ] ) )
+      return false;
+   return true;
 }
 
 template< typename Element, typename Index >
-void allocateMemoryCuda( Element*& data,
+bool allocateMemoryCuda( Element*& data,
                          const Index size )
 {
 #ifdef HAVE_CUDA
    if( cudaMalloc( ( void** ) &data,
                    ( size_t ) size * sizeof( Element ) ) != cudaSuccess )
       data = 0;
+   return checkCudaDevice;
 #endif
 }
 
@@ -54,7 +57,7 @@ bool freeMemoryCuda( Element* data )
 {
 #ifdef HAVE_CUDA
       cudaFree( data );
-      checkCudaDevice;
+      return checkCudaDevice;
 #endif
    return true;
 }
@@ -95,11 +98,11 @@ bool setMemoryCuda( Element* data,
 {
 #ifdef HAVE_CUDA
       dim3 blockSize, gridSize;
-      blockSize. x = 32;
-      int blocksNumber = ceil( ( double ) size / ( double ) blockSize. x );
-      int elementsPerThread = ceil( ( double ) blocksNumber / ( double ) maxCudaGridSize );
+      blockSize. x = 256;
+      Index blocksNumber = ceil( ( double ) size / ( double ) blockSize. x );
+      Index elementsPerThread = ceil( ( double ) blocksNumber / ( double ) maxCudaGridSize );
       gridSize. x = Min( blocksNumber, maxCudaGridSize );
-      cout << "blocksNumber = " << blocksNumber << "Grid size = " << gridSize. x << " elementsPerThread = " << elementsPerThread << endl;
+      //cout << "blocksNumber = " << blocksNumber << "Grid size = " << gridSize. x << " elementsPerThread = " << elementsPerThread << endl;
       setVectorValueCudaKernel<<< blockSize, gridSize >>>( data, size, value, elementsPerThread );
 
       return checkCudaDevice;
@@ -148,15 +151,11 @@ bool copyMemoryCudaToHost( Element* destination,
                            const Index size )
 {
 #ifdef HAVE_CUDA
-   if( cudaMemcpy( destination,
-                   source,
-                   size * sizeof( Element ),
-                   cudaMemcpyDeviceToHost ) != cudaSuccess )
-   {
-      cerr << "Transfer of data from CUDA device to host failed." << endl;
-      return false;
-   }
-   return true;
+   cudaMemcpy( destination,
+               source,
+               size * sizeof( Element ),
+               cudaMemcpyDeviceToHost );
+   return checkCudaDevice;
 #else
    cerr << "CUDA support is missing in this system." << endl;
    return false;
@@ -173,11 +172,7 @@ bool copyMemoryCudaToCuda( Element* destination,
                    source,
                    size * sizeof( Element ),
                    cudaMemcpyDeviceToDevice ) != cudaSuccess )
-   {
-      cerr << "Transfer of data from CUDA device to device failed." << endl;
-      return false;
-   }
-   return true;
+   return checkCudaDevice;
 #else
    cerr << "CUDA support is missing in this system." << endl;
    return false;
diff --git a/tests/unit-tests/CMakeLists.txt b/tests/unit-tests/CMakeLists.txt
index eeef543f8dbfc95a8325992c9f1c574d14c5cee3..367fd50bd59e964e8f63c564274fc9bafffdf4ef 100755
--- a/tests/unit-tests/CMakeLists.txt
+++ b/tests/unit-tests/CMakeLists.txt
@@ -36,6 +36,8 @@ if( BUILD_CUDA )
    ADD_TEST( core/cuda/tnl-device-check-test${mpiExt}${debugExt} ${EXECUTABLE_OUTPUT_PATH}/tnl-device-check-test${mpiExt}${debugExt} )
    ADD_TEST( core/cuda/tnl-memory-operations-test${mpiExt}${debugExt} ${EXECUTABLE_OUTPUT_PATH}/tnl-memory-operations-test${mpiExt}${debugExt} )
    SET_TESTS_PROPERTIES ( core/cuda/tnl-memory-operations-test${mpiExt}${debugExt} PROPERTIES DEPENDS core/cuda/tnl-device-check-test${mpiExt}${debugExt} )
+   ADD_TEST( core/cuda/tnl-reduction-test${mpiExt}${debugExt} ${EXECUTABLE_OUTPUT_PATH}/tnl-reduction-test${mpiExt}${debugExt} )
+   SET_TESTS_PROPERTIES ( core/cuda/tnl-reduction-test${mpiExt}${debugExt} PROPERTIES DEPENDS core/cuda/tnl-memory-operations-test${mpiExt}${debugExt} )   
 endif()
 
 ADD_TEST( tnl-unit-tests${mpiExt}${debugExt} ${EXECUTABLE_OUTPUT_PATH}/tnl-unit-tests${mpiExt}${debugExt}  )                                                                       
diff --git a/tests/unit-tests/core/cuda/CMakeLists.txt b/tests/unit-tests/core/cuda/CMakeLists.txt
index 73f852b9c516d2e23fd8b1c053118f630d459e7b..c52a01d7c043cab8cb01e64ff0f3052daef7203a 100755
--- a/tests/unit-tests/core/cuda/CMakeLists.txt
+++ b/tests/unit-tests/core/cuda/CMakeLists.txt
@@ -1,5 +1,6 @@
 set( headers tnlCudaDeviceCheckTester.h
-             tnlCudaMemoryOperationsTester.h )
+             tnlCudaMemoryOperationsTester.h
+             tnlCudaReductionTester.h )
              
 if( BUILD_CUDA )
     CUDA_ADD_EXECUTABLE( tnl-device-check-test${mpiExt}${debugExt} ${headers} device-check-test.cu )
@@ -10,11 +11,10 @@ if( BUILD_CUDA )
     TARGET_LINK_LIBRARIES( tnl-memory-operations-test${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
                                                               tnl${mpiExt}${debugExt}-0.1 )
     
-#    CUDA_ADD_EXECUTABLE( tnl-reduction-test${mpiExt}${debugExt} ${headers} reduction-test.cu )
-#    TARGET_LINK_LIBRARIES( tnl-reduction-test${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
-#                                                              tnl${mpiExt}${debugExt}-0.1 )                                                                
-                                                                                                                              
-                                                                                                                              
+    CUDA_ADD_EXECUTABLE( tnl-reduction-test${mpiExt}${debugExt} ${headers} reduction-test.cu )
+    TARGET_LINK_LIBRARIES( tnl-reduction-test${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
+                                                              tnl${mpiExt}${debugExt}-0.1 )                                                                
+                                                                                                                                                                                                                                                            
 endif()
 
                                                                        
diff --git a/tests/unit-tests/core/cuda/reduction-test.cu b/tests/unit-tests/core/cuda/reduction-test.cu
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e980cf5b9cb70bf44319b159716245be79fa974f 100644
--- a/tests/unit-tests/core/cuda/reduction-test.cu
+++ b/tests/unit-tests/core/cuda/reduction-test.cu
@@ -0,0 +1,34 @@
+/***************************************************************************
+                          reduction-test.cu  -  description
+                             -------------------
+    begin                : Mar 20, 2013
+    copyright            : (C) 2013 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#include <tnlConfig.h>
+ 
+#ifdef HAVE_CPPUNIT
+
+#include <cppunit/ui/text/TestRunner.h>
+#include "tnlCudaReductionTester.h"
+ 
+int main( int argc, char* argv[] )
+{
+   CppUnit :: TextTestRunner runner;
+   runner. addTest( tnlCudaReductionTester :: suite() );
+   if( ! runner.run() )
+      return EXIT_FAILURE;
+   return EXIT_SUCCESS;
+}
+
+#endif
diff --git a/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h b/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h
index b65a8091a588a1367e3955d60bd5c0814e597888..21183a9d2ea941c5c39a424a7d2027397a66e070 100644
--- a/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h
+++ b/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h
@@ -51,10 +51,10 @@ class tnlCudaMemoryOperationsTester : public CppUnit :: TestCase
                                 "smallMemorySetTest",
                                 &tnlCudaMemoryOperationsTester :: smallMemorySetTest )
                                );
-      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
+      /*suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
                                 "bigMemorySetTest",
                                 &tnlCudaMemoryOperationsTester :: bigMemorySetTest )
-                               );
+                               );*/
 
       return suiteOfTests;
    };
@@ -71,7 +71,7 @@ class tnlCudaMemoryOperationsTester : public CppUnit :: TestCase
 
    void copyTest()
    {
-      const int size( 1 << 20 );
+      const int size( 1 << 22 );
       int *hostData1, *hostData2, *deviceData;
       allocateMemoryHost( hostData1, size );
       allocateMemoryHost( hostData2, size );
diff --git a/tests/unit-tests/core/cuda/tnlCudaReductionTester.h b/tests/unit-tests/core/cuda/tnlCudaReductionTester.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a0519885c4b7e1bc7b94e21b0f71b5cc573de5e
--- /dev/null
+++ b/tests/unit-tests/core/cuda/tnlCudaReductionTester.h
@@ -0,0 +1,122 @@
+/***************************************************************************
+                          tnlCudaReductionTester.h  -  description
+                             -------------------
+    begin                : Mar 22, 2013
+    copyright            : (C) 2013 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+
+#ifndef TNLCUDAREDUCTIONTESTER_H_
+#define TNLCUDAREDUCTIONTESTER_H_
+
+#include <cppunit/TestSuite.h>
+#include <cppunit/TestResult.h>
+#include <cppunit/TestCaller.h>
+#include <cppunit/TestCase.h>
+#include <cppunit/Message.h>
+#include <core/cuda/device-check.h>
+#include <implementation/core/cuda-long-vector-kernels.h>
+
+class tnlCudaReductionTester : public CppUnit :: TestCase
+{
+   public:
+   tnlCudaReductionTester(){};
+
+   virtual
+   ~tnlCudaReductionTester(){};
+
+   static CppUnit :: Test* suite()
+   {
+      CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlCudaReductionTester" );
+      CppUnit :: TestResult result;
+
+      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >(
+                                "constantSequenceTest",
+                                &tnlCudaReductionTester :: constantSequenceTest< float > )
+                               );
+
+      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >(
+                                "linearSequenceTest",
+                                &tnlCudaReductionTester :: linearSequenceTest< float > )
+                               );
+
+
+      return suiteOfTests;
+   }
+
+   template< typename RealType >
+   void constantSequenceTest()
+   {
+      const int size( 1024 );
+      RealType *hostData, *deviceData;
+      allocateMemoryHost( hostData, size );
+      allocateMemoryCuda( deviceData, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+      for( int i = 0; i < size; i ++ )
+         hostData[ i ] = 1;
+      copyMemoryHostToCuda( deviceData, hostData, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+      RealType result;
+      tnlParallelReductionSum< RealType, int > sumOperation;
+      CPPUNIT_ASSERT( 
+         ( tnlCUDALongVectorReduction( sumOperation, size, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == size );
+
+      tnlParallelReductionMin< RealType, int > minOperation;
+      CPPUNIT_ASSERT(
+         ( tnlCUDALongVectorReduction( minOperation, size, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 1 );
+
+
+      freeMemoryHost( hostData );
+      freeMemoryCuda( deviceData );
+      CPPUNIT_ASSERT( checkCudaDevice );
+   };
+
+   template< typename RealType >
+   void linearSequenceTest()
+   {
+      const int size( 1024 );
+      RealType *hostData, *deviceData;
+      allocateMemoryHost( hostData, size );
+      allocateMemoryCuda( deviceData, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      RealType sum( 0.0 );
+      for( int i = 0; i < size; i ++ )
+      {
+         hostData[ i ] = i + 1;
+         sum += hostData[ i ];
+      }
+      copyMemoryHostToCuda( deviceData, hostData, size );
+      CPPUNIT_ASSERT( checkCudaDevice );
+      tnlParallelReductionSum< RealType, int > sumOperation;
+      RealType result;
+      CPPUNIT_ASSERT(
+         ( tnlCUDALongVectorReduction( sumOperation, size, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == sum );
+      tnlParallelReductionMin< RealType, int > minOperation;
+      CPPUNIT_ASSERT(
+         ( tnlCUDALongVectorReduction( minOperation, size, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 1 );
+
+
+      freeMemoryHost( hostData );
+      freeMemoryCuda( deviceData );
+      CPPUNIT_ASSERT( checkCudaDevice );
+   };
+
+};
+
+
+#endif /* TNLCUDAREDUCTIONTESTER_H_ */