From 8b4397ceff15498a75d2457f2904f9fd8c788d1d Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Sat, 23 Mar 2013 15:57:42 +0100
Subject: [PATCH] Refactoring the reduction on the CUDA device.

---
 buildAll                                      |   4 +-
 src/config/CMakeLists.txt                     |   2 +-
 src/core/CMakeLists.txt                       |   3 +-
 src/core/cuda/CMakeLists.txt                  |   5 +-
 src/core/cuda/reduction-operations.h          | 313 +++++++++++-
 src/implementation/core/CMakeLists.txt        |   2 +-
 .../core/cuda-long-vector-kernels.h           | 482 ++++++------------
 src/implementation/core/cuda/CMakeLists.txt   |   2 +-
 src/implementation/core/memory-operations.h   |  18 +-
 src/legacy/core/CMakeLists.txt                |  13 +-
 .../core/cuda/tnlCudaReductionTester.h        | 129 ++++-
 11 files changed, 597 insertions(+), 376 deletions(-)

diff --git a/buildAll b/buildAll
index 3318eab029..d553862bd5 100755
--- a/buildAll
+++ b/buildAll
@@ -25,10 +25,10 @@ cd Debug
 ${CMAKE} .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=${HOME}/local -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} -DWITH_CUDA=${WITH_CUDA} -DWITH_CUSPARSE=${WITH_CUSPARSE} -DPETSC_DIR=${PETSC_DIR}
 make -j${CPUS} #VERBOSE=1
 make -j${CPUS} test
-#make -j${CPUS} install
+make -j${CPUS} install
 
 cd ../Release
 ${CMAKE} .. -DCMAKE_INSTALL_PREFIX=${HOME}/local -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} -DWITH_CUDA=${WITH_CUDA} -DWITH_CUSPARSE=${WITH_CUSPARSE} -DPETSC_DIR=${PETSC_DIR}
 make -j${CPUS} #VERBOSE=1
 make -j${CPUS} test
-#make -j${CPUS} install
+make -j${CPUS} install
diff --git a/src/config/CMakeLists.txt b/src/config/CMakeLists.txt
index f745e98a4c..a1fdbf9582 100755
--- a/src/config/CMakeLists.txt
+++ b/src/config/CMakeLists.txt
@@ -1,5 +1,5 @@
 SET( EXTRA_DIST tnlConfigDescriptionParser.y 
-	            tnlConfigDescriptionScanner.l 	            
+	             tnlConfigDescriptionScanner.l 	            
     )
 
 SET( headers tnlConfigDescription.h     	    
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index badb8ca45a..04f786f263 100755
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -23,8 +23,7 @@ set (headers tnlArray.h
    		    tnlReal.h
    		    tnlTimerCPU.h  
    		    tnlTimerRT.h  
-   		    tnlTuple.h 
-   	       tnlCudaSupport.h 
+   		    tnlTuple.h  
    		    mfilename.h 
    		    mfuncs.h 
    		    mpi-supp.h 
diff --git a/src/core/cuda/CMakeLists.txt b/src/core/cuda/CMakeLists.txt
index af2301ff76..3290b863cf 100755
--- a/src/core/cuda/CMakeLists.txt
+++ b/src/core/cuda/CMakeLists.txt
@@ -1,2 +1,5 @@
 set( headers device-check.h
-    )
+             reduction.h
+             reduction-operations.h )
+
+INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/core/cuda )
\ No newline at end of file
diff --git a/src/core/cuda/reduction-operations.h b/src/core/cuda/reduction-operations.h
index de7488f586..9a6d8a0c4a 100644
--- a/src/core/cuda/reduction-operations.h
+++ b/src/core/cuda/reduction-operations.h
@@ -22,11 +22,7 @@
 #include <cuda.h>
 #include <core/mfuncs.h>
 
-enum tnlTupleOperation {  tnlParallelReductionMax,
-                          tnlParallelReductionAbsMin,
-                          tnlParallelReductionAbsMax,
-                          tnlParallelReductionAbsSum,
-                          tnlParallelReductionLpNorm,
+enum tnlTupleOperation {  tnlParallelReductionLpNorm,
                           tnlParallelReductionSdot };
 
 
@@ -109,20 +105,159 @@ class tnlParallelReductionSum
 
    typedef Real RealType;
    typedef Index IndexType;
+   typedef Real ResultType;
 
+   ResultType initialValueOnHost( const IndexType idx,
+                                  const RealType* data1,
+                                  const RealType* data2 ) const
+   {
+      return data1[ idx ];
+   };
+
+   ResultType reduceOnHost( const IndexType idx,
+                            const ResultType& current,
+                            const RealType* data1,
+                            const RealType* data2 ) const
+   {
+      return current + data1[ idx ];
+   };
+
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
+   {
+      return data1[ idx1 ] + data1[ idx2 ];
+   };
+
+   __device__ ResultType initialValueOnDevice( const IndexType idx1,
+                                               const RealType* data1,
+                                               const RealType* data2 ) const
+   {
+      return data1[ idx1 ];
+   };
+
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const IndexType idx3,
+                                                 const RealType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
+   {
+      return data1[ idx1 ] + data2[ idx2 ] + data2[ idx3 ];
+   };
+
+   __device__ ResultType firstReductionOnDevice( const IndexType idx1,
+                                                 const IndexType idx2,
+                                                 const RealType* data1,
+                                                 const RealType* data2,
+                                                 const RealType* data3 ) const
+   {
+      return data1[ idx1 ] + data2[ idx2 ];
+   };
+
+   __device__ ResultType commonReductionOnDevice( const IndexType idx1,
+                                                  const IndexType idx2,
+                                                  const ResultType* data ) const
+   {
+      return data[ idx1 ] + data[ idx2 ];
+   };
+};
+
+template< typename Real, typename Index >
+class tnlParallelReductionMin
+{
+   public:
+
+   typedef Real RealType;
+   typedef Index IndexType;
+
+   RealType initialValueOnHost( const IndexType idx,
+                                const RealType* data1,
+                                const RealType* data2 ) const
+   {
+      return data1[ idx ];
+   };
 
-   RealType reduceOnHost( const RealType& data1,
-                          const RealType& data2 ) const
+   RealType reduceOnHost( const IndexType idx,
+                          const RealType& current,
+                          const RealType* data1,
+                          const RealType* data2 ) const
    {
-      return data1 + data2;
+      return Min( current, data1[ idx ] );
+   };
+
+   __device__ RealType initialValueOnDevice( const IndexType idx1,
+                                             const IndexType idx2,
+                                             const RealType* data1,
+                                             const RealType* data2 ) const
+   {
+      return tnlCudaMin( data1[ idx1 ], data1[ idx2 ] );
    }
 
+   __device__ RealType initialValueOnDevice( const IndexType idx1,
+                                             const RealType* data1,
+                                             const RealType* data2 ) const
+   {
+      return data1[ idx1 ];
+   };
+
+   __device__ RealType firstReductionOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const IndexType idx3,
+                                               const RealType* data1,
+                                               const RealType* data2,
+                                               const RealType* data3 ) const
+   {
+      return tnlCudaMin( data1[ idx1 ], tnlCudaMin(  data2[ idx2 ],  data2[ idx3 ] ) );
+   };
+
+   __device__ RealType firstReductionOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const RealType* data1,
+                                               const RealType* data2,
+                                               const RealType* data3 ) const
+   {
+      return tnlCudaMin( data1[ idx1 ], data2[ idx2 ] );
+   };
+
+   __device__ RealType commonReductionOnDevice( const IndexType idx1,
+                                                const IndexType idx2,
+                                                const RealType* data ) const
+   {
+      return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
+   };
+};
+
+template< typename Real, typename Index >
+class tnlParallelReductionMax
+{
+   public:
+
+   typedef Real RealType;
+   typedef Index IndexType;
+
+   RealType initialValueOnHost( const IndexType idx,
+                                const RealType* data1,
+                                const RealType* data2 ) const
+   {
+      return data1[ idx ];
+   };
+
+   RealType reduceOnHost( const IndexType idx,
+                          const RealType& current,
+                          const RealType* data1,
+                          const RealType* data2 ) const
+   {
+      return Max( current, data1[ idx ] );
+   };
+
    __device__ RealType initialValueOnDevice( const IndexType idx1,
                                              const IndexType idx2,
                                              const RealType* data1,
                                              const RealType* data2 ) const
    {
-      return data1[ idx1 ] + data1[ idx2 ];
+      return tnlCudaMax( data1[ idx1 ], data1[ idx2 ] );
    }
 
    __device__ RealType initialValueOnDevice( const IndexType idx1,
@@ -139,7 +274,7 @@ class tnlParallelReductionSum
                                                const RealType* data2,
                                                const RealType* data3 ) const
    {
-      return data1[ idx1 ] + data2[ idx2 ] + data2[ idx3 ];
+      return tnlCudaMax( data1[ idx1 ], tnlCudaMax( data2[ idx2 ], data2[ idx3 ] ) );
    };
 
    __device__ RealType firstReductionOnDevice( const IndexType idx1,
@@ -148,7 +283,72 @@ class tnlParallelReductionSum
                                                const RealType* data2,
                                                const RealType* data3 ) const
    {
-      return data1[ idx1 ] + data2[ idx2 ];
+      return tnlCudaMax( data1[ idx1 ], data2[ idx2 ] );
+   };
+
+   __device__ RealType commonReductionOnDevice( const IndexType idx1,
+                                                const IndexType idx2,
+                                                const RealType* data ) const
+   {
+      return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
+   };
+};
+
+template< typename Real, typename Index >
+class tnlParallelReductionAbsSum
+{
+   public:
+
+   typedef Real RealType;
+   typedef Index IndexType;
+
+   RealType initialValueOnHost( const IndexType idx,
+                                const RealType* data1,
+                                const RealType* data2 ) const
+   {
+      return tnlAbs( data1[ idx ] );
+   };
+
+   RealType reduceOnHost( const IndexType idx,
+                          const RealType& current,
+                          const RealType* data1,
+                          const RealType* data2 ) const
+   {
+      return current + tnlAbs( data1[ idx ] );
+   };
+
+   __device__ RealType initialValueOnDevice( const IndexType idx1,
+                                             const IndexType idx2,
+                                             const RealType* data1,
+                                             const RealType* data2 ) const
+   {
+      return tnlCudaAbs( data1[ idx1 ] ) + tnlCudaAbs( data1[ idx2 ] );
+   };
+
+   __device__ RealType initialValueOnDevice( const IndexType idx1,
+                                             const RealType* data1,
+                                             const RealType* data2 ) const
+   {
+      return tnlCudaAbs( data1[ idx1 ] );
+   };
+
+   __device__ RealType firstReductionOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const IndexType idx3,
+                                               const RealType* data1,
+                                               const RealType* data2,
+                                               const RealType* data3 ) const
+   {
+      return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] ) + tnlCudaAbs( data2[ idx3 ] );
+   };
+
+   __device__ RealType firstReductionOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const RealType* data1,
+                                               const RealType* data2,
+                                               const RealType* data3 ) const
+   {
+      return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] );
    };
 
    __device__ RealType commonReductionOnDevice( const IndexType idx1,
@@ -160,33 +360,41 @@ class tnlParallelReductionSum
 };
 
 template< typename Real, typename Index >
-class tnlParallelReductionMin
+class tnlParallelReductionAbsMin
 {
    public:
 
    typedef Real RealType;
    typedef Index IndexType;
 
+   RealType initialValueOnHost( const IndexType idx,
+                                const RealType* data1,
+                                const RealType* data2 ) const
+   {
+      return tnlAbs( data1[ idx ] );
+   };
 
-   RealType reduceOnHost( const RealType& data1,
-                          const RealType& data2 ) const
+   RealType reduceOnHost( const IndexType idx,
+                          const RealType& current,
+                          const RealType* data1,
+                          const RealType* data2 ) const
    {
-      return Min( data1, data2 );
-   }
+      return Min( current, tnlAbs( data1[ idx ] ) );
+   };
 
    __device__ RealType initialValueOnDevice( const IndexType idx1,
                                              const IndexType idx2,
                                              const RealType* data1,
                                              const RealType* data2 ) const
    {
-      return tnlCudaMin( data1[ idx1 ], data1[ idx2 ] );
+      return tnlCudaMin( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) );
    }
 
    __device__ RealType initialValueOnDevice( const IndexType idx1,
                                              const RealType* data1,
                                              const RealType* data2 ) const
    {
-      return data1[ idx1 ];
+      return tnlCudaAbs( data1[ idx1 ] );
    };
 
    __device__ RealType firstReductionOnDevice( const IndexType idx1,
@@ -196,7 +404,7 @@ class tnlParallelReductionMin
                                                const RealType* data2,
                                                const RealType* data3 ) const
    {
-      return tnlCudaMin( data1[ idx1 ], tnlCudaMin(  data2[ idx2 ],  data2[ idx3 ] ) );
+      return tnlCudaMin( data1[ idx1 ], tnlCudaMin(  tnlCudaAbs( data2[ idx2 ] ),  tnlCudaAbs( data2[ idx3 ] ) ) );
    };
 
    __device__ RealType firstReductionOnDevice( const IndexType idx1,
@@ -205,18 +413,81 @@ class tnlParallelReductionMin
                                                const RealType* data2,
                                                const RealType* data3 ) const
    {
-      return tnlCudaMin( data1[ idx1 ], data2[ idx2 ] );
+      return tnlCudaMin( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) );
    };
 
    __device__ RealType commonReductionOnDevice( const IndexType idx1,
                                                 const IndexType idx2,
                                                 const RealType* data ) const
    {
-      return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
+      return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
    };
 };
 
+template< typename Real, typename Index >
+class tnlParallelReductionAbsMax
+{
+   public:
+
+   typedef Real RealType;
+   typedef Index IndexType;
+
+   RealType initialValueOnHost( const IndexType idx,
+                                const RealType* data1,
+                                const RealType* data2 ) const
+   {
+      return tnlAbs( data1[ idx ] );
+   };
+
+   RealType reduceOnHost( const IndexType idx,
+                          const RealType& current,
+                          const RealType* data1,
+                          const RealType* data2 ) const
+   {
+      return Max( current, tnlAbs( data1[ idx ] ) );
+   };
 
+   __device__ RealType initialValueOnDevice( const IndexType idx1,
+                                             const IndexType idx2,
+                                             const RealType* data1,
+                                             const RealType* data2 ) const
+   {
+      return tnlCudaMax( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) );
+   }
+
+   __device__ RealType initialValueOnDevice( const IndexType idx1,
+                                             const RealType* data1,
+                                             const RealType* data2 ) const
+   {
+      return tnlCudaAbs( data1[ idx1 ] );
+   };
+
+   __device__ RealType firstReductionOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const IndexType idx3,
+                                               const RealType* data1,
+                                               const RealType* data2,
+                                               const RealType* data3 ) const
+   {
+      return tnlCudaMax( data1[ idx1 ], tnlCudaMax( tnlCudaAbs( data2[ idx2 ] ), tnlCudaAbs( data2[ idx3 ] ) ) );
+   };
+
+   __device__ RealType firstReductionOnDevice( const IndexType idx1,
+                                               const IndexType idx2,
+                                               const RealType* data1,
+                                               const RealType* data2,
+                                               const RealType* data3 ) const
+   {
+      return tnlCudaMax( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) );
+   };
+
+   __device__ RealType commonReductionOnDevice( const IndexType idx1,
+                                                const IndexType idx2,
+                                                const RealType* data ) const
+   {
+      return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
+   };
+};
 
 
 #include <implementation/core/cuda/reduction-operations_impl.h>
diff --git a/src/implementation/core/CMakeLists.txt b/src/implementation/core/CMakeLists.txt
index 10e9934615..d137f42ade 100755
--- a/src/implementation/core/CMakeLists.txt
+++ b/src/implementation/core/CMakeLists.txt
@@ -2,7 +2,7 @@ ADD_SUBDIRECTORY( cuda )
 
 SET( headers cuda-long-vector-kernels.h
              vector-operations.h
-             memory-functions.h
+             memory-operations.h
              tnlArray_impl.h
              tnlHost_impl.h
              tnlLogger_impl.h
diff --git a/src/implementation/core/cuda-long-vector-kernels.h b/src/implementation/core/cuda-long-vector-kernels.h
index c9f49847ff..41c2cc8039 100644
--- a/src/implementation/core/cuda-long-vector-kernels.h
+++ b/src/implementation/core/cuda-long-vector-kernels.h
@@ -52,13 +52,7 @@ __device__ void reduceAligned( const Operation& operation,
    if( tid < s )
    {
       sdata[ tid ] = operation. commonReductionOnDevice( tid, tid + s, sdata );
-      /*if( operation == tnlParallelReductionMin )
-         sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
-      if( operation == tnlParallelReductionMax )
-         sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
-      if( operation == tnlParallelReductionSum )
-         sdata[ tid ] += sdata[ tid + s ];
-      if( operation == tnlParallelReductionAbsMin )
+      /*if( operation == tnlParallelReductionAbsMin )
          sdata[ tid ] = tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) );
       if( operation == tnlParallelReductionAbsMax )
          sdata[ tid ] = tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) );
@@ -84,13 +78,7 @@ __device__ void reduceNonAligned( const Operation& operation,
    if( tid < s )
    {
       sdata[ tid ] = operation. commonReductionOnDevice( tid, tid + s, sdata );
-      /*if( operation == tnlParallelReductionMin )
-         sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
-      if( operation == tnlParallelReductionMax )
-         sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
-      if( operation == tnlParallelReductionSum )
-         sdata[ tid ] += sdata[ tid + s ];
-      if( operation == tnlParallelReductionAbsMin )
+      /*if( operation == tnlParallelReductionAbsMin )
          sdata[ tid ] = tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) );
       if( operation == tnlParallelReductionAbsMax )
          sdata[ tid ] = tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) );
@@ -106,13 +94,7 @@ __device__ void reduceNonAligned( const Operation& operation,
    if( 2 * s < n && tid == n - 1 )
    {
       sdata[ 0 ] = operation. commonReductionOnDevice( 0, tid, sdata );
-      /*if( operation == tnlParallelReductionMin )
-         sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
-      if( operation == tnlParallelReductionMax )
-         sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
-      if( operation == tnlParallelReductionSum )
-         sdata[ 0 ] += sdata[ tid ];
-      if( operation == tnlParallelReductionAbsMin )
+      /*if( operation == tnlParallelReductionAbsMin )
          sdata[ 0 ] = tnlCudaMin( tnlCudaAbs( sdata[ 0] ), tnlCudaAbs( sdata[ tid + s ] ) );
       if( operation == tnlParallelReductionAbsMax )
          sdata[ 0 ] = tnlCudaMax( tnlCudaAbs( sdata[ 0 ] ), tnlCudaAbs( sdata[ tid + s ] ) );
@@ -357,7 +339,6 @@ __global__ void tnlCUDAReductionKernel( const Operation operation,
          reduceNonAligned( operation, tid, s, n, sdata );
          n = s;
       }
-
    }
 
    /***
@@ -367,275 +348,160 @@ __global__ void tnlCUDAReductionKernel( const Operation operation,
       deviceOutput[ blockIdx. x ] = sdata[ 0 ];
 }
 
-#endif
-/***
- * The template calling the final CUDA kernel for the single vector reduction.
- * The template parameters are:
- * @param T is the type of data we want to reduce
- * @param operation is the operation reducing the data.
- *        It can be tnlParallelReductionSum, tnlParallelReductionMin or tnlParallelReductionMax.
- * The function parameters:
- * @param size tells number of elements in the data array.
- * @param deviceInput1 is the pointer to an array storing the data we want
- *        to reduce. This array must stay on the device!.
- * @param deviceInput2 is the pointer to an array storing the coupling data for example
- *        the second vector for the SDOT operation. This array must stay on the device!.
- * @param result will contain the result of the reduction if everything was ok
- *        and the return code is true.
- * @param parameter can be used for example for the passing the parameter p of Lp norm.
- * @param deviceAux is auxiliary array used to store temporary data during the reduction.
- *        If one calls this function more then once one might provide this array to avoid repetetive
- *        allocation of this array on the device inside of this function.
- *        The size of this array should be size / 128 * sizeof( T ).
- */
 template< typename Operation >
-bool tnlCUDALongVectorReduction( const Operation& operation,
-                                 const typename Operation :: IndexType size,
-                                 const typename Operation :: RealType* deviceInput1,
-                                 const typename Operation :: RealType* deviceInput2,
-                                 typename Operation :: RealType& result,
-                                 typename Operation :: RealType* deviceAux = 0 )
+typename Operation :: IndexType reduceOnCudaDevice( const Operation& operation,
+                                                    const typename Operation :: IndexType size,
+                                                    const typename Operation :: RealType* input1,
+                                                    const typename Operation :: RealType* input2,
+                                                    typename Operation :: RealType*& output)
 {
-#ifdef HAVE_CUDA
-
    typedef typename Operation :: IndexType IndexType;
    typedef typename Operation :: RealType RealType;
-   /***
-    * Set parameters:
-    * @param desBlockSize is desired block size with which we get the best performance (on CUDA rach 1.0 to 1.3)
-    * @param desGridSize is desired grid size
-    */
+
    const int desBlockSize = 512;
    const int desGridSize = 2048;
-
-   RealType* dbg_array1; // debuging array
+   dim3 blockSize( 0 ), gridSize( 0 );
 
    /***
-    * Allocating auxiliary device memory to store temporary reduced arrays.
-    * For example in the first iteration we reduce the number of elements
-    * from size to size / 2. We store this new data in deviceAux array.
-    * If one calls the CUDA reduction more then once then one can provide
-    * auxiliary array by passing it via the parameter deviceAux.
+    * Compute the CUDA block size aligned to the power of two.
     */
-   RealType* myDeviceAux( 0 );
-   if( ! deviceAux )
-   {
-      if( ! allocateMemoryCuda( myDeviceAux, :: Max( 1, size / desBlockSize ) ) )
-      {
-         checkCudaDevice;
+   blockSize. x = :: Min( size, desBlockSize );
+   IndexType alignedBlockSize = 1;
+   while( alignedBlockSize < blockSize. x ) alignedBlockSize <<= 1;
+   blockSize. x = alignedBlockSize;
+
+   gridSize. x = Min( ( IndexType ) ( size / blockSize. x + 1 ) / 2, desGridSize );
+
+   if( ! output &&
+       ! allocateMemoryCuda( output, :: Max( 1, size / desBlockSize ) ) )
          return false;
-      }
-      deviceAux = myDeviceAux;
-   }
 
+   IndexType shmem = blockSize. x * sizeof( RealType );
    /***
-    * Setup parameters of the kernel:
-    * @param sizeReduced is the size of reduced data after each step of parallel reduction
-    * @param reductionInput tells what data we shell reduce. We start with the input if this fuction
-    *                       and after the 1st reduction step we switch this pointer to deviceAux.
+    * Depending on the blockSize we generate appropriate template instance.
     */
-   IndexType sizeReduced = size;
-   const RealType* reductionInput1 = deviceInput1;
-   const RealType* reductionInput2 = deviceInput2;
-   IndexType reductionSteps( 0 );
-   while( sizeReduced > maxGPUReductionDataSize )
-   {
-      dim3 blockSize( 0 ), gridSize( 0 );
-      blockSize. x = :: Min( sizeReduced, desBlockSize );
-      gridSize. x = :: Min( ( IndexType ) ( sizeReduced / blockSize. x + 1 ) / 2, desGridSize );
-
-      /***
-       * We align the blockSize to the power of 2.
-       */
-      IndexType alignedBlockSize = 1;
-      while( alignedBlockSize < blockSize. x ) alignedBlockSize <<= 1;
-      blockSize. x = alignedBlockSize;
-      IndexType shmem = blockSize. x * sizeof( RealType );
-      /***
-       * Depending on the blockSize we generate appropriate template instance.
-       */
-#ifdef UNDEF
-      if( reductionSteps > 0 &&
-          ( operation == tnlParallelReductionSdot ||
-            operation == tnlParallelReductionLpNorm ) )
+      switch( blockSize. x )
       {
-         /***
-          * For operations like SDOT or LpNorm we need to switch to tnlParallelReductionSum after the
-          * first reduction step.
-          */
-         switch( blockSize. x )
-         {
-            case 512:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 512 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case 256:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 256 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case 128:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 128 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case  64:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum,  64 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case  32:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum,  32 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case  16:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum,  16 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case   8:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum,   8 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case   4:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum,   4 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case   2:
-               tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum,   2 >
-               <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 );
-               break;
-            case   1:
-               tnlAssert( false, cerr << "blockSize should not be 1." << endl );
-               break;
-            default:
-               tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
-               break;
-         }
+         case 512:
+            tnlCUDAReductionKernel< Operation, 512 >
+            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+            break;
+         case 256:
+            tnlCUDAReductionKernel< Operation, 256 >
+            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+            break;
+         case 128:
+            tnlCUDAReductionKernel< Operation, 128 >
+            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+            break;
+         case  64:
+            tnlCUDAReductionKernel< Operation,  64 >
+            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+            break;
+         case  32:
+            tnlCUDAReductionKernel< Operation,  32 >
+            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+            break;
+         case  16:
+            tnlCUDAReductionKernel< Operation,  16 >
+            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+            break;
+         case   8:
+            tnlCUDAReductionKernel< Operation,   8 >
+            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+            break;
+         case   4:
+            tnlCUDAReductionKernel< Operation,   4 >
+            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+            break;
+         case   2:
+            tnlCUDAReductionKernel< Operation,   2 >
+            <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+            break;
+         case   1:
+            tnlAssert( false, cerr << "blockSize should not be 1." << endl );
+            break;
+         default:
+            tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
+            break;
       }
-      else
+   return gridSize. x;
+}
 #endif
-         switch( blockSize. x )
-         {
-            case 512:
-               tnlCUDAReductionKernel< Operation, 512 >
-               <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux );
-               break;
-            case 256:
-               tnlCUDAReductionKernel< Operation, 256 >
-               <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux );
-               break;
-            case 128:
-               tnlCUDAReductionKernel< Operation, 128 >
-               <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux );
-               break;
-            case  64:
-               tnlCUDAReductionKernel< Operation,  64 >
-               <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux );
-               break;
-            case  32:
-               tnlCUDAReductionKernel< Operation,  32 >
-               <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux );
-               break;
-            case  16:
-               tnlCUDAReductionKernel< Operation,  16 >
-               <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux );
-               break;
-            case   8:
-               tnlCUDAReductionKernel< Operation,   8 >
-               <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux );
-               break;
-            case   4:
-               tnlCUDAReductionKernel< Operation,   4 >
-               <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux );
-               break;
-            case   2:
-               tnlCUDAReductionKernel< Operation,   2 >
-               <<< gridSize, blockSize, shmem >>>( operation, sizeReduced, reductionInput1, reductionInput2, deviceAux );
-               break;
-            case   1:
-               tnlAssert( false, cerr << "blockSize should not be 1." << endl );
-               break;
-            default:
-               tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
-               break;
-         }
-      sizeReduced = gridSize. x;
-      reductionInput1 = deviceAux;
-      reductionSteps ++;
-   }
 
-   /***
-    * We transfer reduced data from device to host.
-    * If sizeReduced equals size the previous loop was not processed and we read
-    * data directly from the input.
+template< typename Operation >
+bool tnlCUDALongVectorReduction( const Operation& operation,
+                                 const typename Operation :: IndexType size,
+                                 const typename Operation :: RealType* deviceInput1,
+                                 const typename Operation :: RealType* deviceInput2,
+                                 typename Operation :: RealType& result )
+{
+#ifdef HAVE_CUDA
+
+   typedef typename Operation :: IndexType IndexType;
+   typedef typename Operation :: RealType RealType;
+
+   /****
+    * First check if the input array(s) is/are large enough for the reduction on GPU.
+    * Otherwise copy it/them to host and reduce on CPU.
     */
-   RealType result_array[ maxGPUReductionDataSize ];
-   RealType result_array2[ maxGPUReductionDataSize ];
-   if( sizeReduced == size )
+   RealType hostArray1[ maxGPUReductionDataSize ];
+   RealType hostArray2[ maxGPUReductionDataSize ];
+   if( size <= maxGPUReductionDataSize )
    {
-      if( ! copyMemoryCudaToHost( result_array, deviceInput1, sizeReduced ) )
+      if( ! copyMemoryCudaToHost( hostArray1, deviceInput1, size ) )
          return false;
-#ifdef UNDEF
-      switch( operation )
-      {
-         case tnlParallelReductionLpNorm:
-            result = pow( tnlAbs( result_array[ 0 ] ), parameter );
-            for( Index i = 1; i < sizeReduced; i ++ )
-               result += pow( tnlAbs( result_array[ i ] ), parameter );
-            result = pow( result, 1.0/ parameter );
-            return true;
-         case tnlParallelReductionSdot:
-            if( ! copyMemoryCudaToHost( result_array2, deviceInput2, sizeReduced ) )
-               return false;
-            else
-            {
-               result = 0;
-               for( Index i = 0; i < sizeReduced; i ++ )
-                  result += result_array[ i ] * result_array2[ i ] ;
-               return true;
-            }
-      }
-#endif
-   }
-   else
-      if( ! copyMemoryCudaToHost( result_array, deviceAux, sizeReduced ) )
+      if( deviceInput2 && ! copyMemoryCudaToHost( hostArray2, deviceInput2, size ) )
          return false;
-   result = result_array[ 0 ];
-   for( IndexType i = 1; i < sizeReduced; i ++ )
-      result = operation. reduceOnHost( result, result_array[ i ] );
-   /*switch( operation )
-   {
-      case tnlParallelReductionMax:
-         result = result_array[ 0 ];
-         for( Index i = 1; i < sizeReduced; i ++ )
-            result = Max( result, result_array[ i ] );
-         break;
-      case tnlParallelReductionMin:
-         result = result_array[ 0 ];
-         for( Index i = 1; i < sizeReduced; i ++ )
-            result = Min( result, result_array[ i ] );
-         break;
-      case tnlParallelReductionSum:
-      case tnlParallelReductionLpNorm:
-      case tnlParallelReductionSdot:
-         result = result_array[ 0 ];
-         for( Index i = 1; i < sizeReduced; i ++ )
-            result += result_array[ i ];
-         break;
-      case tnlParallelReductionAbsMax:
-         result = tnlAbs( result_array[ 0 ] );
-         for( Index i = 1; i < sizeReduced; i ++ )
-            result = Max( result, tnlAbs( result_array[ i ] ) );
-         break;
-      case tnlParallelReductionAbsMin:
-         result = tnlAbs( result_array[ 0 ] );
-         for( Index i = 1; i < sizeReduced; i ++ )
-            result = Min( result, tnlAbs( result_array[ i ] ) );
-         break;
-   }*/
-   if( myDeviceAux )
+      result = operation. initialValueOnHost( 0, hostArray1, hostArray2 );
+      for( IndexType i = 1; i < size; i ++ )
+         result = operation. reduceOnHost( i, result, hostArray1, hostArray2 );
+      return true;
+   }
+
+   /****
+    * Reduce the data on the CUDA device.
+    */
+   RealType* deviceAux1( 0 ), *deviceAux2( 0 );
+   IndexType reducedSize = reduceOnCudaDevice( operation,
+                                               size,
+                                               deviceInput1,
+                                               deviceInput2,
+                                               deviceAux1 );
+
+   while( reducedSize > maxGPUReductionDataSize )
    {
-      freeMemoryCuda( myDeviceAux );
-      if( ! checkCudaDevice )
-         return false;
+      reducedSize = reduceOnCudaDevice( operation,
+                                        reducedSize,
+                                        deviceAux1,
+                                        ( RealType* ) 0,
+                                        deviceAux2 );
+      Swap( deviceAux1, deviceAux2 );
    }
+
+   /***
+    * Transfer the reduced data from device to host.
+    */
+   RealType resultArray[ maxGPUReductionDataSize ];
+   if( ! copyMemoryCudaToHost( resultArray, deviceAux1, reducedSize ) )
+      return false;
+
+   /***
+    * Reduce the data on the host system.
+    */
+   result = operation. initialValueOnHost( 0, resultArray, ( RealType* ) 0 );
+   for( IndexType i = 1; i < reducedSize; i ++ )
+      result = operation. reduceOnHost( i, result, resultArray, ( RealType*) 0 );
+
+   /****
+    * Free the memory allocated on the device.
+    */
+   if( deviceAux1 && ! freeMemoryCuda( deviceAux1 ) )
+      return false;
+   if( deviceAux2 && ! freeMemoryCuda( deviceAux2 ) )
+      return false;
+
+
    return true;
 #else
    cerr << "I am sorry but CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl;
@@ -643,52 +509,34 @@ bool tnlCUDALongVectorReduction( const Operation& operation,
 #endif
 };
 
-#ifdef HAVE_CUDA
-/***
- * This kernel just compares two vectors element by element. It writes
- * the result of the comparison into array result. This array must be
- * then reduced.
- */
-template< typename Real, typename Index >
-__global__ void compareTwoVectorsElementwise( const Index size,
-                                              const Real* vector1,
-                                              const Real* vector2,
-                                              bool* result )
-{
-   Index gid = blockDim. x * blockIdx. x + threadIdx. x;
-   if( gid < size )
-   {
-      if( vector1[ gid ] == vector2[ gid ] )
-         result[ gid ] = true;
-      else
-         result[ gid ] = false;
-   }
-}
-#endif
 
-/***
- * The template for comparison of two long vectors on the CUDA device.
- * The template parameters are:
- * @param T is the type of data we want to reduce
- * @param operation is the operation reducing the data.
- *        It can be tnlParallelReductionSum, tnlParallelReductionMin or tnlParallelReductionMax.
- * The function parameters:
- * @param size tells number of elements in the data array.
- * @param deviceInput1 is the pointer to an array storing the data we want
- *        to reduce. This array must stay on the device!.
- * @param deviceInput2 is the pointer to an array storing the coupling data for example
- *        the second vector for the SDOT operation. This array most stay on the device!.
- * @param result will contain the result of the reduction if everything was ok
- *        and the return code is true.
- * @param deviceAux is auxiliary array used to store temporary data during the reduction.
- *        If one calls this function more then once one might provide this array to avoid repetetive
- *        allocation of this array on the device inside of this function.
- *        The size of this array should be size / 128 * sizeof( T ).
- *
- * This function first calls kernel which compares each couples of elements from both vectors.
- * Result is written into a bool array. The minimum value then says if both vectors equal.
- *
- */
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 // TODO: result of comparison should not be returned!!!
 template< typename Type, typename Index >
 bool tnlCUDALongVectorComparison( const Index size,
@@ -714,10 +562,10 @@ bool tnlCUDALongVectorComparison( const Index size,
    blockSize. x = 256;
    gridSize. x = size / blockSize. x + 1;
 
-   compareTwoVectorsElementwise<<< gridSize, blockSize >>>( size,
-                                                            deviceInput1,
-                                                            deviceInput2,
-                                                            deviceBoolAux );
+   //compareTwoVectorsElementwise<<< gridSize, blockSize >>>( size,
+   //                                                         deviceInput1,
+   //                                                         deviceInput2,
+   //                                                         deviceBoolAux );
    if( ! checkCudaDevice )
       return false;
    bool result;
diff --git a/src/implementation/core/cuda/CMakeLists.txt b/src/implementation/core/cuda/CMakeLists.txt
index 5e9be45d30..6d119a124f 100755
--- a/src/implementation/core/cuda/CMakeLists.txt
+++ b/src/implementation/core/cuda/CMakeLists.txt
@@ -13,5 +13,5 @@ set( tnl_implementation_core_cuda_SOURCES
      ${CURRENT_DIR}/device-check.cpp
      PARENT_SCOPE )
 
-INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/implementation )
+INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/implementation/core/cuda )
 
diff --git a/src/implementation/core/memory-operations.h b/src/implementation/core/memory-operations.h
index 6a645b7560..759ca067c4 100644
--- a/src/implementation/core/memory-operations.h
+++ b/src/implementation/core/memory-operations.h
@@ -129,15 +129,16 @@ bool copyMemoryHostToCuda( Element* destination,
                            const Index size )
 {
 #ifdef HAVE_CUDA
-   if( cudaMemcpy( destination,
-                   source,
-                   size * sizeof( Element ),
-                   cudaMemcpyHostToDevice ) != cudaSuccess )
+   cudaMemcpy( destination,
+               source,
+               size * sizeof( Element ),
+               cudaMemcpyHostToDevice );
+   if( ! checkCudaDevice )
    {
       cerr << "Transfer of data from host to CUDA device failed." << endl;
       return false;
    }
-   return checkCudaDevice;
+   return true;
 #else
    cerr << "CUDA support is missing in this system." << endl;
    return false;
@@ -155,7 +156,12 @@ bool copyMemoryCudaToHost( Element* destination,
                source,
                size * sizeof( Element ),
                cudaMemcpyDeviceToHost );
-   return checkCudaDevice;
+   if( ! checkCudaDevice )
+   {
+      cerr << "Transfer of data from CUDA device to host failed." << endl;
+      return false;
+   }
+   return true;
 #else
    cerr << "CUDA support is missing in this system." << endl;
    return false;
diff --git a/src/legacy/core/CMakeLists.txt b/src/legacy/core/CMakeLists.txt
index 3a8d36c743..247d104fad 100755
--- a/src/legacy/core/CMakeLists.txt
+++ b/src/legacy/core/CMakeLists.txt
@@ -1,13 +1,4 @@
-SET( headers tnlBICGSolver.h
-             tnlBICGStabSolver.h
-             tnlCGSolver.h
-             tnlGMRESSolverOld.h
-             tnlILUPreconditioner.h
-             tnlMatrixSolver.h             
-             tnlPETSCPreconditioner.h
-             tnlPETSCSolver.h
-             tnlPreconditioner.h
-             tnlSORSolver.h
+SET( headers tnlCudaSupport.h
    )
    
-INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/legacy/solvers )
+INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/legacy/core )
diff --git a/tests/unit-tests/core/cuda/tnlCudaReductionTester.h b/tests/unit-tests/core/cuda/tnlCudaReductionTester.h
index 8a0519885c..ee8dbd83a5 100644
--- a/tests/unit-tests/core/cuda/tnlCudaReductionTester.h
+++ b/tests/unit-tests/core/cuda/tnlCudaReductionTester.h
@@ -41,10 +41,16 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
       CppUnit :: TestResult result;
 
       suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >(
-                                "constantSequenceTest",
-                                &tnlCudaReductionTester :: constantSequenceTest< float > )
+                                "shortConstantSequenceTest",
+                                &tnlCudaReductionTester :: shortConstantSequenceTest< float > )
                                );
 
+      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >(
+                                "longConstantSequenceTest",
+                                &tnlCudaReductionTester :: longConstantSequenceTest< float > )
+                               );
+
+
       suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >(
                                 "linearSequenceTest",
                                 &tnlCudaReductionTester :: linearSequenceTest< float > )
@@ -55,28 +61,106 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
    }
 
    template< typename RealType >
-   void constantSequenceTest()
+   void setConstantSequence( const int size,
+                             const RealType& value,
+                             RealType*& hostData,
+                             RealType*& deviceData )
    {
-      const int size( 1024 );
-      RealType *hostData, *deviceData;
-      allocateMemoryHost( hostData, size );
-      allocateMemoryCuda( deviceData, size );
-      CPPUNIT_ASSERT( checkCudaDevice );
       for( int i = 0; i < size; i ++ )
-         hostData[ i ] = 1;
+         hostData[ i ] = value;
       copyMemoryHostToCuda( deviceData, hostData, size );
       CPPUNIT_ASSERT( checkCudaDevice );
+   }
+
+   template< typename RealType >
+   void shortConstantSequenceTest()
+   {
+      const int shortSequence( 128 );
+      const int longSequence( 8192 );
+      RealType *hostData, *deviceData;
+      allocateMemoryHost( hostData, shortSequence );
+      allocateMemoryCuda( deviceData, shortSequence );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      setConstantSequence( shortSequence, ( RealType ) -1, hostData, deviceData );
+      RealType result;
+
+      tnlParallelReductionSum< RealType, int > sumOperation;
+      CPPUNIT_ASSERT(
+         ( tnlCUDALongVectorReduction( sumOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == -shortSequence );
+
+      tnlParallelReductionMin< RealType, int > minOperation;
+      CPPUNIT_ASSERT(
+         ( tnlCUDALongVectorReduction( minOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == -1 );
+
+      tnlParallelReductionMax< RealType, int > maxOperation;
+      CPPUNIT_ASSERT(
+         ( tnlCUDALongVectorReduction( maxOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == -1 );
+
+      tnlParallelReductionAbsSum< RealType, int > absSumOperation;
+      CPPUNIT_ASSERT(
+         ( tnlCUDALongVectorReduction( absSumOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == shortSequence );
+
+      tnlParallelReductionAbsMin< RealType, int > absMinOperation;
+      CPPUNIT_ASSERT(
+         ( tnlCUDALongVectorReduction( absMinOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 1 );
+
+      tnlParallelReductionAbsMax< RealType, int > absMaxOperation;
+      CPPUNIT_ASSERT(
+         ( tnlCUDALongVectorReduction( absMaxOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 1 );
+
+      freeMemoryHost( hostData );
+      freeMemoryCuda( deviceData );
+      CPPUNIT_ASSERT( checkCudaDevice );
+   };
+
+   template< typename RealType >
+   void longConstantSequenceTest()
+   {
+      const int longSequence( 8192 );
+      RealType *hostData, *deviceData;
+      allocateMemoryHost( hostData, longSequence );
+      allocateMemoryCuda( deviceData, longSequence );
+      CPPUNIT_ASSERT( checkCudaDevice );
+
+      setConstantSequence( longSequence, ( RealType ) -1, hostData, deviceData );
       RealType result;
+
       tnlParallelReductionSum< RealType, int > sumOperation;
       CPPUNIT_ASSERT( 
-         ( tnlCUDALongVectorReduction( sumOperation, size, deviceData, ( RealType* ) 0, result ) ) );
-      CPPUNIT_ASSERT( result == size );
+         ( tnlCUDALongVectorReduction( sumOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == -longSequence );
 
       tnlParallelReductionMin< RealType, int > minOperation;
       CPPUNIT_ASSERT(
-         ( tnlCUDALongVectorReduction( minOperation, size, deviceData, ( RealType* ) 0, result ) ) );
+         ( tnlCUDALongVectorReduction( minOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == -1 );
+
+      tnlParallelReductionMax< RealType, int > maxOperation;
+      CPPUNIT_ASSERT(
+         ( tnlCUDALongVectorReduction( maxOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == -1 );
+
+      tnlParallelReductionAbsSum< RealType, int > absSumOperation;
+      CPPUNIT_ASSERT(
+         ( tnlCUDALongVectorReduction( absSumOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == longSequence );
+
+      tnlParallelReductionAbsMin< RealType, int > absMinOperation;
+      CPPUNIT_ASSERT(
+         ( tnlCUDALongVectorReduction( absMinOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == 1 );
 
+      tnlParallelReductionAbsMax< RealType, int > absMaxOperation;
+      CPPUNIT_ASSERT(
+         ( tnlCUDALongVectorReduction( absMaxOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == 1 );
 
       freeMemoryHost( hostData );
       freeMemoryCuda( deviceData );
@@ -95,7 +179,7 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
       RealType sum( 0.0 );
       for( int i = 0; i < size; i ++ )
       {
-         hostData[ i ] = i + 1;
+         hostData[ i ] = -i - 1;
          sum += hostData[ i ];
       }
       copyMemoryHostToCuda( deviceData, hostData, size );
@@ -108,8 +192,27 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
       tnlParallelReductionMin< RealType, int > minOperation;
       CPPUNIT_ASSERT(
          ( tnlCUDALongVectorReduction( minOperation, size, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == -size );
+
+      tnlParallelReductionMax< RealType, int > maxOperation;
+      CPPUNIT_ASSERT(
+         ( tnlCUDALongVectorReduction( maxOperation, size, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == -1 );
+
+      tnlParallelReductionAbsSum< RealType, int > absSumOperation;
+      CPPUNIT_ASSERT(
+         ( tnlCUDALongVectorReduction( absSumOperation, size, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == tnlAbs( sum ) );
+
+      tnlParallelReductionAbsMin< RealType, int > absMinOperation;
+      CPPUNIT_ASSERT(
+         ( tnlCUDALongVectorReduction( absMinOperation, size, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == 1 );
 
+      tnlParallelReductionAbsMax< RealType, int > absMaxOperation;
+      CPPUNIT_ASSERT(
+         ( tnlCUDALongVectorReduction( absMaxOperation, size, deviceData, ( RealType* ) 0, result ) ) );
+      CPPUNIT_ASSERT( result == size );
 
       freeMemoryHost( hostData );
       freeMemoryCuda( deviceData );
-- 
GitLab