From 9a01097dcea5c06ac13ebc72b9142c13b63c3077 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 19 Jan 2010 15:30:58 +0000
Subject: [PATCH] Implementing CUDA parallel reduction.

---
 TODO.txt                        |    7 +-
 configure.ac                    |   23 +-
 src/core/tnl-cuda-kernels.cu    |  496 +++++++++++----
 src/core/tnl-cuda-kernels.h     | 1051 ++++++++++++++++++++++++++-----
 src/core/tnlCUDAKernelsTester.h |  294 +++++++--
 src/tnl-unit-tests.cpp          |    4 +-
 6 files changed, 1533 insertions(+), 342 deletions(-)

diff --git a/TODO.txt b/TODO.txt
index 76c317a4e0..b489c016c0 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,5 +1,8 @@
 TODO: implementovat tridu tnlFileName pro generovani jmen souboru
 
+TODO: metodu pro tnlString pro nahrazeni napr. podretezce XXXXX indexem 00001 tj. uXXXXX.bin -> u00001.bin
+      to by melo byt robustnejsi, nez doposavadni pristup 
+
 TODO: implementovat tridu tnlParabolicSolver pro odvozovani resicu k casove promennym uloham
 
 TODO: Nahradit mGrid2D, mGrid3D za mGrid obecne dimenze
@@ -8,6 +11,4 @@ TODO: zavets iteratory pres uzle site misto for cyklu
 
 TODO: implementovat Mersona v CUDA
 
-TODO: metoda Test do tnlObject
-
-TODO: trida tnlTester pro rizeni testu   
\ No newline at end of file
+TODO: objekt pro osetreni chyb - zavedeni funkce tnlGetError   
\ No newline at end of file
diff --git a/configure.ac b/configure.ac
index d88c476315..ac6ad487dc 100644
--- a/configure.ac
+++ b/configure.ac
@@ -37,6 +37,12 @@ AC_ARG_WITH(cuda_libdir,
             AS_HELP_STRING([--with-cuda-libdir],
                            [says where the CUDA libraries can be found, default is /usr/local/cuda/lib]),
             CUDA_LIBS=$withval)
+AC_ARG_WITH(cuda_arch,
+            AS_HELP_STRING([--with-cuda-arch],
+                           [specifies the CUDA architecture, can be 1.0, 1.1, 1.2 or 1.3 - default is 1.3]),
+            CUDA_ARCH=$withval,
+            CUDA_ARCH="1.3")
+
 working_nvcc="no"  
 if test x$with_cuda = xyes;
 then
@@ -89,6 +95,21 @@ then
       CUDA_LDFLAGS="$CUDA_LDFLAGS -lcudart"
       CC="nvcc"
       CXX="nvcc"
+      case "$CUDA_ARCH"  in
+         1.0 )
+            CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_10"
+         ;;
+         1.1 )
+            CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_11"
+         ;;
+         1.2 )
+            CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_12"
+         ;;
+         1.3 )
+            CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_13"
+         ;;
+      esac  
+      DBGCXXFLAGS="$DBGCXXFLAGS -deviceemu"    
    else
       CUDA_LDFLAGS=""
       CUDA_CXXFLAGS=""
@@ -121,7 +142,7 @@ dnl ----------- check for debug--------------
 dnl -----------------------------------------
 AC_ARG_ENABLE(debug,[  --enable-debug=[no/yes]	turn on debugging [default=no]] )
 if test x"$enable_debug" = xyes; then
-   DBGCXXFLAGS="-O0 -DDEBUG"
+   DBGCXXFLAGS="$DBGCXXFLAGS -O0 -DDEBUG"
    if test x$CXX != xnvcc;
    then
       DBGCXXFLAGS="$DBGCXXFLAGS -g3 -Wall -W -ansi -Wno-unused"
diff --git a/src/core/tnl-cuda-kernels.cu b/src/core/tnl-cuda-kernels.cu
index 8d043e78e2..b36083a498 100644
--- a/src/core/tnl-cuda-kernels.cu
+++ b/src/core/tnl-cuda-kernels.cu
@@ -95,133 +95,409 @@ double tnlCUDAReductionSum( const int size,
 }
 
 /*
- * Simple redcution 1
+ * Simple redcution 5
  */
 
-int tnlCUDASimpleReduction1Min( const int size,
-                                const int block_size,
-                                const int grid_size,
-                                const int* input,
-                                int* output )
-{
-
-}
-
-int tnlCUDASimpleReduction1Max( const int size,
-                                const int block_size,
-                                const int grid_size,
-                                const int* input,
-                                int* output )
-{
-   //Calculate necessary block/grid dimensions
-   const int cpuThreshold = 1;
-   const int desBlockSize = 128;    //Desired block size   
-   dim3 blockSize = :: Min( size, desBlockSize );
-   dim3 gridSize = size / blockSize. x;
-   unsigned int shmem = blockSize. x * sizeof( int );
-   cout << "Grid size: " << gridSize. x << endl 
-        << "Block size: " << blockSize. x << endl
-        << "Shmem: " << shmem << endl;
-   tnlCUDASimpleReductionKernel1< int, tnlMax ><<< gridSize, blockSize, shmem >>>( size, input, output );
-   int sizeReduced = gridSize. x;
-   while( sizeReduced > cpuThreshold )
-   {
-      cout << "Reducing with size reduced = " << sizeReduced << endl;
-      blockSize. x = :: Min( sizeReduced, desBlockSize );
-      gridSize. x = sizeReduced / blockSize. x;
-      shmem = blockSize. x * sizeof(int);
-      tnlCUDASimpleReductionKernel1< int, tnlMax ><<< gridSize, blockSize, shmem >>>( size, input, output );
-      sizeReduced = gridSize. x;
-   }
-   int* host_output = new int[ sizeReduced ];
-   cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost );
-   int result = host_output[ 0 ];
-   for( int i = 1;i < sizeReduced; i++ )
-        result = :: Max( result, host_output[ i ] );
-   delete[] host_output;
-   return result;
+bool tnlCUDASimpleReduction5Min( const int size,
+                                 const int* device_input,
+                                 int& result )
+{
+   return tnlCUDASimpleReduction5< int, tnlMin >( size,
+                                                  device_input,
+                                                  result );
 }
-                         
-int tnlCUDASimpleReduction1Sum( const int size,
-                                const int block_size,
-                                const int grid_size,
-                                const int* input,
-                                int* output )
-{
-   //Calculate necessary block/grid dimensions
-   const int cpuThreshold = 1;
-   const int desBlockSize = 128;    //Desired block size   
-   dim3 blockSize = :: Min( size, desBlockSize );
-   dim3 gridSize = size / blockSize. x;
-   unsigned int shmem = blockSize. x * sizeof( int );
-   cout << "Grid size: " << gridSize. x << endl 
-        << "Block size: " << blockSize. x << endl
-        << "Shmem: " << shmem << endl;
-   tnlCUDASimpleReductionKernel1< int, tnlSum ><<< gridSize, blockSize, shmem >>>( size, input, output );
-   int sizeReduced = gridSize. x;
-   while( sizeReduced > cpuThreshold )
-   {
-      cout << "Reducing with size reduced = " << sizeReduced << endl;
-      blockSize. x = :: Min( sizeReduced, desBlockSize );
-      gridSize. x = sizeReduced / blockSize. x;
-      shmem = blockSize. x * sizeof(int);
-      tnlCUDASimpleReductionKernel1< int, tnlSum ><<< gridSize, blockSize, shmem >>>( size, input, output );
-      sizeReduced = gridSize. x;
-   }
-   int* host_output = new int[ sizeReduced ];
-   cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost );
-   int result = host_output[ 0 ];
-   for( int i = 1;i < sizeReduced; i++ )
-        result += host_output[ i ];
-   delete[] host_output;
-   return result;  
+
+bool tnlCUDASimpleReduction5Max( const int size,
+                                 const int* device_input,
+                                 int& result )
+{
+   return tnlCUDASimpleReduction5< int, tnlMax >( size,
+                                                  device_input,
+                                                  result );
+}
+bool tnlCUDASimpleReduction5Sum( const int size,
+                                 const int* device_input,
+                                 int& result )
+{
+   return tnlCUDASimpleReduction5< int, tnlSum >( size,
+                                                  device_input,
+                                                  result );
 }
 
+bool tnlCUDASimpleReduction5Min( const int size,
+                                 const float* device_input,
+                                 float& result )
+{
+   return tnlCUDASimpleReduction5< float, tnlMin >( size,
+                                                    device_input,
+                                                    result );
+}
+
+bool tnlCUDASimpleReduction5Max( const int size,
+                                 const float* device_input,
+                                 float& result )
+{
+   return tnlCUDASimpleReduction5< float, tnlMax >( size,
+                                                    device_input,
+                                                    result );
+}
+bool tnlCUDASimpleReduction5Sum( const int size,
+                                 const float* device_input,
+                                 float& result )
+{
+   return tnlCUDASimpleReduction5< float, tnlSum >( size,
+                                                    device_input,
+                                                    result );
+}
+bool tnlCUDASimpleReduction5Min( const int size,
+                                 const double* device_input,
+                                 double& result )
+{
+   return tnlCUDASimpleReduction5< double, tnlMin >( size,
+                                                     device_input,
+                                                     result );
+}
+
+bool tnlCUDASimpleReduction5Max( const int size,
+                                 const double* device_input,
+                                 double& result )
+{
+   return tnlCUDASimpleReduction5< double, tnlMax >( size,
+                                                     device_input,
+                                                     result );
+}
+bool tnlCUDASimpleReduction5Sum( const int size,
+                                 const double* device_input,
+                                 double& result )
+{
+   return tnlCUDASimpleReduction5< double, tnlSum >( size,
+                                                     device_input,
+                                                     result );
+}
+
+
 /*
-float tnlCUDASimpleReduction1Min( const int size,
-                                  const int block_size,
-                                  const int grid_size,
-                                  const float* input )
+ * Simple redcution 4
+ */
+
+bool tnlCUDASimpleReduction4Min( const int size,
+                                 const int* device_input,
+                                 int& result )
 {
-   return tnlCUDASimpleReduction1< float, tnlMin >( size, block_size, grid_size, input );
+   return tnlCUDASimpleReduction4< int, tnlMin >( size,
+                                                  device_input,
+                                                  result );
 }
 
-float tnlCUDASimpleReduction1Max( const int size,
-                                  const int block_size,
-                                  const int grid_size,
-                                  const float* input )
+bool tnlCUDASimpleReduction4Max( const int size,
+                                 const int* device_input,
+                                 int& result )
 {
-   return tnlCUDASimpleReduction1< float, tnlMax >( size, block_size, grid_size, input );
+   return tnlCUDASimpleReduction4< int, tnlMax >( size,
+                                                  device_input,
+                                                  result );
 }
-                         
-float tnlCUDASimpleReduction1Sum( const int size,
-                                  const int block_size,
-                                  const int grid_size,
-                                  const float* input )
+bool tnlCUDASimpleReduction4Sum( const int size,
+                                 const int* device_input,
+                                 int& result )
 {
-   return tnlCUDASimpleReduction1< float, tnlSum >( size, block_size, grid_size, input );
+   return tnlCUDASimpleReduction4< int, tnlSum >( size,
+                                                  device_input,
+                                                  result );
 }
 
-double tnlCUDASimpleReduction1Min( const int size,
-                                   const int block_size,
-                                   const int grid_size,
-                                   const double* input )
+bool tnlCUDASimpleReduction4Min( const int size,
+                                 const float* device_input,
+                                 float& result )
 {
-   return tnlCUDASimpleReduction1< double, tnlMin >( size, block_size, grid_size, input );
+   return tnlCUDASimpleReduction4< float, tnlMin >( size,
+                                                    device_input,
+                                                    result );
 }
 
-double tnlCUDASimpleReduction1Max( const int size,
-                                   const int block_size,
-                                   const int grid_size,
-                                   const double* input )
+bool tnlCUDASimpleReduction4Max( const int size,
+                                 const float* device_input,
+                                 float& result )
 {
-   return tnlCUDASimpleReduction1< double, tnlMax >( size, block_size, grid_size, input );
+   return tnlCUDASimpleReduction4< float, tnlMax >( size,
+                                                    device_input,
+                                                    result );
 }
-                         
-double tnlCUDASimpleReduction1Sum( const int size,
-                                   const int block_size,
-                                   const int grid_size,
-                                   const double* input )
+bool tnlCUDASimpleReduction4Sum( const int size,
+                                 const float* device_input,
+                                 float& result )
+{
+   return tnlCUDASimpleReduction4< float, tnlSum >( size,
+                                                    device_input,
+                                                    result );
+}
+bool tnlCUDASimpleReduction4Min( const int size,
+                                 const double* device_input,
+                                 double& result )
+{
+   return tnlCUDASimpleReduction4< double, tnlMin >( size,
+                                                     device_input,
+                                                     result );
+}
+
+bool tnlCUDASimpleReduction4Max( const int size,
+                                 const double* device_input,
+                                 double& result )
+{
+   return tnlCUDASimpleReduction4< double, tnlMax >( size,
+                                                     device_input,
+                                                     result );
+}
+bool tnlCUDASimpleReduction4Sum( const int size,
+                                 const double* device_input,
+                                 double& result )
+{
+   return tnlCUDASimpleReduction4< double, tnlSum >( size,
+                                                     device_input,
+                                                     result );
+}
+
+/*
+ * Simple redcution 3
+ */
+
+bool tnlCUDASimpleReduction3Min( const int size,
+                                 const int* device_input,
+                                 int& result )
+{
+   return tnlCUDASimpleReduction3< int, tnlMin >( size,
+                                                  device_input,
+                                                  result );
+}
+
+bool tnlCUDASimpleReduction3Max( const int size,
+                                 const int* device_input,
+                                 int& result )
+{
+   return tnlCUDASimpleReduction3< int, tnlMax >( size,
+                                                  device_input,
+                                                  result );
+}
+bool tnlCUDASimpleReduction3Sum( const int size,
+                                 const int* device_input,
+                                 int& result )
+{
+   return tnlCUDASimpleReduction3< int, tnlSum >( size,
+                                                  device_input,
+                                                  result );
+}
+
+bool tnlCUDASimpleReduction3Min( const int size,
+                                 const float* device_input,
+                                 float& result )
+{
+   return tnlCUDASimpleReduction3< float, tnlMin >( size,
+                                                    device_input,
+                                                    result );
+}
+
+bool tnlCUDASimpleReduction3Max( const int size,
+                                 const float* device_input,
+                                 float& result )
+{
+   return tnlCUDASimpleReduction3< float, tnlMax >( size,
+                                                    device_input,
+                                                    result );
+}
+bool tnlCUDASimpleReduction3Sum( const int size,
+                                 const float* device_input,
+                                 float& result )
+{
+   return tnlCUDASimpleReduction3< float, tnlSum >( size,
+                                                    device_input,
+                                                    result );
+}
+bool tnlCUDASimpleReduction3Min( const int size,
+                                 const double* device_input,
+                                 double& result )
+{
+   return tnlCUDASimpleReduction3< double, tnlMin >( size,
+                                                     device_input,
+                                                     result );
+}
+
+bool tnlCUDASimpleReduction3Max( const int size,
+                                 const double* device_input,
+                                 double& result )
 {
-   return tnlCUDASimpleReduction1< double, tnlSum >( size, block_size, grid_size, input );
-}*/
\ No newline at end of file
+   return tnlCUDASimpleReduction3< double, tnlMax >( size,
+                                                     device_input,
+                                                     result );
+}
+bool tnlCUDASimpleReduction3Sum( const int size,
+                                 const double* device_input,
+                                 double& result )
+{
+   return tnlCUDASimpleReduction3< double, tnlSum >( size,
+                                                     device_input,
+                                                     result );
+}
+
+/*
+ * Simple redcution 2
+ */
+
+bool tnlCUDASimpleReduction2Min( const int size,
+                                 const int* device_input,
+                                 int& result )
+{
+   return tnlCUDASimpleReduction2< int, tnlMin >( size,
+                                                  device_input,
+                                                  result );
+}
+
+bool tnlCUDASimpleReduction2Max( const int size,
+                                 const int* device_input,
+                                 int& result )
+{
+   return tnlCUDASimpleReduction2< int, tnlMax >( size,
+                                                  device_input,
+                                                  result );
+}
+bool tnlCUDASimpleReduction2Sum( const int size,
+                                 const int* device_input,
+                                 int& result )
+{
+   return tnlCUDASimpleReduction2< int, tnlSum >( size,
+                                                  device_input,
+                                                  result );
+}
+
+bool tnlCUDASimpleReduction2Min( const int size,
+                                 const float* device_input,
+                                 float& result )
+{
+   return tnlCUDASimpleReduction2< float, tnlMin >( size,
+                                                    device_input,
+                                                    result );
+}
+
+bool tnlCUDASimpleReduction2Max( const int size,
+                                 const float* device_input,
+                                 float& result )
+{
+   return tnlCUDASimpleReduction2< float, tnlMax >( size,
+                                                    device_input,
+                                                    result );
+}
+bool tnlCUDASimpleReduction2Sum( const int size,
+                                 const float* device_input,
+                                 float& result )
+{
+   return tnlCUDASimpleReduction2< float, tnlSum >( size,
+                                                    device_input,
+                                                    result );
+}
+bool tnlCUDASimpleReduction2Min( const int size,
+                                 const double* device_input,
+                                 double& result )
+{
+   return tnlCUDASimpleReduction2< double, tnlMin >( size,
+                                                     device_input,
+                                                     result );
+}
+
+bool tnlCUDASimpleReduction2Max( const int size,
+                                 const double* device_input,
+                                 double& result )
+{
+   return tnlCUDASimpleReduction2< double, tnlMax >( size,
+                                                     device_input,
+                                                     result );
+}
+bool tnlCUDASimpleReduction2Sum( const int size,
+                                 const double* device_input,
+                                 double& result )
+{
+   return tnlCUDASimpleReduction2< double, tnlSum >( size,
+                                                     device_input,
+                                                     result );
+}
+
+
+/*
+ * Simple redcution 1
+ */
+
+bool tnlCUDASimpleReduction1Min( const int size,
+                                 const int* device_input,
+                                 int& result )
+{
+   return tnlCUDASimpleReduction1< int, tnlMin >( size,
+                                                  device_input,
+                                                  result );
+}
+
+bool tnlCUDASimpleReduction1Max( const int size,
+                                 const int* device_input,
+                                 int& result )
+{
+   return tnlCUDASimpleReduction1< int, tnlMax >( size,
+                                   device_input,
+                                   result );
+}
+bool tnlCUDASimpleReduction1Sum( const int size,
+                                 const int* device_input,
+                                 int& result )
+{
+   return tnlCUDASimpleReduction1< int, tnlSum >( size,
+                                   device_input,
+                                   result );
+}
+
+bool tnlCUDASimpleReduction1Min( const int size,
+                                 const float* device_input,
+                                 float& result )
+{
+   return tnlCUDASimpleReduction1< float, tnlMin >( size,
+                                   device_input,
+                                   result );
+}
+
+bool tnlCUDASimpleReduction1Max( const int size,
+                                 const float* device_input,
+                                 float& result )
+{
+   return tnlCUDASimpleReduction1< float, tnlMax >( size,
+                                   device_input,
+                                   result );
+}
+bool tnlCUDASimpleReduction1Sum( const int size,
+                                 const float* device_input,
+                                 float& result )
+{
+   return tnlCUDASimpleReduction1< float, tnlSum >( size,
+                                   device_input,
+                                   result );
+}
+bool tnlCUDASimpleReduction1Min( const int size,
+                                 const double* device_input,
+                                 double& result )
+{
+   return tnlCUDASimpleReduction1< double, tnlMin >( size,
+                                   device_input,
+                                   result );
+}
+
+bool tnlCUDASimpleReduction1Max( const int size,
+                                 const double* device_input,
+                                 double& result )
+{
+   return tnlCUDASimpleReduction1< double, tnlMax >( size,
+                                   device_input,
+                                   result );
+}
+bool tnlCUDASimpleReduction1Sum( const int size,
+                                 const double* device_input,
+                                 double& result )
+{
+   return tnlCUDASimpleReduction1< double, tnlSum >( size,
+                                   device_input,
+                                   result );
+}
+
diff --git a/src/core/tnl-cuda-kernels.h b/src/core/tnl-cuda-kernels.h
index dc1e1c7656..c56306f051 100644
--- a/src/core/tnl-cuda-kernels.h
+++ b/src/core/tnl-cuda-kernels.h
@@ -45,208 +45,906 @@ template< class T > __device__ T tnlCudaMax( const T& a,
  *
  * Call this kernel with grid size divided by 2.
  * Maximum block size is 512.
+ * We also limit the grid size to 2048 - desGridSize.
  *
  */
 
 template < class T, tnlOperation operation, int blockSize >
 __global__ void tnlCUDAReductionKernel( const int size,
-		                                const T* d_input,
-		                                T* d_output )
+										const int grid_size,
+	                                    const T* d_input,
+	                                    T* d_output,
+	                                    T* dbg_array1 = 0  )
 {
-	extern __shared__ __align__( 8 ) T sdata[];
-	// Read the data into shared memory
-	int tid = threadIdx.x;
-	int gid = blockIdx. x * blockSize * 2 + threadIdx. x;
-	int gridSize = blockSize * 2 * gridDim. x;
-	if( operation == tnlMin ||
-		operation == tnlMax )
-		sdata[ tid ] = d_input[ gid ];
+	extern __shared__ T sdata[];
+
+	// Read data into the shared memory
+	int tid = threadIdx. x;
+	int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x;
+	// Last thread ID which manipulates meaningful data
+
+	int grid_size = 2 * blockSize * gridDim. x;
+	if( gid + blockSize < size )
+	{
+		if( operation == tnlMin ) sdata[ tid ] = :: Min( d_input[ gid ], d_input[ gid + blockSize ] );
+		if( operation == tnlMax ) sdata[ tid ] = :: Max( d_input[ gid ], d_input[ gid + blockSize ] );
+		if( operation == tnlSum ) sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockSize ];
+	}
 	else
-		sdata[ tid ] = 0;
-	while( gid < size )
+	{
+		sdata[ tid ] = d_input[ gid ];
+	}
+	gid += grid_size;
+
+	while (gid < size)
+	{
+		if( operation == tnlMin ) sdata[ tid ] = :: Min( sdata[ tid ], :: Min( d_input[gid], d_input[gid+blockSize] );
+		if( operation == tnlMax ) sdata[ tid ] = :: Max( sdata[ tid ], :: Max( d_input[gid], d_input[gid+blockSize] );
+		if( operation == tnlSum ) sdata[ tid ] += d_input[gid] + d_input[ gid + blockSize ];
+		gid += gridSize;
+	}
+	__syncthreads();
+
+	if( gid + blockDim. x < size )
 	{
 		if( operation == tnlMin )
-			sdata[ tid ] = tnlCudaMin( d_input[ gid ], d_input[ tnlCudaMin( gid + blockSize, size ) ] );
+			sdata[ tid ] = tnlCudaMin( d_input[ gid ], d_input[ gid + blockDim. x ] );
 		if( operation == tnlMax )
-			sdata[ tid ] = tnlCudaMax( d_input[ gid ], d_input[ tnlCudaMin( gid + blockSize, size ) ] );
+			sdata[ tid ] = tnlCudaMax( d_input[ gid ], d_input[ gid + blockDim. x ] );
 		if( operation == tnlSum )
-			sdata[ tid ] += d_input[gid] + d_input[gid+blockSize];
-		gid += gridSize;
+			sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockDim. x ];
+	}
+	else if( gid < size )
+	{
+		sdata[ tid ] = d_input[ gid ];
 	}
 	__syncthreads();
 
 	// Parallel reduction
-	if( blockSize == 512 )
-	{
-		if( tid < 256 )
+		if( blockSize == 512 )
 		{
-			if( operation == tnlMin )
-				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 256 ] );
-			if( operation == tnlMax )
-				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 256 ] );
-			if( operation == tnlSum )
-				sdata[ tid ] += sdata[ tid + 256 ];
+			if( tid < 256 )
+			{
+				if( operation == tnlMin )
+					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 256 ] );
+				if( operation == tnlMax )
+					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 256 ] );
+				if( operation == tnlSum )
+					sdata[ tid ] += sdata[ tid + 256 ];
+			}
+			__syncthreads();
 		}
-		__syncthreads();
-	}
-	if( blockSize >= 256 )
-	{
-		if( tid < 128 )
+		if( blockSize >= 256 )
 		{
-			if( operation == tnlMin )
-				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 128 ] );
-			if( operation == tnlMax )
-				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 128 ] );
-			if( operation == tnlSum )
-				sdata[ tid ] += sdata[ tid + 128 ];
+			if( tid < 128 )
+			{
+				if( operation == tnlMin )
+					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 128 ] );
+				if( operation == tnlMax )
+					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 128 ] );
+				if( operation == tnlSum )
+					sdata[ tid ] += sdata[ tid + 128 ];
+			}
+			__syncthreads();
 		}
-		__syncthreads();
-	}
-	if( blockSize >= 128 )
-	{
-		if (tid< 64)
+		if( blockSize >= 128 )
 		{
-			if( operation == tnlMin )
-				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 64 ] );
-			if( operation == tnlMax )
-				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 64 ] );
-			if( operation == tnlSum )
-				sdata[ tid ] += sdata[ tid + 64 ];
+			if (tid< 64)
+			{
+				if( operation == tnlMin )
+					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 64 ] );
+				if( operation == tnlMax )
+					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 64 ] );
+				if( operation == tnlSum )
+					sdata[ tid ] += sdata[ tid + 64 ];
+			}
+			__syncthreads();
 		}
-		__syncthreads();
+		/*
+		 * What follows runs in warp so it does not need to be synchronised.
+		 */
+		if( tid < 32 )
+		{
+			if( blockSize >= 64 )
+			{
+				if( operation == tnlMin )
+					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 32 ] );
+				if( operation == tnlMax )
+					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 32 ] );
+				if( operation == tnlSum )
+					sdata[ tid ] += sdata[ tid + 32 ];
+			}
+			if( blockSize >= 32 )
+			{
+				if( operation == tnlMin )
+					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 16 ] );
+				if( operation == tnlMax )
+					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 16 ] );
+				if( operation == tnlSum )
+					sdata[ tid ] += sdata[ tid + 16 ];
+			}
+			if( blockSize >= 16 )
+			{
+				if( operation == tnlMin )
+					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 8 ] );
+				if( operation == tnlMax )
+					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 8 ] );
+				if( operation == tnlSum )
+					sdata[ tid ] += sdata[ tid + 8 ];
+			}
+		    if( blockSize >= 8 )
+		    {
+		    	if( operation == tnlMin )
+		    		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 4 ] );
+		    	if( operation == tnlMax )
+		    		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 4 ] );
+		    	if( operation == tnlSum )
+		    		sdata[ tid ] += sdata[ tid + 4 ];
+		    }
+			if( blockSize >= 4 )
+			{
+				if( operation == tnlMin )
+					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 2 ] );
+				if( operation == tnlMax )
+					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 2 ] );
+				if( operation == tnlSum )
+					sdata[ tid ] += sdata[ tid + 2 ];
+			}
+			if( blockSize >= 2 )
+			{
+				if( operation == tnlMin )
+					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 1 ] );
+				if( operation == tnlMax )
+					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 1 ] );
+				if( operation == tnlSum )
+					sdata[ tid ] += sdata[ tid + 1 ];
+			}
+		}
+
+	// Store the result back in global memory
+	if( tid == 0 )
+		d_output[ blockIdx. x ] = sdata[ 0 ];
+}
+
+/*
+ * CUDA reduction kernel caller.
+ * block_size can be only some of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512.
+ * d_input must reside on the device.
+ */
+template< class T, tnlOperation operation >
+bool tnlCUDAReduction( const int size,
+	                   const T* device_input,
+	                   T& result,
+	                   T* device_output = 0 )
+{
+   //Calculate necessary block/grid dimensions
+   const int cpuThreshold = 1;
+   const int desBlockSize = 16;    //Desired block size
+
+   T* dbg_array1;
+
+   bool device_output_allocated( false );
+   if( ! device_output )
+   {
+	   int bytes_alloc = :: Max( 1, size / desBlockSize ) * sizeof( T );
+       cudaMalloc( ( void** ) &device_output, bytes_alloc );
+       if( cudaGetLastError() != cudaSuccess )
+       {
+    	   cerr << "Unable to allocate device memory with size " << bytes_alloc << "." << endl;
+    	   return false;
+       }
+       device_output_allocated = true;
+
+       //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
+       //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!!
+   }
+   dim3 block_size( 0 ), grid_size( 0 );
+   int shmem;
+   int size_reduced = size;
+   const T* reduction_input = device_input;
+   while( size_reduced > cpuThreshold )
+   {
+      block_size. x = :: Min( size_reduced, desBlockSize );
+      grid_size. x = :: Min( ( size_reduced / block_size. x + 1 ) / 2, desGridSize );
+      shmem = block_size. x * sizeof( T );
+      cout << "Size: " << size_reduced
+           << " Grid size: " << grid_size. x
+           << " Block size: " << block_size. x
+           << " Shmem: " << shmem << endl;
+      //tnlCUDASimpleReductionKernel4< T, operation ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
+      tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." );
+      switch( block_size. x )
+      {
+		  case 512:
+			  tnlCUDASimpleReductionKernel5< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
+			  break;
+		  case 256:
+			  tnlCUDASimpleReductionKernel5< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
+			  break;
+		  case 128:
+			  tnlCUDASimpleReductionKernel5< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
+			  break;
+		  case  64:
+			  tnlCUDASimpleReductionKernel5< T, operation,  64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
+			  break;
+		  case  32:
+			  tnlCUDASimpleReductionKernel5< T, operation,  32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
+			  break;
+		  case  16:
+			  tnlCUDASimpleReductionKernel5< T, operation,  16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
+			  break;
+		  case   8:
+			  tnlCUDASimpleReductionKernel5< T, operation,   8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
+			  break;
+		  case   4:
+			  tnlCUDASimpleReductionKernel5< T, operation,   4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
+			  break;
+		  case   2:
+			  tnlCUDASimpleReductionKernel5< T, operation,   2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
+			  break;
+		  case   1:
+			  tnlAssert( false, cerr << "blockSize should not be 1." << endl );
+			  break;
+		  default:
+			  tnlAssert( false, cerr << "Block size is " << block_size. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
+			  break;
+      	}
+      size_reduced = grid_size. x;
+      reduction_input = device_output;
+
+      // debuging part
+      /*T* host_array = new T[ desBlockSize ];
+      cudaMemcpy( host_array, dbg_array1,  desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost );
+      for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ )
+    	  cout << host_array[ i ] << " - ";
+      cout << endl;
+
+      T* output = new T[ size_reduced ];
+      cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
+      cout << endl;
+      for( int i = 0; i < size_reduced; i ++ )
+    	  cout << output[ i ] << "   ";
+      cout << endl;
+      delete[] output;*/
+   }
+   T* host_output = new T[ size_reduced ];
+   cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
+   result = host_output[ 0 ];
+   for( int i = 1; i < size_reduced; i++ )
+   {
+      if( operation == tnlMin)
+         result = :: Min( result, host_output[ i ] );
+      if( operation == tnlMax )
+         result = :: Max( result, host_output[ i ] );
+      if( operation == tnlSum )
+         result += host_output[ i ];
+   }
+   delete[] host_output;
+   if( device_output_allocated )
+	   cudaFree( device_output );
+   return true;
+}
+
+
+/*
+ * Modified parallel reduction - version 5.
+ * We have replaced the for cycle with switch
+ * and template parameter blockSize
+ */
+
+template < class T, tnlOperation operation, int blockSize >
+__global__ void tnlCUDASimpleReductionKernel5( const int size,
+	                                           const T* d_input,
+	                                           T* d_output,
+	                   	                       T* dbg_array1 = 0  )
+{
+	extern __shared__ T sdata[];
+
+	// Read data into the shared memory
+	int tid = threadIdx. x;
+	int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x;
+	// Last thread ID which manipulates meaningful data
+	//int last_tid = size - 2 * blockIdx. x * blockDim. x;
+	if( gid + blockDim. x < size )
+	{
+		if( operation == tnlMin )
+			sdata[ tid ] = tnlCudaMin( d_input[ gid ], d_input[ gid + blockDim. x ] );
+		if( operation == tnlMax )
+			sdata[ tid ] = tnlCudaMax( d_input[ gid ], d_input[ gid + blockDim. x ] );
+		if( operation == tnlSum )
+			sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockDim. x ];
 	}
-	/*
-	 * What follows runs in warp so it does not need to be synchronised.
-	 */
-	if( tid < 32 )
+	else if( gid < size )
 	{
-		if( blockSize >= 64 )
+		sdata[ tid ] = d_input[ gid ];
+	}
+	__syncthreads();
+
+	// Parallel reduction
+		if( blockSize == 512 )
 		{
-			if( operation == tnlMin )
-				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 32 ] );
-			if( operation == tnlMax )
-				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 32 ] );
-			if( operation == tnlSum )
-				sdata[ tid ] += sdata[ tid + 32 ];
+			if( tid < 256 )
+			{
+				if( operation == tnlMin )
+					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 256 ] );
+				if( operation == tnlMax )
+					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 256 ] );
+				if( operation == tnlSum )
+					sdata[ tid ] += sdata[ tid + 256 ];
+			}
+			__syncthreads();
 		}
-		if( blockSize >= 32 )
+		if( blockSize >= 256 )
 		{
-			if( operation == tnlMin )
-				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 16 ] );
-			if( operation == tnlMax )
-				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 16 ] );
-			if( operation == tnlSum )
-				sdata[ tid ] += sdata[ tid + 16 ];
+			if( tid < 128 )
+			{
+				if( operation == tnlMin )
+					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 128 ] );
+				if( operation == tnlMax )
+					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 128 ] );
+				if( operation == tnlSum )
+					sdata[ tid ] += sdata[ tid + 128 ];
+			}
+			__syncthreads();
 		}
-		if( blockSize >= 16 )
+		if( blockSize >= 128 )
 		{
-			if( operation == tnlMin )
-				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 8 ] );
-			if( operation == tnlMax )
-				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 8 ] );
-			if( operation == tnlSum )
-				sdata[ tid ] += sdata[ tid + 8 ];
+			if (tid< 64)
+			{
+				if( operation == tnlMin )
+					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 64 ] );
+				if( operation == tnlMax )
+					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 64 ] );
+				if( operation == tnlSum )
+					sdata[ tid ] += sdata[ tid + 64 ];
+			}
+			__syncthreads();
 		}
-	    if( blockSize >= 8 )
-	    {
-	    	if( operation == tnlMin )
-	    		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 4 ] );
-	    	if( operation == tnlMax )
-	    		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 4 ] );
-	    	if( operation == tnlSum )
-	    		sdata[ tid ] += sdata[ tid + 4 ];
-	    }
-		if( blockSize >= 4 )
+		/*
+		 * What follows runs in warp so it does not need to be synchronised.
+		 */
+		if( tid < 32 )
+		{
+			if( blockSize >= 64 )
+			{
+				if( operation == tnlMin )
+					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 32 ] );
+				if( operation == tnlMax )
+					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 32 ] );
+				if( operation == tnlSum )
+					sdata[ tid ] += sdata[ tid + 32 ];
+			}
+			if( blockSize >= 32 )
+			{
+				if( operation == tnlMin )
+					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 16 ] );
+				if( operation == tnlMax )
+					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 16 ] );
+				if( operation == tnlSum )
+					sdata[ tid ] += sdata[ tid + 16 ];
+			}
+			if( blockSize >= 16 )
+			{
+				if( operation == tnlMin )
+					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 8 ] );
+				if( operation == tnlMax )
+					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 8 ] );
+				if( operation == tnlSum )
+					sdata[ tid ] += sdata[ tid + 8 ];
+			}
+		    if( blockSize >= 8 )
+		    {
+		    	if( operation == tnlMin )
+		    		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 4 ] );
+		    	if( operation == tnlMax )
+		    		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 4 ] );
+		    	if( operation == tnlSum )
+		    		sdata[ tid ] += sdata[ tid + 4 ];
+		    }
+			if( blockSize >= 4 )
+			{
+				if( operation == tnlMin )
+					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 2 ] );
+				if( operation == tnlMax )
+					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 2 ] );
+				if( operation == tnlSum )
+					sdata[ tid ] += sdata[ tid + 2 ];
+			}
+			if( blockSize >= 2 )
+			{
+				if( operation == tnlMin )
+					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 1 ] );
+				if( operation == tnlMax )
+					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 1 ] );
+				if( operation == tnlSum )
+					sdata[ tid ] += sdata[ tid + 1 ];
+			}
+		}
+
+	// Store the result back in global memory
+	if( tid == 0 )
+		d_output[ blockIdx. x ] = sdata[ 0 ];
+}
+
+template< class T, tnlOperation operation >
+bool tnlCUDASimpleReduction5( const int size,
+	                          const T* device_input,
+	                          T& result,
+	                          T* device_output = 0 )
+{
+   //Calculate necessary block/grid dimensions
+   const int cpuThreshold = 1;
+   const int desBlockSize = 16;    //Desired block size
+
+   T* dbg_array1;
+
+   bool device_output_allocated( false );
+   if( ! device_output )
+   {
+	   int bytes_alloc = :: Max( 1, size / desBlockSize ) * sizeof( T );
+       cudaMalloc( ( void** ) &device_output, bytes_alloc );
+       if( cudaGetLastError() != cudaSuccess )
+       {
+    	   cerr << "Unable to allocate device memory with size " << bytes_alloc << "." << endl;
+    	   return false;
+       }
+       device_output_allocated = true;
+
+       //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
+       //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!!
+   }
+   dim3 block_size( 0 ), grid_size( 0 );
+   int shmem;
+   int size_reduced = size;
+   const T* reduction_input = device_input;
+   while( size_reduced > cpuThreshold )
+   {
+      block_size. x = :: Min( size_reduced, desBlockSize );
+      grid_size. x = ( size_reduced / block_size. x + 1 ) / 2;
+      shmem = block_size. x * sizeof( T );
+      cout << "Size: " << size_reduced
+           << " Grid size: " << grid_size. x
+           << " Block size: " << block_size. x
+           << " Shmem: " << shmem << endl;
+      //tnlCUDASimpleReductionKernel4< T, operation ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
+      tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." );
+      switch( block_size. x )
+      {
+		  case 512:
+			  tnlCUDASimpleReductionKernel5< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
+			  break;
+		  case 256:
+			  tnlCUDASimpleReductionKernel5< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
+			  break;
+		  case 128:
+			  tnlCUDASimpleReductionKernel5< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
+			  break;
+		  case  64:
+			  tnlCUDASimpleReductionKernel5< T, operation,  64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
+			  break;
+		  case  32:
+			  tnlCUDASimpleReductionKernel5< T, operation,  32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
+			  break;
+		  case  16:
+			  tnlCUDASimpleReductionKernel5< T, operation,  16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
+			  break;
+		  case   8:
+			  tnlCUDASimpleReductionKernel5< T, operation,   8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
+			  break;
+		  case   4:
+			  tnlCUDASimpleReductionKernel5< T, operation,   4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
+			  break;
+		  case   2:
+			  tnlCUDASimpleReductionKernel5< T, operation,   2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
+			  break;
+		  case   1:
+			  tnlAssert( false, cerr << "blockSize should not be 1." << endl );
+			  break;
+		  default:
+			  tnlAssert( false, cerr << "Block size is " << block_size. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
+			  break;
+      	}
+      size_reduced = grid_size. x;
+      reduction_input = device_output;
+
+      // debuging part
+      /*T* host_array = new T[ desBlockSize ];
+      cudaMemcpy( host_array, dbg_array1,  desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost );
+      for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ )
+    	  cout << host_array[ i ] << " - ";
+      cout << endl;
+
+      T* output = new T[ size_reduced ];
+      cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
+      cout << endl;
+      for( int i = 0; i < size_reduced; i ++ )
+    	  cout << output[ i ] << "   ";
+      cout << endl;
+      delete[] output;*/
+   }
+   T* host_output = new T[ size_reduced ];
+   cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
+   result = host_output[ 0 ];
+   for( int i = 1; i < size_reduced; i++ )
+   {
+      if( operation == tnlMin)
+         result = :: Min( result, host_output[ i ] );
+      if( operation == tnlMax )
+         result = :: Max( result, host_output[ i ] );
+      if( operation == tnlSum )
+         result += host_output[ i ];
+   }
+   delete[] host_output;
+   if( device_output_allocated )
+	   cudaFree( device_output );
+   return true;
+}
+
+
+/*
+ * Modified parallel reduction - version 4.
+ * We have reduced the grid size to one half
+ * to avoid inactive threads.
+ */
+
+
+template < class T, tnlOperation operation >
+__global__ void tnlCUDASimpleReductionKernel4( const int size,
+	                                           const T* d_input,
+	                                           T* d_output,
+	                   	                       T* dbg_array1 = 0  )
+{
+	extern __shared__ T sdata[];
+
+	// Read data into the shared memory
+	int tid = threadIdx. x;
+	int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x;
+	// Last thread ID which manipulates meaningful data
+	int last_tid = size - 2 * blockIdx. x * blockDim. x;
+	if( gid + blockDim. x < size )
+	{
+		if( operation == tnlMin )
+			sdata[ tid ] = tnlCudaMin( d_input[ gid ], d_input[ gid + blockDim. x ] );
+		if( operation == tnlMax )
+			sdata[ tid ] = tnlCudaMax( d_input[ gid ], d_input[ gid + blockDim. x ] );
+		if( operation == tnlSum )
+			sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockDim. x ];
+	}
+	else if( gid < size )
+	{
+		sdata[ tid ] = d_input[ gid ];
+	}
+	__syncthreads();
+
+	// Parallel reduction
+	for( int s = blockDim. x / 2; s > 0; s >>= 1 )
+	{
+		if( tid < s && tid + s < last_tid )
 		{
 			if( operation == tnlMin )
-				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 2 ] );
+				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
 			if( operation == tnlMax )
-				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 2 ] );
+				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
 			if( operation == tnlSum )
-				sdata[ tid ] += sdata[ tid + 2 ];
+				sdata[ tid ] += sdata[ tid + s ];
 		}
-		if( blockSize >= 2 )
+		__syncthreads();
+	}
+
+	// Store the result back in global memory
+	if( tid == 0 )
+		d_output[ blockIdx. x ] = sdata[ 0 ];
+}
+
+template< class T, tnlOperation operation >
+bool tnlCUDASimpleReduction4( const int size,
+	                          const T* device_input,
+	                          T& result,
+	                          T* device_output = 0 )
+{
+   //Calculate necessary block/grid dimensions
+   const int cpuThreshold = 1;
+   const int desBlockSize = 16;    //Desired block size
+
+   T* dbg_array1;
+
+   bool device_output_allocated( false );
+   if( ! device_output )
+   {
+	   int bytes_alloc = :: Max( 1, size / desBlockSize ) * sizeof( T );
+       cudaMalloc( ( void** ) &device_output, bytes_alloc );
+       if( cudaGetLastError() != cudaSuccess )
+       {
+    	   cerr << "Unable to allocate device memory with size " << bytes_alloc << "." << endl;
+    	   return false;
+       }
+       device_output_allocated = true;
+
+       //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
+       //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!!
+   }
+   dim3 block_size( 0 ), grid_size( 0 );
+   int shmem;
+   int size_reduced = size;
+   const T* reduction_input = device_input;
+   while( size_reduced > cpuThreshold )
+   {
+      block_size. x = :: Min( size_reduced, desBlockSize );
+      grid_size. x = ( size_reduced / block_size. x + 1 ) / 2;
+      shmem = block_size. x * sizeof( T );
+      /*cout << "Size: " << size_reduced
+           << " Grid size: " << grid_size. x
+           << " Block size: " << block_size. x
+           << " Shmem: " << shmem << endl;*/
+      tnlCUDASimpleReductionKernel4< T, operation ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
+      size_reduced = grid_size. x;
+      reduction_input = device_output;
+
+      // debuging part
+      /*T* host_array = new T[ desBlockSize ];
+      cudaMemcpy( host_array, dbg_array1,  desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost );
+      for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ )
+    	  cout << host_array[ i ] << " - ";
+      cout << endl;
+
+      T* output = new T[ size_reduced ];
+      cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
+      cout << endl;
+      for( int i = 0; i < size_reduced; i ++ )
+    	  cout << output[ i ] << "   ";
+      cout << endl;
+      delete[] output;*/
+   }
+   T* host_output = new T[ size_reduced ];
+   cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
+   result = host_output[ 0 ];
+   for( int i = 1; i < size_reduced; i++ )
+   {
+      if( operation == tnlMin)
+         result = :: Min( result, host_output[ i ] );
+      if( operation == tnlMax )
+         result = :: Max( result, host_output[ i ] );
+      if( operation == tnlSum )
+         result += host_output[ i ];
+   }
+   delete[] host_output;
+   if( device_output_allocated )
+	   cudaFree( device_output );
+   return true;
+}
+
+/*
+ * Modified parallel reduction - version 3.
+ * We have avoided conflicting memory accesses.
+ */
+
+template < class T, tnlOperation operation >
+__global__ void tnlCUDASimpleReductionKernel3( const int size,
+		                                       const T* d_input,
+		                                       T* d_output )
+{
+	extern __shared__ T sdata[];
+
+	// Read data into the shared memory
+	int tid = threadIdx. x;
+	int gid = blockIdx. x * blockDim. x + threadIdx. x;
+	// Last thread ID which manipulates meaningful data
+	int last_tid = size - blockIdx. x * blockDim. x;
+	if( gid < size )
+		sdata[ tid ] = d_input[gid];
+	__syncthreads();
+
+	// Parallel reduction
+	for( int s = blockDim. x / 2; s > 0; s >>= 1 )
+	{
+		if( tid < s && tid + s < last_tid )
 		{
 			if( operation == tnlMin )
-				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 1 ] );
+				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
 			if( operation == tnlMax )
-				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 1 ] );
+				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
 			if( operation == tnlSum )
-				sdata[ tid ] += sdata[ tid + 1 ];
+				sdata[ tid ] += sdata[ tid + s ];
 		}
+		__syncthreads();
 	}
-	// Store the result back to the global memory of the device
-	if( tid == 0 ) d_output[ blockIdx. x ] = sdata[ 0 ];
+
+	// Store the result back in global memory
+	if( tid == 0 )
+		d_output[ blockIdx. x ] = sdata[ 0 ];
+}
+
+template< class T, tnlOperation operation >
+bool tnlCUDASimpleReduction3( const int size,
+	                          const T* device_input,
+	                          T& result,
+	                          T* device_output = 0 )
+{
+   //Calculate necessary block/grid dimensions
+   const int cpuThreshold = 1;
+   const int desBlockSize = 256;    //Desired block size
+
+   bool device_output_allocated( false );
+   if( ! device_output )
+   {
+	   int bytes_alloc = :: Max( 1, size / desBlockSize ) * sizeof( T );
+       cudaMalloc( ( void** ) &device_output, bytes_alloc );
+       if( cudaGetLastError() != cudaSuccess )
+       {
+    	   cerr << "Unable to allocate device memory with size " << bytes_alloc << "." << endl;
+    	   return false;
+       }
+       device_output_allocated = true;
+   }
+   dim3 block_size( 0 ), grid_size( 0 );
+   int shmem;
+   int size_reduced = size;
+   const T* reduction_input = device_input;
+   while( size_reduced > cpuThreshold )
+   {
+      block_size. x = :: Min( size_reduced, desBlockSize );
+      grid_size. x = size_reduced / block_size. x;
+      shmem = block_size. x * sizeof( T );
+      /*cout << "Size: " << size_reduced
+           << " Grid size: " << grid_size. x
+           << " Block size: " << block_size. x
+           << " Shmem: " << shmem << endl;*/
+      tnlCUDASimpleReductionKernel3< T, operation ><<< grid_size, block_size, shmem >>>( size_reduced, reduction_input, device_output );
+      size_reduced = grid_size. x;
+      reduction_input = device_output;
+
+      // debuging part
+      /*T* output = new T[ size_reduced ];
+      cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
+      cout << endl;
+      for( int i = 0; i < size_reduced; i ++ )
+    	  cout << output[ i ] << "   ";
+      cout << endl;
+      delete[] output;*/
+   }
+   T* host_output = new T[ size_reduced ];
+   cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
+   result = host_output[ 0 ];
+   for( int i = 1; i < size_reduced; i++ )
+   {
+      if( operation == tnlMin)
+         result = :: Min( result, host_output[ i ] );
+      if( operation == tnlMax )
+         result = :: Max( result, host_output[ i ] );
+      if( operation == tnlSum )
+         result += host_output[ i ];
+   }
+   delete[] host_output;
+   if( device_output_allocated )
+	   cudaFree( device_output );
+   return true;
 }
 
 /*
- * CUDA reduction kernel caller.
- * block_size can be only some of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512.
- * d_input must reside on the device.
+ * Modified parallel reduction - version 2.
+ * We have avoided operation modulo.
  */
-template< class T, tnlOperation operation >
-T tnlCUDAReduction( const int size,
-					const int block_size,
-					const int grid_size,
-	                const T* d_input )
+
+template < class T, tnlOperation operation >
+__global__ void tnlCUDASimpleReductionKernel2( const int size,
+		                                       const T* d_input,
+		                                       T* d_output )
 {
-	T result;
+	extern __shared__ T sdata[];
 
-	dim3 blockSize( block_size );
-	dim3 gridSize( grid_size );
-	int shmem = 512 * sizeof( T );
-	tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." );
-	switch( block_size )
+	// Read data into the shared memory
+	int tid = threadIdx. x;
+	int gid = blockIdx. x * blockDim. x + threadIdx. x;
+	// Last thread ID which manipulates meaningful data
+	int last_tid = size - blockIdx. x * blockDim. x;
+	if( gid < size )
+		sdata[tid] = d_input[gid];
+	__syncthreads();
+
+	// Parallel reduction
+	for( int s = 1; s < blockDim. x; s *= 2 )
 	{
-		case 512:
-	        tnlCUDAReductionKernel< T, operation, 512 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
-	        break;
-	    case 256:
-	    	tnlCUDAReductionKernel< T, operation, 256 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
-	    	break;
-	    case 128:
-	    	tnlCUDAReductionKernel< T, operation, 128 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
-	    	break;
-	    case  64:
-	    	tnlCUDAReductionKernel< T, operation,  64 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
-	    	break;
-	    case  32:
-	    	tnlCUDAReductionKernel< T, operation,  32 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
-	    	break;
-	    case  16:
-	    	tnlCUDAReductionKernel< T, operation,  16 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
-	    	break;
-	    case   8:
-	    	tnlCUDAReductionKernel< T, operation,   8 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
-	    	break;
-	    case   4:
-	    	tnlCUDAReductionKernel< T, operation,   4 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
-	    	break;
-	    case   2:
-	    	tnlCUDAReductionKernel< T, operation,   2 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
-	    	break;
-	    case   1:
-	    	tnlCUDAReductionKernel< T, operation,   1 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
-	    	break;
-	    default:
-	    	tnlAssert( false, cerr << "Block size is " << block_size << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
-	    	break;
+		int inds = 2 * s * tid;
+		if( inds < blockDim. x && inds + s < last_tid )
+		{
+			if( operation == tnlMin )
+				sdata[ inds ] = tnlCudaMin( sdata[ inds ], sdata[ inds + s ] );
+			if( operation == tnlMax )
+				sdata[ inds ] = tnlCudaMax( sdata[ inds ], sdata[ inds + s ] );
+			if( operation == tnlSum )
+				sdata[ inds ] += sdata[ inds + s ];
+		}
+		__syncthreads();
 	}
-	return result;
+
+	// Store the result back in global memory
+	if( tid == 0 )
+		d_output[ blockIdx. x ] = sdata[ 0 ];
+}
+
+template< class T, tnlOperation operation >
+bool tnlCUDASimpleReduction2( const int size,
+	                          const T* device_input,
+	                          T& result,
+	                          T* device_output = 0 )
+{
+   //Calculate necessary block/grid dimensions
+   const int cpuThreshold = 1;
+   const int desBlockSize = 256;    //Desired block size
+
+   bool device_output_allocated( false );
+   if( ! device_output )
+   {
+	   int bytes_alloc = :: Max( 1, size / desBlockSize ) * sizeof( T );
+       cudaMalloc( ( void** ) &device_output, bytes_alloc );
+       if( cudaGetLastError() != cudaSuccess )
+       {
+    	   cerr << "Unable to allocate device memory with size " << bytes_alloc << "." << endl;
+    	   return false;
+       }
+       device_output_allocated = true;
+   }
+   dim3 block_size( 0 ), grid_size( 0 );
+   int shmem;
+   int size_reduced = size;
+   const T* reduction_input = device_input;
+   while( size_reduced > cpuThreshold )
+   {
+      block_size. x = :: Min( size_reduced, desBlockSize );
+      grid_size. x = size_reduced / block_size. x;
+      shmem = block_size. x * sizeof( T );
+      /*cout << "Size: " << size_reduced
+           << " Grid size: " << grid_size. x
+           << " Block size: " << block_size. x
+           << " Shmem: " << shmem << endl;*/
+      tnlCUDASimpleReductionKernel2< T, operation ><<< grid_size, block_size, shmem >>>( size_reduced, reduction_input, device_output );
+      size_reduced = grid_size. x;
+      reduction_input = device_output;
+
+      // debuging part
+      /*T* output = new T[ size_reduced ];
+      cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
+      cout << endl;
+      for( int i = 0; i < size_reduced; i ++ )
+    	  cout << output[ i ] << "   ";
+      cout << endl;
+      delete[] output;*/
+   }
+   T* host_output = new T[ size_reduced ];
+   cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
+   result = host_output[ 0 ];
+   for( int i = 1; i < size_reduced; i++ )
+   {
+      if( operation == tnlMin)
+         result = :: Min( result, host_output[ i ] );
+      if( operation == tnlMax )
+         result = :: Max( result, host_output[ i ] );
+      if( operation == tnlSum )
+         result += host_output[ i ];
+   }
+   delete[] host_output;
+   if( device_output_allocated )
+	   cudaFree( device_output );
+   return true;
 }
 
+/*
+ * The simplest and very slow parallel reduction - version 1.
+ */
+
 template < class T, tnlOperation operation >
 __global__ void tnlCUDASimpleReductionKernel1( const int size,
 		                                       const T* d_input,
 		                                       T* d_output )
 {
-	extern __shared__ int sdata[];
+	extern __shared__ T sdata[];
 
 	// Read data into the shared memory
 	int tid = threadIdx. x;
 	int gid = blockIdx. x * blockDim. x + threadIdx. x;
 	// Last thread ID which manipulates meaningful data
 	int last_tid = size - blockIdx. x * blockDim. x;
+
 	if( gid < size )
 		sdata[tid] = d_input[gid];
 	__syncthreads();
@@ -272,50 +970,69 @@ __global__ void tnlCUDASimpleReductionKernel1( const int size,
 }
 
 template< class T, tnlOperation operation >
-T tnlCUDASimpleReduction1( const int size,
-	                   const T* d_input,
-	                   T* output = 0 )
+bool tnlCUDASimpleReduction1( const int size,
+	                          const T* device_input,
+	                          T& result,
+	                          T* device_output = 0 )
 {
    //Calculate necessary block/grid dimensions
    const int cpuThreshold = 1;
-   const int desBlockSize = 128;    //Desired block size
-   dim3 block_size = :: Min( size, desBlockSize );
-   dim3 grid_size = size / blockSize. x;
-   unsigned int shmem = blockSize. x * sizeof( int );
-   cout << "Grid size: " << grid_size. x << endl
-        << "Block size: " << block_size. x << endl
-        << "Shmem: " << shmem << endl;
-   if( ! output )
+   const int desBlockSize = 256;    //Desired block size
+
+   bool device_output_allocated( false );
+   if( ! device_output )
    {
-      cudaMalloc( ( void** ) &output, grid_size * sizeof( T ) );
-      if( cuda // TODO: add allocation check
+	   int bytes_alloc = :: Max( 1, size / desBlockSize ) * sizeof( T );
+       cudaMalloc( ( void** ) &device_output, bytes_alloc );
+       if( cudaGetLastError() != cudaSuccess )
+       {
+    	   cerr << "Unable to allocate device memory with size " << bytes_alloc << "." << endl;
+    	   return false;
+       }
+       device_output_allocated = true;
    }
-   tnlCUDASimpleReductionKernel1< T, operation ><<< grid_size, block_size, shmem >>>( size, input, output );
-   int size_reduced = grid_size. x;
-   while( sizeReduced > cpuThreshold )
+   dim3 block_size( 0 ), grid_size( 0 );
+   int shmem;
+   int size_reduced = size;
+   const T* reduction_input = device_input;
+   while( size_reduced > cpuThreshold )
    {
-      cout << "Reducing with size reduced = " << size_reduced << endl;
       block_size. x = :: Min( size_reduced, desBlockSize );
       grid_size. x = size_reduced / block_size. x;
-      shmem = block_size. x * sizeof(int);
-      tnlCUDASimpleReductionKernel1< T, operation ><<< grid_size, block_size, shmem >>>( size, input, output );
+      shmem = block_size. x * sizeof( T );
+      /*cout << "Size: " << size_reduced
+           << " Grid size: " << grid_size. x
+           << " Block size: " << block_size. x
+           << " Shmem: " << shmem << endl;*/
+      tnlCUDASimpleReductionKernel1< T, operation ><<< grid_size, block_size, shmem >>>( size_reduced, reduction_input, device_output );
       size_reduced = grid_size. x;
+      reduction_input = device_output;
+
+      // debuging part
+      /*T* output = new T[ size_reduced ];
+      cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
+      cout << endl;
+      for( int i = 0; i < size_reduced; i ++ )
+    	  cout << output[ i ] << "   ";
+      cout << endl;
+      delete[] output;*/
    }
-   int* host_output = new int[ size_reduced ];
-   cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost );
-   int result = host_output[ 0 ];
-   for( int i = 1;i < size_reduced; i++ )
+   T* host_output = new T[ size_reduced ];
+   cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
+   result = host_output[ 0 ];
+   for( int i = 1; i < size_reduced; i++ )
    {
       if( operation == tnlMin)
          result = :: Min( result, host_output[ i ] );
       if( operation == tnlMax )
          result = :: Max( result, host_output[ i ] );
       if( operation == tnlSum )
-         result += host_ouput[ i ];
+         result += host_output[ i ];
    }
    delete[] host_output;
-
-   return result;
+   if( device_output_allocated )
+   	   cudaFree( device_output );
+   return true;
 }
 
 #endif /* HAVE_CUDA */
diff --git a/src/core/tnlCUDAKernelsTester.h b/src/core/tnlCUDAKernelsTester.h
index 6d0ba3e330..aa14fddc11 100644
--- a/src/core/tnlCUDAKernelsTester.h
+++ b/src/core/tnlCUDAKernelsTester.h
@@ -66,49 +66,161 @@ double tnlCUDAReductionSum( const int size,
                             const int block_size,
                             const int grid_size,
                             const double* input );
+/*
+ * Simple reduction 5
+ */
+bool tnlCUDASimpleReduction5Min( const int size,
+                                 const int* input,
+                                 int& result );
+bool tnlCUDASimpleReduction5Max( const int size,
+                                 const int* input,
+                                 int& result );
+bool tnlCUDASimpleReduction5Sum( const int size,
+                                 const int* input,
+                                 int& result );
+bool tnlCUDASimpleReduction5Min( const int size,
+                                 const float* input,
+                                 float& result);
+bool tnlCUDASimpleReduction5Max( const int size,
+                                 const float* input,
+                                 float& result);
+bool tnlCUDASimpleReduction5Sum( const int size,
+                                 const float* input,
+                                 float& result);
+bool tnlCUDASimpleReduction5Min( const int size,
+                                 const double* input,
+                                 double& result);
+bool tnlCUDASimpleReduction5Max( const int size,
+                                 const double* input,
+                                 double& result );
+bool tnlCUDASimpleReduction5Sum( const int size,
+                                 const double* input,
+                                 double& result );
+
+
+/*
+ * Simple reduction 4
+ */
+bool tnlCUDASimpleReduction4Min( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction4Max( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction4Sum( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction4Min( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction4Max( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction4Sum( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction4Min( const int size,
+                             const double* input,
+                             double& result);
+bool tnlCUDASimpleReduction4Max( const int size,
+                             const double* input,
+                             double& result );
+bool tnlCUDASimpleReduction4Sum( const int size,
+                             const double* input,
+                             double& result );
+
+/*
+ * Simple reduction 3
+ */
+bool tnlCUDASimpleReduction3Min( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction3Max( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction3Sum( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction3Min( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction3Max( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction3Sum( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction3Min( const int size,
+                             const double* input,
+                             double& result);
+bool tnlCUDASimpleReduction3Max( const int size,
+                             const double* input,
+                             double& result );
+bool tnlCUDASimpleReduction3Sum( const int size,
+                             const double* input,
+                             double& result );
+
+/*
+ * Simple reduction 2
+ */
+bool tnlCUDASimpleReduction2Min( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction2Max( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction2Sum( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction2Min( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction2Max( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction2Sum( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction2Min( const int size,
+                             const double* input,
+                             double& result);
+bool tnlCUDASimpleReduction2Max( const int size,
+                             const double* input,
+                             double& result );
+bool tnlCUDASimpleReduction2Sum( const int size,
+                             const double* input,
+                             double& result );
 
 /*
  * Simple reduction 1
  */
-int tnlCUDASimpleReduction1Min( const int size,
-                          const int block_size,
-                          const int grid_size,
+bool tnlCUDASimpleReduction1Min( const int size,
                           const int* input,
-                          int* output );
-int tnlCUDASimpleReduction1Max( const int size,
-                          const int block_size,
-                          const int grid_size,
+                          int& result );
+bool tnlCUDASimpleReduction1Max( const int size,
                           const int* input,
-                          int* output );
-int tnlCUDASimpleReduction1Sum( const int size,
-                          const int block_size,
-                          const int grid_size,
+                          int& result );
+bool tnlCUDASimpleReduction1Sum( const int size,
                           const int* input,
-                          int* output );
-float tnlCUDASimpleReduction1Min( const int size,
-                            const int block_size,
-                            const int grid_size,
-                            const float* input );
-float tnlCUDASimpleReduction1Max( const int size,
-                            const int block_size,
-                            const int grid_size,
-                            const float* input );
-float tnlCUDASimpleReduction1Sum( const int size,
-                            const int block_size,
-                            const int grid_size,
-                            const float* input );
-double tnlCUDASimpleReduction1Min( const int size,
-                             const int block_size,
-                             const int grid_size,
-                             const double* input );
-double tnlCUDASimpleReduction1Max( const int size,
-                             const int block_size,
-                             const int grid_size,
-                             const double* input );
-double tnlCUDASimpleReduction1Sum( const int size,
-                             const int block_size,
-                             const int grid_size,
-                             const double* input );
+                          int& result );
+bool tnlCUDASimpleReduction1Min( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction1Max( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction1Sum( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction1Min( const int size,
+                             const double* input,
+                             double& result);
+bool tnlCUDASimpleReduction1Max( const int size,
+                             const double* input,
+                             double& result );
+bool tnlCUDASimpleReduction1Sum( const int size,
+                             const double* input,
+                             double& result );
 
 #endif
 
@@ -125,57 +237,71 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
    {
       CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlCUDAKernelsTester" );
       CppUnit :: TestResult result;
+
       suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
     		                   "testSimpleReduction1",
                                & tnlCUDAKernelsTester< T > :: testSimpleReduction1 )
                              );
       suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
+        		               "testSimpleReduction2",
+                               & tnlCUDAKernelsTester< T > :: testSimpleReduction2 )
+                              );
+      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
+                      		   "testSimpleReduction3",
+                               & tnlCUDAKernelsTester< T > :: testSimpleReduction3 )
+                              );
+      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
+                               "testSimpleReduction4",
+                               & tnlCUDAKernelsTester< T > :: testSimpleReduction4 )
+                              );
+      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
+                               "testSimpleReduction5",
+                               & tnlCUDAKernelsTester< T > :: testSimpleReduction5 )
+                              );
+      /*suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
                                "testReduction",
                                & tnlCUDAKernelsTester< T > :: testFastReduction )
-                             );
+                             );*/
 
       return suiteOfTests;
    };
 
    bool testSetup( tnlLongVector< T >& host_input,
-		           tnlLongVector< T >& host_output,
 		           tnlLongVectorCUDA< T >& device_input,
-		           tnlLongVectorCUDA< T >& device_output,
 		           int size )
    {
 	   if( ! host_input. SetNewSize( size ) )
 		   return false;
-	   if( ! host_output. SetNewSize( size ) )
-		   return false;
 	   if( ! device_input. SetNewSize( size ) )
 		   return false;
-	   if( ! device_output. SetNewSize( size ) )
-		   return false;
 
 	   for( int i=0; i < size; i ++ )
-	   {
 		   host_input[ i ] = i + 1;
-		   host_output[ i ] = 0;
-	   }
+
 	   device_input. copyFrom( host_input );
 	   return true;
    }
 
    void testReduction( int algorithm_efficiency = 0 )
    {
-	   int size = 1<<16;
+	   int size = 1<<10;
 	   int desBlockSize = 128;    //Desired block size
 	   int desGridSize = 2048;    //Impose limitation on grid size so that threads could perform sequential work
 
-	   tnlLongVector< T > host_input, host_output;
-	   tnlLongVectorCUDA< T > device_input, device_output;
+	   tnlLongVector< T > host_input;
+	   tnlLongVectorCUDA< T > device_input;
 	   CPPUNIT_ASSERT( testSetup( host_input,
-		         	   host_output,
 		         	   device_input,
-		         	   device_output,
 		         	   size )  );
-
-
+	   T seq_min( host_input[ 0 ] ),
+	     seq_max( host_input[ 0 ] ),
+	     seq_sum( host_input[ 0 ] );
+	   for( int i = 1; i < size; i ++ )
+	   {
+		   seq_min = :: Min( seq_min, host_input[ i ] );
+		   seq_max = :: Max( seq_max, host_input[ i ] );
+		   seq_sum += host_input[ i ];
+	   }
 
 	   //Calculate necessary block/grid dimensions
 	   int block_size = :: Min( size/2, desBlockSize );
@@ -186,9 +312,29 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
 	   switch( algorithm_efficiency )
 	   {
 		   case 1:
-			   min = tnlCUDASimpleReduction1Min( size, block_size, grid_size, device_input. Data(), device_output. Data() );
-			   max = tnlCUDASimpleReduction1Max( size, block_size, grid_size, device_input. Data(), device_output. Data() );
-			   sum = tnlCUDASimpleReduction1Sum( size, block_size, grid_size, device_input. Data(), device_output. Data() );
+			   tnlCUDASimpleReduction1Min( size, device_input. Data(), min );
+			   tnlCUDASimpleReduction1Max( size, device_input. Data(), max );
+			   tnlCUDASimpleReduction1Sum( size, device_input. Data(), sum );
+			   break;
+		   case 2:
+			   tnlCUDASimpleReduction2Min( size, device_input. Data(), min );
+			   tnlCUDASimpleReduction2Max( size, device_input. Data(), max );
+			   tnlCUDASimpleReduction2Sum( size, device_input. Data(), sum );
+			   break;
+		   case 3:
+			   tnlCUDASimpleReduction3Min( size, device_input. Data(), min );
+			   tnlCUDASimpleReduction3Max( size, device_input. Data(), max );
+			   tnlCUDASimpleReduction3Sum( size, device_input. Data(), sum );
+			   break;
+		   case 4:
+			   tnlCUDASimpleReduction4Min( size, device_input. Data(), min );
+			   tnlCUDASimpleReduction4Max( size, device_input. Data(), max );
+			   tnlCUDASimpleReduction4Sum( size, device_input. Data(), sum );
+			   break;
+		   case 5:
+			   tnlCUDASimpleReduction5Min( size, device_input. Data(), min );
+			   tnlCUDASimpleReduction5Max( size, device_input. Data(), max );
+			   tnlCUDASimpleReduction5Sum( size, device_input. Data(), sum );
 			   break;
 		   default:
 			   min = tnlCUDAReductionMin( size, block_size, grid_size, device_input. Data() );
@@ -196,14 +342,44 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
 			   sum = tnlCUDAReductionSum( size, block_size, grid_size, device_input. Data() );
 	   }
 
-	   cout << "Min: " << min << endl
-			<< "Max: " << max << endl
-			<< "Sum: " << sum << endl;
 
+	   cout << "Min: " << min << " Seq. min: " << seq_min << endl
+			<< "Max: " << max << " Seq. max: " << seq_max << endl
+			<< "Sum: " << sum << " Seq. sum: " << seq_sum << endl;
+
+	   CPPUNIT_ASSERT( min == seq_min );
+	   CPPUNIT_ASSERT( max == seq_max );
+	   CPPUNIT_ASSERT( sum == seq_sum );
+
+   };
+
+   void testSimpleReduction5()
+   {
+   	   cout << "Test reduction 5" << endl;
+   	   testReduction( 5 );
+   };
+
+   void testSimpleReduction4()
+   {
+	   cout << "Test reduction 4" << endl;
+	   testReduction( 4 );
+   };
+
+   void testSimpleReduction3()
+   {
+   	   cout << "Test reduction 3" << endl;
+     	   testReduction( 3 );
+   };
+
+   void testSimpleReduction2()
+   {
+	   cout << "Test reduction 2" << endl;
+  	   testReduction( 2 );
    };
 
    void testSimpleReduction1()
    {
+	   cout << "Test reduction 1" << endl;
 	   testReduction( 1 );
    };
 
diff --git a/src/tnl-unit-tests.cpp b/src/tnl-unit-tests.cpp
index 0e4be63e70..e75f2578d8 100644
--- a/src/tnl-unit-tests.cpp
+++ b/src/tnl-unit-tests.cpp
@@ -44,8 +44,8 @@ int main( int argc, char* argv[] )
    runner.addTest( tnlGridCUDA2DTester< double > :: suite() );
    
    runner.addTest( tnlCUDAKernelsTester< int > :: suite() );
-   //runner.addTest( tnlCUDAKernelsTester< float > :: suite() );
-   //runner.addTest( tnlCUDAKernelsTester< double > :: suite() );
+   runner.addTest( tnlCUDAKernelsTester< float > :: suite() );
+   runner.addTest( tnlCUDAKernelsTester< double > :: suite() );
    
    runner.run();
    return 0;
-- 
GitLab