From 0076d54f55291d33fe43fe8200fba4f29c2e94ee Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 19 Jan 2010 20:34:05 +0000
Subject: [PATCH] Implementing CUDA parallel reduction.

---
 src/core/Makefile.am            |   1 +
 src/core/tnl-cuda-kernels.cu    |  92 +++++----
 src/core/tnl-cuda-kernels.cu.h  | 222 +++++++++++++++++++++
 src/core/tnl-cuda-kernels.h     | 333 ++++++++++++++++----------------
 src/core/tnlCUDAKernelsTester.h | 215 ++-------------------
 src/tnl-unit-tests.cpp          |   2 +-
 6 files changed, 448 insertions(+), 417 deletions(-)
 create mode 100644 src/core/tnl-cuda-kernels.cu.h

diff --git a/src/core/Makefile.am b/src/core/Makefile.am
index 23c69744c3..097fe1d20e 100644
--- a/src/core/Makefile.am
+++ b/src/core/Makefile.am
@@ -38,6 +38,7 @@ headers = tnlAssert.h \
 		    tnlTester.h \
 		    tnlVector.h \
 		    tnl-cuda-kernels.h \
+		    tnl-cuda-kernels.cu.h \
 		    tnlCUDAKernelsTester.h \
 		    compress-file.h \
 		    mfilename.h \
diff --git a/src/core/tnl-cuda-kernels.cu b/src/core/tnl-cuda-kernels.cu
index b36083a498..01a739d057 100644
--- a/src/core/tnl-cuda-kernels.cu
+++ b/src/core/tnl-cuda-kernels.cu
@@ -22,76 +22,74 @@
 using namespace std;
 
 int tnlCUDAReductionMin( const int size,
-                         const int block_size,
-                         const int grid_size,
-                         const int* input )
+                         const int* input,
+                         int& result,
+                         int* device_aux_array = 0 )
 {
-   return tnlCUDAReduction< int, tnlMin >( size, block_size, grid_size, input );
+   return tnlCUDAReduction< int, tnlMin >( size, input, result, device_aux_array );
 }
 
 int tnlCUDAReductionMax( const int size,
-                         const int block_size,
-                         const int grid_size,
-                         const int* input )
+                         const int* input,
+                         int& result,
+                         int* device_aux_array = 0 )
 {
-   return tnlCUDAReduction< int, tnlMax >( size, block_size, grid_size, input );
+   return tnlCUDAReduction< int, tnlMax >( size, input, result, device_aux_array );
 }
-                         
+
 int tnlCUDAReductionSum( const int size,
-                         const int block_size,
-                         const int grid_size,
-                         const int* input )
+                         const int* input,
+                         int& result,
+                         int* device_aux_array = 0 )
 {
-   return tnlCUDAReduction< int, tnlSum >( size, block_size, grid_size, input );
+   return tnlCUDAReduction< int, tnlSum >( size, input, result, device_aux_array );
 }
 
-
-float tnlCUDAReductionMin( const int size,
-                           const int block_size,
-                           const int grid_size,
-                           const float* input )
+bool tnlCUDAReductionMin( const int size,
+                          const float* input,
+                          float& result,
+                          float* device_aux_array = 0 )
 {
-   return tnlCUDAReduction< float, tnlMin >( size, block_size, grid_size, input );
+   return tnlCUDAReduction< float, tnlMin >( size, input, result, device_aux_array );
 }
 
-float tnlCUDAReductionMax( const int size,
-                           const int block_size,
-                           const int grid_size,
-                           const float* input )
+bool tnlCUDAReductionMax( const int size,
+                          const float* input,
+                          float& result,
+                          float* device_aux_array = 0 )
 {
-   return tnlCUDAReduction< float, tnlMax >( size, block_size, grid_size, input );
+   return tnlCUDAReduction< float, tnlMax >( size, input, result, device_aux_array );
 }
-                         
-float tnlCUDAReductionSum( const int size,
-                           const int block_size,
-                           const int grid_size,
-                           const float* input )
+
+bool tnlCUDAReductionSum( const int size,
+                          const float* input,
+                          float& result,
+                          float* device_aux_array = 0 )
 {
-   return tnlCUDAReduction< float, tnlSum >( size, block_size, grid_size, input );
+   return tnlCUDAReduction< float, tnlSum >( size, input, result, device_aux_array );
 }
-
-double tnlCUDAReductionMin( const int size,
-                            const int block_size,
-                            const int grid_size,
-                            const double* input )
+bool tnlCUDAReductionMin( const int size,
+                          const double* input,
+                          double& result,
+                          double* device_aux_array = 0 )
 {
-   return tnlCUDAReduction< double, tnlMin >( size, block_size, grid_size, input );
+   return tnlCUDAReduction< double, tnlMin >( size, input, result, device_aux_array );
 }
 
-double tnlCUDAReductionMax( const int size,
-                            const int block_size,
-                            const int grid_size,
-                            const double* input )
+bool tnlCUDAReductionMax( const int size,
+                          const double* input,
+                          double& result,
+                          double* device_aux_array = 0 )
 {
-   return tnlCUDAReduction< double, tnlMax >( size, block_size, grid_size, input );
+   return tnlCUDAReduction< double, tnlMax >( size, input, result, device_aux_array );
 }
-                         
-double tnlCUDAReductionSum( const int size,
-                            const int block_size,
-                            const int grid_size,
-                            const double* input )
+
+bool tnlCUDAReductionSum( const int size,
+                          const double* input,
+                          double& result,
+                          double* device_aux_array = 0 )
 {
-   return tnlCUDAReduction< double, tnlSum >( size, block_size, grid_size, input );
+   return tnlCUDAReduction< double, tnlSum >( size, input, result, device_aux_array );
 }
 
 /*
diff --git a/src/core/tnl-cuda-kernels.cu.h b/src/core/tnl-cuda-kernels.cu.h
new file mode 100644
index 0000000000..6a0da10bfc
--- /dev/null
+++ b/src/core/tnl-cuda-kernels.cu.h
@@ -0,0 +1,222 @@
+/***************************************************************************
+                          tnl-cuda-kernels.cu.h  -  description
+                             -------------------
+    begin                : Jan 19, 2010
+    copyright            : (C) 2010 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef TNLCUDAKERNELS_CU_H_
+#define TNLCUDAKERNELS_CU_H_
+
+int tnlCUDAReductionMin( const int size,
+                         const int* input,
+                         int& result,
+                         int* device_aux_array = 0 );
+
+int tnlCUDAReductionMax( const int size,
+                         const int* input,
+                         int& result,
+                         int* device_aux_array = 0 );
+
+int tnlCUDAReductionSum( const int size,
+                         const int* input,
+                         int& result,
+                         int* device_aux_array = 0 );
+
+bool tnlCUDAReductionMin( const int size,
+                          const float* input,
+                          float& result,
+                          float* device_aux_array = 0 );
+
+bool tnlCUDAReductionMax( const int size,
+                          const float* input,
+                          float& result,
+                          float* device_aux_array = 0 );
+
+bool tnlCUDAReductionSum( const int size,
+                          const float* input,
+                          float& result,
+                          float* device_aux_array = 0 );
+
+bool tnlCUDAReductionMin( const int size,
+                          const double* input,
+                          double& result,
+                          double* device_aux_array = 0 );
+
+bool tnlCUDAReductionMax( const int size,
+                          const double* input,
+                          double& result,
+                          double* device_aux_array = 0 );
+
+bool tnlCUDAReductionSum( const int size,
+                          const double* input,
+                          double& result,
+                          double* device_aux_array = 0 );
+
+/*
+ * Simple reduction 5
+ */
+bool tnlCUDASimpleReduction5Min( const int size,
+                                 const int* input,
+                                 int& result );
+bool tnlCUDASimpleReduction5Max( const int size,
+                                 const int* input,
+                                 int& result );
+bool tnlCUDASimpleReduction5Sum( const int size,
+                                 const int* input,
+                                 int& result );
+bool tnlCUDASimpleReduction5Min( const int size,
+                                 const float* input,
+                                 float& result);
+bool tnlCUDASimpleReduction5Max( const int size,
+                                 const float* input,
+                                 float& result);
+bool tnlCUDASimpleReduction5Sum( const int size,
+                                 const float* input,
+                                 float& result);
+bool tnlCUDASimpleReduction5Min( const int size,
+                                 const double* input,
+                                 double& result);
+bool tnlCUDASimpleReduction5Max( const int size,
+                                 const double* input,
+                                 double& result );
+bool tnlCUDASimpleReduction5Sum( const int size,
+                                 const double* input,
+                                 double& result );
+
+/*
+ * Simple reduction 4
+ */
+bool tnlCUDASimpleReduction4Min( const int size,
+                                 const int* input,
+                                 int& result );
+bool tnlCUDASimpleReduction4Max( const int size,
+                                 const int* input,
+                                 int& result );
+bool tnlCUDASimpleReduction4Sum( const int size,
+                                 const int* input,
+                                 int& result );
+bool tnlCUDASimpleReduction4Min( const int size,
+                                 const float* input,
+                                 float& result);
+bool tnlCUDASimpleReduction4Max( const int size,
+                                 const float* input,
+                                 float& result);
+bool tnlCUDASimpleReduction4Sum( const int size,
+                                 const float* input,
+                                 float& result);
+bool tnlCUDASimpleReduction4Min( const int size,
+                                 const double* input,
+                                 double& result);
+bool tnlCUDASimpleReduction4Max( const int size,
+                                 const double* input,
+                                 double& result );
+bool tnlCUDASimpleReduction4Sum( const int size,
+                                 const double* input,
+                                 double& result );
+
+/*
+ * Simple reduction 3
+ */
+bool tnlCUDASimpleReduction3Min( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction3Max( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction3Sum( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction3Min( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction3Max( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction3Sum( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction3Min( const int size,
+                             const double* input,
+                             double& result);
+bool tnlCUDASimpleReduction3Max( const int size,
+                             const double* input,
+                             double& result );
+bool tnlCUDASimpleReduction3Sum( const int size,
+                             const double* input,
+                             double& result );
+
+/*
+ * Simple reduction 2
+ */
+bool tnlCUDASimpleReduction2Min( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction2Max( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction2Sum( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction2Min( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction2Max( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction2Sum( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction2Min( const int size,
+                             const double* input,
+                             double& result);
+bool tnlCUDASimpleReduction2Max( const int size,
+                             const double* input,
+                             double& result );
+bool tnlCUDASimpleReduction2Sum( const int size,
+                             const double* input,
+                             double& result );
+
+/*
+ * Simple reduction 1
+ */
+bool tnlCUDASimpleReduction1Min( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction1Max( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction1Sum( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction1Min( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction1Max( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction1Sum( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction1Min( const int size,
+                             const double* input,
+                             double& result);
+bool tnlCUDASimpleReduction1Max( const int size,
+                             const double* input,
+                             double& result );
+bool tnlCUDASimpleReduction1Sum( const int size,
+                             const double* input,
+                             double& result );
+
+
+#endif /* TNLCUDAKERNELS_CU_H_ */
diff --git a/src/core/tnl-cuda-kernels.h b/src/core/tnl-cuda-kernels.h
index c56306f051..ba9bc974b2 100644
--- a/src/core/tnl-cuda-kernels.h
+++ b/src/core/tnl-cuda-kernels.h
@@ -27,15 +27,15 @@ enum tnlOperation { tnlMin, tnlMax, tnlSum };
 #ifdef HAVE_CUDA
 
 template< class T > __device__ T tnlCudaMin( const T& a,
-		                                     const T& b )
+                                             const T& b )
 {
-	return a < b ? a : b;
+   return a < b ? a : b;
 }
 
 template< class T > __device__ T tnlCudaMax( const T& a,
-		                                     const T& b )
+                                             const T& b )
 {
-	return a > b ? a : b;
+   return a > b ? a : b;
 }
 
 /*
@@ -51,159 +51,159 @@ template< class T > __device__ T tnlCudaMax( const T& a,
 
 template < class T, tnlOperation operation, int blockSize >
 __global__ void tnlCUDAReductionKernel( const int size,
-										const int grid_size,
-	                                    const T* d_input,
-	                                    T* d_output,
-	                                    T* dbg_array1 = 0  )
+	         			const int grid_size,
+	                                const T* d_input,
+	                                T* d_output,
+	                                T* dbg_array1 = 0  )
 {
-	extern __shared__ T sdata[];
+   extern __shared__ __align__ ( 8 ) T sdata[];
 
-	// Read data into the shared memory
-	int tid = threadIdx. x;
-	int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x;
-	// Last thread ID which manipulates meaningful data
+   // Read data into the shared memory
+   int tid = threadIdx. x;
+   int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x;
+   // Last thread ID which manipulates meaningful data
 
-	int grid_size = 2 * blockSize * gridDim. x;
-	if( gid + blockSize < size )
-	{
-		if( operation == tnlMin ) sdata[ tid ] = :: Min( d_input[ gid ], d_input[ gid + blockSize ] );
-		if( operation == tnlMax ) sdata[ tid ] = :: Max( d_input[ gid ], d_input[ gid + blockSize ] );
-		if( operation == tnlSum ) sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockSize ];
-	}
-	else
-	{
-		sdata[ tid ] = d_input[ gid ];
-	}
-	gid += grid_size;
+   //int grid_size = 2 * blockSize * gridDim. x;
+   if( gid + blockSize < size )
+   {
+      if( operation == tnlMin ) sdata[ tid ] = :: tnlCudaMin( d_input[ gid ], d_input[ gid + blockSize ] );
+      if( operation == tnlMax ) sdata[ tid ] = :: tnlCudaMax( d_input[ gid ], d_input[ gid + blockSize ] );
+      if( operation == tnlSum ) sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockSize ];
+   }
+   else
+   {
+      sdata[ tid ] = d_input[ gid ];
+   }
+   gid += grid_size;
 
-	while (gid < size)
-	{
-		if( operation == tnlMin ) sdata[ tid ] = :: Min( sdata[ tid ], :: Min( d_input[gid], d_input[gid+blockSize] );
-		if( operation == tnlMax ) sdata[ tid ] = :: Max( sdata[ tid ], :: Max( d_input[gid], d_input[gid+blockSize] );
-		if( operation == tnlSum ) sdata[ tid ] += d_input[gid] + d_input[ gid + blockSize ];
-		gid += gridSize;
-	}
-	__syncthreads();
+   while( gid < size )
+   {
+      if( operation == tnlMin ) sdata[ tid ] = :: tnlCudaMin( sdata[ tid ], :: tnlCudaMin( d_input[gid], d_input[gid+blockSize] ) );
+      if( operation == tnlMax ) sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], :: tnlCudaMax( d_input[gid], d_input[gid+blockSize] ) );
+      if( operation == tnlSum ) sdata[ tid ] += d_input[gid] + d_input[ gid + blockSize ];
+      gid += grid_size;
+   }
+   __syncthreads();
 
-	if( gid + blockDim. x < size )
-	{
-		if( operation == tnlMin )
-			sdata[ tid ] = tnlCudaMin( d_input[ gid ], d_input[ gid + blockDim. x ] );
-		if( operation == tnlMax )
-			sdata[ tid ] = tnlCudaMax( d_input[ gid ], d_input[ gid + blockDim. x ] );
-		if( operation == tnlSum )
-			sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockDim. x ];
-	}
-	else if( gid < size )
-	{
-		sdata[ tid ] = d_input[ gid ];
-	}
-	__syncthreads();
+   if( gid + blockDim. x < size )
+   {
+      if( operation == tnlMin )
+         sdata[ tid ] = tnlCudaMin( d_input[ gid ], d_input[ gid + blockDim. x ] );
+      if( operation == tnlMax )
+         sdata[ tid ] = tnlCudaMax( d_input[ gid ], d_input[ gid + blockDim. x ] );
+      if( operation == tnlSum )
+         sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockDim. x ];
+   }
+   else if( gid < size )
+   {
+      sdata[ tid ] = d_input[ gid ];
+   }
+   __syncthreads();
 
-	// Parallel reduction
-		if( blockSize == 512 )
-		{
-			if( tid < 256 )
-			{
-				if( operation == tnlMin )
-					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 256 ] );
-				if( operation == tnlMax )
-					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 256 ] );
-				if( operation == tnlSum )
-					sdata[ tid ] += sdata[ tid + 256 ];
-			}
-			__syncthreads();
-		}
-		if( blockSize >= 256 )
-		{
-			if( tid < 128 )
-			{
-				if( operation == tnlMin )
-					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 128 ] );
-				if( operation == tnlMax )
-					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 128 ] );
-				if( operation == tnlSum )
-					sdata[ tid ] += sdata[ tid + 128 ];
-			}
-			__syncthreads();
-		}
-		if( blockSize >= 128 )
-		{
-			if (tid< 64)
-			{
-				if( operation == tnlMin )
-					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 64 ] );
-				if( operation == tnlMax )
-					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 64 ] );
-				if( operation == tnlSum )
-					sdata[ tid ] += sdata[ tid + 64 ];
-			}
-			__syncthreads();
-		}
-		/*
-		 * What follows runs in warp so it does not need to be synchronised.
-		 */
-		if( tid < 32 )
-		{
-			if( blockSize >= 64 )
-			{
-				if( operation == tnlMin )
-					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 32 ] );
-				if( operation == tnlMax )
-					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 32 ] );
-				if( operation == tnlSum )
-					sdata[ tid ] += sdata[ tid + 32 ];
-			}
-			if( blockSize >= 32 )
-			{
-				if( operation == tnlMin )
-					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 16 ] );
-				if( operation == tnlMax )
-					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 16 ] );
-				if( operation == tnlSum )
-					sdata[ tid ] += sdata[ tid + 16 ];
-			}
-			if( blockSize >= 16 )
-			{
-				if( operation == tnlMin )
-					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 8 ] );
-				if( operation == tnlMax )
-					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 8 ] );
-				if( operation == tnlSum )
-					sdata[ tid ] += sdata[ tid + 8 ];
-			}
-		    if( blockSize >= 8 )
-		    {
-		    	if( operation == tnlMin )
-		    		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 4 ] );
-		    	if( operation == tnlMax )
-		    		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 4 ] );
-		    	if( operation == tnlSum )
-		    		sdata[ tid ] += sdata[ tid + 4 ];
-		    }
-			if( blockSize >= 4 )
-			{
-				if( operation == tnlMin )
-					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 2 ] );
-				if( operation == tnlMax )
-					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 2 ] );
-				if( operation == tnlSum )
-					sdata[ tid ] += sdata[ tid + 2 ];
-			}
-			if( blockSize >= 2 )
-			{
-				if( operation == tnlMin )
-					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 1 ] );
-				if( operation == tnlMax )
-					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 1 ] );
-				if( operation == tnlSum )
-					sdata[ tid ] += sdata[ tid + 1 ];
-			}
-		}
+   // Parallel reduction
+   if( blockSize == 512 )
+   {
+      if( tid < 256 )
+      {
+         if( operation == tnlMin )
+            sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 256 ] );
+         if( operation == tnlMax )
+            sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 256 ] );
+         if( operation == tnlSum )
+            sdata[ tid ] += sdata[ tid + 256 ];
+      }
+      __syncthreads();
+   }
+   if( blockSize >= 256 )
+   {
+      if( tid < 128 )
+      {
+         if( operation == tnlMin )
+            sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 128 ] );
+         if( operation == tnlMax )
+            sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 128 ] );
+         if( operation == tnlSum )
+            sdata[ tid ] += sdata[ tid + 128 ];
+      }
+      __syncthreads();
+   }
+   if( blockSize >= 128 )
+   {
+      if (tid< 64)
+      {
+         if( operation == tnlMin )
+            sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 64 ] );
+         if( operation == tnlMax )
+            sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 64 ] );
+         if( operation == tnlSum )
+            sdata[ tid ] += sdata[ tid + 64 ];
+      }
+      __syncthreads();
+   }
+   /*
+    * What follows runs in warp so it does not need to be synchronised.
+    */
+   if( tid < 32 )
+   {
+      if( blockSize >= 64 )
+      {
+         if( operation == tnlMin )
+            sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 32 ] );
+         if( operation == tnlMax )
+            sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 32 ] );
+         if( operation == tnlSum )
+            sdata[ tid ] += sdata[ tid + 32 ];
+      }
+      if( blockSize >= 32 )
+      {
+         if( operation == tnlMin )
+            sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 16 ] );
+         if( operation == tnlMax )
+            sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 16 ] );
+         if( operation == tnlSum )
+            sdata[ tid ] += sdata[ tid + 16 ];
+      }
+      if( blockSize >= 16 )
+      {
+         if( operation == tnlMin )
+            sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 8 ] );
+         if( operation == tnlMax )
+            sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 8 ] );
+         if( operation == tnlSum )
+            sdata[ tid ] += sdata[ tid + 8 ];
+      }
+      if( blockSize >= 8 )
+      {
+         if( operation == tnlMin )
+            sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 4 ] );
+         if( operation == tnlMax )
+            sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 4 ] );
+         if( operation == tnlSum )
+            sdata[ tid ] += sdata[ tid + 4 ];
+      }
+      if( blockSize >= 4 )
+      {
+         if( operation == tnlMin )
+            sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 2 ] );
+         if( operation == tnlMax )
+            sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 2 ] );
+         if( operation == tnlSum )
+            sdata[ tid ] += sdata[ tid + 2 ];
+      }
+      if( blockSize >= 2 )
+      {
+         if( operation == tnlMin )
+            sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 1 ] );
+         if( operation == tnlMax )
+            sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 1 ] );
+         if( operation == tnlSum )
+            sdata[ tid ] += sdata[ tid + 1 ];
+      }
+   }
 
-	// Store the result back in global memory
-	if( tid == 0 )
-		d_output[ blockIdx. x ] = sdata[ 0 ];
+   // Store the result back in global memory
+   if( tid == 0 )
+      d_output[ blockIdx. x ] = sdata[ 0 ];
 }
 
 /*
@@ -213,15 +213,14 @@ __global__ void tnlCUDAReductionKernel( const int size,
  */
 template< class T, tnlOperation operation >
 bool tnlCUDAReduction( const int size,
-	                   const T* device_input,
-	                   T& result,
-	                   T* device_output = 0 )
+	               const T* device_input,
+	               T& result,
+	               T* device_output = 0 )
 {
    //Calculate necessary block/grid dimensions
    const int cpuThreshold = 1;
    const int desBlockSize = 16;    //Desired block size
-
-   T* dbg_array1;
+   const int desGridSize = 2048;
 
    bool device_output_allocated( false );
    if( ! device_output )
@@ -234,9 +233,6 @@ bool tnlCUDAReduction( const int size,
     	   return false;
        }
        device_output_allocated = true;
-
-       //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
-       //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!!
    }
    dim3 block_size( 0 ), grid_size( 0 );
    int shmem;
@@ -245,42 +241,41 @@ bool tnlCUDAReduction( const int size,
    while( size_reduced > cpuThreshold )
    {
       block_size. x = :: Min( size_reduced, desBlockSize );
-      grid_size. x = :: Min( ( size_reduced / block_size. x + 1 ) / 2, desGridSize );
+      grid_size. x = :: Min( ( int ) ( size_reduced / block_size. x + 1 ) / 2, desGridSize );
       shmem = block_size. x * sizeof( T );
       cout << "Size: " << size_reduced
            << " Grid size: " << grid_size. x
            << " Block size: " << block_size. x
            << " Shmem: " << shmem << endl;
-      //tnlCUDASimpleReductionKernel4< T, operation ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
       tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." );
       switch( block_size. x )
       {
 		  case 512:
-			  tnlCUDASimpleReductionKernel5< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
+			  tnlCUDAReductionKernel< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
 			  break;
 		  case 256:
-			  tnlCUDASimpleReductionKernel5< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
+			  tnlCUDAReductionKernel< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
 			  break;
 		  case 128:
-			  tnlCUDASimpleReductionKernel5< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
+			  tnlCUDAReductionKernel< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
 			  break;
 		  case  64:
-			  tnlCUDASimpleReductionKernel5< T, operation,  64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
+			  tnlCUDAReductionKernel< T, operation,  64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
 			  break;
 		  case  32:
-			  tnlCUDASimpleReductionKernel5< T, operation,  32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
+			  tnlCUDAReductionKernel< T, operation,  32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
 			  break;
 		  case  16:
-			  tnlCUDASimpleReductionKernel5< T, operation,  16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
+			  tnlCUDAReductionKernel< T, operation,  16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
 			  break;
 		  case   8:
-			  tnlCUDASimpleReductionKernel5< T, operation,   8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
+			  tnlCUDAReductionKernel< T, operation,   8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
 			  break;
 		  case   4:
-			  tnlCUDASimpleReductionKernel5< T, operation,   4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
+			  tnlCUDAReductionKernel< T, operation,   4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
 			  break;
 		  case   2:
-			  tnlCUDASimpleReductionKernel5< T, operation,   2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
+			  tnlCUDAReductionKernel< T, operation,   2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
 			  break;
 		  case   1:
 			  tnlAssert( false, cerr << "blockSize should not be 1." << endl );
diff --git a/src/core/tnlCUDAKernelsTester.h b/src/core/tnlCUDAKernelsTester.h
index aa14fddc11..82edc2341a 100644
--- a/src/core/tnlCUDAKernelsTester.h
+++ b/src/core/tnlCUDAKernelsTester.h
@@ -30,198 +30,7 @@
 using namespace std;
 
 #ifdef HAVE_CUDA
-int tnlCUDAReductionMin( const int size,
-                         const int block_size,
-                         const int grid_size,
-                         const int* input );
-int tnlCUDAReductionMax( const int size,
-                         const int block_size,
-                         const int grid_size,
-                         const int* input );
-int tnlCUDAReductionSum( const int size,
-                         const int block_size,
-                         const int grid_size,
-                         const int* input );
-float tnlCUDAReductionMin( const int size,
-                           const int block_size,
-                           const int grid_size,
-                           const float* input );
-float tnlCUDAReductionMax( const int size,
-                           const int block_size,
-                           const int grid_size,
-                           const float* input );
-float tnlCUDAReductionSum( const int size,
-                           const int block_size,
-                           const int grid_size,
-                           const float* input );
-double tnlCUDAReductionMin( const int size,
-                            const int block_size,
-                            const int grid_size,
-                            const double* input );
-double tnlCUDAReductionMax( const int size,
-                            const int block_size,
-                            const int grid_size,
-                            const double* input );
-double tnlCUDAReductionSum( const int size,
-                            const int block_size,
-                            const int grid_size,
-                            const double* input );
-/*
- * Simple reduction 5
- */
-bool tnlCUDASimpleReduction5Min( const int size,
-                                 const int* input,
-                                 int& result );
-bool tnlCUDASimpleReduction5Max( const int size,
-                                 const int* input,
-                                 int& result );
-bool tnlCUDASimpleReduction5Sum( const int size,
-                                 const int* input,
-                                 int& result );
-bool tnlCUDASimpleReduction5Min( const int size,
-                                 const float* input,
-                                 float& result);
-bool tnlCUDASimpleReduction5Max( const int size,
-                                 const float* input,
-                                 float& result);
-bool tnlCUDASimpleReduction5Sum( const int size,
-                                 const float* input,
-                                 float& result);
-bool tnlCUDASimpleReduction5Min( const int size,
-                                 const double* input,
-                                 double& result);
-bool tnlCUDASimpleReduction5Max( const int size,
-                                 const double* input,
-                                 double& result );
-bool tnlCUDASimpleReduction5Sum( const int size,
-                                 const double* input,
-                                 double& result );
-
-
-/*
- * Simple reduction 4
- */
-bool tnlCUDASimpleReduction4Min( const int size,
-                          const int* input,
-                          int& result );
-bool tnlCUDASimpleReduction4Max( const int size,
-                          const int* input,
-                          int& result );
-bool tnlCUDASimpleReduction4Sum( const int size,
-                          const int* input,
-                          int& result );
-bool tnlCUDASimpleReduction4Min( const int size,
-                            const float* input,
-                            float& result);
-bool tnlCUDASimpleReduction4Max( const int size,
-                            const float* input,
-                            float& result);
-bool tnlCUDASimpleReduction4Sum( const int size,
-                            const float* input,
-                            float& result);
-bool tnlCUDASimpleReduction4Min( const int size,
-                             const double* input,
-                             double& result);
-bool tnlCUDASimpleReduction4Max( const int size,
-                             const double* input,
-                             double& result );
-bool tnlCUDASimpleReduction4Sum( const int size,
-                             const double* input,
-                             double& result );
-
-/*
- * Simple reduction 3
- */
-bool tnlCUDASimpleReduction3Min( const int size,
-                          const int* input,
-                          int& result );
-bool tnlCUDASimpleReduction3Max( const int size,
-                          const int* input,
-                          int& result );
-bool tnlCUDASimpleReduction3Sum( const int size,
-                          const int* input,
-                          int& result );
-bool tnlCUDASimpleReduction3Min( const int size,
-                            const float* input,
-                            float& result);
-bool tnlCUDASimpleReduction3Max( const int size,
-                            const float* input,
-                            float& result);
-bool tnlCUDASimpleReduction3Sum( const int size,
-                            const float* input,
-                            float& result);
-bool tnlCUDASimpleReduction3Min( const int size,
-                             const double* input,
-                             double& result);
-bool tnlCUDASimpleReduction3Max( const int size,
-                             const double* input,
-                             double& result );
-bool tnlCUDASimpleReduction3Sum( const int size,
-                             const double* input,
-                             double& result );
-
-/*
- * Simple reduction 2
- */
-bool tnlCUDASimpleReduction2Min( const int size,
-                          const int* input,
-                          int& result );
-bool tnlCUDASimpleReduction2Max( const int size,
-                          const int* input,
-                          int& result );
-bool tnlCUDASimpleReduction2Sum( const int size,
-                          const int* input,
-                          int& result );
-bool tnlCUDASimpleReduction2Min( const int size,
-                            const float* input,
-                            float& result);
-bool tnlCUDASimpleReduction2Max( const int size,
-                            const float* input,
-                            float& result);
-bool tnlCUDASimpleReduction2Sum( const int size,
-                            const float* input,
-                            float& result);
-bool tnlCUDASimpleReduction2Min( const int size,
-                             const double* input,
-                             double& result);
-bool tnlCUDASimpleReduction2Max( const int size,
-                             const double* input,
-                             double& result );
-bool tnlCUDASimpleReduction2Sum( const int size,
-                             const double* input,
-                             double& result );
-
-/*
- * Simple reduction 1
- */
-bool tnlCUDASimpleReduction1Min( const int size,
-                          const int* input,
-                          int& result );
-bool tnlCUDASimpleReduction1Max( const int size,
-                          const int* input,
-                          int& result );
-bool tnlCUDASimpleReduction1Sum( const int size,
-                          const int* input,
-                          int& result );
-bool tnlCUDASimpleReduction1Min( const int size,
-                            const float* input,
-                            float& result);
-bool tnlCUDASimpleReduction1Max( const int size,
-                            const float* input,
-                            float& result);
-bool tnlCUDASimpleReduction1Sum( const int size,
-                            const float* input,
-                            float& result);
-bool tnlCUDASimpleReduction1Min( const int size,
-                             const double* input,
-                             double& result);
-bool tnlCUDASimpleReduction1Max( const int size,
-                             const double* input,
-                             double& result );
-bool tnlCUDASimpleReduction1Sum( const int size,
-                             const double* input,
-                             double& result );
-
+#include <core/tnl-cuda-kernels.cu.h>
 #endif
 
 
@@ -238,24 +47,30 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
       CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlCUDAKernelsTester" );
       CppUnit :: TestResult result;
 
+      T param;
+      tnlString test_name = tnlString( "testSimpleReduction1< " ) + GetParameterType( param ) + tnlString( " >" );
       suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
-    		                   "testSimpleReduction1",
+    		               test_name. Data(),
                                & tnlCUDAKernelsTester< T > :: testSimpleReduction1 )
                              );
+      test_name = tnlString( "testSimpleReduction2< " ) + GetParameterType( param ) + tnlString( " >" );
       suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
-        		               "testSimpleReduction2",
+                               test_name. Data(),
                                & tnlCUDAKernelsTester< T > :: testSimpleReduction2 )
                               );
+      test_name = tnlString( "testSimpleReduction3< " ) + GetParameterType( param ) + tnlString( " >" );
       suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
-                      		   "testSimpleReduction3",
+                               test_name. Data(),
                                & tnlCUDAKernelsTester< T > :: testSimpleReduction3 )
                               );
+      test_name = tnlString( "testSimpleReduction4< " ) + GetParameterType( param ) + tnlString( " >" );
       suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
-                               "testSimpleReduction4",
+                               test_name. Data(),
                                & tnlCUDAKernelsTester< T > :: testSimpleReduction4 )
                               );
+      test_name = tnlString( "testSimpleReduction5< " ) + GetParameterType( param ) + tnlString( " >" );
       suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
-                               "testSimpleReduction5",
+                               test_name. Data(),
                                & tnlCUDAKernelsTester< T > :: testSimpleReduction5 )
                               );
       /*suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
@@ -337,9 +152,9 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
 			   tnlCUDASimpleReduction5Sum( size, device_input. Data(), sum );
 			   break;
 		   default:
-			   min = tnlCUDAReductionMin( size, block_size, grid_size, device_input. Data() );
-			   max = tnlCUDAReductionMax( size, block_size, grid_size, device_input. Data() );
-			   sum = tnlCUDAReductionSum( size, block_size, grid_size, device_input. Data() );
+			   tnlCUDAReductionMin( size, device_input. Data(), min );
+			   tnlCUDAReductionMax( size, device_input. Data(), max );
+			   tnlCUDAReductionSum( size, device_input. Data(), sum );
 	   }
 
 
diff --git a/src/tnl-unit-tests.cpp b/src/tnl-unit-tests.cpp
index e75f2578d8..571a8f1c4c 100644
--- a/src/tnl-unit-tests.cpp
+++ b/src/tnl-unit-tests.cpp
@@ -29,7 +29,7 @@ using namespace std;
 
 int main( int argc, char* argv[] )
 {
-   CppUnit::TextUi::TestRunner runner;
+   CppUnit :: TextTestRunner runner;
    
    runner.addTest( tnlLongVectorCUDATester< int > :: suite() );
    runner.addTest( tnlLongVectorCUDATester< float > :: suite() );
-- 
GitLab