From 0076d54f55291d33fe43fe8200fba4f29c2e94ee Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz> Date: Tue, 19 Jan 2010 20:34:05 +0000 Subject: [PATCH] Implementing CUDA parallel reduction. --- src/core/Makefile.am | 1 + src/core/tnl-cuda-kernels.cu | 92 +++++---- src/core/tnl-cuda-kernels.cu.h | 222 +++++++++++++++++++++ src/core/tnl-cuda-kernels.h | 333 ++++++++++++++++---------------- src/core/tnlCUDAKernelsTester.h | 215 ++------------------- src/tnl-unit-tests.cpp | 2 +- 6 files changed, 448 insertions(+), 417 deletions(-) create mode 100644 src/core/tnl-cuda-kernels.cu.h diff --git a/src/core/Makefile.am b/src/core/Makefile.am index 23c69744c3..097fe1d20e 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -38,6 +38,7 @@ headers = tnlAssert.h \ tnlTester.h \ tnlVector.h \ tnl-cuda-kernels.h \ + tnl-cuda-kernels.cu.h \ tnlCUDAKernelsTester.h \ compress-file.h \ mfilename.h \ diff --git a/src/core/tnl-cuda-kernels.cu b/src/core/tnl-cuda-kernels.cu index b36083a498..01a739d057 100644 --- a/src/core/tnl-cuda-kernels.cu +++ b/src/core/tnl-cuda-kernels.cu @@ -22,76 +22,74 @@ using namespace std; int tnlCUDAReductionMin( const int size, - const int block_size, - const int grid_size, - const int* input ) + const int* input, + int& result, + int* device_aux_array = 0 ) { - return tnlCUDAReduction< int, tnlMin >( size, block_size, grid_size, input ); + return tnlCUDAReduction< int, tnlMin >( size, input, result, device_aux_array ); } int tnlCUDAReductionMax( const int size, - const int block_size, - const int grid_size, - const int* input ) + const int* input, + int& result, + int* device_aux_array = 0 ) { - return tnlCUDAReduction< int, tnlMax >( size, block_size, grid_size, input ); + return tnlCUDAReduction< int, tnlMax >( size, input, result, device_aux_array ); } - + int tnlCUDAReductionSum( const int size, - const int block_size, - const int grid_size, - const int* input ) + const int* input, + int& result, + int* device_aux_array = 0 ) { - return tnlCUDAReduction< int, tnlSum >( size, block_size, grid_size, input ); + return tnlCUDAReduction< int, tnlSum >( size, input, result, device_aux_array ); } - -float tnlCUDAReductionMin( const int size, - const int block_size, - const int grid_size, - const float* input ) +bool tnlCUDAReductionMin( const int size, + const float* input, + float& result, + float* device_aux_array = 0 ) { - return tnlCUDAReduction< float, tnlMin >( size, block_size, grid_size, input ); + return tnlCUDAReduction< float, tnlMin >( size, input, result, device_aux_array ); } -float tnlCUDAReductionMax( const int size, - const int block_size, - const int grid_size, - const float* input ) +bool tnlCUDAReductionMax( const int size, + const float* input, + float& result, + float* device_aux_array = 0 ) { - return tnlCUDAReduction< float, tnlMax >( size, block_size, grid_size, input ); + return tnlCUDAReduction< float, tnlMax >( size, input, result, device_aux_array ); } - -float tnlCUDAReductionSum( const int size, - const int block_size, - const int grid_size, - const float* input ) + +bool tnlCUDAReductionSum( const int size, + const float* input, + float& result, + float* device_aux_array = 0 ) { - return tnlCUDAReduction< float, tnlSum >( size, block_size, grid_size, input ); + return tnlCUDAReduction< float, tnlSum >( size, input, result, device_aux_array ); } - -double tnlCUDAReductionMin( const int size, - const int block_size, - const int grid_size, - const double* input ) +bool tnlCUDAReductionMin( const int size, + const double* input, + double& result, + double* device_aux_array = 0 ) { - return tnlCUDAReduction< double, tnlMin >( size, block_size, grid_size, input ); + return tnlCUDAReduction< double, tnlMin >( size, input, result, device_aux_array ); } -double tnlCUDAReductionMax( const int size, - const int block_size, - const int grid_size, - const double* input ) +bool tnlCUDAReductionMax( const int size, + const double* input, + double& result, + double* device_aux_array = 0 ) { - return tnlCUDAReduction< double, tnlMax >( size, block_size, grid_size, input ); + return tnlCUDAReduction< double, tnlMax >( size, input, result, device_aux_array ); } - -double tnlCUDAReductionSum( const int size, - const int block_size, - const int grid_size, - const double* input ) + +bool tnlCUDAReductionSum( const int size, + const double* input, + double& result, + double* device_aux_array = 0 ) { - return tnlCUDAReduction< double, tnlSum >( size, block_size, grid_size, input ); + return tnlCUDAReduction< double, tnlSum >( size, input, result, device_aux_array ); } /* diff --git a/src/core/tnl-cuda-kernels.cu.h b/src/core/tnl-cuda-kernels.cu.h new file mode 100644 index 0000000000..6a0da10bfc --- /dev/null +++ b/src/core/tnl-cuda-kernels.cu.h @@ -0,0 +1,222 @@ +/*************************************************************************** + tnl-cuda-kernels.cu.h - description + ------------------- + begin : Jan 19, 2010 + copyright : (C) 2010 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ + +#ifndef TNLCUDAKERNELS_CU_H_ +#define TNLCUDAKERNELS_CU_H_ + +int tnlCUDAReductionMin( const int size, + const int* input, + int& result, + int* device_aux_array = 0 ); + +int tnlCUDAReductionMax( const int size, + const int* input, + int& result, + int* device_aux_array = 0 ); + +int tnlCUDAReductionSum( const int size, + const int* input, + int& result, + int* device_aux_array = 0 ); + +bool tnlCUDAReductionMin( const int size, + const float* input, + float& result, + float* device_aux_array = 0 ); + +bool tnlCUDAReductionMax( const int size, + const float* input, + float& result, + float* device_aux_array = 0 ); + +bool tnlCUDAReductionSum( const int size, + const float* input, + float& result, + float* device_aux_array = 0 ); + +bool tnlCUDAReductionMin( const int size, + const double* input, + double& result, + double* device_aux_array = 0 ); + +bool tnlCUDAReductionMax( const int size, + const double* input, + double& result, + double* device_aux_array = 0 ); + +bool tnlCUDAReductionSum( const int size, + const double* input, + double& result, + double* device_aux_array = 0 ); + +/* + * Simple reduction 5 + */ +bool tnlCUDASimpleReduction5Min( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction5Max( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction5Sum( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction5Min( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction5Max( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction5Sum( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction5Min( const int size, + const double* input, + double& result); +bool tnlCUDASimpleReduction5Max( const int size, + const double* input, + double& result ); +bool tnlCUDASimpleReduction5Sum( const int size, + const double* input, + double& result ); + +/* + * Simple reduction 4 + */ +bool tnlCUDASimpleReduction4Min( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction4Max( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction4Sum( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction4Min( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction4Max( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction4Sum( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction4Min( const int size, + const double* input, + double& result); +bool tnlCUDASimpleReduction4Max( const int size, + const double* input, + double& result ); +bool tnlCUDASimpleReduction4Sum( const int size, + const double* input, + double& result ); + +/* + * Simple reduction 3 + */ +bool tnlCUDASimpleReduction3Min( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction3Max( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction3Sum( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction3Min( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction3Max( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction3Sum( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction3Min( const int size, + const double* input, + double& result); +bool tnlCUDASimpleReduction3Max( const int size, + const double* input, + double& result ); +bool tnlCUDASimpleReduction3Sum( const int size, + const double* input, + double& result ); + +/* + * Simple reduction 2 + */ +bool tnlCUDASimpleReduction2Min( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction2Max( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction2Sum( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction2Min( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction2Max( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction2Sum( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction2Min( const int size, + const double* input, + double& result); +bool tnlCUDASimpleReduction2Max( const int size, + const double* input, + double& result ); +bool tnlCUDASimpleReduction2Sum( const int size, + const double* input, + double& result ); + +/* + * Simple reduction 1 + */ +bool tnlCUDASimpleReduction1Min( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction1Max( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction1Sum( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction1Min( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction1Max( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction1Sum( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction1Min( const int size, + const double* input, + double& result); +bool tnlCUDASimpleReduction1Max( const int size, + const double* input, + double& result ); +bool tnlCUDASimpleReduction1Sum( const int size, + const double* input, + double& result ); + + +#endif /* TNLCUDAKERNELS_CU_H_ */ diff --git a/src/core/tnl-cuda-kernels.h b/src/core/tnl-cuda-kernels.h index c56306f051..ba9bc974b2 100644 --- a/src/core/tnl-cuda-kernels.h +++ b/src/core/tnl-cuda-kernels.h @@ -27,15 +27,15 @@ enum tnlOperation { tnlMin, tnlMax, tnlSum }; #ifdef HAVE_CUDA template< class T > __device__ T tnlCudaMin( const T& a, - const T& b ) + const T& b ) { - return a < b ? a : b; + return a < b ? a : b; } template< class T > __device__ T tnlCudaMax( const T& a, - const T& b ) + const T& b ) { - return a > b ? a : b; + return a > b ? a : b; } /* @@ -51,159 +51,159 @@ template< class T > __device__ T tnlCudaMax( const T& a, template < class T, tnlOperation operation, int blockSize > __global__ void tnlCUDAReductionKernel( const int size, - const int grid_size, - const T* d_input, - T* d_output, - T* dbg_array1 = 0 ) + const int grid_size, + const T* d_input, + T* d_output, + T* dbg_array1 = 0 ) { - extern __shared__ T sdata[]; + extern __shared__ __align__ ( 8 ) T sdata[]; - // Read data into the shared memory - int tid = threadIdx. x; - int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x; - // Last thread ID which manipulates meaningful data + // Read data into the shared memory + int tid = threadIdx. x; + int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x; + // Last thread ID which manipulates meaningful data - int grid_size = 2 * blockSize * gridDim. x; - if( gid + blockSize < size ) - { - if( operation == tnlMin ) sdata[ tid ] = :: Min( d_input[ gid ], d_input[ gid + blockSize ] ); - if( operation == tnlMax ) sdata[ tid ] = :: Max( d_input[ gid ], d_input[ gid + blockSize ] ); - if( operation == tnlSum ) sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockSize ]; - } - else - { - sdata[ tid ] = d_input[ gid ]; - } - gid += grid_size; + //int grid_size = 2 * blockSize * gridDim. x; + if( gid + blockSize < size ) + { + if( operation == tnlMin ) sdata[ tid ] = :: tnlCudaMin( d_input[ gid ], d_input[ gid + blockSize ] ); + if( operation == tnlMax ) sdata[ tid ] = :: tnlCudaMax( d_input[ gid ], d_input[ gid + blockSize ] ); + if( operation == tnlSum ) sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockSize ]; + } + else + { + sdata[ tid ] = d_input[ gid ]; + } + gid += grid_size; - while (gid < size) - { - if( operation == tnlMin ) sdata[ tid ] = :: Min( sdata[ tid ], :: Min( d_input[gid], d_input[gid+blockSize] ); - if( operation == tnlMax ) sdata[ tid ] = :: Max( sdata[ tid ], :: Max( d_input[gid], d_input[gid+blockSize] ); - if( operation == tnlSum ) sdata[ tid ] += d_input[gid] + d_input[ gid + blockSize ]; - gid += gridSize; - } - __syncthreads(); + while( gid < size ) + { + if( operation == tnlMin ) sdata[ tid ] = :: tnlCudaMin( sdata[ tid ], :: tnlCudaMin( d_input[gid], d_input[gid+blockSize] ) ); + if( operation == tnlMax ) sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], :: tnlCudaMax( d_input[gid], d_input[gid+blockSize] ) ); + if( operation == tnlSum ) sdata[ tid ] += d_input[gid] + d_input[ gid + blockSize ]; + gid += grid_size; + } + __syncthreads(); - if( gid + blockDim. x < size ) - { - if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( d_input[ gid ], d_input[ gid + blockDim. x ] ); - if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( d_input[ gid ], d_input[ gid + blockDim. x ] ); - if( operation == tnlSum ) - sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockDim. x ]; - } - else if( gid < size ) - { - sdata[ tid ] = d_input[ gid ]; - } - __syncthreads(); + if( gid + blockDim. x < size ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( d_input[ gid ], d_input[ gid + blockDim. x ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( d_input[ gid ], d_input[ gid + blockDim. x ] ); + if( operation == tnlSum ) + sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockDim. x ]; + } + else if( gid < size ) + { + sdata[ tid ] = d_input[ gid ]; + } + __syncthreads(); - // Parallel reduction - if( blockSize == 512 ) - { - if( tid < 256 ) - { - if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 256 ] ); - if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 256 ] ); - if( operation == tnlSum ) - sdata[ tid ] += sdata[ tid + 256 ]; - } - __syncthreads(); - } - if( blockSize >= 256 ) - { - if( tid < 128 ) - { - if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 128 ] ); - if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 128 ] ); - if( operation == tnlSum ) - sdata[ tid ] += sdata[ tid + 128 ]; - } - __syncthreads(); - } - if( blockSize >= 128 ) - { - if (tid< 64) - { - if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 64 ] ); - if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 64 ] ); - if( operation == tnlSum ) - sdata[ tid ] += sdata[ tid + 64 ]; - } - __syncthreads(); - } - /* - * What follows runs in warp so it does not need to be synchronised. - */ - if( tid < 32 ) - { - if( blockSize >= 64 ) - { - if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 32 ] ); - if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 32 ] ); - if( operation == tnlSum ) - sdata[ tid ] += sdata[ tid + 32 ]; - } - if( blockSize >= 32 ) - { - if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 16 ] ); - if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 16 ] ); - if( operation == tnlSum ) - sdata[ tid ] += sdata[ tid + 16 ]; - } - if( blockSize >= 16 ) - { - if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 8 ] ); - if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 8 ] ); - if( operation == tnlSum ) - sdata[ tid ] += sdata[ tid + 8 ]; - } - if( blockSize >= 8 ) - { - if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 4 ] ); - if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 4 ] ); - if( operation == tnlSum ) - sdata[ tid ] += sdata[ tid + 4 ]; - } - if( blockSize >= 4 ) - { - if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 2 ] ); - if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 2 ] ); - if( operation == tnlSum ) - sdata[ tid ] += sdata[ tid + 2 ]; - } - if( blockSize >= 2 ) - { - if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 1 ] ); - if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 1 ] ); - if( operation == tnlSum ) - sdata[ tid ] += sdata[ tid + 1 ]; - } - } + // Parallel reduction + if( blockSize == 512 ) + { + if( tid < 256 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 256 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 256 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 256 ]; + } + __syncthreads(); + } + if( blockSize >= 256 ) + { + if( tid < 128 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 128 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 128 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 128 ]; + } + __syncthreads(); + } + if( blockSize >= 128 ) + { + if (tid< 64) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 64 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 64 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 64 ]; + } + __syncthreads(); + } + /* + * What follows runs in warp so it does not need to be synchronised. + */ + if( tid < 32 ) + { + if( blockSize >= 64 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 32 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 32 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 32 ]; + } + if( blockSize >= 32 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 16 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 16 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 16 ]; + } + if( blockSize >= 16 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 8 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 8 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 8 ]; + } + if( blockSize >= 8 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 4 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 4 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 4 ]; + } + if( blockSize >= 4 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 2 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 2 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 2 ]; + } + if( blockSize >= 2 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 1 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 1 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 1 ]; + } + } - // Store the result back in global memory - if( tid == 0 ) - d_output[ blockIdx. x ] = sdata[ 0 ]; + // Store the result back in global memory + if( tid == 0 ) + d_output[ blockIdx. x ] = sdata[ 0 ]; } /* @@ -213,15 +213,14 @@ __global__ void tnlCUDAReductionKernel( const int size, */ template< class T, tnlOperation operation > bool tnlCUDAReduction( const int size, - const T* device_input, - T& result, - T* device_output = 0 ) + const T* device_input, + T& result, + T* device_output = 0 ) { //Calculate necessary block/grid dimensions const int cpuThreshold = 1; const int desBlockSize = 16; //Desired block size - - T* dbg_array1; + const int desGridSize = 2048; bool device_output_allocated( false ); if( ! device_output ) @@ -234,9 +233,6 @@ bool tnlCUDAReduction( const int size, return false; } device_output_allocated = true; - - //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! - //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!! } dim3 block_size( 0 ), grid_size( 0 ); int shmem; @@ -245,42 +241,41 @@ bool tnlCUDAReduction( const int size, while( size_reduced > cpuThreshold ) { block_size. x = :: Min( size_reduced, desBlockSize ); - grid_size. x = :: Min( ( size_reduced / block_size. x + 1 ) / 2, desGridSize ); + grid_size. x = :: Min( ( int ) ( size_reduced / block_size. x + 1 ) / 2, desGridSize ); shmem = block_size. x * sizeof( T ); cout << "Size: " << size_reduced << " Grid size: " << grid_size. x << " Block size: " << block_size. x << " Shmem: " << shmem << endl; - //tnlCUDASimpleReductionKernel4< T, operation ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." ); switch( block_size. x ) { case 512: - tnlCUDASimpleReductionKernel5< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); + tnlCUDAReductionKernel< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); break; case 256: - tnlCUDASimpleReductionKernel5< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); + tnlCUDAReductionKernel< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); break; case 128: - tnlCUDASimpleReductionKernel5< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); + tnlCUDAReductionKernel< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); break; case 64: - tnlCUDASimpleReductionKernel5< T, operation, 64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); + tnlCUDAReductionKernel< T, operation, 64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); break; case 32: - tnlCUDASimpleReductionKernel5< T, operation, 32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); + tnlCUDAReductionKernel< T, operation, 32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); break; case 16: - tnlCUDASimpleReductionKernel5< T, operation, 16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); + tnlCUDAReductionKernel< T, operation, 16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); break; case 8: - tnlCUDASimpleReductionKernel5< T, operation, 8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); + tnlCUDAReductionKernel< T, operation, 8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); break; case 4: - tnlCUDASimpleReductionKernel5< T, operation, 4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); + tnlCUDAReductionKernel< T, operation, 4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); break; case 2: - tnlCUDASimpleReductionKernel5< T, operation, 2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); + tnlCUDAReductionKernel< T, operation, 2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); break; case 1: tnlAssert( false, cerr << "blockSize should not be 1." << endl ); diff --git a/src/core/tnlCUDAKernelsTester.h b/src/core/tnlCUDAKernelsTester.h index aa14fddc11..82edc2341a 100644 --- a/src/core/tnlCUDAKernelsTester.h +++ b/src/core/tnlCUDAKernelsTester.h @@ -30,198 +30,7 @@ using namespace std; #ifdef HAVE_CUDA -int tnlCUDAReductionMin( const int size, - const int block_size, - const int grid_size, - const int* input ); -int tnlCUDAReductionMax( const int size, - const int block_size, - const int grid_size, - const int* input ); -int tnlCUDAReductionSum( const int size, - const int block_size, - const int grid_size, - const int* input ); -float tnlCUDAReductionMin( const int size, - const int block_size, - const int grid_size, - const float* input ); -float tnlCUDAReductionMax( const int size, - const int block_size, - const int grid_size, - const float* input ); -float tnlCUDAReductionSum( const int size, - const int block_size, - const int grid_size, - const float* input ); -double tnlCUDAReductionMin( const int size, - const int block_size, - const int grid_size, - const double* input ); -double tnlCUDAReductionMax( const int size, - const int block_size, - const int grid_size, - const double* input ); -double tnlCUDAReductionSum( const int size, - const int block_size, - const int grid_size, - const double* input ); -/* - * Simple reduction 5 - */ -bool tnlCUDASimpleReduction5Min( const int size, - const int* input, - int& result ); -bool tnlCUDASimpleReduction5Max( const int size, - const int* input, - int& result ); -bool tnlCUDASimpleReduction5Sum( const int size, - const int* input, - int& result ); -bool tnlCUDASimpleReduction5Min( const int size, - const float* input, - float& result); -bool tnlCUDASimpleReduction5Max( const int size, - const float* input, - float& result); -bool tnlCUDASimpleReduction5Sum( const int size, - const float* input, - float& result); -bool tnlCUDASimpleReduction5Min( const int size, - const double* input, - double& result); -bool tnlCUDASimpleReduction5Max( const int size, - const double* input, - double& result ); -bool tnlCUDASimpleReduction5Sum( const int size, - const double* input, - double& result ); - - -/* - * Simple reduction 4 - */ -bool tnlCUDASimpleReduction4Min( const int size, - const int* input, - int& result ); -bool tnlCUDASimpleReduction4Max( const int size, - const int* input, - int& result ); -bool tnlCUDASimpleReduction4Sum( const int size, - const int* input, - int& result ); -bool tnlCUDASimpleReduction4Min( const int size, - const float* input, - float& result); -bool tnlCUDASimpleReduction4Max( const int size, - const float* input, - float& result); -bool tnlCUDASimpleReduction4Sum( const int size, - const float* input, - float& result); -bool tnlCUDASimpleReduction4Min( const int size, - const double* input, - double& result); -bool tnlCUDASimpleReduction4Max( const int size, - const double* input, - double& result ); -bool tnlCUDASimpleReduction4Sum( const int size, - const double* input, - double& result ); - -/* - * Simple reduction 3 - */ -bool tnlCUDASimpleReduction3Min( const int size, - const int* input, - int& result ); -bool tnlCUDASimpleReduction3Max( const int size, - const int* input, - int& result ); -bool tnlCUDASimpleReduction3Sum( const int size, - const int* input, - int& result ); -bool tnlCUDASimpleReduction3Min( const int size, - const float* input, - float& result); -bool tnlCUDASimpleReduction3Max( const int size, - const float* input, - float& result); -bool tnlCUDASimpleReduction3Sum( const int size, - const float* input, - float& result); -bool tnlCUDASimpleReduction3Min( const int size, - const double* input, - double& result); -bool tnlCUDASimpleReduction3Max( const int size, - const double* input, - double& result ); -bool tnlCUDASimpleReduction3Sum( const int size, - const double* input, - double& result ); - -/* - * Simple reduction 2 - */ -bool tnlCUDASimpleReduction2Min( const int size, - const int* input, - int& result ); -bool tnlCUDASimpleReduction2Max( const int size, - const int* input, - int& result ); -bool tnlCUDASimpleReduction2Sum( const int size, - const int* input, - int& result ); -bool tnlCUDASimpleReduction2Min( const int size, - const float* input, - float& result); -bool tnlCUDASimpleReduction2Max( const int size, - const float* input, - float& result); -bool tnlCUDASimpleReduction2Sum( const int size, - const float* input, - float& result); -bool tnlCUDASimpleReduction2Min( const int size, - const double* input, - double& result); -bool tnlCUDASimpleReduction2Max( const int size, - const double* input, - double& result ); -bool tnlCUDASimpleReduction2Sum( const int size, - const double* input, - double& result ); - -/* - * Simple reduction 1 - */ -bool tnlCUDASimpleReduction1Min( const int size, - const int* input, - int& result ); -bool tnlCUDASimpleReduction1Max( const int size, - const int* input, - int& result ); -bool tnlCUDASimpleReduction1Sum( const int size, - const int* input, - int& result ); -bool tnlCUDASimpleReduction1Min( const int size, - const float* input, - float& result); -bool tnlCUDASimpleReduction1Max( const int size, - const float* input, - float& result); -bool tnlCUDASimpleReduction1Sum( const int size, - const float* input, - float& result); -bool tnlCUDASimpleReduction1Min( const int size, - const double* input, - double& result); -bool tnlCUDASimpleReduction1Max( const int size, - const double* input, - double& result ); -bool tnlCUDASimpleReduction1Sum( const int size, - const double* input, - double& result ); - +#include <core/tnl-cuda-kernels.cu.h> #endif @@ -238,24 +47,30 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlCUDAKernelsTester" ); CppUnit :: TestResult result; + T param; + tnlString test_name = tnlString( "testSimpleReduction1< " ) + GetParameterType( param ) + tnlString( " >" ); suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >( - "testSimpleReduction1", + test_name. Data(), & tnlCUDAKernelsTester< T > :: testSimpleReduction1 ) ); + test_name = tnlString( "testSimpleReduction2< " ) + GetParameterType( param ) + tnlString( " >" ); suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >( - "testSimpleReduction2", + test_name. Data(), & tnlCUDAKernelsTester< T > :: testSimpleReduction2 ) ); + test_name = tnlString( "testSimpleReduction3< " ) + GetParameterType( param ) + tnlString( " >" ); suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >( - "testSimpleReduction3", + test_name. Data(), & tnlCUDAKernelsTester< T > :: testSimpleReduction3 ) ); + test_name = tnlString( "testSimpleReduction4< " ) + GetParameterType( param ) + tnlString( " >" ); suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >( - "testSimpleReduction4", + test_name. Data(), & tnlCUDAKernelsTester< T > :: testSimpleReduction4 ) ); + test_name = tnlString( "testSimpleReduction5< " ) + GetParameterType( param ) + tnlString( " >" ); suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >( - "testSimpleReduction5", + test_name. Data(), & tnlCUDAKernelsTester< T > :: testSimpleReduction5 ) ); /*suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >( @@ -337,9 +152,9 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase tnlCUDASimpleReduction5Sum( size, device_input. Data(), sum ); break; default: - min = tnlCUDAReductionMin( size, block_size, grid_size, device_input. Data() ); - max = tnlCUDAReductionMax( size, block_size, grid_size, device_input. Data() ); - sum = tnlCUDAReductionSum( size, block_size, grid_size, device_input. Data() ); + tnlCUDAReductionMin( size, device_input. Data(), min ); + tnlCUDAReductionMax( size, device_input. Data(), max ); + tnlCUDAReductionSum( size, device_input. Data(), sum ); } diff --git a/src/tnl-unit-tests.cpp b/src/tnl-unit-tests.cpp index e75f2578d8..571a8f1c4c 100644 --- a/src/tnl-unit-tests.cpp +++ b/src/tnl-unit-tests.cpp @@ -29,7 +29,7 @@ using namespace std; int main( int argc, char* argv[] ) { - CppUnit::TextUi::TestRunner runner; + CppUnit :: TextTestRunner runner; runner.addTest( tnlLongVectorCUDATester< int > :: suite() ); runner.addTest( tnlLongVectorCUDATester< float > :: suite() ); -- GitLab