diff --git a/TODO.txt b/TODO.txt index 76c317a4e0007dd82a008bb5e063716f7039c344..b489c016c0bd45409a2c0448af874ffc77097820 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,5 +1,8 @@ TODO: implementovat tridu tnlFileName pro generovani jmen souboru +TODO: metodu pro tnlString pro nahrazeni napr. podretezce XXXXX indexem 00001 tj. uXXXXX.bin -> u00001.bin + to by melo byt robustnejsi, nez doposavadni pristup + TODO: implementovat tridu tnlParabolicSolver pro odvozovani resicu k casove promennym uloham TODO: Nahradit mGrid2D, mGrid3D za mGrid obecne dimenze @@ -8,6 +11,4 @@ TODO: zavets iteratory pres uzle site misto for cyklu TODO: implementovat Mersona v CUDA -TODO: metoda Test do tnlObject - -TODO: trida tnlTester pro rizeni testu \ No newline at end of file +TODO: objekt pro osetreni chyb - zavedeni funkce tnlGetError \ No newline at end of file diff --git a/configure.ac b/configure.ac index d88c476315156db9075bf1d9792fc3962c8e21d5..ac6ad487dcfe9bccaa8385e3aecb38e7eec7ecca 100644 --- a/configure.ac +++ b/configure.ac @@ -37,6 +37,12 @@ AC_ARG_WITH(cuda_libdir, AS_HELP_STRING([--with-cuda-libdir], [says where the CUDA libraries can be found, default is /usr/local/cuda/lib]), CUDA_LIBS=$withval) +AC_ARG_WITH(cuda_arch, + AS_HELP_STRING([--with-cuda-arch], + [specifies the CUDA architecture, can be 1.0, 1.1, 1.2 or 1.3 - default is 1.3]), + CUDA_ARCH=$withval, + CUDA_ARCH="1.3") + working_nvcc="no" if test x$with_cuda = xyes; then @@ -89,6 +95,21 @@ then CUDA_LDFLAGS="$CUDA_LDFLAGS -lcudart" CC="nvcc" CXX="nvcc" + case "$CUDA_ARCH" in + 1.0 ) + CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_10" + ;; + 1.1 ) + CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_11" + ;; + 1.2 ) + CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_12" + ;; + 1.3 ) + CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_13" + ;; + esac + DBGCXXFLAGS="$DBGCXXFLAGS -deviceemu" else CUDA_LDFLAGS="" CUDA_CXXFLAGS="" @@ -121,7 +142,7 @@ dnl ----------- check for debug-------------- dnl ----------------------------------------- AC_ARG_ENABLE(debug,[ --enable-debug=[no/yes] turn on debugging [default=no]] ) if test x"$enable_debug" = xyes; then - DBGCXXFLAGS="-O0 -DDEBUG" + DBGCXXFLAGS="$DBGCXXFLAGS -O0 -DDEBUG" if test x$CXX != xnvcc; then DBGCXXFLAGS="$DBGCXXFLAGS -g3 -Wall -W -ansi -Wno-unused" diff --git a/src/core/tnl-cuda-kernels.cu b/src/core/tnl-cuda-kernels.cu index 8d043e78e2e25603579c1215318ba59552e6db77..b36083a49803993bd1d6483c3a16c38ace12b234 100644 --- a/src/core/tnl-cuda-kernels.cu +++ b/src/core/tnl-cuda-kernels.cu @@ -95,133 +95,409 @@ double tnlCUDAReductionSum( const int size, } /* - * Simple redcution 1 + * Simple redcution 5 */ -int tnlCUDASimpleReduction1Min( const int size, - const int block_size, - const int grid_size, - const int* input, - int* output ) -{ - -} - -int tnlCUDASimpleReduction1Max( const int size, - const int block_size, - const int grid_size, - const int* input, - int* output ) -{ - //Calculate necessary block/grid dimensions - const int cpuThreshold = 1; - const int desBlockSize = 128; //Desired block size - dim3 blockSize = :: Min( size, desBlockSize ); - dim3 gridSize = size / blockSize. x; - unsigned int shmem = blockSize. x * sizeof( int ); - cout << "Grid size: " << gridSize. x << endl - << "Block size: " << blockSize. x << endl - << "Shmem: " << shmem << endl; - tnlCUDASimpleReductionKernel1< int, tnlMax ><<< gridSize, blockSize, shmem >>>( size, input, output ); - int sizeReduced = gridSize. x; - while( sizeReduced > cpuThreshold ) - { - cout << "Reducing with size reduced = " << sizeReduced << endl; - blockSize. x = :: Min( sizeReduced, desBlockSize ); - gridSize. x = sizeReduced / blockSize. x; - shmem = blockSize. x * sizeof(int); - tnlCUDASimpleReductionKernel1< int, tnlMax ><<< gridSize, blockSize, shmem >>>( size, input, output ); - sizeReduced = gridSize. x; - } - int* host_output = new int[ sizeReduced ]; - cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost ); - int result = host_output[ 0 ]; - for( int i = 1;i < sizeReduced; i++ ) - result = :: Max( result, host_output[ i ] ); - delete[] host_output; - return result; +bool tnlCUDASimpleReduction5Min( const int size, + const int* device_input, + int& result ) +{ + return tnlCUDASimpleReduction5< int, tnlMin >( size, + device_input, + result ); } - -int tnlCUDASimpleReduction1Sum( const int size, - const int block_size, - const int grid_size, - const int* input, - int* output ) -{ - //Calculate necessary block/grid dimensions - const int cpuThreshold = 1; - const int desBlockSize = 128; //Desired block size - dim3 blockSize = :: Min( size, desBlockSize ); - dim3 gridSize = size / blockSize. x; - unsigned int shmem = blockSize. x * sizeof( int ); - cout << "Grid size: " << gridSize. x << endl - << "Block size: " << blockSize. x << endl - << "Shmem: " << shmem << endl; - tnlCUDASimpleReductionKernel1< int, tnlSum ><<< gridSize, blockSize, shmem >>>( size, input, output ); - int sizeReduced = gridSize. x; - while( sizeReduced > cpuThreshold ) - { - cout << "Reducing with size reduced = " << sizeReduced << endl; - blockSize. x = :: Min( sizeReduced, desBlockSize ); - gridSize. x = sizeReduced / blockSize. x; - shmem = blockSize. x * sizeof(int); - tnlCUDASimpleReductionKernel1< int, tnlSum ><<< gridSize, blockSize, shmem >>>( size, input, output ); - sizeReduced = gridSize. x; - } - int* host_output = new int[ sizeReduced ]; - cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost ); - int result = host_output[ 0 ]; - for( int i = 1;i < sizeReduced; i++ ) - result += host_output[ i ]; - delete[] host_output; - return result; + +bool tnlCUDASimpleReduction5Max( const int size, + const int* device_input, + int& result ) +{ + return tnlCUDASimpleReduction5< int, tnlMax >( size, + device_input, + result ); +} +bool tnlCUDASimpleReduction5Sum( const int size, + const int* device_input, + int& result ) +{ + return tnlCUDASimpleReduction5< int, tnlSum >( size, + device_input, + result ); } +bool tnlCUDASimpleReduction5Min( const int size, + const float* device_input, + float& result ) +{ + return tnlCUDASimpleReduction5< float, tnlMin >( size, + device_input, + result ); +} + +bool tnlCUDASimpleReduction5Max( const int size, + const float* device_input, + float& result ) +{ + return tnlCUDASimpleReduction5< float, tnlMax >( size, + device_input, + result ); +} +bool tnlCUDASimpleReduction5Sum( const int size, + const float* device_input, + float& result ) +{ + return tnlCUDASimpleReduction5< float, tnlSum >( size, + device_input, + result ); +} +bool tnlCUDASimpleReduction5Min( const int size, + const double* device_input, + double& result ) +{ + return tnlCUDASimpleReduction5< double, tnlMin >( size, + device_input, + result ); +} + +bool tnlCUDASimpleReduction5Max( const int size, + const double* device_input, + double& result ) +{ + return tnlCUDASimpleReduction5< double, tnlMax >( size, + device_input, + result ); +} +bool tnlCUDASimpleReduction5Sum( const int size, + const double* device_input, + double& result ) +{ + return tnlCUDASimpleReduction5< double, tnlSum >( size, + device_input, + result ); +} + + /* -float tnlCUDASimpleReduction1Min( const int size, - const int block_size, - const int grid_size, - const float* input ) + * Simple redcution 4 + */ + +bool tnlCUDASimpleReduction4Min( const int size, + const int* device_input, + int& result ) { - return tnlCUDASimpleReduction1< float, tnlMin >( size, block_size, grid_size, input ); + return tnlCUDASimpleReduction4< int, tnlMin >( size, + device_input, + result ); } -float tnlCUDASimpleReduction1Max( const int size, - const int block_size, - const int grid_size, - const float* input ) +bool tnlCUDASimpleReduction4Max( const int size, + const int* device_input, + int& result ) { - return tnlCUDASimpleReduction1< float, tnlMax >( size, block_size, grid_size, input ); + return tnlCUDASimpleReduction4< int, tnlMax >( size, + device_input, + result ); } - -float tnlCUDASimpleReduction1Sum( const int size, - const int block_size, - const int grid_size, - const float* input ) +bool tnlCUDASimpleReduction4Sum( const int size, + const int* device_input, + int& result ) { - return tnlCUDASimpleReduction1< float, tnlSum >( size, block_size, grid_size, input ); + return tnlCUDASimpleReduction4< int, tnlSum >( size, + device_input, + result ); } -double tnlCUDASimpleReduction1Min( const int size, - const int block_size, - const int grid_size, - const double* input ) +bool tnlCUDASimpleReduction4Min( const int size, + const float* device_input, + float& result ) { - return tnlCUDASimpleReduction1< double, tnlMin >( size, block_size, grid_size, input ); + return tnlCUDASimpleReduction4< float, tnlMin >( size, + device_input, + result ); } -double tnlCUDASimpleReduction1Max( const int size, - const int block_size, - const int grid_size, - const double* input ) +bool tnlCUDASimpleReduction4Max( const int size, + const float* device_input, + float& result ) { - return tnlCUDASimpleReduction1< double, tnlMax >( size, block_size, grid_size, input ); + return tnlCUDASimpleReduction4< float, tnlMax >( size, + device_input, + result ); } - -double tnlCUDASimpleReduction1Sum( const int size, - const int block_size, - const int grid_size, - const double* input ) +bool tnlCUDASimpleReduction4Sum( const int size, + const float* device_input, + float& result ) +{ + return tnlCUDASimpleReduction4< float, tnlSum >( size, + device_input, + result ); +} +bool tnlCUDASimpleReduction4Min( const int size, + const double* device_input, + double& result ) +{ + return tnlCUDASimpleReduction4< double, tnlMin >( size, + device_input, + result ); +} + +bool tnlCUDASimpleReduction4Max( const int size, + const double* device_input, + double& result ) +{ + return tnlCUDASimpleReduction4< double, tnlMax >( size, + device_input, + result ); +} +bool tnlCUDASimpleReduction4Sum( const int size, + const double* device_input, + double& result ) +{ + return tnlCUDASimpleReduction4< double, tnlSum >( size, + device_input, + result ); +} + +/* + * Simple redcution 3 + */ + +bool tnlCUDASimpleReduction3Min( const int size, + const int* device_input, + int& result ) +{ + return tnlCUDASimpleReduction3< int, tnlMin >( size, + device_input, + result ); +} + +bool tnlCUDASimpleReduction3Max( const int size, + const int* device_input, + int& result ) +{ + return tnlCUDASimpleReduction3< int, tnlMax >( size, + device_input, + result ); +} +bool tnlCUDASimpleReduction3Sum( const int size, + const int* device_input, + int& result ) +{ + return tnlCUDASimpleReduction3< int, tnlSum >( size, + device_input, + result ); +} + +bool tnlCUDASimpleReduction3Min( const int size, + const float* device_input, + float& result ) +{ + return tnlCUDASimpleReduction3< float, tnlMin >( size, + device_input, + result ); +} + +bool tnlCUDASimpleReduction3Max( const int size, + const float* device_input, + float& result ) +{ + return tnlCUDASimpleReduction3< float, tnlMax >( size, + device_input, + result ); +} +bool tnlCUDASimpleReduction3Sum( const int size, + const float* device_input, + float& result ) +{ + return tnlCUDASimpleReduction3< float, tnlSum >( size, + device_input, + result ); +} +bool tnlCUDASimpleReduction3Min( const int size, + const double* device_input, + double& result ) +{ + return tnlCUDASimpleReduction3< double, tnlMin >( size, + device_input, + result ); +} + +bool tnlCUDASimpleReduction3Max( const int size, + const double* device_input, + double& result ) { - return tnlCUDASimpleReduction1< double, tnlSum >( size, block_size, grid_size, input ); -}*/ \ No newline at end of file + return tnlCUDASimpleReduction3< double, tnlMax >( size, + device_input, + result ); +} +bool tnlCUDASimpleReduction3Sum( const int size, + const double* device_input, + double& result ) +{ + return tnlCUDASimpleReduction3< double, tnlSum >( size, + device_input, + result ); +} + +/* + * Simple redcution 2 + */ + +bool tnlCUDASimpleReduction2Min( const int size, + const int* device_input, + int& result ) +{ + return tnlCUDASimpleReduction2< int, tnlMin >( size, + device_input, + result ); +} + +bool tnlCUDASimpleReduction2Max( const int size, + const int* device_input, + int& result ) +{ + return tnlCUDASimpleReduction2< int, tnlMax >( size, + device_input, + result ); +} +bool tnlCUDASimpleReduction2Sum( const int size, + const int* device_input, + int& result ) +{ + return tnlCUDASimpleReduction2< int, tnlSum >( size, + device_input, + result ); +} + +bool tnlCUDASimpleReduction2Min( const int size, + const float* device_input, + float& result ) +{ + return tnlCUDASimpleReduction2< float, tnlMin >( size, + device_input, + result ); +} + +bool tnlCUDASimpleReduction2Max( const int size, + const float* device_input, + float& result ) +{ + return tnlCUDASimpleReduction2< float, tnlMax >( size, + device_input, + result ); +} +bool tnlCUDASimpleReduction2Sum( const int size, + const float* device_input, + float& result ) +{ + return tnlCUDASimpleReduction2< float, tnlSum >( size, + device_input, + result ); +} +bool tnlCUDASimpleReduction2Min( const int size, + const double* device_input, + double& result ) +{ + return tnlCUDASimpleReduction2< double, tnlMin >( size, + device_input, + result ); +} + +bool tnlCUDASimpleReduction2Max( const int size, + const double* device_input, + double& result ) +{ + return tnlCUDASimpleReduction2< double, tnlMax >( size, + device_input, + result ); +} +bool tnlCUDASimpleReduction2Sum( const int size, + const double* device_input, + double& result ) +{ + return tnlCUDASimpleReduction2< double, tnlSum >( size, + device_input, + result ); +} + + +/* + * Simple redcution 1 + */ + +bool tnlCUDASimpleReduction1Min( const int size, + const int* device_input, + int& result ) +{ + return tnlCUDASimpleReduction1< int, tnlMin >( size, + device_input, + result ); +} + +bool tnlCUDASimpleReduction1Max( const int size, + const int* device_input, + int& result ) +{ + return tnlCUDASimpleReduction1< int, tnlMax >( size, + device_input, + result ); +} +bool tnlCUDASimpleReduction1Sum( const int size, + const int* device_input, + int& result ) +{ + return tnlCUDASimpleReduction1< int, tnlSum >( size, + device_input, + result ); +} + +bool tnlCUDASimpleReduction1Min( const int size, + const float* device_input, + float& result ) +{ + return tnlCUDASimpleReduction1< float, tnlMin >( size, + device_input, + result ); +} + +bool tnlCUDASimpleReduction1Max( const int size, + const float* device_input, + float& result ) +{ + return tnlCUDASimpleReduction1< float, tnlMax >( size, + device_input, + result ); +} +bool tnlCUDASimpleReduction1Sum( const int size, + const float* device_input, + float& result ) +{ + return tnlCUDASimpleReduction1< float, tnlSum >( size, + device_input, + result ); +} +bool tnlCUDASimpleReduction1Min( const int size, + const double* device_input, + double& result ) +{ + return tnlCUDASimpleReduction1< double, tnlMin >( size, + device_input, + result ); +} + +bool tnlCUDASimpleReduction1Max( const int size, + const double* device_input, + double& result ) +{ + return tnlCUDASimpleReduction1< double, tnlMax >( size, + device_input, + result ); +} +bool tnlCUDASimpleReduction1Sum( const int size, + const double* device_input, + double& result ) +{ + return tnlCUDASimpleReduction1< double, tnlSum >( size, + device_input, + result ); +} + diff --git a/src/core/tnl-cuda-kernels.h b/src/core/tnl-cuda-kernels.h index dc1e1c76562a1cff76073f62c30d477a2d5d4dd0..c56306f051b572e662fc300f0e658887e6b60970 100644 --- a/src/core/tnl-cuda-kernels.h +++ b/src/core/tnl-cuda-kernels.h @@ -45,208 +45,906 @@ template< class T > __device__ T tnlCudaMax( const T& a, * * Call this kernel with grid size divided by 2. * Maximum block size is 512. + * We also limit the grid size to 2048 - desGridSize. * */ template < class T, tnlOperation operation, int blockSize > __global__ void tnlCUDAReductionKernel( const int size, - const T* d_input, - T* d_output ) + const int grid_size, + const T* d_input, + T* d_output, + T* dbg_array1 = 0 ) { - extern __shared__ __align__( 8 ) T sdata[]; - // Read the data into shared memory - int tid = threadIdx.x; - int gid = blockIdx. x * blockSize * 2 + threadIdx. x; - int gridSize = blockSize * 2 * gridDim. x; - if( operation == tnlMin || - operation == tnlMax ) - sdata[ tid ] = d_input[ gid ]; + extern __shared__ T sdata[]; + + // Read data into the shared memory + int tid = threadIdx. x; + int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x; + // Last thread ID which manipulates meaningful data + + int grid_size = 2 * blockSize * gridDim. x; + if( gid + blockSize < size ) + { + if( operation == tnlMin ) sdata[ tid ] = :: Min( d_input[ gid ], d_input[ gid + blockSize ] ); + if( operation == tnlMax ) sdata[ tid ] = :: Max( d_input[ gid ], d_input[ gid + blockSize ] ); + if( operation == tnlSum ) sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockSize ]; + } else - sdata[ tid ] = 0; - while( gid < size ) + { + sdata[ tid ] = d_input[ gid ]; + } + gid += grid_size; + + while (gid < size) + { + if( operation == tnlMin ) sdata[ tid ] = :: Min( sdata[ tid ], :: Min( d_input[gid], d_input[gid+blockSize] ); + if( operation == tnlMax ) sdata[ tid ] = :: Max( sdata[ tid ], :: Max( d_input[gid], d_input[gid+blockSize] ); + if( operation == tnlSum ) sdata[ tid ] += d_input[gid] + d_input[ gid + blockSize ]; + gid += gridSize; + } + __syncthreads(); + + if( gid + blockDim. x < size ) { if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( d_input[ gid ], d_input[ tnlCudaMin( gid + blockSize, size ) ] ); + sdata[ tid ] = tnlCudaMin( d_input[ gid ], d_input[ gid + blockDim. x ] ); if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( d_input[ gid ], d_input[ tnlCudaMin( gid + blockSize, size ) ] ); + sdata[ tid ] = tnlCudaMax( d_input[ gid ], d_input[ gid + blockDim. x ] ); if( operation == tnlSum ) - sdata[ tid ] += d_input[gid] + d_input[gid+blockSize]; - gid += gridSize; + sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockDim. x ]; + } + else if( gid < size ) + { + sdata[ tid ] = d_input[ gid ]; } __syncthreads(); // Parallel reduction - if( blockSize == 512 ) - { - if( tid < 256 ) + if( blockSize == 512 ) { - if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 256 ] ); - if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 256 ] ); - if( operation == tnlSum ) - sdata[ tid ] += sdata[ tid + 256 ]; + if( tid < 256 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 256 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 256 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 256 ]; + } + __syncthreads(); } - __syncthreads(); - } - if( blockSize >= 256 ) - { - if( tid < 128 ) + if( blockSize >= 256 ) { - if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 128 ] ); - if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 128 ] ); - if( operation == tnlSum ) - sdata[ tid ] += sdata[ tid + 128 ]; + if( tid < 128 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 128 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 128 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 128 ]; + } + __syncthreads(); } - __syncthreads(); - } - if( blockSize >= 128 ) - { - if (tid< 64) + if( blockSize >= 128 ) { - if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 64 ] ); - if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 64 ] ); - if( operation == tnlSum ) - sdata[ tid ] += sdata[ tid + 64 ]; + if (tid< 64) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 64 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 64 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 64 ]; + } + __syncthreads(); } - __syncthreads(); + /* + * What follows runs in warp so it does not need to be synchronised. + */ + if( tid < 32 ) + { + if( blockSize >= 64 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 32 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 32 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 32 ]; + } + if( blockSize >= 32 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 16 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 16 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 16 ]; + } + if( blockSize >= 16 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 8 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 8 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 8 ]; + } + if( blockSize >= 8 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 4 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 4 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 4 ]; + } + if( blockSize >= 4 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 2 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 2 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 2 ]; + } + if( blockSize >= 2 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 1 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 1 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 1 ]; + } + } + + // Store the result back in global memory + if( tid == 0 ) + d_output[ blockIdx. x ] = sdata[ 0 ]; +} + +/* + * CUDA reduction kernel caller. + * block_size can be only some of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512. + * d_input must reside on the device. + */ +template< class T, tnlOperation operation > +bool tnlCUDAReduction( const int size, + const T* device_input, + T& result, + T* device_output = 0 ) +{ + //Calculate necessary block/grid dimensions + const int cpuThreshold = 1; + const int desBlockSize = 16; //Desired block size + + T* dbg_array1; + + bool device_output_allocated( false ); + if( ! device_output ) + { + int bytes_alloc = :: Max( 1, size / desBlockSize ) * sizeof( T ); + cudaMalloc( ( void** ) &device_output, bytes_alloc ); + if( cudaGetLastError() != cudaSuccess ) + { + cerr << "Unable to allocate device memory with size " << bytes_alloc << "." << endl; + return false; + } + device_output_allocated = true; + + //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! + //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!! + } + dim3 block_size( 0 ), grid_size( 0 ); + int shmem; + int size_reduced = size; + const T* reduction_input = device_input; + while( size_reduced > cpuThreshold ) + { + block_size. x = :: Min( size_reduced, desBlockSize ); + grid_size. x = :: Min( ( size_reduced / block_size. x + 1 ) / 2, desGridSize ); + shmem = block_size. x * sizeof( T ); + cout << "Size: " << size_reduced + << " Grid size: " << grid_size. x + << " Block size: " << block_size. x + << " Shmem: " << shmem << endl; + //tnlCUDASimpleReductionKernel4< T, operation ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); + tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." ); + switch( block_size. x ) + { + case 512: + tnlCUDASimpleReductionKernel5< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); + break; + case 256: + tnlCUDASimpleReductionKernel5< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); + break; + case 128: + tnlCUDASimpleReductionKernel5< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); + break; + case 64: + tnlCUDASimpleReductionKernel5< T, operation, 64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); + break; + case 32: + tnlCUDASimpleReductionKernel5< T, operation, 32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); + break; + case 16: + tnlCUDASimpleReductionKernel5< T, operation, 16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); + break; + case 8: + tnlCUDASimpleReductionKernel5< T, operation, 8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); + break; + case 4: + tnlCUDASimpleReductionKernel5< T, operation, 4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); + break; + case 2: + tnlCUDASimpleReductionKernel5< T, operation, 2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output ); + break; + case 1: + tnlAssert( false, cerr << "blockSize should not be 1." << endl ); + break; + default: + tnlAssert( false, cerr << "Block size is " << block_size. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); + break; + } + size_reduced = grid_size. x; + reduction_input = device_output; + + // debuging part + /*T* host_array = new T[ desBlockSize ]; + cudaMemcpy( host_array, dbg_array1, desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost ); + for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ ) + cout << host_array[ i ] << " - "; + cout << endl; + + T* output = new T[ size_reduced ]; + cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); + cout << endl; + for( int i = 0; i < size_reduced; i ++ ) + cout << output[ i ] << " "; + cout << endl; + delete[] output;*/ + } + T* host_output = new T[ size_reduced ]; + cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); + result = host_output[ 0 ]; + for( int i = 1; i < size_reduced; i++ ) + { + if( operation == tnlMin) + result = :: Min( result, host_output[ i ] ); + if( operation == tnlMax ) + result = :: Max( result, host_output[ i ] ); + if( operation == tnlSum ) + result += host_output[ i ]; + } + delete[] host_output; + if( device_output_allocated ) + cudaFree( device_output ); + return true; +} + + +/* + * Modified parallel reduction - version 5. + * We have replaced the for cycle with switch + * and template parameter blockSize + */ + +template < class T, tnlOperation operation, int blockSize > +__global__ void tnlCUDASimpleReductionKernel5( const int size, + const T* d_input, + T* d_output, + T* dbg_array1 = 0 ) +{ + extern __shared__ T sdata[]; + + // Read data into the shared memory + int tid = threadIdx. x; + int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x; + // Last thread ID which manipulates meaningful data + //int last_tid = size - 2 * blockIdx. x * blockDim. x; + if( gid + blockDim. x < size ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( d_input[ gid ], d_input[ gid + blockDim. x ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( d_input[ gid ], d_input[ gid + blockDim. x ] ); + if( operation == tnlSum ) + sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockDim. x ]; } - /* - * What follows runs in warp so it does not need to be synchronised. - */ - if( tid < 32 ) + else if( gid < size ) { - if( blockSize >= 64 ) + sdata[ tid ] = d_input[ gid ]; + } + __syncthreads(); + + // Parallel reduction + if( blockSize == 512 ) { - if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 32 ] ); - if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 32 ] ); - if( operation == tnlSum ) - sdata[ tid ] += sdata[ tid + 32 ]; + if( tid < 256 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 256 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 256 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 256 ]; + } + __syncthreads(); } - if( blockSize >= 32 ) + if( blockSize >= 256 ) { - if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 16 ] ); - if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 16 ] ); - if( operation == tnlSum ) - sdata[ tid ] += sdata[ tid + 16 ]; + if( tid < 128 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 128 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 128 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 128 ]; + } + __syncthreads(); } - if( blockSize >= 16 ) + if( blockSize >= 128 ) { - if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 8 ] ); - if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 8 ] ); - if( operation == tnlSum ) - sdata[ tid ] += sdata[ tid + 8 ]; + if (tid< 64) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 64 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 64 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 64 ]; + } + __syncthreads(); } - if( blockSize >= 8 ) - { - if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 4 ] ); - if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 4 ] ); - if( operation == tnlSum ) - sdata[ tid ] += sdata[ tid + 4 ]; - } - if( blockSize >= 4 ) + /* + * What follows runs in warp so it does not need to be synchronised. + */ + if( tid < 32 ) + { + if( blockSize >= 64 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 32 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 32 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 32 ]; + } + if( blockSize >= 32 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 16 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 16 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 16 ]; + } + if( blockSize >= 16 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 8 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 8 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 8 ]; + } + if( blockSize >= 8 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 4 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 4 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 4 ]; + } + if( blockSize >= 4 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 2 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 2 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 2 ]; + } + if( blockSize >= 2 ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 1 ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 1 ] ); + if( operation == tnlSum ) + sdata[ tid ] += sdata[ tid + 1 ]; + } + } + + // Store the result back in global memory + if( tid == 0 ) + d_output[ blockIdx. x ] = sdata[ 0 ]; +} + +template< class T, tnlOperation operation > +bool tnlCUDASimpleReduction5( const int size, + const T* device_input, + T& result, + T* device_output = 0 ) +{ + //Calculate necessary block/grid dimensions + const int cpuThreshold = 1; + const int desBlockSize = 16; //Desired block size + + T* dbg_array1; + + bool device_output_allocated( false ); + if( ! device_output ) + { + int bytes_alloc = :: Max( 1, size / desBlockSize ) * sizeof( T ); + cudaMalloc( ( void** ) &device_output, bytes_alloc ); + if( cudaGetLastError() != cudaSuccess ) + { + cerr << "Unable to allocate device memory with size " << bytes_alloc << "." << endl; + return false; + } + device_output_allocated = true; + + //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! + //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!! + } + dim3 block_size( 0 ), grid_size( 0 ); + int shmem; + int size_reduced = size; + const T* reduction_input = device_input; + while( size_reduced > cpuThreshold ) + { + block_size. x = :: Min( size_reduced, desBlockSize ); + grid_size. x = ( size_reduced / block_size. x + 1 ) / 2; + shmem = block_size. x * sizeof( T ); + cout << "Size: " << size_reduced + << " Grid size: " << grid_size. x + << " Block size: " << block_size. x + << " Shmem: " << shmem << endl; + //tnlCUDASimpleReductionKernel4< T, operation ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); + tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." ); + switch( block_size. x ) + { + case 512: + tnlCUDASimpleReductionKernel5< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); + break; + case 256: + tnlCUDASimpleReductionKernel5< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); + break; + case 128: + tnlCUDASimpleReductionKernel5< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); + break; + case 64: + tnlCUDASimpleReductionKernel5< T, operation, 64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); + break; + case 32: + tnlCUDASimpleReductionKernel5< T, operation, 32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); + break; + case 16: + tnlCUDASimpleReductionKernel5< T, operation, 16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); + break; + case 8: + tnlCUDASimpleReductionKernel5< T, operation, 8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); + break; + case 4: + tnlCUDASimpleReductionKernel5< T, operation, 4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); + break; + case 2: + tnlCUDASimpleReductionKernel5< T, operation, 2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); + break; + case 1: + tnlAssert( false, cerr << "blockSize should not be 1." << endl ); + break; + default: + tnlAssert( false, cerr << "Block size is " << block_size. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); + break; + } + size_reduced = grid_size. x; + reduction_input = device_output; + + // debuging part + /*T* host_array = new T[ desBlockSize ]; + cudaMemcpy( host_array, dbg_array1, desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost ); + for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ ) + cout << host_array[ i ] << " - "; + cout << endl; + + T* output = new T[ size_reduced ]; + cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); + cout << endl; + for( int i = 0; i < size_reduced; i ++ ) + cout << output[ i ] << " "; + cout << endl; + delete[] output;*/ + } + T* host_output = new T[ size_reduced ]; + cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); + result = host_output[ 0 ]; + for( int i = 1; i < size_reduced; i++ ) + { + if( operation == tnlMin) + result = :: Min( result, host_output[ i ] ); + if( operation == tnlMax ) + result = :: Max( result, host_output[ i ] ); + if( operation == tnlSum ) + result += host_output[ i ]; + } + delete[] host_output; + if( device_output_allocated ) + cudaFree( device_output ); + return true; +} + + +/* + * Modified parallel reduction - version 4. + * We have reduced the grid size to one half + * to avoid inactive threads. + */ + + +template < class T, tnlOperation operation > +__global__ void tnlCUDASimpleReductionKernel4( const int size, + const T* d_input, + T* d_output, + T* dbg_array1 = 0 ) +{ + extern __shared__ T sdata[]; + + // Read data into the shared memory + int tid = threadIdx. x; + int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x; + // Last thread ID which manipulates meaningful data + int last_tid = size - 2 * blockIdx. x * blockDim. x; + if( gid + blockDim. x < size ) + { + if( operation == tnlMin ) + sdata[ tid ] = tnlCudaMin( d_input[ gid ], d_input[ gid + blockDim. x ] ); + if( operation == tnlMax ) + sdata[ tid ] = tnlCudaMax( d_input[ gid ], d_input[ gid + blockDim. x ] ); + if( operation == tnlSum ) + sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockDim. x ]; + } + else if( gid < size ) + { + sdata[ tid ] = d_input[ gid ]; + } + __syncthreads(); + + // Parallel reduction + for( int s = blockDim. x / 2; s > 0; s >>= 1 ) + { + if( tid < s && tid + s < last_tid ) { if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 2 ] ); + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 2 ] ); + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) - sdata[ tid ] += sdata[ tid + 2 ]; + sdata[ tid ] += sdata[ tid + s ]; } - if( blockSize >= 2 ) + __syncthreads(); + } + + // Store the result back in global memory + if( tid == 0 ) + d_output[ blockIdx. x ] = sdata[ 0 ]; +} + +template< class T, tnlOperation operation > +bool tnlCUDASimpleReduction4( const int size, + const T* device_input, + T& result, + T* device_output = 0 ) +{ + //Calculate necessary block/grid dimensions + const int cpuThreshold = 1; + const int desBlockSize = 16; //Desired block size + + T* dbg_array1; + + bool device_output_allocated( false ); + if( ! device_output ) + { + int bytes_alloc = :: Max( 1, size / desBlockSize ) * sizeof( T ); + cudaMalloc( ( void** ) &device_output, bytes_alloc ); + if( cudaGetLastError() != cudaSuccess ) + { + cerr << "Unable to allocate device memory with size " << bytes_alloc << "." << endl; + return false; + } + device_output_allocated = true; + + //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! + //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!! + } + dim3 block_size( 0 ), grid_size( 0 ); + int shmem; + int size_reduced = size; + const T* reduction_input = device_input; + while( size_reduced > cpuThreshold ) + { + block_size. x = :: Min( size_reduced, desBlockSize ); + grid_size. x = ( size_reduced / block_size. x + 1 ) / 2; + shmem = block_size. x * sizeof( T ); + /*cout << "Size: " << size_reduced + << " Grid size: " << grid_size. x + << " Block size: " << block_size. x + << " Shmem: " << shmem << endl;*/ + tnlCUDASimpleReductionKernel4< T, operation ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); + size_reduced = grid_size. x; + reduction_input = device_output; + + // debuging part + /*T* host_array = new T[ desBlockSize ]; + cudaMemcpy( host_array, dbg_array1, desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost ); + for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ ) + cout << host_array[ i ] << " - "; + cout << endl; + + T* output = new T[ size_reduced ]; + cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); + cout << endl; + for( int i = 0; i < size_reduced; i ++ ) + cout << output[ i ] << " "; + cout << endl; + delete[] output;*/ + } + T* host_output = new T[ size_reduced ]; + cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); + result = host_output[ 0 ]; + for( int i = 1; i < size_reduced; i++ ) + { + if( operation == tnlMin) + result = :: Min( result, host_output[ i ] ); + if( operation == tnlMax ) + result = :: Max( result, host_output[ i ] ); + if( operation == tnlSum ) + result += host_output[ i ]; + } + delete[] host_output; + if( device_output_allocated ) + cudaFree( device_output ); + return true; +} + +/* + * Modified parallel reduction - version 3. + * We have avoided conflicting memory accesses. + */ + +template < class T, tnlOperation operation > +__global__ void tnlCUDASimpleReductionKernel3( const int size, + const T* d_input, + T* d_output ) +{ + extern __shared__ T sdata[]; + + // Read data into the shared memory + int tid = threadIdx. x; + int gid = blockIdx. x * blockDim. x + threadIdx. x; + // Last thread ID which manipulates meaningful data + int last_tid = size - blockIdx. x * blockDim. x; + if( gid < size ) + sdata[ tid ] = d_input[gid]; + __syncthreads(); + + // Parallel reduction + for( int s = blockDim. x / 2; s > 0; s >>= 1 ) + { + if( tid < s && tid + s < last_tid ) { if( operation == tnlMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 1 ] ); + sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 1 ] ); + sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) - sdata[ tid ] += sdata[ tid + 1 ]; + sdata[ tid ] += sdata[ tid + s ]; } + __syncthreads(); } - // Store the result back to the global memory of the device - if( tid == 0 ) d_output[ blockIdx. x ] = sdata[ 0 ]; + + // Store the result back in global memory + if( tid == 0 ) + d_output[ blockIdx. x ] = sdata[ 0 ]; +} + +template< class T, tnlOperation operation > +bool tnlCUDASimpleReduction3( const int size, + const T* device_input, + T& result, + T* device_output = 0 ) +{ + //Calculate necessary block/grid dimensions + const int cpuThreshold = 1; + const int desBlockSize = 256; //Desired block size + + bool device_output_allocated( false ); + if( ! device_output ) + { + int bytes_alloc = :: Max( 1, size / desBlockSize ) * sizeof( T ); + cudaMalloc( ( void** ) &device_output, bytes_alloc ); + if( cudaGetLastError() != cudaSuccess ) + { + cerr << "Unable to allocate device memory with size " << bytes_alloc << "." << endl; + return false; + } + device_output_allocated = true; + } + dim3 block_size( 0 ), grid_size( 0 ); + int shmem; + int size_reduced = size; + const T* reduction_input = device_input; + while( size_reduced > cpuThreshold ) + { + block_size. x = :: Min( size_reduced, desBlockSize ); + grid_size. x = size_reduced / block_size. x; + shmem = block_size. x * sizeof( T ); + /*cout << "Size: " << size_reduced + << " Grid size: " << grid_size. x + << " Block size: " << block_size. x + << " Shmem: " << shmem << endl;*/ + tnlCUDASimpleReductionKernel3< T, operation ><<< grid_size, block_size, shmem >>>( size_reduced, reduction_input, device_output ); + size_reduced = grid_size. x; + reduction_input = device_output; + + // debuging part + /*T* output = new T[ size_reduced ]; + cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); + cout << endl; + for( int i = 0; i < size_reduced; i ++ ) + cout << output[ i ] << " "; + cout << endl; + delete[] output;*/ + } + T* host_output = new T[ size_reduced ]; + cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); + result = host_output[ 0 ]; + for( int i = 1; i < size_reduced; i++ ) + { + if( operation == tnlMin) + result = :: Min( result, host_output[ i ] ); + if( operation == tnlMax ) + result = :: Max( result, host_output[ i ] ); + if( operation == tnlSum ) + result += host_output[ i ]; + } + delete[] host_output; + if( device_output_allocated ) + cudaFree( device_output ); + return true; } /* - * CUDA reduction kernel caller. - * block_size can be only some of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512. - * d_input must reside on the device. + * Modified parallel reduction - version 2. + * We have avoided operation modulo. */ -template< class T, tnlOperation operation > -T tnlCUDAReduction( const int size, - const int block_size, - const int grid_size, - const T* d_input ) + +template < class T, tnlOperation operation > +__global__ void tnlCUDASimpleReductionKernel2( const int size, + const T* d_input, + T* d_output ) { - T result; + extern __shared__ T sdata[]; - dim3 blockSize( block_size ); - dim3 gridSize( grid_size ); - int shmem = 512 * sizeof( T ); - tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." ); - switch( block_size ) + // Read data into the shared memory + int tid = threadIdx. x; + int gid = blockIdx. x * blockDim. x + threadIdx. x; + // Last thread ID which manipulates meaningful data + int last_tid = size - blockIdx. x * blockDim. x; + if( gid < size ) + sdata[tid] = d_input[gid]; + __syncthreads(); + + // Parallel reduction + for( int s = 1; s < blockDim. x; s *= 2 ) { - case 512: - tnlCUDAReductionKernel< T, operation, 512 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); - break; - case 256: - tnlCUDAReductionKernel< T, operation, 256 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); - break; - case 128: - tnlCUDAReductionKernel< T, operation, 128 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); - break; - case 64: - tnlCUDAReductionKernel< T, operation, 64 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); - break; - case 32: - tnlCUDAReductionKernel< T, operation, 32 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); - break; - case 16: - tnlCUDAReductionKernel< T, operation, 16 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); - break; - case 8: - tnlCUDAReductionKernel< T, operation, 8 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); - break; - case 4: - tnlCUDAReductionKernel< T, operation, 4 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); - break; - case 2: - tnlCUDAReductionKernel< T, operation, 2 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); - break; - case 1: - tnlCUDAReductionKernel< T, operation, 1 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); - break; - default: - tnlAssert( false, cerr << "Block size is " << block_size << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); - break; + int inds = 2 * s * tid; + if( inds < blockDim. x && inds + s < last_tid ) + { + if( operation == tnlMin ) + sdata[ inds ] = tnlCudaMin( sdata[ inds ], sdata[ inds + s ] ); + if( operation == tnlMax ) + sdata[ inds ] = tnlCudaMax( sdata[ inds ], sdata[ inds + s ] ); + if( operation == tnlSum ) + sdata[ inds ] += sdata[ inds + s ]; + } + __syncthreads(); } - return result; + + // Store the result back in global memory + if( tid == 0 ) + d_output[ blockIdx. x ] = sdata[ 0 ]; +} + +template< class T, tnlOperation operation > +bool tnlCUDASimpleReduction2( const int size, + const T* device_input, + T& result, + T* device_output = 0 ) +{ + //Calculate necessary block/grid dimensions + const int cpuThreshold = 1; + const int desBlockSize = 256; //Desired block size + + bool device_output_allocated( false ); + if( ! device_output ) + { + int bytes_alloc = :: Max( 1, size / desBlockSize ) * sizeof( T ); + cudaMalloc( ( void** ) &device_output, bytes_alloc ); + if( cudaGetLastError() != cudaSuccess ) + { + cerr << "Unable to allocate device memory with size " << bytes_alloc << "." << endl; + return false; + } + device_output_allocated = true; + } + dim3 block_size( 0 ), grid_size( 0 ); + int shmem; + int size_reduced = size; + const T* reduction_input = device_input; + while( size_reduced > cpuThreshold ) + { + block_size. x = :: Min( size_reduced, desBlockSize ); + grid_size. x = size_reduced / block_size. x; + shmem = block_size. x * sizeof( T ); + /*cout << "Size: " << size_reduced + << " Grid size: " << grid_size. x + << " Block size: " << block_size. x + << " Shmem: " << shmem << endl;*/ + tnlCUDASimpleReductionKernel2< T, operation ><<< grid_size, block_size, shmem >>>( size_reduced, reduction_input, device_output ); + size_reduced = grid_size. x; + reduction_input = device_output; + + // debuging part + /*T* output = new T[ size_reduced ]; + cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); + cout << endl; + for( int i = 0; i < size_reduced; i ++ ) + cout << output[ i ] << " "; + cout << endl; + delete[] output;*/ + } + T* host_output = new T[ size_reduced ]; + cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); + result = host_output[ 0 ]; + for( int i = 1; i < size_reduced; i++ ) + { + if( operation == tnlMin) + result = :: Min( result, host_output[ i ] ); + if( operation == tnlMax ) + result = :: Max( result, host_output[ i ] ); + if( operation == tnlSum ) + result += host_output[ i ]; + } + delete[] host_output; + if( device_output_allocated ) + cudaFree( device_output ); + return true; } +/* + * The simplest and very slow parallel reduction - version 1. + */ + template < class T, tnlOperation operation > __global__ void tnlCUDASimpleReductionKernel1( const int size, const T* d_input, T* d_output ) { - extern __shared__ int sdata[]; + extern __shared__ T sdata[]; // Read data into the shared memory int tid = threadIdx. x; int gid = blockIdx. x * blockDim. x + threadIdx. x; // Last thread ID which manipulates meaningful data int last_tid = size - blockIdx. x * blockDim. x; + if( gid < size ) sdata[tid] = d_input[gid]; __syncthreads(); @@ -272,50 +970,69 @@ __global__ void tnlCUDASimpleReductionKernel1( const int size, } template< class T, tnlOperation operation > -T tnlCUDASimpleReduction1( const int size, - const T* d_input, - T* output = 0 ) +bool tnlCUDASimpleReduction1( const int size, + const T* device_input, + T& result, + T* device_output = 0 ) { //Calculate necessary block/grid dimensions const int cpuThreshold = 1; - const int desBlockSize = 128; //Desired block size - dim3 block_size = :: Min( size, desBlockSize ); - dim3 grid_size = size / blockSize. x; - unsigned int shmem = blockSize. x * sizeof( int ); - cout << "Grid size: " << grid_size. x << endl - << "Block size: " << block_size. x << endl - << "Shmem: " << shmem << endl; - if( ! output ) + const int desBlockSize = 256; //Desired block size + + bool device_output_allocated( false ); + if( ! device_output ) { - cudaMalloc( ( void** ) &output, grid_size * sizeof( T ) ); - if( cuda // TODO: add allocation check + int bytes_alloc = :: Max( 1, size / desBlockSize ) * sizeof( T ); + cudaMalloc( ( void** ) &device_output, bytes_alloc ); + if( cudaGetLastError() != cudaSuccess ) + { + cerr << "Unable to allocate device memory with size " << bytes_alloc << "." << endl; + return false; + } + device_output_allocated = true; } - tnlCUDASimpleReductionKernel1< T, operation ><<< grid_size, block_size, shmem >>>( size, input, output ); - int size_reduced = grid_size. x; - while( sizeReduced > cpuThreshold ) + dim3 block_size( 0 ), grid_size( 0 ); + int shmem; + int size_reduced = size; + const T* reduction_input = device_input; + while( size_reduced > cpuThreshold ) { - cout << "Reducing with size reduced = " << size_reduced << endl; block_size. x = :: Min( size_reduced, desBlockSize ); grid_size. x = size_reduced / block_size. x; - shmem = block_size. x * sizeof(int); - tnlCUDASimpleReductionKernel1< T, operation ><<< grid_size, block_size, shmem >>>( size, input, output ); + shmem = block_size. x * sizeof( T ); + /*cout << "Size: " << size_reduced + << " Grid size: " << grid_size. x + << " Block size: " << block_size. x + << " Shmem: " << shmem << endl;*/ + tnlCUDASimpleReductionKernel1< T, operation ><<< grid_size, block_size, shmem >>>( size_reduced, reduction_input, device_output ); size_reduced = grid_size. x; + reduction_input = device_output; + + // debuging part + /*T* output = new T[ size_reduced ]; + cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); + cout << endl; + for( int i = 0; i < size_reduced; i ++ ) + cout << output[ i ] << " "; + cout << endl; + delete[] output;*/ } - int* host_output = new int[ size_reduced ]; - cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost ); - int result = host_output[ 0 ]; - for( int i = 1;i < size_reduced; i++ ) + T* host_output = new T[ size_reduced ]; + cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); + result = host_output[ 0 ]; + for( int i = 1; i < size_reduced; i++ ) { if( operation == tnlMin) result = :: Min( result, host_output[ i ] ); if( operation == tnlMax ) result = :: Max( result, host_output[ i ] ); if( operation == tnlSum ) - result += host_ouput[ i ]; + result += host_output[ i ]; } delete[] host_output; - - return result; + if( device_output_allocated ) + cudaFree( device_output ); + return true; } #endif /* HAVE_CUDA */ diff --git a/src/core/tnlCUDAKernelsTester.h b/src/core/tnlCUDAKernelsTester.h index 6d0ba3e330fa08b25e8787134887740c1b5df879..aa14fddc113459c512d1372d3460df1326c8df52 100644 --- a/src/core/tnlCUDAKernelsTester.h +++ b/src/core/tnlCUDAKernelsTester.h @@ -66,49 +66,161 @@ double tnlCUDAReductionSum( const int size, const int block_size, const int grid_size, const double* input ); +/* + * Simple reduction 5 + */ +bool tnlCUDASimpleReduction5Min( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction5Max( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction5Sum( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction5Min( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction5Max( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction5Sum( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction5Min( const int size, + const double* input, + double& result); +bool tnlCUDASimpleReduction5Max( const int size, + const double* input, + double& result ); +bool tnlCUDASimpleReduction5Sum( const int size, + const double* input, + double& result ); + + +/* + * Simple reduction 4 + */ +bool tnlCUDASimpleReduction4Min( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction4Max( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction4Sum( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction4Min( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction4Max( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction4Sum( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction4Min( const int size, + const double* input, + double& result); +bool tnlCUDASimpleReduction4Max( const int size, + const double* input, + double& result ); +bool tnlCUDASimpleReduction4Sum( const int size, + const double* input, + double& result ); + +/* + * Simple reduction 3 + */ +bool tnlCUDASimpleReduction3Min( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction3Max( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction3Sum( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction3Min( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction3Max( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction3Sum( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction3Min( const int size, + const double* input, + double& result); +bool tnlCUDASimpleReduction3Max( const int size, + const double* input, + double& result ); +bool tnlCUDASimpleReduction3Sum( const int size, + const double* input, + double& result ); + +/* + * Simple reduction 2 + */ +bool tnlCUDASimpleReduction2Min( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction2Max( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction2Sum( const int size, + const int* input, + int& result ); +bool tnlCUDASimpleReduction2Min( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction2Max( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction2Sum( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction2Min( const int size, + const double* input, + double& result); +bool tnlCUDASimpleReduction2Max( const int size, + const double* input, + double& result ); +bool tnlCUDASimpleReduction2Sum( const int size, + const double* input, + double& result ); /* * Simple reduction 1 */ -int tnlCUDASimpleReduction1Min( const int size, - const int block_size, - const int grid_size, +bool tnlCUDASimpleReduction1Min( const int size, const int* input, - int* output ); -int tnlCUDASimpleReduction1Max( const int size, - const int block_size, - const int grid_size, + int& result ); +bool tnlCUDASimpleReduction1Max( const int size, const int* input, - int* output ); -int tnlCUDASimpleReduction1Sum( const int size, - const int block_size, - const int grid_size, + int& result ); +bool tnlCUDASimpleReduction1Sum( const int size, const int* input, - int* output ); -float tnlCUDASimpleReduction1Min( const int size, - const int block_size, - const int grid_size, - const float* input ); -float tnlCUDASimpleReduction1Max( const int size, - const int block_size, - const int grid_size, - const float* input ); -float tnlCUDASimpleReduction1Sum( const int size, - const int block_size, - const int grid_size, - const float* input ); -double tnlCUDASimpleReduction1Min( const int size, - const int block_size, - const int grid_size, - const double* input ); -double tnlCUDASimpleReduction1Max( const int size, - const int block_size, - const int grid_size, - const double* input ); -double tnlCUDASimpleReduction1Sum( const int size, - const int block_size, - const int grid_size, - const double* input ); + int& result ); +bool tnlCUDASimpleReduction1Min( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction1Max( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction1Sum( const int size, + const float* input, + float& result); +bool tnlCUDASimpleReduction1Min( const int size, + const double* input, + double& result); +bool tnlCUDASimpleReduction1Max( const int size, + const double* input, + double& result ); +bool tnlCUDASimpleReduction1Sum( const int size, + const double* input, + double& result ); #endif @@ -125,57 +237,71 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase { CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlCUDAKernelsTester" ); CppUnit :: TestResult result; + suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >( "testSimpleReduction1", & tnlCUDAKernelsTester< T > :: testSimpleReduction1 ) ); suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >( + "testSimpleReduction2", + & tnlCUDAKernelsTester< T > :: testSimpleReduction2 ) + ); + suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >( + "testSimpleReduction3", + & tnlCUDAKernelsTester< T > :: testSimpleReduction3 ) + ); + suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >( + "testSimpleReduction4", + & tnlCUDAKernelsTester< T > :: testSimpleReduction4 ) + ); + suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >( + "testSimpleReduction5", + & tnlCUDAKernelsTester< T > :: testSimpleReduction5 ) + ); + /*suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >( "testReduction", & tnlCUDAKernelsTester< T > :: testFastReduction ) - ); + );*/ return suiteOfTests; }; bool testSetup( tnlLongVector< T >& host_input, - tnlLongVector< T >& host_output, tnlLongVectorCUDA< T >& device_input, - tnlLongVectorCUDA< T >& device_output, int size ) { if( ! host_input. SetNewSize( size ) ) return false; - if( ! host_output. SetNewSize( size ) ) - return false; if( ! device_input. SetNewSize( size ) ) return false; - if( ! device_output. SetNewSize( size ) ) - return false; for( int i=0; i < size; i ++ ) - { host_input[ i ] = i + 1; - host_output[ i ] = 0; - } + device_input. copyFrom( host_input ); return true; } void testReduction( int algorithm_efficiency = 0 ) { - int size = 1<<16; + int size = 1<<10; int desBlockSize = 128; //Desired block size int desGridSize = 2048; //Impose limitation on grid size so that threads could perform sequential work - tnlLongVector< T > host_input, host_output; - tnlLongVectorCUDA< T > device_input, device_output; + tnlLongVector< T > host_input; + tnlLongVectorCUDA< T > device_input; CPPUNIT_ASSERT( testSetup( host_input, - host_output, device_input, - device_output, size ) ); - - + T seq_min( host_input[ 0 ] ), + seq_max( host_input[ 0 ] ), + seq_sum( host_input[ 0 ] ); + for( int i = 1; i < size; i ++ ) + { + seq_min = :: Min( seq_min, host_input[ i ] ); + seq_max = :: Max( seq_max, host_input[ i ] ); + seq_sum += host_input[ i ]; + } //Calculate necessary block/grid dimensions int block_size = :: Min( size/2, desBlockSize ); @@ -186,9 +312,29 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase switch( algorithm_efficiency ) { case 1: - min = tnlCUDASimpleReduction1Min( size, block_size, grid_size, device_input. Data(), device_output. Data() ); - max = tnlCUDASimpleReduction1Max( size, block_size, grid_size, device_input. Data(), device_output. Data() ); - sum = tnlCUDASimpleReduction1Sum( size, block_size, grid_size, device_input. Data(), device_output. Data() ); + tnlCUDASimpleReduction1Min( size, device_input. Data(), min ); + tnlCUDASimpleReduction1Max( size, device_input. Data(), max ); + tnlCUDASimpleReduction1Sum( size, device_input. Data(), sum ); + break; + case 2: + tnlCUDASimpleReduction2Min( size, device_input. Data(), min ); + tnlCUDASimpleReduction2Max( size, device_input. Data(), max ); + tnlCUDASimpleReduction2Sum( size, device_input. Data(), sum ); + break; + case 3: + tnlCUDASimpleReduction3Min( size, device_input. Data(), min ); + tnlCUDASimpleReduction3Max( size, device_input. Data(), max ); + tnlCUDASimpleReduction3Sum( size, device_input. Data(), sum ); + break; + case 4: + tnlCUDASimpleReduction4Min( size, device_input. Data(), min ); + tnlCUDASimpleReduction4Max( size, device_input. Data(), max ); + tnlCUDASimpleReduction4Sum( size, device_input. Data(), sum ); + break; + case 5: + tnlCUDASimpleReduction5Min( size, device_input. Data(), min ); + tnlCUDASimpleReduction5Max( size, device_input. Data(), max ); + tnlCUDASimpleReduction5Sum( size, device_input. Data(), sum ); break; default: min = tnlCUDAReductionMin( size, block_size, grid_size, device_input. Data() ); @@ -196,14 +342,44 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase sum = tnlCUDAReductionSum( size, block_size, grid_size, device_input. Data() ); } - cout << "Min: " << min << endl - << "Max: " << max << endl - << "Sum: " << sum << endl; + cout << "Min: " << min << " Seq. min: " << seq_min << endl + << "Max: " << max << " Seq. max: " << seq_max << endl + << "Sum: " << sum << " Seq. sum: " << seq_sum << endl; + + CPPUNIT_ASSERT( min == seq_min ); + CPPUNIT_ASSERT( max == seq_max ); + CPPUNIT_ASSERT( sum == seq_sum ); + + }; + + void testSimpleReduction5() + { + cout << "Test reduction 5" << endl; + testReduction( 5 ); + }; + + void testSimpleReduction4() + { + cout << "Test reduction 4" << endl; + testReduction( 4 ); + }; + + void testSimpleReduction3() + { + cout << "Test reduction 3" << endl; + testReduction( 3 ); + }; + + void testSimpleReduction2() + { + cout << "Test reduction 2" << endl; + testReduction( 2 ); }; void testSimpleReduction1() { + cout << "Test reduction 1" << endl; testReduction( 1 ); }; diff --git a/src/tnl-unit-tests.cpp b/src/tnl-unit-tests.cpp index 0e4be63e7064c1421668db1c34b908c634363b96..e75f2578d89ed83484577c73f7d55e55b67ff502 100644 --- a/src/tnl-unit-tests.cpp +++ b/src/tnl-unit-tests.cpp @@ -44,8 +44,8 @@ int main( int argc, char* argv[] ) runner.addTest( tnlGridCUDA2DTester< double > :: suite() ); runner.addTest( tnlCUDAKernelsTester< int > :: suite() ); - //runner.addTest( tnlCUDAKernelsTester< float > :: suite() ); - //runner.addTest( tnlCUDAKernelsTester< double > :: suite() ); + runner.addTest( tnlCUDAKernelsTester< float > :: suite() ); + runner.addTest( tnlCUDAKernelsTester< double > :: suite() ); runner.run(); return 0;