diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index 2686ec83119569e4cc263781b3fa8045460c872a..e506c547a44d75369730954090a5c09939884166 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -57,9 +57,9 @@ CudaReductionKernel( Operation operation, * gridSize is the number of element processed by all blocks at the * same time. */ - IndexType tid = threadIdx. x; - IndexType gid = blockIdx. x * blockDim. x + threadIdx. x; - IndexType gridSize = blockDim. x * gridDim.x; + const IndexType tid = threadIdx.x; + IndexType gid = blockIdx.x * blockDim. x + threadIdx.x; + const IndexType gridSize = blockDim.x * gridDim.x; sdata[ tid ] = operation.initialValue(); /*** @@ -72,13 +72,13 @@ CudaReductionKernel( Operation operation, operation.cudaFirstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 ); - gid += 4*gridSize; + gid += 4 * gridSize; } while( gid + 2 * gridSize < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); - gid += 2*gridSize; + gid += 2 * gridSize; } while( gid < size ) { @@ -260,8 +260,8 @@ CudaReductionKernelLauncher( Operation& operation, cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 4 >, cudaFuncCachePreferShared); CudaReductionKernel< Operation, 4 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; case 2: cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 2 >, cudaFuncCachePreferShared); diff --git a/src/TNL/Containers/Algorithms/Multireduction_impl.h b/src/TNL/Containers/Algorithms/Multireduction_impl.h index cb7efdfa274fc73ff0b3265621d83e2fab8c2646..1eff176f014355a78afc7f0207d086a0f218e8c4 100644 --- a/src/TNL/Containers/Algorithms/Multireduction_impl.h +++ b/src/TNL/Containers/Algorithms/Multireduction_impl.h @@ -103,7 +103,7 @@ reduce( Operation& operation, deviceAux1 ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); - cout << " Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << endl; + std::cout << " Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl; timer.reset(); timer.start(); #endif @@ -117,18 +117,18 @@ reduce( Operation& operation, #ifdef CUDA_REDUCTION_PROFILING timer.stop(); - cout << " Transferring data to CPU took " << timer.getRealTime() << " sec. " << endl; + std::cout << " Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl; timer.reset(); timer.start(); #endif -// cout << "resultArray = ["; +// std::cout << "resultArray = ["; // for( int i = 0; i < n * reducedSize; i++ ) { -// cout << resultArray[ i ]; +// std::cout << resultArray[ i ]; // if( i < n * reducedSize - 1 ) -// cout << ", "; +// std::cout << ", "; // } -// cout << "]" << endl; +// std::cout << "]" << std::endl; /*** * Reduce the data on the host system. @@ -138,7 +138,7 @@ reduce( Operation& operation, #ifdef CUDA_REDUCTION_PROFILING timer.stop(); - cout << " Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << endl; + std::cout << " Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif return checkCudaDevice; diff --git a/src/TNL/Containers/Algorithms/Reduction_impl.h b/src/TNL/Containers/Algorithms/Reduction_impl.h index 64c503c106565ce654f8830d4b484f521df65fc6..bbc312da0c574c90cfaa7ab637a5c8efaf035dc6 100644 --- a/src/TNL/Containers/Algorithms/Reduction_impl.h +++ b/src/TNL/Containers/Algorithms/Reduction_impl.h @@ -36,9 +36,6 @@ namespace Algorithms { */ const int minGPUReductionDataSize = 256;//65536; //16384;//1024;//256; -#ifdef HAVE_CUDA -#endif - template< typename Operation > bool reductionOnCudaDevice( Operation& operation,