Loading src/TNL/Containers/Algorithms/cuda-reduction_impl.h +21 −18 Original line number Diff line number Diff line Loading @@ -15,7 +15,6 @@ #ifdef HAVE_CUDA #include <cuda.h> #endif #include <iostream> #include <TNL/Assert.h> #include <TNL/Containers/Algorithms/reduction-operations.h> #include <TNL/Containers/ArrayOperations.h> Loading @@ -24,6 +23,7 @@ #include <TNL/Containers/Algorithms/CudaReduction.h> #ifdef CUDA_REDUCTION_PROFILING #include <iostream> #include <TNL/Timer.h> #endif Loading @@ -42,7 +42,8 @@ const int minGPUReductionDataSize = 256;//65536; //16384;//1024;//256; #ifdef HAVE_CUDA template< typename Operation, int blockSize > __global__ void CudaReductionKernel( Operation operation, __global__ void CudaReductionKernel( Operation operation, const typename Operation::IndexType size, const typename Operation::RealType* input1, const typename Operation::RealType* input2, Loading @@ -53,7 +54,8 @@ __global__ void CudaReductionKernel( Operation operation, }; template< typename Operation > typename Operation::IndexType reduceOnCudaDevice( Operation& operation, typename Operation::IndexType reduceOnCudaDevice( Operation& operation, const typename Operation::IndexType size, const typename Operation::RealType* input1, const typename Operation::RealType* input2, Loading Loading @@ -127,7 +129,8 @@ typename Operation::IndexType reduceOnCudaDevice( Operation& operation, #endif template< typename Operation > bool reductionOnCudaDevice( Operation& operation, bool reductionOnCudaDevice( Operation& operation, const typename Operation::IndexType size, const typename Operation::RealType* deviceInput1, const typename Operation::RealType* deviceInput2, Loading @@ -144,10 +147,10 @@ bool reductionOnCudaDevice( Operation& operation, * First check if the input array(s) is/are large enough for the reduction on GPU. * Otherwise copy it/them to host and reduce on CPU. */ RealType hostArray1[ minGPUReductionDataSize ]; RealType hostArray2[ minGPUReductionDataSize ]; if( size <= minGPUReductionDataSize ) { RealType hostArray1[ minGPUReductionDataSize ]; RealType hostArray2[ minGPUReductionDataSize ]; if( ! Containers::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( hostArray1, deviceInput1, size ) ) return false; if( deviceInput2 && ! Loading Loading
src/TNL/Containers/Algorithms/cuda-reduction_impl.h +21 −18 Original line number Diff line number Diff line Loading @@ -15,7 +15,6 @@ #ifdef HAVE_CUDA #include <cuda.h> #endif #include <iostream> #include <TNL/Assert.h> #include <TNL/Containers/Algorithms/reduction-operations.h> #include <TNL/Containers/ArrayOperations.h> Loading @@ -24,6 +23,7 @@ #include <TNL/Containers/Algorithms/CudaReduction.h> #ifdef CUDA_REDUCTION_PROFILING #include <iostream> #include <TNL/Timer.h> #endif Loading @@ -42,7 +42,8 @@ const int minGPUReductionDataSize = 256;//65536; //16384;//1024;//256; #ifdef HAVE_CUDA template< typename Operation, int blockSize > __global__ void CudaReductionKernel( Operation operation, __global__ void CudaReductionKernel( Operation operation, const typename Operation::IndexType size, const typename Operation::RealType* input1, const typename Operation::RealType* input2, Loading @@ -53,7 +54,8 @@ __global__ void CudaReductionKernel( Operation operation, }; template< typename Operation > typename Operation::IndexType reduceOnCudaDevice( Operation& operation, typename Operation::IndexType reduceOnCudaDevice( Operation& operation, const typename Operation::IndexType size, const typename Operation::RealType* input1, const typename Operation::RealType* input2, Loading Loading @@ -127,7 +129,8 @@ typename Operation::IndexType reduceOnCudaDevice( Operation& operation, #endif template< typename Operation > bool reductionOnCudaDevice( Operation& operation, bool reductionOnCudaDevice( Operation& operation, const typename Operation::IndexType size, const typename Operation::RealType* deviceInput1, const typename Operation::RealType* deviceInput2, Loading @@ -144,10 +147,10 @@ bool reductionOnCudaDevice( Operation& operation, * First check if the input array(s) is/are large enough for the reduction on GPU. * Otherwise copy it/them to host and reduce on CPU. */ RealType hostArray1[ minGPUReductionDataSize ]; RealType hostArray2[ minGPUReductionDataSize ]; if( size <= minGPUReductionDataSize ) { RealType hostArray1[ minGPUReductionDataSize ]; RealType hostArray2[ minGPUReductionDataSize ]; if( ! Containers::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( hostArray1, deviceInput1, size ) ) return false; if( deviceInput2 && ! Loading