Loading src/TNL/Containers/Algorithms/Reduction_impl.h +66 −29 Original line number Diff line number Diff line Loading @@ -51,11 +51,18 @@ reductionOnCudaDevice( Operation& operation, typedef typename Operation::ResultType ResultType; typedef typename Operation::LaterReductionOperation LaterReductionOperation; /*** * Only fundamental and pointer types can be safely reduced on host. Complex * objects stored on the device might contain pointers into the device memory, * in which case reduction on host might fail. */ constexpr bool can_copy_to_host = std::is_fundamental< RealType >::value || std::is_pointer< RealType >::value; /*** * First check if the input array(s) is/are large enough for the reduction on GPU. * Otherwise copy it/them to host and reduce on CPU. */ if( size <= minGPUReductionDataSize ) if( can_copy_to_host && size <= minGPUReductionDataSize ) { RealType hostArray1[ minGPUReductionDataSize ]; RealType hostArray2[ minGPUReductionDataSize ]; Loading Loading @@ -92,6 +99,7 @@ reductionOnCudaDevice( Operation& operation, timer.start(); #endif if( can_copy_to_host ) { /*** * Transfer the reduced data from device to host. */ Loading @@ -102,9 +110,6 @@ reductionOnCudaDevice( Operation& operation, #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif #ifdef CUDA_REDUCTION_PROFILING timer.reset(); timer.start(); #endif Loading @@ -121,6 +126,38 @@ reductionOnCudaDevice( Operation& operation, timer.stop(); std::cout << " Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif } else { /*** * Data can't be safely reduced on host, so continue with the reduction on the CUDA device. */ LaterReductionOperation laterReductionOperation; while( reducedSize > 1 ) { // TODO: copy the intermediate result somewhere else, in-place reduction probably does not work reducedSize = CudaReductionKernelLauncher( laterReductionOperation, reducedSize, deviceAux1, (ResultType*) 0, deviceAux1 ); } #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Reduction of small data set on GPU took " << timer.getRealTime() << " sec. " << std::endl; timer.reset(); timer.start(); #endif ResultType resultArray[ 1 ]; if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< ResultType, ResultType, IndexType >( resultArray, deviceAux1, reducedSize ) ) return false; result = resultArray[ 0 ]; #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif } return checkCudaDevice; #else Loading Loading
src/TNL/Containers/Algorithms/Reduction_impl.h +66 −29 Original line number Diff line number Diff line Loading @@ -51,11 +51,18 @@ reductionOnCudaDevice( Operation& operation, typedef typename Operation::ResultType ResultType; typedef typename Operation::LaterReductionOperation LaterReductionOperation; /*** * Only fundamental and pointer types can be safely reduced on host. Complex * objects stored on the device might contain pointers into the device memory, * in which case reduction on host might fail. */ constexpr bool can_copy_to_host = std::is_fundamental< RealType >::value || std::is_pointer< RealType >::value; /*** * First check if the input array(s) is/are large enough for the reduction on GPU. * Otherwise copy it/them to host and reduce on CPU. */ if( size <= minGPUReductionDataSize ) if( can_copy_to_host && size <= minGPUReductionDataSize ) { RealType hostArray1[ minGPUReductionDataSize ]; RealType hostArray2[ minGPUReductionDataSize ]; Loading Loading @@ -92,6 +99,7 @@ reductionOnCudaDevice( Operation& operation, timer.start(); #endif if( can_copy_to_host ) { /*** * Transfer the reduced data from device to host. */ Loading @@ -102,9 +110,6 @@ reductionOnCudaDevice( Operation& operation, #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif #ifdef CUDA_REDUCTION_PROFILING timer.reset(); timer.start(); #endif Loading @@ -121,6 +126,38 @@ reductionOnCudaDevice( Operation& operation, timer.stop(); std::cout << " Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif } else { /*** * Data can't be safely reduced on host, so continue with the reduction on the CUDA device. */ LaterReductionOperation laterReductionOperation; while( reducedSize > 1 ) { // TODO: copy the intermediate result somewhere else, in-place reduction probably does not work reducedSize = CudaReductionKernelLauncher( laterReductionOperation, reducedSize, deviceAux1, (ResultType*) 0, deviceAux1 ); } #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Reduction of small data set on GPU took " << timer.getRealTime() << " sec. " << std::endl; timer.reset(); timer.start(); #endif ResultType resultArray[ 1 ]; if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< ResultType, ResultType, IndexType >( resultArray, deviceAux1, reducedSize ) ) return false; result = resultArray[ 0 ]; #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif } return checkCudaDevice; #else Loading