Non-fundamental types must be reduced completely on the CUDA device (28938623) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/Reduction_impl.h

+66 −29

Original line number	Diff line number	Diff line
		@@ -51,11 +51,18 @@ reductionOnCudaDevice( Operation& operation,
		typedef typename Operation::ResultType ResultType;
		typedef typename Operation::LaterReductionOperation LaterReductionOperation;

		/***
		* Only fundamental and pointer types can be safely reduced on host. Complex
		* objects stored on the device might contain pointers into the device memory,
		* in which case reduction on host might fail.
		*/
		constexpr bool can_copy_to_host = std::is_fundamental< RealType >::value \|\| std::is_pointer< RealType >::value;

		/***
		* First check if the input array(s) is/are large enough for the reduction on GPU.
		* Otherwise copy it/them to host and reduce on CPU.
		*/
		if( size <= minGPUReductionDataSize )
		if( can_copy_to_host && size <= minGPUReductionDataSize )
		{
		RealType hostArray1[ minGPUReductionDataSize ];
		RealType hostArray2[ minGPUReductionDataSize ];
		@@ -92,6 +99,7 @@ reductionOnCudaDevice( Operation& operation,
		timer.start();
		#endif

		if( can_copy_to_host ) {
		/***
		* Transfer the reduced data from device to host.
		*/
		@@ -102,9 +110,6 @@ reductionOnCudaDevice( Operation& operation,
		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		std::cout << " Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl;
		#endif

		#ifdef CUDA_REDUCTION_PROFILING
		timer.reset();
		timer.start();
		#endif
		@@ -121,6 +126,38 @@ reductionOnCudaDevice( Operation& operation,
		timer.stop();
		std::cout << " Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
		#endif
		}
		else {
		/***
		* Data can't be safely reduced on host, so continue with the reduction on the CUDA device.
		*/
		LaterReductionOperation laterReductionOperation;
		while( reducedSize > 1 ) {
		// TODO: copy the intermediate result somewhere else, in-place reduction probably does not work
		reducedSize = CudaReductionKernelLauncher( laterReductionOperation,
		reducedSize,
		deviceAux1,
		(ResultType*) 0,
		deviceAux1 );
		}

		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		std::cout << " Reduction of small data set on GPU took " << timer.getRealTime() << " sec. " << std::endl;
		timer.reset();
		timer.start();
		#endif

		ResultType resultArray[ 1 ];
		if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< ResultType, ResultType, IndexType >( resultArray, deviceAux1, reducedSize ) )
		return false;
		result = resultArray[ 0 ];

		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		std::cout << " Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl;
		#endif
		}

		return checkCudaDevice;
		#else