Commit 28938623 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Non-fundamental types must be reduced completely on the CUDA device

parent 3c03735e
Loading
Loading
Loading
Loading
+66 −29
Original line number Diff line number Diff line
@@ -51,11 +51,18 @@ reductionOnCudaDevice( Operation& operation,
   typedef typename Operation::ResultType ResultType;
   typedef typename Operation::LaterReductionOperation LaterReductionOperation;
 
   /***
    * Only fundamental and pointer types can be safely reduced on host. Complex
    * objects stored on the device might contain pointers into the device memory,
    * in which case reduction on host might fail.
    */
   constexpr bool can_copy_to_host = std::is_fundamental< RealType >::value || std::is_pointer< RealType >::value;

   /***
    * First check if the input array(s) is/are large enough for the reduction on GPU.
    * Otherwise copy it/them to host and reduce on CPU.
    */
   if( size <= minGPUReductionDataSize )
   if( can_copy_to_host && size <= minGPUReductionDataSize )
   {
      RealType hostArray1[ minGPUReductionDataSize ];
      RealType hostArray2[ minGPUReductionDataSize ];
@@ -92,6 +99,7 @@ reductionOnCudaDevice( Operation& operation,
      timer.start();
   #endif

   if( can_copy_to_host ) {
      /***
       * Transfer the reduced data from device to host.
       */
@@ -102,9 +110,6 @@ reductionOnCudaDevice( Operation& operation,
      #ifdef CUDA_REDUCTION_PROFILING
         timer.stop();
         std::cout << "   Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl;
   #endif

   #ifdef CUDA_REDUCTION_PROFILING
         timer.reset();
         timer.start();
      #endif
@@ -121,6 +126,38 @@ reductionOnCudaDevice( Operation& operation,
         timer.stop();
         std::cout << "   Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
      #endif
   }
   else {
      /***
       * Data can't be safely reduced on host, so continue with the reduction on the CUDA device.
       */
      LaterReductionOperation laterReductionOperation;
      while( reducedSize > 1 ) {
         // TODO: copy the intermediate result somewhere else, in-place reduction probably does not work
         reducedSize = CudaReductionKernelLauncher( laterReductionOperation,
                                                    reducedSize,
                                                    deviceAux1,
                                                    (ResultType*) 0,
                                                    deviceAux1 );
      }

      #ifdef CUDA_REDUCTION_PROFILING
         timer.stop();
         std::cout << "   Reduction of small data set on GPU took " << timer.getRealTime() << " sec. " << std::endl;
         timer.reset();
         timer.start();
      #endif

      ResultType resultArray[ 1 ];
      if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< ResultType, ResultType, IndexType >( resultArray, deviceAux1, reducedSize ) )
         return false;
      result = resultArray[ 0 ];

      #ifdef CUDA_REDUCTION_PROFILING
         timer.stop();
         std::cout << "   Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl;
      #endif
   }
 
   return checkCudaDevice;
#else