Commit 639df716 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Fixed bug in parallel reduction on CUDA

parent ca8e0b12
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -39,7 +39,7 @@ static constexpr int Multireduction_maxThreadsPerBlock = 256; // must be a powe
template< typename Operation, int blockSizeX >      
__global__ void
__launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor )
CudaMultireductionKernel( Operation& operation,
CudaMultireductionKernel( Operation operation,
                          const typename Operation::IndexType n,
                          const typename Operation::IndexType size,
                          const typename Operation::RealType* input1,
+1 −1
Original line number Diff line number Diff line
@@ -39,7 +39,7 @@ static constexpr int Reduction_maxThreadsPerBlock = 256; // must be a power of
template< typename Operation, int blockSize >
__global__ void
__launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
CudaReductionKernel( Operation& operation,
CudaReductionKernel( Operation operation,
                     const typename Operation::IndexType size,
                     const typename Operation::RealType* input1,
                     const typename Operation::RealType* input2,
+1 −1
Original line number Diff line number Diff line
@@ -98,7 +98,7 @@ reductionOnCudaDevice( Operation& operation,
   /***
    * Transfer the reduced data from device to host.
    */
   ResultType resultArray[ minGPUReductionDataSize ];
   ResultType resultArray[ reducedSize ];
   if( ! Containers::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< ResultType, ResultType, IndexType >( resultArray, deviceAux1, reducedSize ) )
      return false;