Commit af6d1d6b authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Added default stream synchronizations after kernel launches in CudaPrefixSumKernel.h

parent 8d0d2638
Loading
Loading
Loading
Loading
+9 −8
Original line number Original line Diff line number Diff line
@@ -220,7 +220,7 @@ struct CudaPrefixSumKernelLauncher
       */
       */
      const std::size_t sharedDataSize = elementsInBlock +
      const std::size_t sharedDataSize = elementsInBlock +
                                         elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2;
                                         elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2;
      const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize()  ) * sizeof( Real );
      const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real );
      cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
      cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
         ( prefixSumType_,
         ( prefixSumType_,
           reduction,
           reduction,
@@ -231,6 +231,7 @@ struct CudaPrefixSumKernelLauncher
           output,
           output,
           auxArray1.getData(),
           auxArray1.getData(),
           gridShift );
           gridShift );
      cudaStreamSynchronize(0);
      TNL_CHECK_CUDA_DEVICE;
      TNL_CHECK_CUDA_DEVICE;




@@ -260,6 +261,7 @@ struct CudaPrefixSumKernelLauncher
           gridShift,
           gridShift,
           auxArray2.getData(),
           auxArray2.getData(),
           output );
           output );
      cudaStreamSynchronize(0);
      TNL_CHECK_CUDA_DEVICE;
      TNL_CHECK_CUDA_DEVICE;


      cudaMemcpy( &gridShift,
      cudaMemcpy( &gridShift,
@@ -284,11 +286,11 @@ struct CudaPrefixSumKernelLauncher
   template< typename Reduction >
   template< typename Reduction >
   static void
   static void
   start( const Index size,
   start( const Index size,
      const Index blockSize,
          const Index blockSize,
      const Real *deviceInput,
          const Real *deviceInput,
      Real* deviceOutput,
          Real* deviceOutput,
      Reduction& reduction,
          Reduction& reduction,
      const Real& zero )
          const Real& zero )
   {
   {
      /****
      /****
       * Compute the number of grids
       * Compute the number of grids
@@ -323,11 +325,10 @@ struct CudaPrefixSumKernelLauncher
            gridShift,
            gridShift,
            &deviceInput[ gridOffset ],
            &deviceInput[ gridOffset ],
            &deviceOutput[ gridOffset ] );
            &deviceOutput[ gridOffset ] );
         TNL_CHECK_CUDA_DEVICE;
      }
      }


      /***
      /***
       * Store the number of CUDA grids for a purpose of unit testing, i.e.
       * Store the number of CUDA grids for the purpose of unit testing, i.e.
       * to check if we test the algorithm with more than one CUDA grid.
       * to check if we test the algorithm with more than one CUDA grid.
       */
       */
      gridsCount = numberOfGrids;
      gridsCount = numberOfGrids;