Added default stream synchronizations after kernel launches in CudaPrefixSumKernel.h (af6d1d6b) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h

+9 −8

Original line number	Diff line number	Diff line
		@@ -231,6 +231,7 @@ struct CudaPrefixSumKernelLauncher
		output,
		auxArray1.getData(),
		gridShift );
		cudaStreamSynchronize(0);
		TNL_CHECK_CUDA_DEVICE;


		@@ -260,6 +261,7 @@ struct CudaPrefixSumKernelLauncher
		gridShift,
		auxArray2.getData(),
		output );
		cudaStreamSynchronize(0);
		TNL_CHECK_CUDA_DEVICE;

		cudaMemcpy( &gridShift,
		@@ -323,11 +325,10 @@ struct CudaPrefixSumKernelLauncher
		gridShift,
		&deviceInput[ gridOffset ],
		&deviceOutput[ gridOffset ] );
		TNL_CHECK_CUDA_DEVICE;
		}

		/***
		* Store the number of CUDA grids for a purpose of unit testing, i.e.
		* Store the number of CUDA grids for the purpose of unit testing, i.e.
		* to check if we test the algorithm with more than one CUDA grid.
		*/
		gridsCount = numberOfGrids;