Added missing TNL_CHECK_CUDA_DEVICE and fixed stream synchronization after segments CUDA kernels (ce8f92fc) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Segments/BiEllpackView.hpp

+2 −2

Original line number	Diff line number	Diff line
		@@ -428,9 +428,9 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
		detail::BiEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim >
		<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		( *this, gridIdx, first, last, fetch, reduction, keeper, zero );
		cudaThreadSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		cudaStreamSynchronize(0);
		TNL_CHECK_CUDA_DEVICE;
		#endif
		}
		}

+2 −0

Original line number	Diff line number	Diff line
		@@ -460,6 +460,8 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
		<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		( *this, gridIdx, first, last, fetch, reduction, keeper, zero );
		}
		cudaStreamSynchronize(0);
		TNL_CHECK_CUDA_DEVICE;
		#endif
		}
		}

+4 −2

Original line number	Diff line number	Diff line
		@@ -111,7 +111,8 @@ struct EllpackCudaReductionDispatcher
		dim3 blockSize( 256 );
		dim3 gridSize( blocksCount );
		EllpackCudaReductionKernelFull<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize );
		cudaDeviceSynchronize();
		cudaStreamSynchronize(0);
		TNL_CHECK_CUDA_DEVICE;
		#endif
		}
		};
		@@ -133,7 +134,8 @@ struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Re
		dim3 blockSize( 256 );
		dim3 gridSize( blocksCount );
		EllpackCudaReductionKernelCompact<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize );
		cudaDeviceSynchronize();
		cudaStreamSynchronize(0);
		TNL_CHECK_CUDA_DEVICE;
		#endif
		}
		};

+2 −0

Original line number	Diff line number	Diff line
		@@ -272,6 +272,8 @@ struct CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reductio
		zero,
		args... );
		}
		cudaStreamSynchronize(0);
		TNL_CHECK_CUDA_DEVICE;
		#endif
		}
		};

+2 −0

Original line number	Diff line number	Diff line
		@@ -297,6 +297,8 @@ reduceSegments( const OffsetsView& offsets,
		throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) );
		}
		}
		cudaStreamSynchronize(0);
		TNL_CHECK_CUDA_DEVICE;
		#endif
		}