Loading src/TNL/Algorithms/Segments/BiEllpackView.hpp +2 −2 Original line number Diff line number Diff line Loading @@ -428,9 +428,9 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& detail::BiEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( *this, gridIdx, first, last, fetch, reduction, keeper, zero ); cudaThreadSynchronize(); TNL_CHECK_CUDA_DEVICE; } cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } } Loading src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp +2 −0 Original line number Diff line number Diff line Loading @@ -460,6 +460,8 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( *this, gridIdx, first, last, fetch, reduction, keeper, zero ); } cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } } Loading src/TNL/Algorithms/Segments/EllpackView.hpp +4 −2 Original line number Diff line number Diff line Loading @@ -111,7 +111,8 @@ struct EllpackCudaReductionDispatcher dim3 blockSize( 256 ); dim3 gridSize( blocksCount ); EllpackCudaReductionKernelFull<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize ); cudaDeviceSynchronize(); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } }; Loading @@ -133,7 +134,8 @@ struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Re dim3 blockSize( 256 ); dim3 gridSize( blocksCount ); EllpackCudaReductionKernelCompact<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize ); cudaDeviceSynchronize(); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } }; Loading src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp +2 −0 Original line number Diff line number Diff line Loading @@ -272,6 +272,8 @@ struct CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reductio zero, args... ); } cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } }; Loading src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp +2 −0 Original line number Diff line number Diff line Loading @@ -297,6 +297,8 @@ reduceSegments( const OffsetsView& offsets, throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) ); } } cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } Loading Loading
src/TNL/Algorithms/Segments/BiEllpackView.hpp +2 −2 Original line number Diff line number Diff line Loading @@ -428,9 +428,9 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& detail::BiEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( *this, gridIdx, first, last, fetch, reduction, keeper, zero ); cudaThreadSynchronize(); TNL_CHECK_CUDA_DEVICE; } cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } } Loading
src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp +2 −0 Original line number Diff line number Diff line Loading @@ -460,6 +460,8 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( *this, gridIdx, first, last, fetch, reduction, keeper, zero ); } cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } } Loading
src/TNL/Algorithms/Segments/EllpackView.hpp +4 −2 Original line number Diff line number Diff line Loading @@ -111,7 +111,8 @@ struct EllpackCudaReductionDispatcher dim3 blockSize( 256 ); dim3 gridSize( blocksCount ); EllpackCudaReductionKernelFull<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize ); cudaDeviceSynchronize(); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } }; Loading @@ -133,7 +134,8 @@ struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Re dim3 blockSize( 256 ); dim3 gridSize( blocksCount ); EllpackCudaReductionKernelCompact<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize ); cudaDeviceSynchronize(); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } }; Loading
src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp +2 −0 Original line number Diff line number Diff line Loading @@ -272,6 +272,8 @@ struct CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reductio zero, args... ); } cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } }; Loading
src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp +2 −0 Original line number Diff line number Diff line Loading @@ -297,6 +297,8 @@ reduceSegments( const OffsetsView& offsets, throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) ); } } cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } Loading