Loading src/TNL/Algorithms/Segments/BiEllpackView.hpp +2 −2 Original line number Diff line number Diff line Loading @@ -428,9 +428,9 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& detail::BiEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( *this, gridIdx, first, last, fetch, reduction, keeper, zero ); cudaThreadSynchronize(); TNL_CHECK_CUDA_DEVICE; } cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } } Loading src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp +2 −0 Original line number Diff line number Diff line Loading @@ -460,6 +460,8 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( *this, gridIdx, first, last, fetch, reduction, keeper, zero ); } cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } } Loading src/TNL/Algorithms/Segments/EllpackView.hpp +8 −2 Original line number Diff line number Diff line Loading @@ -105,13 +105,16 @@ struct EllpackCudaReductionDispatcher exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize ) { #ifdef HAVE_CUDA if( last <= first ) return; const Index segmentsCount = last - first; const Index threadsCount = segmentsCount * 32; const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 ); dim3 blockSize( 256 ); dim3 gridSize( blocksCount ); EllpackCudaReductionKernelFull<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize ); cudaDeviceSynchronize(); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } }; Loading @@ -127,13 +130,16 @@ struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Re exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize ) { #ifdef HAVE_CUDA if( last <= first ) return; const Index segmentsCount = last - first; const Index threadsCount = segmentsCount * 32; const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 ); dim3 blockSize( 256 ); dim3 gridSize( blocksCount ); EllpackCudaReductionKernelCompact<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize ); cudaDeviceSynchronize(); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } }; Loading src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp +2 −0 Original line number Diff line number Diff line Loading @@ -272,6 +272,8 @@ struct CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reductio zero, args... ); } cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } }; Loading src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp +7 −0 Original line number Diff line number Diff line Loading @@ -175,7 +175,10 @@ void CSRHybridKernel< Index, Device, ThreadsInBlock >:: init( const Offsets& offsets ) { TNL_ASSERT_GT( offsets.getSize(), 0, "offsets size must be strictly positive" ); const Index segmentsCount = offsets.getSize() - 1; if( segmentsCount <= 0 ) return; const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount ); this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), ThreadsInBlock ); //TNL::Cuda::getWarpSize() ); TNL_ASSERT_GE( threadsPerSegment, 0, "" ); Loading Loading @@ -245,6 +248,8 @@ reduceSegments( const OffsetsView& offsets, TNL_ASSERT_LE( this->threadsPerSegment, ThreadsInBlock, "" ); #ifdef HAVE_CUDA if( last <= first ) return; const size_t threadsCount = this->threadsPerSegment * ( last - first ); dim3 blocksCount, gridsCount, blockSize( ThreadsInBlock ); TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); Loading Loading @@ -297,6 +302,8 @@ reduceSegments( const OffsetsView& offsets, throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) ); } } cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } Loading Loading
src/TNL/Algorithms/Segments/BiEllpackView.hpp +2 −2 Original line number Diff line number Diff line Loading @@ -428,9 +428,9 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& detail::BiEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( *this, gridIdx, first, last, fetch, reduction, keeper, zero ); cudaThreadSynchronize(); TNL_CHECK_CUDA_DEVICE; } cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } } Loading
src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp +2 −0 Original line number Diff line number Diff line Loading @@ -460,6 +460,8 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( *this, gridIdx, first, last, fetch, reduction, keeper, zero ); } cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } } Loading
src/TNL/Algorithms/Segments/EllpackView.hpp +8 −2 Original line number Diff line number Diff line Loading @@ -105,13 +105,16 @@ struct EllpackCudaReductionDispatcher exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize ) { #ifdef HAVE_CUDA if( last <= first ) return; const Index segmentsCount = last - first; const Index threadsCount = segmentsCount * 32; const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 ); dim3 blockSize( 256 ); dim3 gridSize( blocksCount ); EllpackCudaReductionKernelFull<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize ); cudaDeviceSynchronize(); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } }; Loading @@ -127,13 +130,16 @@ struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Re exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize ) { #ifdef HAVE_CUDA if( last <= first ) return; const Index segmentsCount = last - first; const Index threadsCount = segmentsCount * 32; const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 ); dim3 blockSize( 256 ); dim3 gridSize( blocksCount ); EllpackCudaReductionKernelCompact<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize ); cudaDeviceSynchronize(); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } }; Loading
src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp +2 −0 Original line number Diff line number Diff line Loading @@ -272,6 +272,8 @@ struct CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reductio zero, args... ); } cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } }; Loading
src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp +7 −0 Original line number Diff line number Diff line Loading @@ -175,7 +175,10 @@ void CSRHybridKernel< Index, Device, ThreadsInBlock >:: init( const Offsets& offsets ) { TNL_ASSERT_GT( offsets.getSize(), 0, "offsets size must be strictly positive" ); const Index segmentsCount = offsets.getSize() - 1; if( segmentsCount <= 0 ) return; const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount ); this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), ThreadsInBlock ); //TNL::Cuda::getWarpSize() ); TNL_ASSERT_GE( threadsPerSegment, 0, "" ); Loading Loading @@ -245,6 +248,8 @@ reduceSegments( const OffsetsView& offsets, TNL_ASSERT_LE( this->threadsPerSegment, ThreadsInBlock, "" ); #ifdef HAVE_CUDA if( last <= first ) return; const size_t threadsCount = this->threadsPerSegment * ( last - first ); dim3 blocksCount, gridsCount, blockSize( ThreadsInBlock ); TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); Loading Loading @@ -297,6 +302,8 @@ reduceSegments( const OffsetsView& offsets, throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) ); } } cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; #endif } Loading