Merge branch 'JK/segments-fix' into 'develop' (a0c48268) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Segments/BiEllpackView.hpp

+2 −2

Original line number	Diff line number	Diff line
		@@ -428,9 +428,9 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
		detail::BiEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim >
		<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		( *this, gridIdx, first, last, fetch, reduction, keeper, zero );
		cudaThreadSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		cudaStreamSynchronize(0);
		TNL_CHECK_CUDA_DEVICE;
		#endif
		}
		}

+2 −0

Original line number	Diff line number	Diff line
		@@ -460,6 +460,8 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
		<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		( *this, gridIdx, first, last, fetch, reduction, keeper, zero );
		}
		cudaStreamSynchronize(0);
		TNL_CHECK_CUDA_DEVICE;
		#endif
		}
		}

+8 −2

Original line number	Diff line number	Diff line
		@@ -105,13 +105,16 @@ struct EllpackCudaReductionDispatcher
		exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize )
		{
		#ifdef HAVE_CUDA
		if( last <= first )
		return;
		const Index segmentsCount = last - first;
		const Index threadsCount = segmentsCount * 32;
		const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 );
		dim3 blockSize( 256 );
		dim3 gridSize( blocksCount );
		EllpackCudaReductionKernelFull<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize );
		cudaDeviceSynchronize();
		cudaStreamSynchronize(0);
		TNL_CHECK_CUDA_DEVICE;
		#endif
		}
		};
		@@ -127,13 +130,16 @@ struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Re
		exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize )
		{
		#ifdef HAVE_CUDA
		if( last <= first )
		return;
		const Index segmentsCount = last - first;
		const Index threadsCount = segmentsCount * 32;
		const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 );
		dim3 blockSize( 256 );
		dim3 gridSize( blocksCount );
		EllpackCudaReductionKernelCompact<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize );
		cudaDeviceSynchronize();
		cudaStreamSynchronize(0);
		TNL_CHECK_CUDA_DEVICE;
		#endif
		}
		};

+2 −0

Original line number	Diff line number	Diff line
		@@ -272,6 +272,8 @@ struct CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reductio
		zero,
		args... );
		}
		cudaStreamSynchronize(0);
		TNL_CHECK_CUDA_DEVICE;
		#endif
		}
		};

+7 −0

Original line number	Diff line number	Diff line
		@@ -175,7 +175,10 @@ void
		CSRHybridKernel< Index, Device, ThreadsInBlock >::
		init( const Offsets& offsets )
		{
		TNL_ASSERT_GT( offsets.getSize(), 0, "offsets size must be strictly positive" );
		const Index segmentsCount = offsets.getSize() - 1;
		if( segmentsCount <= 0 )
		return;
		const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount );
		this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), ThreadsInBlock ); //TNL::Cuda::getWarpSize() );
		TNL_ASSERT_GE( threadsPerSegment, 0, "" );
		@@ -245,6 +248,8 @@ reduceSegments( const OffsetsView& offsets,
		TNL_ASSERT_LE( this->threadsPerSegment, ThreadsInBlock, "" );

		#ifdef HAVE_CUDA
		if( last <= first )
		return;
		const size_t threadsCount = this->threadsPerSegment * ( last - first );
		dim3 blocksCount, gridsCount, blockSize( ThreadsInBlock );
		TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
		@@ -297,6 +302,8 @@ reduceSegments( const OffsetsView& offsets,
		throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) );
		}
		}
		cudaStreamSynchronize(0);
		TNL_CHECK_CUDA_DEVICE;
		#endif
		}