Changed smart lambdas in segments, implemented fast segments reduction in ChunkedEllpack. (e93c933e) · Commits · TNL / tnl-dev

src/TNL/Containers/Segments/ChunkedEllpackView.h

+18 −0

Original line number	Diff line number	Diff line
		@@ -148,6 +148,21 @@ class ChunkedEllpackView
		protected:

		#ifdef HAVE_CUDA
		template< typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		typename Real,
		typename... Args >
		__device__
		void segmentsReductionKernelWithAllParameters( IndexType gridIdx,
		IndexType first,
		IndexType last,
		Fetch fetch,
		Reduction reduction,
		ResultKeeper keeper,
		Real zero,
		Args... args ) const;

		template< typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		@@ -206,6 +221,9 @@ class ChunkedEllpackView
		ResultKeeper_ keeper,
		Real_ zero,
		Args_... args );

		template< typename Index_, typename Fetch_, bool B_ >
		friend struct details::ChunkedEllpackSegmentsReductionDispatcher;
		#endif
		};
		} // namespace Segements

src/TNL/Containers/Segments/ChunkedEllpackView.hpp

+130 −92

Original line number	Diff line number	Diff line
		@@ -20,30 +20,6 @@ namespace TNL {
		namespace Containers {
		namespace Segments {

		#ifdef HAVE_CUDA
		template< typename View,
		typename Index,
		typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		typename Real,
		typename... Args >
		__global__
		void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack,
		Index gridIdx,
		Index first,
		Index last,
		Fetch fetch,
		Reduction reduction,
		ResultKeeper keeper,
		Real zero,
		Args... args )
		{
		chunkedEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
		}
		#endif


		template< typename Device,
		typename Index,
		bool RowMajorOrder >
		@@ -460,9 +436,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
		{
		if( gridIdx == cudaGrids - 1 )
		cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
		//ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... >
		// <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		// ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
		details::ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... >
		<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
		}
		#endif
		}
		@@ -571,7 +547,7 @@ template< typename Device,
		__device__
		void
		ChunkedEllpackView< Device, Index, RowMajorOrder >::
		segmentsReductionKernel( IndexType gridIdx,
		segmentsReductionKernelWithAllParameters( IndexType gridIdx,
		IndexType first,
		IndexType last,
		Fetch fetch,
		@@ -582,17 +558,14 @@ segmentsReductionKernel( IndexType gridIdx,
		{
		using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );

		const IndexType firstSlice = rowToChunkMapping[ first ] / chunksInSlice;
		const IndexType lastSlice = rowToChunkMapping[ last - 1 ] / chunksInSlice;
		//for( IndexType sliceIdx = firstSlice; sliceIdx < lastSlice; sliceIdx++ )
		{
		const IndexType sliceIdx = gridIdx * Cuda::getMaxGridSize() + blockIdx.x;
		if( sliceIdx >= lastSlice )
		const IndexType firstSlice = rowToSliceMapping[ first ];
		const IndexType lastSlice = rowToSliceMapping[ last - 1 ];

		const IndexType sliceIdx = firstSlice + gridIdx * Cuda::getMaxGridSize() + blockIdx.x;
		if( sliceIdx > lastSlice )
		return;

		RealType* chunksResults = Cuda::getSharedMemory< RealType >();
		//for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ )
		//{
		__shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
		if( threadIdx.x == 0 )
		sliceInfo = this->slices[ sliceIdx ];
		@@ -616,23 +589,89 @@ segmentsReductionKernel( IndexType gridIdx,
		IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice
		IndexType end = begin + chunkSize;
		for( IndexType j = begin; j < end && compute; j++ )
		reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute, args...) );
		reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) );
		}
		else
		{
		const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice
		const IndexType end = begin + chunksInSlice * chunkSize;
		for( IndexType j = begin; j < end && compute; j += chunksInSlice )
		reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) );
		}
		__syncthreads();
		if( threadIdx.x < sliceInfo.size )
		{
		const IndexType row = sliceInfo.firstSegment + threadIdx.x;
		IndexType chunkIndex( 0 );
		if( threadIdx.x != 0 )
		chunkIndex = this->rowToChunkMapping[ row - 1 ];
		const IndexType lastChunk = this->rowToChunkMapping[ row ];
		RealType result( zero );
		while( chunkIndex < lastChunk )
		reduction( result, chunksResults[ chunkIndex++ ] );
		if( row >= first && row < last )
		keeper( row, result );
		}
		}

		template< typename Device,
		typename Index,
		bool RowMajorOrder >
		template< typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		typename Real,
		typename... Args >
		__device__
		void
		ChunkedEllpackView< Device, Index, RowMajorOrder >::
		segmentsReductionKernel( IndexType gridIdx,
		IndexType first,
		IndexType last,
		Fetch fetch,
		Reduction reduction,
		ResultKeeper keeper,
		Real zero,
		Args... args ) const
		{
		using RealType = decltype( fetch( IndexType(), std::declval< bool& >(), args... ) );

		const IndexType firstSlice = rowToSliceMapping[ first ];
		const IndexType lastSlice = rowToSliceMapping[ last - 1 ];

		const IndexType sliceIdx = firstSlice + gridIdx * Cuda::getMaxGridSize() + blockIdx.x;
		if( sliceIdx > lastSlice )
		return;

		RealType* chunksResults = Cuda::getSharedMemory< RealType >();
		__shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo;

		if( threadIdx.x == 0 )
		sliceInfo = this->slices[ sliceIdx ];
		chunksResults[ threadIdx.x ] = zero;
		__syncthreads();

		const IndexType sliceOffset = sliceInfo.pointer;
		const IndexType chunkSize = sliceInfo.chunkSize;
		const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx.x;
		bool compute( true );

		if( RowMajorOrder )
		{
		IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice
		IndexType end = begin + chunkSize;
		for( IndexType j = begin; j < end && compute; j++ )
		reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) );
		}
		else
		{
		const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice
		const IndexType end = begin + chunksInSlice * chunkSize;
		for( IndexType j = begin; j < end && compute; j += chunksInSlice )
		reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute, args...) );
		reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) );
		}
		__syncthreads();
		//}

		//details::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
		//for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ )
		//{
		//if( threadIdx == 0 )
		// sliceInfo = this->slices[ sliceIdx ];
		if( threadIdx.x < sliceInfo.size )
		{
		const IndexType row = sliceInfo.firstSegment + threadIdx.x;
		@@ -643,10 +682,9 @@ segmentsReductionKernel( IndexType gridIdx,
		RealType result( zero );
		while( chunkIndex < lastChunk )
		reduction( result, chunksResults[ chunkIndex++ ] );
		if( row >= first && row < last )
		keeper( row, result );
		}
		//} // threadIdx
		} // sliceIdx
		}
		#endif

src/TNL/Containers/Segments/details/CheckLambdas.h

+7 −48

Original line number	Diff line number	Diff line
		@@ -18,7 +18,7 @@ namespace TNL {

		template< typename Index,
		typename Lambda >
		class CheckFetchLambdaAcceptsSegmentIdxAndCompute
		class CheckFetchLambda
		{
		private:
		typedef char YesType[1];
		@@ -27,52 +27,11 @@ class CheckFetchLambdaAcceptsSegmentIdxAndCompute
		template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), Index(), std::declval< bool& >() ) ) );
		template< typename C > static NoType& test(...);

		public:
		static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) );
		};

		template< typename Index,
		typename Lambda >
		class CheckFetchLambdaAcceptsSegmentIdx
		{
		private:
		typedef char YesType[1];
		typedef char NoType[2];

		template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), Index() ) ) );
		template< typename C > static NoType& test(...);

		public:
		static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) );
		};

		template< typename Index,
		typename Lambda >
		class CheckFetchLambdaAcceptsCompute
		{
		private:
		typedef char YesType[1];
		typedef char NoType[2];

		template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), std::declval< bool& >() ) ) );
		template< typename C > static NoType& test(...);

		public:
		static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) );
		};


		template< typename Index,
		typename Lambda >
		class CheckFetchLambda
		{
		static constexpr bool AcceptsSegmentIdxAndCompute = CheckFetchLambdaAcceptsSegmentIdxAndCompute< Index, Lambda >::value;
		static constexpr bool AcceptsSegmentIdx = CheckFetchLambdaAcceptsSegmentIdx< Index, Lambda >::value;
		static constexpr bool AcceptsCompute = CheckFetchLambdaAcceptsCompute< Index, Lambda >::value;

		public:
		static constexpr bool acceptsSegmentIdx() { return AcceptsSegmentIdxAndCompute \|\| AcceptsSegmentIdx; };
		static constexpr bool acceptsCompute() { return AcceptsSegmentIdxAndCompute \|\| AcceptsCompute; };
		static constexpr bool hasAllParameters() { return value; };
		};

		} // namespace details

src/TNL/Containers/Segments/details/ChunkedEllpack.h

+76 −0

Original line number	Diff line number	Diff line
		@@ -13,6 +13,7 @@
		#include <type_traits>
		#include <TNL/Containers/Vector.h>
		#include <TNL/Containers/Segments/ChunkedEllpackSegmentView.h>
		#include <TNL/Containers/Segments/details/CheckLambdas.h>

		namespace TNL {
		namespace Containers {
		@@ -223,6 +224,81 @@ class ChunkedEllpack
		chunksInSlice );
		}
		};

		#ifdef HAVE_CUDA
		template< typename Index,
		typename Fetch,
		bool HasAllParameters = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
		struct ChunkedEllpackSegmentsReductionDispatcher{};

		template< typename Index, typename Fetch >
		struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, true >
		{
		template< typename View,
		typename Reduction,
		typename ResultKeeper,
		typename Real,
		typename... Args >
		__device__
		static void exec( View chunkedEllpack,
		Index gridIdx,
		Index first,
		Index last,
		Fetch fetch,
		Reduction reduction,
		ResultKeeper keeper,
		Real zero,
		Args... args )
		{
		chunkedEllpack.segmentsReductionKernelWithAllParameters( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
		}
		};

		template< typename Index, typename Fetch >
		struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, false >
		{
		template< typename View,
		typename Reduction,
		typename ResultKeeper,
		typename Real,
		typename... Args >
		__device__
		static void exec( View chunkedEllpack,
		Index gridIdx,
		Index first,
		Index last,
		Fetch fetch,
		Reduction reduction,
		ResultKeeper keeper,
		Real zero,
		Args... args )
		{
		chunkedEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... );
		}
		};

		template< typename View,
		typename Index,
		typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		typename Real,
		typename... Args >
		__global__
		void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack,
		Index gridIdx,
		Index first,
		Index last,
		Fetch fetch,
		Reduction reduction,
		ResultKeeper keeper,
		Real zero,
		Args... args )
		{
		ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch >::exec( chunkedEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
		}
		#endif

		} //namespace details
		} //namespace Segments
		} //namespace Containers

src/TNL/Containers/Segments/details/LambdaAdapter.h

+3 −3

Original line number	Diff line number	Diff line
		@@ -21,7 +21,7 @@ namespace TNL {

		template< typename Index,
		typename Lambda,
		bool AcceptsSegmentIdx = CheckFetchLambda< Index, Lambda >::acceptsSegmentIdx() >
		bool AllParameters = CheckFetchLambda< Index, Lambda >::hasAllParameters() >
		struct FetchLambdaAdapter
		{
		};
		@@ -42,10 +42,10 @@ template< typename Index,
		typename Lambda >
		struct FetchLambdaAdapter< Index, Lambda, false >
		{
		using ReturnType = decltype( std::declval< Lambda >()( Index(), Index(), std::declval< bool& >() ) );
		using ReturnType = decltype( std::declval< Lambda >()( Index(), std::declval< bool& >() ) );
		static ReturnType call( Lambda& f, Index segmentIdx, Index localIdx, Index globalIdx, bool& compute )
		{
		return f( localIdx, globalIdx, compute );
		return f( globalIdx, compute );
		}
		};