Reduction: renamed zero and idempotent to identity (0d329226) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Multireduction.h

+12 −6

Original line number	Diff line number	Diff line
		@@ -29,7 +29,9 @@ struct Multireduction< Devices::Sequential >
		{
		/**
		* Parameters:
		* zero: starting value for reduction
		* identity: the [identity element](https://en.wikipedia.org/wiki/Identity_element)
		* for the reduction operation, i.e. element which does not
		* change the result of the reduction
		* dataFetcher: callable object such that `dataFetcher( i, j )` yields
		* the i-th value to be reduced from the j-th dataset
		* (i = 0,...,size-1; j = 0,...,n-1)
		@@ -45,7 +47,7 @@ struct Multireduction< Devices::Sequential >
		typename Reduction,
		typename Index >
		static constexpr void
		reduce( const Result zero,
		reduce( const Result identity,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const Index size,
		@@ -58,7 +60,9 @@ struct Multireduction< Devices::Host >
		{
		/**
		* Parameters:
		* zero: starting value for reduction
		* identity: the [identity element](https://en.wikipedia.org/wiki/Identity_element)
		* for the reduction operation, i.e. element which does not
		* change the result of the reduction
		* dataFetcher: callable object such that `dataFetcher( i, j )` yields
		* the i-th value to be reduced from the j-th dataset
		* (i = 0,...,size-1; j = 0,...,n-1)
		@@ -74,7 +78,7 @@ struct Multireduction< Devices::Host >
		typename Reduction,
		typename Index >
		static void
		reduce( const Result zero,
		reduce( const Result identity,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const Index size,
		@@ -87,7 +91,9 @@ struct Multireduction< Devices::Cuda >
		{
		/**
		* Parameters:
		* zero: starting value for reduction
		* identity: the [identity element](https://en.wikipedia.org/wiki/Identity_element)
		* for the reduction operation, i.e. element which does not
		* change the result of the reduction
		* dataFetcher: callable object such that `dataFetcher( i, j )` yields
		* the i-th value to be reduced from the j-th dataset
		* (i = 0,...,size-1; j = 0,...,n-1)
		@@ -103,7 +109,7 @@ struct Multireduction< Devices::Cuda >
		typename Reduction,
		typename Index >
		static void
		reduce( const Result zero,
		reduce( const Result identity,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const Index size,

src/TNL/Algorithms/Multireduction.hpp

+10 −10

Original line number	Diff line number	Diff line
		@@ -35,7 +35,7 @@ template< typename Result,
		typename Index >
		void constexpr
		Multireduction< Devices::Sequential >::
		reduce( const Result zero,
		reduce( const Result identity,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const Index size,
		@@ -53,7 +53,7 @@ reduce( const Result zero,
		// (it is accessed as a row-major matrix with n rows and 4 columns)
		Result r[ n * 4 ];
		for( int k = 0; k < n * 4; k++ )
		r[ k ] = zero;
		r[ k ] = identity;

		// main reduction (explicitly unrolled loop)
		for( int b = 0; b < blocks; b++ ) {
		@@ -89,7 +89,7 @@ reduce( const Result zero,
		}
		else {
		for( int k = 0; k < n; k++ )
		result[ k ] = zero;
		result[ k ] = identity;

		for( int b = 0; b < blocks; b++ ) {
		const Index offset = b * block_size;
		@@ -112,7 +112,7 @@ template< typename Result,
		typename Index >
		void
		Multireduction< Devices::Host >::
		reduce( const Result zero,
		reduce( const Result identity,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const Index size,
		@@ -134,14 +134,14 @@ reduce( const Result zero,
		#pragma omp single nowait
		{
		for( int k = 0; k < n; k++ )
		result[ k ] = zero;
		result[ k ] = identity;
		}

		// initialize array for thread-local results
		// (it is accessed as a row-major matrix with n rows and 4 columns)
		Result r[ n * 4 ];
		for( int k = 0; k < n * 4; k++ )
		r[ k ] = zero;
		r[ k ] = identity;

		#pragma omp for nowait
		for( int b = 0; b < blocks; b++ ) {
		@@ -185,7 +185,7 @@ reduce( const Result zero,
		}
		else
		#endif
		Multireduction< Devices::Sequential >::reduce( zero, dataFetcher, reduction, size, n, result );
		Multireduction< Devices::Sequential >::reduce( identity, dataFetcher, reduction, size, n, result );
		}

		template< typename Result,
		@@ -194,7 +194,7 @@ template< typename Result,
		typename Index >
		void
		Multireduction< Devices::Cuda >::
		reduce( const Result zero,
		reduce( const Result identity,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const Index size,
		@@ -212,7 +212,7 @@ reduce( const Result zero,

		// start the reduction on the GPU
		Result* deviceAux1 = nullptr;
		const int reducedSize = detail::CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 );
		const int reducedSize = detail::CudaMultireductionKernelLauncher( identity, dataFetcher, reduction, size, n, deviceAux1 );

		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		@@ -234,7 +234,7 @@ reduce( const Result zero,

		// finish the reduction on the host
		auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; };
		Multireduction< Devices::Sequential >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult );
		Multireduction< Devices::Sequential >::reduce( identity, dataFetcherFinish, reduction, reducedSize, n, hostResult );

		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();

src/TNL/Algorithms/SegmentedScan.h

+12 −9

Original line number	Diff line number	Diff line
		@@ -81,8 +81,9 @@ struct SegmentedScan< Devices::Sequential, Type >
		* \param begin the first element in the array to be scanned
		* \param end the last element in the array to be scanned
		* \param reduction lambda function implementing the reduction operation
		* \param zero is the idempotent element for the reduction operation, i.e. element which
		* does not change the result of the reduction.
		* \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
		* for the reduction operation, i.e. element which does not
		* change the result of the reduction.
		*
		* The reduction lambda function takes two variables which are supposed to be reduced:
		*
		@@ -107,7 +108,7 @@ struct SegmentedScan< Devices::Sequential, Type >
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::ValueType zero );
		const typename Vector::ValueType identity );
		};

		template< detail::ScanType Type >
		@@ -125,8 +126,9 @@ struct SegmentedScan< Devices::Host, Type >
		* \param begin the first element in the array to be scanned
		* \param end the last element in the array to be scanned
		* \param reduction lambda function implementing the reduction operation
		* \param zero is the idempotent element for the reduction operation, i.e. element which
		* does not change the result of the reduction.
		* \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
		* for the reduction operation, i.e. element which does not
		* change the result of the reduction.
		*
		* The reduction lambda function takes two variables which are supposed to be reduced:
		*
		@@ -151,7 +153,7 @@ struct SegmentedScan< Devices::Host, Type >
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::ValueType zero );
		const typename Vector::ValueType identity );
		};

		template< detail::ScanType Type >
		@@ -169,8 +171,9 @@ struct SegmentedScan< Devices::Cuda, Type >
		* \param begin the first element in the array to be scanned
		* \param end the last element in the array to be scanned
		* \param reduction lambda function implementing the reduction operation
		* \param zero is the idempotent element for the reduction operation, i.e. element which
		* does not change the result of the reduction.
		* \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
		* for the reduction operation, i.e. element which does not
		* change the result of the reduction.
		*
		* The reduction lambda function takes two variables which are supposed to be reduced:
		*
		@@ -197,7 +200,7 @@ struct SegmentedScan< Devices::Cuda, Type >
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::ValueType zero );
		const typename Vector::ValueType identity );
		};

		} // namespace Algorithms

src/TNL/Algorithms/SegmentedScan.hpp

+7 −7

Original line number	Diff line number	Diff line
		@@ -30,7 +30,7 @@ perform( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::ValueType zero )
		const typename Vector::ValueType identity )
		{
		using ValueType = typename Vector::ValueType;
		using IndexType = typename Vector::IndexType;
		@@ -44,12 +44,12 @@ perform( Vector& v,
		else // Exclusive scan
		{
		ValueType aux( v[ begin ] );
		v[ begin ] = zero;
		v[ begin ] = identity;
		for( IndexType i = begin + 1; i < end; i++ )
		{
		ValueType x = v[ i ];
		if( flags[ i ] )
		aux = zero;
		aux = identity;
		v[ i ] = aux;
		aux = reduction( aux, x );
		}
		@@ -67,13 +67,13 @@ perform( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::ValueType zero )
		const typename Vector::ValueType identity )
		{
		#ifdef HAVE_OPENMP
		// TODO: parallelize with OpenMP
		SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
		SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, identity );
		#else
		SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
		SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, identity );
		#endif
		}

		@@ -88,7 +88,7 @@ perform( Vector& v,
		const typename Vector::IndexType begin,
		const typename Vector::IndexType end,
		const Reduction& reduction,
		const typename Vector::ValueType zero )
		const typename Vector::ValueType identity )
		{
		#ifdef HAVE_CUDA
		using ValueType = typename Vector::ValueType;

src/TNL/Algorithms/detail/CudaMultireductionKernel.h

+12 −12

Original line number	Diff line number	Diff line
		@@ -47,7 +47,7 @@ template< int blockSizeX,
		typename Index >
		__global__ void
		__launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor )
		CudaMultireductionKernel( const Result zero,
		CudaMultireductionKernel( const Result identity,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const Index size,
		@@ -65,7 +65,7 @@ CudaMultireductionKernel( const Result zero,
		const int y = blockIdx.y * blockDim.y + threadIdx.y;
		if( y >= n ) return;

		sdata[ tid ] = zero;
		sdata[ tid ] = identity;

		// Start with the sequential reduction and push the result into the shared memory.
		while( gid + 4 * gridSizeX < size ) {
		@@ -145,7 +145,7 @@ template< typename Result,
		typename Reduction,
		typename Index >
		int
		CudaMultireductionKernelLauncher( const Result zero,
		CudaMultireductionKernelLauncher( const Result identity,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const Index size,
		@@ -217,55 +217,55 @@ CudaMultireductionKernelLauncher( const Result zero,
		{
		case 512:
		CudaMultireductionKernel< 512 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
		<<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
		break;
		case 256:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 256 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
		<<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
		break;
		case 128:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 128 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
		<<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
		break;
		case 64:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 64 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
		<<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
		break;
		case 32:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 32 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
		<<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
		break;
		case 16:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 16 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
		<<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
		break;
		case 8:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 8 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
		<<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
		break;
		case 4:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 4 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
		<<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
		break;
		case 2:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 2 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
		<<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
		break;
		case 1:
		throw std::logic_error( "blockSize should not be 1." );