Swaping parameters of tuple returned by reductionWithArgument. It makes better... (58fc7ce1) · Commits · TNL / tnl-dev

Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp

+4 −4

Original line number	Diff line number	Diff line
		@@ -8,13 +8,13 @@ using namespace TNL::Containers;
		using namespace TNL::Algorithms;

		template< typename Device >
		std::pair< int, double >
		std::pair< double, int >
		maximumNorm( const Vector< double, Device >& v )
		{
		auto view = v.getConstView();

		auto fetch = [=] __cuda_callable__ ( int i ) { return abs( view[ i ] ); };
		auto reduction = [] __cuda_callable__ ( int& aIdx, const int& bIdx, double& a, const double& b ) {
		auto reduction = [] __cuda_callable__ ( double& a, const double& b, int& aIdx, const int& bIdx ) {
		if( a < b ) {
		a = b;
		aIdx = bIdx;
		@@ -31,13 +31,13 @@ int main( int argc, char* argv[] )
		host_v.evaluate( [] __cuda_callable__ ( int i )->double { return i - 7; } );
		std::cout << "host_v = " << host_v << std::endl;
		auto maxNormHost = maximumNorm( host_v );
		std::cout << "The maximum norm of the host vector elements is " << maxNormHost.second << " at position " << maxNormHost.first << "." << std::endl;
		std::cout << "The maximum norm of the host vector elements is " << maxNormHost.first << " at position " << maxNormHost.second << "." << std::endl;
		#ifdef HAVE_CUDA
		Vector< double, Devices::Cuda > cuda_v( 10 );
		cuda_v.evaluate( [] __cuda_callable__ ( int i )->double { return i - 7; } );
		std::cout << "cuda_v = " << cuda_v << std::endl;
		auto maxNormCuda = maximumNorm( cuda_v );
		std::cout << "The maximum norm of the device vector elements is " << maxNormCuda.second << " at position " << maxNormCuda.first << "." << std::endl;
		std::cout << "The maximum norm of the device vector elements is " << maxNormCuda.first << " at position " << maxNormCuda.second << "." << std::endl;
		#endif
		return EXIT_SUCCESS;
		}

src/TNL/Algorithms/CudaReductionKernel.h

+28 −28

Original line number	Diff line number	Diff line
		@@ -166,19 +166,19 @@ CudaReductionWithArgumentKernel( const Result zero,
		sdata[ tid ] = zero;
		}
		while( gid + 4 * gridSize < size ) {
		reduction( sidx[ tid ], idxInput[ gid ], sdata[ tid ], dataFetcher( gid ) );
		reduction( sidx[ tid ], idxInput[ gid + gridSize ], sdata[ tid ], dataFetcher( gid + gridSize ) );
		reduction( sidx[ tid ], idxInput[ gid + 2 * gridSize ], sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
		reduction( sidx[ tid ], idxInput[ gid + 3 * gridSize ], sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
		reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] );
		reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], idxInput[ gid + gridSize ] );
		reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ), sidx[ tid ], idxInput[ gid + 2 * gridSize ] );
		reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ), sidx[ tid ], idxInput[ gid + 3 * gridSize ] );
		gid += 4 * gridSize;
		}
		while( gid + 2 * gridSize < size ) {
		reduction( sidx[ tid ], idxInput[ gid ], sdata[ tid ], dataFetcher( gid ) );
		reduction( sidx[ tid ], idxInput[ gid + gridSize ], sdata[ tid ], dataFetcher( gid + gridSize ) );
		reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] );
		reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], idxInput[ gid + gridSize ] );
		gid += 2 * gridSize;
		}
		while( gid < size ) {
		reduction( sidx[ tid ], idxInput[ gid ], sdata[ tid ], dataFetcher( gid ) );
		reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] );
		gid += gridSize;
		}
		}
		@@ -191,19 +191,19 @@ CudaReductionWithArgumentKernel( const Result zero,
		sdata[ tid ] = zero;
		}
		while( gid + 4 * gridSize < size ) {
		reduction( sidx[ tid ], gid, sdata[ tid ], dataFetcher( gid ) );
		reduction( sidx[ tid ], gid + gridSize, sdata[ tid ], dataFetcher( gid + gridSize ) );
		reduction( sidx[ tid ], gid + 2 * gridSize, sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
		reduction( sidx[ tid ], gid + 3 * gridSize, sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
		reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid );
		reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], gid + gridSize );
		reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ), sidx[ tid ], gid + 2 * gridSize );
		reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ), sidx[ tid ], gid + 3 * gridSize );
		gid += 4 * gridSize;
		}
		while( gid + 2 * gridSize < size ) {
		reduction( sidx[ tid ], gid, sdata[ tid ], dataFetcher( gid ) );
		reduction( sidx[ tid ], gid + gridSize, sdata[ tid ], dataFetcher( gid + gridSize ) );
		reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid );
		reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], gid + gridSize );
		gid += 2 * gridSize;
		}
		while( gid < size ) {
		reduction( sidx[ tid ], gid, sdata[ tid ], dataFetcher( gid ) );
		reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid );
		gid += gridSize;
		}
		}
		@@ -212,48 +212,48 @@ CudaReductionWithArgumentKernel( const Result zero,
		// Perform the parallel reduction.
		if( blockSize >= 1024 ) {
		if( tid < 512 )
		reduction( sidx[ tid ], sidx[ tid + 512 ], sdata[ tid ], sdata[ tid + 512 ] );
		reduction( sdata[ tid ], sdata[ tid + 512 ], sidx[ tid ], sidx[ tid + 512 ] );
		__syncthreads();
		}
		if( blockSize >= 512 ) {
		if( tid < 256 )
		reduction( sidx[ tid ], sidx[ tid + 256 ], sdata[ tid ], sdata[ tid + 256 ] );
		reduction( sdata[ tid ], sdata[ tid + 256 ], sidx[ tid ], sidx[ tid + 256 ] );
		__syncthreads();
		}
		if( blockSize >= 256 ) {
		if( tid < 128 )
		reduction( sidx[ tid ], sidx[ tid + 128 ], sdata[ tid ], sdata[ tid + 128 ] );
		reduction( sdata[ tid ], sdata[ tid + 128 ], sidx[ tid ], sidx[ tid + 128 ] );
		__syncthreads();
		}
		if( blockSize >= 128 ) {
		if( tid < 64 )
		reduction( sidx[ tid ], sidx[ tid + 64 ], sdata[ tid ], sdata[ tid + 64 ] );
		reduction( sdata[ tid ], sdata[ tid + 64 ], sidx[ tid ], sidx[ tid + 64 ] );
		__syncthreads();
		}

		// This runs in one warp so we use __syncwarp() instead of __syncthreads().
		if( tid < 32 ) {
		if( blockSize >= 64 )
		reduction( sidx[ tid ], sidx[ tid + 32 ], sdata[ tid ], sdata[ tid + 32 ] );
		reduction( sdata[ tid ], sdata[ tid + 32 ], sidx[ tid ], sidx[ tid + 32 ] );
		__syncwarp();
		// Note that here we do not have to check if tid < 16 etc, because we have
		// 2 * blockSize.x elements of shared memory per block, so we do not
		// access out of bounds. The results for the upper half will be undefined,
		// but unused anyway.
		if( blockSize >= 32 )
		reduction( sidx[ tid ], sidx[ tid + 16 ], sdata[ tid ], sdata[ tid + 16 ] );
		reduction( sdata[ tid ], sdata[ tid + 16 ], sidx[ tid ], sidx[ tid + 16 ] );
		__syncwarp();
		if( blockSize >= 16 )
		reduction( sidx[ tid ], sidx[ tid + 8 ], sdata[ tid ], sdata[ tid + 8 ] );
		reduction( sdata[ tid ], sdata[ tid + 8 ], sidx[ tid ], sidx[ tid + 8 ] );
		__syncwarp();
		if( blockSize >= 8 )
		reduction( sidx[ tid ], sidx[ tid + 4 ], sdata[ tid ], sdata[ tid + 4 ] );
		reduction( sdata[ tid ], sdata[ tid + 4 ], sidx[ tid ], sidx[ tid + 4 ] );
		__syncwarp();
		if( blockSize >= 4 )
		reduction( sidx[ tid ], sidx[ tid + 2 ], sdata[ tid ], sdata[ tid + 2 ] );
		reduction( sdata[ tid ], sdata[ tid + 2 ], sidx[ tid ], sidx[ tid + 2 ] );
		__syncwarp();
		if( blockSize >= 2 )
		reduction( sidx[ tid ], sidx[ tid + 1 ], sdata[ tid ], sdata[ tid + 1 ] );
		reduction( sdata[ tid ], sdata[ tid + 1 ], sidx[ tid ], sidx[ tid + 1 ] );
		}

		// Store the result back in the global memory.
		@@ -357,7 +357,7 @@ struct CudaReductionKernelLauncher
		}

		template< typename Reduction >
		std::pair< Index, Result >
		std::pair< Result, Index >
		finishWithArgument( const Reduction& reduction,
		const Result& zero )
		{
		@@ -384,9 +384,9 @@ struct CudaReductionKernelLauncher

		////
		// Copy result on CPU
		std::pair< Index, Result > result;
		MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.first, idxOutput, 1 );
		MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.second, output, 1 );
		std::pair< Result, Index > result;
		MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.first, output, 1 );
		MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.second, idxOutput, 1 );
		return result;
		}

src/TNL/Algorithms/Reduction.h

+5 −5

Original line number	Diff line number	Diff line
		@@ -54,7 +54,7 @@ struct Reduction< Devices::Sequential >
		typename Result,
		typename ReductionOperation,
		typename DataFetcher >
		static constexpr std::pair< Index, Result >
		static constexpr std::pair< Result, Index >
		reduceWithArgument( const Index size,
		const ReductionOperation& reduction,
		DataFetcher& dataFetcher,
		@@ -138,7 +138,7 @@ struct Reduction< Devices::Host >
		* The reduction lambda function takes two variables which are supposed to be reduced:
		*
		* ```
		* auto reduction = [] __cuda_callable__ ( Index& aIdx, const Index& bIdx, const Result& a, const Result& b ) { return ... };
		* auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
		* ```
		*
		* \par Example
		@@ -153,7 +153,7 @@ struct Reduction< Devices::Host >
		typename Result,
		typename ReductionOperation,
		typename DataFetcher >
		static std::pair< Index, Result >
		static std::pair< Result, Index >
		reduceWithArgument( const Index size,
		const ReductionOperation& reduction,
		DataFetcher& dataFetcher,
		@@ -237,7 +237,7 @@ struct Reduction< Devices::Cuda >
		* The reduction lambda function takes two variables which are supposed to be reduced:
		*
		* ```
		* auto reduction = [] __cuda_callable__ ( Index& aIdx, const Index& bIdx, const Result& a, const Result& b ) { return ... };
		* auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
		* ```
		*
		* \par Example
		@@ -252,7 +252,7 @@ struct Reduction< Devices::Cuda >
		typename Result,
		typename ReductionOperation,
		typename DataFetcher >
		static std::pair< Index, Result >
		static std::pair< Result, Index >
		reduceWithArgument( const Index size,
		const ReductionOperation& reduction,
		DataFetcher& dataFetcher,

src/TNL/Algorithms/Reduction.hpp

+28 −28

Original line number	Diff line number	Diff line
		@@ -86,7 +86,7 @@ template< typename Index,
		typename Result,
		typename ReductionOperation,
		typename DataFetcher >
		constexpr std::pair< Index, Result >
		constexpr std::pair< Result, Index >
		Reduction< Devices::Sequential >::
		reduceWithArgument( const Index size,
		const ReductionOperation& reduction,
		@@ -119,27 +119,27 @@ reduceWithArgument( const Index size,
		initialized = true;
		continue;
		}
		reduction( arg[ 0 ], offset + i, r[ 0 ], dataFetcher( offset + i ) );
		reduction( arg[ 1 ], offset + i + 1, r[ 1 ], dataFetcher( offset + i + 1 ) );
		reduction( arg[ 2 ], offset + i + 2, r[ 2 ], dataFetcher( offset + i + 2 ) );
		reduction( arg[ 3 ], offset + i + 3, r[ 3 ], dataFetcher( offset + i + 3 ) );
		reduction( r[ 0 ], dataFetcher( offset + i ), arg[ 0 ], offset + i );
		reduction( r[ 1 ], dataFetcher( offset + i + 1 ), arg[ 1 ], offset + i + 1 );
		reduction( r[ 2 ], dataFetcher( offset + i + 2 ), arg[ 2 ], offset + i + 2 );
		reduction( r[ 3 ], dataFetcher( offset + i + 3 ), arg[ 3 ], offset + i + 3 );
		}
		}

		// reduction of the last, incomplete block (not unrolled)
		for( Index i = blocks * block_size; i < size; i++ )
		reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) );
		reduction( r[ 0 ], dataFetcher( i ), arg[ 0 ], i );

		// reduction of unrolled results
		reduction( arg[ 0 ], arg[ 2 ], r[ 0 ], r[ 2 ] );
		reduction( arg[ 1 ], arg[ 3 ], r[ 1 ], r[ 3 ] );
		reduction( arg[ 0 ], arg[ 1 ], r[ 0 ], r[ 1 ] );
		return std::make_pair( arg[ 0 ], r[ 0 ] );
		reduction( r[ 0 ], r[ 2 ], arg[ 0 ], arg[ 2 ] );
		reduction( r[ 1 ], r[ 3 ], arg[ 1 ], arg[ 3 ] );
		reduction( r[ 0 ], r[ 1 ], arg[ 0 ], arg[ 1 ] );
		return std::make_pair( r[ 0 ], arg[ 0 ] );
		}
		else {
		std::pair< Index, Result > result( 0, dataFetcher( 0 ) );
		std::pair< Result, Index > result( dataFetcher( 0 ), 0 );
		for( Index i = 1; i < size; i++ )
		reduction( result.first, i, result.second, dataFetcher( i ) );
		reduction( result.first, dataFetcher( i ), result.second, i );
		return result;
		}
		}
		@@ -208,7 +208,7 @@ template< typename Index,
		typename Result,
		typename ReductionOperation,
		typename DataFetcher >
		std::pair< Index, Result >
		std::pair< Result, Index >
		Reduction< Devices::Host >::
		reduceWithArgument( const Index size,
		const ReductionOperation& reduction,
		@@ -221,7 +221,7 @@ reduceWithArgument( const Index size,

		if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
		// global result variable
		std::pair< Index, Result > result( -1, zero );
		std::pair< Result, Index > result( zero, -1 );
		const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
		#pragma omp parallel num_threads(threads)
		{
		@@ -246,10 +246,10 @@ reduceWithArgument( const Index size,
		initialized = true;
		continue;
		}
		reduction( arg[ 0 ], offset + i, r[ 0 ], dataFetcher( offset + i ) );
		reduction( arg[ 1 ], offset + i + 1, r[ 1 ], dataFetcher( offset + i + 1 ) );
		reduction( arg[ 2 ], offset + i + 2, r[ 2 ], dataFetcher( offset + i + 2 ) );
		reduction( arg[ 3 ], offset + i + 3, r[ 3 ], dataFetcher( offset + i + 3 ) );
		reduction( r[ 0 ], dataFetcher( offset + i ), arg[ 0 ], offset + i );
		reduction( r[ 1 ], dataFetcher( offset + i + 1 ), arg[ 1 ], offset + i + 1 );
		reduction( r[ 2 ], dataFetcher( offset + i + 2 ), arg[ 2 ], offset + i + 2 );
		reduction( r[ 3 ], dataFetcher( offset + i + 3 ), arg[ 3 ], offset + i + 3 );
		}
		}

		@@ -257,20 +257,20 @@ reduceWithArgument( const Index size,
		#pragma omp single nowait
		{
		for( Index i = blocks * block_size; i < size; i++ )
		reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) );
		reduction( r[ 0 ], dataFetcher( i ), arg[ 0 ], i );
		}

		// local reduction of unrolled results
		reduction( arg[ 0 ], arg[ 2 ], r[ 0 ], r[ 2 ] );
		reduction( arg[ 1 ], arg[ 3 ], r[ 1 ], r[ 3 ] );
		reduction( arg[ 0 ], arg[ 1 ], r[ 0 ], r[ 1 ] );
		reduction( r[ 0 ], r[ 2 ], arg[ 0 ], arg[ 2 ] );
		reduction( r[ 1 ], r[ 3 ], arg[ 1 ], arg[ 3 ] );
		reduction( r[ 0 ], r[ 1 ], arg[ 0 ], arg[ 1 ] );

		// inter-thread reduction of local results
		#pragma omp critical
		{
		if( result.first == -1 )
		result.first = arg[ 0 ];
		reduction( result.first, arg[ 0 ], result.second, r[ 0 ] );
		if( result.second == -1 )
		result.second = arg[ 0 ];
		reduction( result.first, r[ 0 ], result.second, arg[ 0 ] );
		}
		}
		return result;
		@@ -373,7 +373,7 @@ template< typename Index,
		typename Result,
		typename ReductionOperation,
		typename DataFetcher >
		std::pair< Index, Result >
		std::pair< Result, Index >
		Reduction< Devices::Cuda >::
		reduceWithArgument( const Index size,
		const ReductionOperation& reduction,
		@@ -454,13 +454,13 @@ reduceWithArgument( const Index size,
		// auto fetch = [&] ( Index i ) { return resultArray[ i ]; };
		// const Result result = Reduction< Devices::Sequential >::reduceWithArgument( reducedSize, argument, reduction, fetch, zero );
		for( Index i = 1; i < reducedSize; i++ )
		reduction( indexArray[ 0 ], indexArray[ i ], resultArray[ 0 ], resultArray[ i ] );
		reduction( resultArray[ 0 ], resultArray[ i ], indexArray[ 0 ], indexArray[ i ] );

		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		std::cout << " Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
		#endif
		return std::make_pair( indexArray[ 0 ], resultArray[ 0 ] );
		return std::make_pair( resultArray[ 0 ], indexArray[ 0 ] );
		}
		else {
		// data can't be safely reduced on host, so continue with the reduction on the GPU

src/TNL/Containers/Expressions/DistributedVerticalOperations.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -35,7 +35,7 @@ auto DistributedExpressionMin( const Expression& expression ) -> std::decay_t< d

		template< typename Expression >
		auto DistributedExpressionArgMin( const Expression& expression )
		-> std::pair< typename Expression::IndexType, std::decay_t< decltype( expression[0] ) > >
		-> std::pair< std::decay_t< decltype( expression[0] ) >, typename Expression::IndexType >
		{
		using RealType = std::decay_t< decltype( expression[0] ) >;
		using IndexType = typename Expression::IndexType;
		@@ -92,7 +92,7 @@ auto DistributedExpressionMax( const Expression& expression ) -> std::decay_t< d

		template< typename Expression >
		auto DistributedExpressionArgMax( const Expression& expression )
		-> std::pair< typename Expression::IndexType, std::decay_t< decltype( expression[0] ) > >
		-> std::pair< std::decay_t< decltype( expression[0] ) >, typename Expression::IndexType >
		{
		using RealType = std::decay_t< decltype( expression[0] ) >;
		using IndexType = typename Expression::IndexType;