[WIP] Rewritting reduction using lambda functions. (5c983e49) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp

+4 −3

Original line number	Diff line number	Diff line
		@@ -220,9 +220,10 @@ containsValue( const Element* data,
		TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
		TNL_ASSERT_GE( size, 0, "" );
		if( size == 0 ) return false;
		Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue;
		reductionContainsValue.setValue( value );
		return Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, nullptr );
		//Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue;
		//reductionContainsValue.setValue( value );
		auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); };
		return Reduction< Devices::Cuda >::reduce( size, TNL::sum, fetch, 0.0 );
		}

		template< typename Element,

src/TNL/Containers/Algorithms/CudaReductionKernel.h

+59 −53

Original line number	Diff line number	Diff line
		@@ -41,20 +41,21 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi

		template< int blockSize,
		typename Real,
		typename FirstPhase,
		typename SecondPhase,
		typename DataFetcher,
		typename Reduction,
		typename Index,
		typename ResultType = decltype( std::declval< FirstPhase >( 0,0 ) ) >
		typename Result = decltype( std::declval< DataFetcher >( 0,0 ) ) >
		__global__ void
		__launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
		CudaReductionKernel( const Real& initialValue,
		FirstReduction& firstReduction,
		SecondReduction& secondReduction,
		CudaReductionKernel( const Real& zero,
		const DataFetcher& dataFetcher,
		const Reduction& reduction,
		const Index size,
		ResultType* output )
		Result* output )
		{
		typedef Index IndexType;
		typedef typename Operation::ResultType ResultType;
		using RealType = Real;
		using IndexType = Index;
		using ResultType = Result;

		ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();

		@@ -67,28 +68,28 @@ CudaReductionKernel( const Real& initialValue,
		IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
		const IndexType gridSize = blockDim.x * gridDim.x;

		sdata[ tid ] = initialValue;
		sdata[ tid ] = zero;
		/***
		* Read data into the shared memory. We start with the
		* sequential reduction.
		*/
		while( gid + 4 * gridSize < size )
		{
		sdata[ tid ] = firstReduction( sdata[ tid ], gid );
		sdata[ tid ] = firstReduction( sdata[ tid ], gid + gridSize );
		sdata[ tid ] = firstReduction( sdata[ tid ], gid + 2 * gridSize );
		sdata[ tid ] = firstReduction( sdata[ tid ], gid + 3 * gridSize );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
		gid += 4 * gridSize;
		}
		while( gid + 2 * gridSize < size )
		{
		firstReduction( sdata[ tid ], gid, input1, input2 );
		firstReduction( sdata[ tid ], gid + gridSize, input1, input2 );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
		gid += 2 * gridSize;
		}
		while( gid < size )
		{
		operation.firstReduction( sdata[ tid ], gid, input1, input2 );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
		gid += gridSize;
		}
		__syncthreads();
		@@ -103,19 +104,19 @@ CudaReductionKernel( const Real& initialValue,
		if( blockSize >= 1024 )
		{
		if( tid < 512 )
		operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] );
		reduction( sdata[ tid ], sdata[ tid + 512 ] );
		__syncthreads();
		}
		if( blockSize >= 512 )
		{
		if( tid < 256 )
		operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] );
		reduction( sdata[ tid ], sdata[ tid + 256 ] );
		__syncthreads();
		}
		if( blockSize >= 256 )
		{
		if( tid < 128 )
		operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] );
		reduction( sdata[ tid ], sdata[ tid + 128 ] );
		__syncthreads();
		//printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
		}
		@@ -123,7 +124,7 @@ CudaReductionKernel( const Real& initialValue,
		if( blockSize >= 128 )
		{
		if( tid < 64 )
		operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] );
		reduction( sdata[ tid ], sdata[ tid + 64 ] );
		__syncthreads();
		//printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
		}
		@@ -137,34 +138,34 @@ CudaReductionKernel( const Real& initialValue,
		volatile ResultType* vsdata = sdata;
		if( blockSize >= 64 )
		{
		operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
		reduction( vsdata[ tid ], vsdata[ tid + 32 ] );
		//printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
		}
		// TODO: If blocksize == 32, the following does not work
		// We do not check if tid < 16. Fix it!!!
		if( blockSize >= 32 )
		{
		operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
		reduction( vsdata[ tid ], vsdata[ tid + 16 ] );
		//printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
		}
		if( blockSize >= 16 )
		{
		operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
		reduction( vsdata[ tid ], vsdata[ tid + 8 ] );
		//printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
		}
		if( blockSize >= 8 )
		{
		operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
		reduction( vsdata[ tid ], vsdata[ tid + 4 ] );
		//printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
		}
		if( blockSize >= 4 )
		{
		operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
		reduction( vsdata[ tid ], vsdata[ tid + 2 ] );
		//printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
		}
		if( blockSize >= 2 )
		{
		operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
		reduction( vsdata[ tid ], vsdata[ tid + 1 ] );
		//printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
		}
		}
		@@ -180,16 +181,21 @@ CudaReductionKernel( const Real& initialValue,

		}

		template< typename Operation, typename Index >
		template< typename Real,
		typename DataFetcher,
		typename Reduction,
		typename Index,
		typename Result >
		int
		CudaReductionKernelLauncher( Operation& operation,
		const Index size,
		const typename Operation::DataType1* input1,
		const typename Operation::DataType2* input2,
		typename Operation::ResultType*& output )
		CudaReductionKernelLauncher( const Index size,
		const Reduction& reduction,
		const DataFetcher& dataFetcher,
		const Real& zero,
		Result*& output )
		{
		typedef Index IndexType;
		typedef typename Operation::ResultType ResultType;
		using RealType = Real;
		using IndexType = Index;
		using ResultType = Result;

		// The number of blocks should be a multiple of the number of multiprocessors
		// to ensure optimum balancing of the load. This is very important, because
		@@ -227,55 +233,55 @@ CudaReductionKernelLauncher( Operation& operation,
		{
		case 512:
		CudaReductionKernel< 512 >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 256:
		cudaFuncSetCacheConfig(CudaReductionKernel< 256, Operation, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaReductionKernel< 256, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);

		CudaReductionKernel< 256 >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 128:
		cudaFuncSetCacheConfig(CudaReductionKernel< 128, Operation, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaReductionKernel< 128, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);

		CudaReductionKernel< 128 >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 64:
		cudaFuncSetCacheConfig(CudaReductionKernel< 64, Operation, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaReductionKernel< 64, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);

		CudaReductionKernel< 64 >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 32:
		cudaFuncSetCacheConfig(CudaReductionKernel< 32, Operation, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaReductionKernel< 32, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);

		CudaReductionKernel< 32 >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 16:
		cudaFuncSetCacheConfig(CudaReductionKernel< 16, Operation, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaReductionKernel< 16, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);

		CudaReductionKernel< 16 >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 8:
		cudaFuncSetCacheConfig(CudaReductionKernel< 8, Operation, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaReductionKernel< 8, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);

		CudaReductionKernel< 8 >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 4:
		cudaFuncSetCacheConfig(CudaReductionKernel< 4, Operation, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaReductionKernel< 4, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);

		CudaReductionKernel< 4 >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 2:
		cudaFuncSetCacheConfig(CudaReductionKernel< 2, Operation, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaReductionKernel< 2, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);

		CudaReductionKernel< 2 >
		<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
		break;
		case 1:
		TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );

src/TNL/Containers/Algorithms/Reduction.h

+29 −20

Original line number	Diff line number	Diff line
		@@ -29,36 +29,45 @@ template<>
		class Reduction< Devices::Cuda >
		{
		public:
		template< typename Operation, typename Index >
		static typename Operation::ResultType
		reduce( Operation& operation,
		const Index size,
		const typename Operation::DataType1* deviceInput1,
		const typename Operation::DataType2* deviceInput2 );
		template< typename Index,
		typename ReductionOperation,
		typename DataFetcher,
		typename Result = decltype( DataFetcher::operator() ) >
		static Result
		reduce( const Index size,
		ReductionOperation& reduction,
		DataFetcher& dataFetcher,
		const Result& zero );
		};

		template<>
		class Reduction< Devices::Host >
		{
		public:
		template< typename Operation, typename Index >
		static typename Operation::ResultType
		reduce( Operation& operation,
		const Index size,
		const typename Operation::DataType1* deviceInput1,
		const typename Operation::DataType2* deviceInput2 );
		template< typename Index,
		typename ReductionOperation,
		typename DataFetcher,
		typename Result = decltype( DataFetcher::operator() ) >
		static Result
		reduce( const Index size,
		ReductionOperation& reduction,
		DataFetcher& dataFetcher,
		const Result& zero );
		};

		template<>
		class Reduction< Devices::MIC >
		{
		public:
		template< typename Operation, typename Index >
		static typename Operation::ResultType
		reduce( Operation& operation,
		const Index size,
		const typename Operation::DataType1* deviceInput1,
		const typename Operation::DataType2* deviceInput2 );
		template< typename Index,
		typename ReductionOperation,
		typename DataFetcher,
		typename Result = decltype( DataFetcher::operator() ) >
		static Result
		reduce( const Index size,
		ReductionOperation& reduction,
		DataFetcher& dataFetcher,
		const Result& zero );
		};

		} // namespace Algorithms

src/TNL/Containers/Algorithms/Reduction_impl.h

+76 −64

Original line number	Diff line number	Diff line
		@@ -38,35 +38,37 @@ namespace Algorithms {
		*/
		static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//256;

		template< typename Operation, typename Index >
		typename Operation::ResultType

		template< typename Index,
		typename ReductionOperation,
		typename DataFetcher,
		typename Result >
		Result
		Reduction< Devices::Cuda >::
		reduce( Operation& operation,
		const Index size,
		const typename Operation::DataType1* deviceInput1,
		const typename Operation::DataType2* deviceInput2 )
		reduce( const Index size,
		ReductionOperation& reduction,
		DataFetcher& dataFetcher,
		const Result& zero )
		{
		#ifdef HAVE_CUDA

		typedef Index IndexType;
		typedef typename Operation::DataType1 DataType1;
		typedef typename Operation::DataType2 DataType2;
		typedef typename Operation::ResultType ResultType;
		typedef typename Operation::LaterReductionOperation LaterReductionOperation;
		using IndexType = Index;
		using ResultType = Result;

		/***
		* Only fundamental and pointer types can be safely reduced on host. Complex
		* objects stored on the device might contain pointers into the device memory,
		* in which case reduction on host might fail.
		*/
		constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value \|\| std::is_fundamental< DataType2 >::value \|\| std::is_pointer< DataType1 >::value \|\| std::is_pointer< DataType2 >::value;
		//constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value \|\| std::is_fundamental< DataType2 >::value \|\| std::is_pointer< DataType1 >::value \|\| std::is_pointer< DataType2 >::value;
		constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value \|\| std::is_pointer< ResultType >::value;

		/***
		* First check if the input array(s) is/are large enough for the reduction on GPU.
		* Otherwise copy it/them to host and reduce on CPU.
		*/
		if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize )
		// With lambda function we cannot reduce data on host - we do not know the data here.
		/*if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize )
		{
		typename std::remove_const< DataType1 >::type hostArray1[ Reduction_minGpuDataSize ];
		ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray1, deviceInput1, size );
		@@ -74,12 +76,12 @@ reduce( Operation& operation,
		using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type;
		typename std::remove_const< _DT2 >::type hostArray2[ Reduction_minGpuDataSize ];
		ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size );
		return Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2 );
		return Reduction< Devices::Host >::reduce( zero, dataFetcher, reduction, size, hostArray1, hostArray2 );
		}
		else {
		return Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr );
		}
		}
		}*/

		#ifdef CUDA_REDUCTION_PROFILING
		Timer timer;
		@@ -91,10 +93,10 @@ reduce( Operation& operation,
		* Reduce the data on the CUDA device.
		*/
		ResultType* deviceAux1( 0 );
		IndexType reducedSize = CudaReductionKernelLauncher( operation,
		size,
		deviceInput1,
		deviceInput2,
		IndexType reducedSize = CudaReductionKernelLauncher( size,
		reduction,
		dataFetcher,
		zero,
		deviceAux1 );
		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		@@ -120,8 +122,8 @@ reduce( Operation& operation,
		/***
		* Reduce the data on the host system.
		*/
		LaterReductionOperation laterReductionOperation;
		const ResultType result = Reduction< Devices::Host >::reduce( laterReductionOperation, reducedSize, resultArray, (void*) nullptr );
		auto copyFetch = [=] ( IndexType i ) { return resultArray[ i ]; };
		const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, copyFetch, zero );

		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		@@ -134,12 +136,13 @@ reduce( Operation& operation,
		/***
		* Data can't be safely reduced on host, so continue with the reduction on the CUDA device.
		*/
		LaterReductionOperation laterReductionOperation;
		//LaterReductionOperation laterReductionOperation;
		auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return deviceAux1[ i ]; };
		while( reducedSize > 1 ) {
		reducedSize = CudaReductionKernelLauncher( laterReductionOperation,
		reducedSize,
		deviceAux1,
		(ResultType*) 0,
		reducedSize = CudaReductionKernelLauncher( reducedSize,
		reduction,
		copyFetch,
		zero,
		deviceAux1 );
		}

		@@ -166,18 +169,19 @@ reduce( Operation& operation,
		#endif
		};

		template< typename Operation, typename Index >
		typename Operation::ResultType
		template< typename Index,
		typename ReductionOperation,
		typename DataFetcher,
		typename Result >
		Result
		Reduction< Devices::Host >::
		reduce( Operation& operation,
		const Index size,
		const typename Operation::DataType1* input1,
		const typename Operation::DataType2* input2 )
		reduce( const Index size,
		ReductionOperation& reduction,
		DataFetcher& dataFetcher,
		const Result& zero )
		{
		typedef Index IndexType;
		typedef typename Operation::DataType1 DataType1;
		typedef typename Operation::DataType2 DataType2;
		typedef typename Operation::ResultType ResultType;
		using IndexType = Index;
		using ResultType = Result;

		constexpr int block_size = 128;
		const int blocks = size / block_size;
		@@ -185,23 +189,24 @@ reduce( Operation& operation,
		#ifdef HAVE_OPENMP
		if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) {
		// global result variable
		ResultType result = operation.initialValue();
		ResultType result = zero;
		#pragma omp parallel
		{
		// initialize array for thread-local results
		ResultType r[ 4 ] = { operation.initialValue(),
		operation.initialValue(),
		operation.initialValue(),
		operation.initialValue() };
		ResultType r[ 4 ] = { zero, zero, zero, zero };

		#pragma omp for nowait
		for( int b = 0; b < blocks; b++ ) {
		const IndexType offset = b * block_size;
		for( int i = 0; i < block_size; i += 4 ) {
		operation.firstReduction( r[ 0 ], offset + i, input1, input2 );
		operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 );
		operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 );
		operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 );
		r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) );
		r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
		r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
		r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
		/*operation.dataFetcher( r[ 0 ], offset + i, input1, input2 );
		operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 );
		operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 );
		operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/
		}
		}

		@@ -209,18 +214,19 @@ reduce( Operation& operation,
		#pragma omp single nowait
		{
		for( IndexType i = blocks * block_size; i < size; i++ )
		operation.firstReduction( r[ 0 ], i, input1, input2 );
		r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) );
		//operation.dataFetcher( r[ 0 ], i, input1, input2 );
		}

		// local reduction of unrolled results
		operation.commonReduction( r[ 0 ], r[ 2 ] );
		operation.commonReduction( r[ 1 ], r[ 3 ] );
		operation.commonReduction( r[ 0 ], r[ 1 ] );
		r[ 0 ] = reduction( r[ 0 ], r[ 2 ] );
		r[ 1 ] = reduction( r[ 1 ], r[ 3 ] );
		r[ 0 ] = reduction( r[ 0 ], r[ 1 ] );

		// inter-thread reduction of local results
		#pragma omp critical
		{
		operation.commonReduction( result, r[ 0 ] );
		result = reduction( result, r[ 0 ] );
		}
		}
		return result;
		@@ -229,37 +235,43 @@ reduce( Operation& operation,
		#endif
		if( blocks > 1 ) {
		// initialize array for unrolled results
		ResultType r[ 4 ] = { operation.initialValue(),
		operation.initialValue(),
		operation.initialValue(),
		operation.initialValue() };
		ResultType r[ 4 ] = { zero, zero, zero, zero };

		// main reduction (explicitly unrolled loop)
		for( int b = 0; b < blocks; b++ ) {
		const IndexType offset = b * block_size;
		for( int i = 0; i < block_size; i += 4 ) {
		operation.firstReduction( r[ 0 ], offset + i, input1, input2 );
		operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 );
		operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 );
		operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 );
		r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) );
		r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
		r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
		r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
		/*operation.dataFetcher( r[ 0 ], offset + i, input1, input2 );
		operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 );
		operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 );
		operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/
		}
		}

		// reduction of the last, incomplete block (not unrolled)
		for( IndexType i = blocks * block_size; i < size; i++ )
		operation.firstReduction( r[ 0 ], i, input1, input2 );
		r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) );
		//operation.dataFetcher( r[ 0 ], i, input1, input2 );

		// reduction of unrolled results
		operation.commonReduction( r[ 0 ], r[ 2 ] );
		operation.commonReduction( r[ 1 ], r[ 3 ] );
		operation.commonReduction( r[ 0 ], r[ 1 ] );
		r[ 0 ] = reduction( r[ 0 ], r[ 2 ] );
		r[ 1 ] = reduction( r[ 1 ], r[ 3 ] );
		r[ 0 ] = reduction( r[ 0 ], r[ 1 ] );

		/*operation.reduction( r[ 0 ], r[ 2 ] );
		operation.reduction( r[ 1 ], r[ 3 ] );
		operation.reduction( r[ 0 ], r[ 1 ] );*/

		return r[ 0 ];
		}
		else {
		ResultType result = operation.initialValue();
		ResultType result = zero;
		for( IndexType i = 0; i < size; i++ )
		operation.firstReduction( result, i, input1, input2 );
		result = reduction( result, dataFetcher( i ) );
		return result;
		}
		#ifdef HAVE_OPENMP

src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h

+18 −6

File changed.

Preview size limit exceeded, changes collapsed.