Changed reduction operation to use functions with `return a + b` instead of `a += b` (e20a0930) · Commits · TNL / tnl-dev

src/Benchmarks/BLAS/CommonVectorOperations.hpp

+28 −28

Original line number	Diff line number	Diff line
		@@ -29,7 +29,7 @@ getVectorMax( const Vector& v )

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		}

		@@ -46,7 +46,7 @@ getVectorMin( const Vector& v )

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return data[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		}

		@@ -63,7 +63,7 @@ getVectorAbsMax( const Vector& v )

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		}

		@@ -80,7 +80,7 @@ getVectorAbsMin( const Vector& v )

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		}

		@@ -97,7 +97,7 @@ getVectorL1Norm( const Vector& v )

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 );
		}

		@@ -114,7 +114,7 @@ getVectorL2Norm( const Vector& v )

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
		return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ) );
		}

		@@ -138,7 +138,7 @@ getVectorLpNorm( const Vector& v,

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
		return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p );
		}

		@@ -158,7 +158,7 @@ getVectorSum( const Vector& v )

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 );
		}

		@@ -178,7 +178,7 @@ getVectorDifferenceMax( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		}

		@@ -198,7 +198,7 @@ getVectorDifferenceMin( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		}

		@@ -218,7 +218,7 @@ getVectorDifferenceAbsMax( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		}

		@@ -238,7 +238,7 @@ getVectorDifferenceAbsMin( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		}

		@@ -258,7 +258,7 @@ getVectorDifferenceL1Norm( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 );
		}

		@@ -281,7 +281,7 @@ getVectorDifferenceL2Norm( const Vector1& v1,
		auto diff = data1[ i ] - data2[ i ];
		return diff * diff;
		};
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
		return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ) );
		}

		@@ -308,7 +308,7 @@ getVectorDifferenceLpNorm( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
		return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p );
		}

		@@ -328,7 +328,7 @@ getVectorDifferenceSum( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 );
		}

		@@ -348,7 +348,7 @@ getScalarProduct( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 );
		}

src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp

+6 −6

Original line number	Diff line number	Diff line
		@@ -132,8 +132,8 @@ compare( const Element1* destination,
		TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." );
		TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );

		auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( destination[ i ] == source[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
		auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return destination[ i ] == source[ i ]; };
		auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
		return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );
		}

		@@ -149,8 +149,8 @@ containsValue( const Element* data,
		TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
		TNL_ASSERT_GE( size, (Index) 0, "" );

		auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); };
		auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a \|= b; };
		auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
		auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a \|\| b; };
		return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false );
		}

		@@ -166,8 +166,8 @@ containsOnlyValue( const Element* data,
		TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
		TNL_ASSERT_GE( size, 0, "" );

		auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); };
		auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
		auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
		auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
		return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );
		}

src/TNL/Containers/Algorithms/CudaMultireductionKernel.h

+17 −17

Original line number	Diff line number	Diff line
		@@ -67,19 +67,19 @@ CudaMultireductionKernel( const Result zero,

		// Start with the sequential reduction and push the result into the shared memory.
		while( gid + 4 * gridSizeX < size ) {
		reduction( sdata[ tid ], dataFetcher( gid, y ) );
		reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) );
		reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSizeX, y ) );
		reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSizeX, y ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid, y ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSizeX, y ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSizeX, y ) );
		gid += 4 * gridSizeX;
		}
		while( gid + 2 * gridSizeX < size ) {
		reduction( sdata[ tid ], dataFetcher( gid, y ) );
		reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid, y ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) );
		gid += 2 * gridSizeX;
		}
		while( gid < size ) {
		reduction( sdata[ tid ], dataFetcher( gid, y ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid, y ) );
		gid += gridSizeX;
		}
		__syncthreads();
		@@ -87,48 +87,48 @@ CudaMultireductionKernel( const Result zero,
		// Perform the parallel reduction.
		if( blockSizeX >= 1024 ) {
		if( threadIdx.x < 512 )
		reduction( sdata[ tid ], sdata[ tid + 512 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] );
		__syncthreads();
		}
		if( blockSizeX >= 512 ) {
		if( threadIdx.x < 256 )
		reduction( sdata[ tid ], sdata[ tid + 256 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] );
		__syncthreads();
		}
		if( blockSizeX >= 256 ) {
		if( threadIdx.x < 128 )
		reduction( sdata[ tid ], sdata[ tid + 128 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] );
		__syncthreads();
		}
		if( blockSizeX >= 128 ) {
		if( threadIdx.x < 64 )
		reduction( sdata[ tid ], sdata[ tid + 64 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] );
		__syncthreads();
		}

		// This runs in one warp so we use __syncwarp() instead of __syncthreads().
		if( threadIdx.x < 32 ) {
		if( blockSizeX >= 64 )
		reduction( sdata[ tid ], sdata[ tid + 32 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 32 ] );
		__syncwarp();
		// Note that here we do not have to check if tid < 16 etc, because we have
		// 2 * blockSize.x elements of shared memory per block, so we do not
		// access out of bounds. The results for the upper half will be undefined,
		// but unused anyway.
		if( blockSizeX >= 32 )
		reduction( sdata[ tid ], sdata[ tid + 16 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 16 ] );
		__syncwarp();
		if( blockSizeX >= 16 )
		reduction( sdata[ tid ], sdata[ tid + 8 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 8 ] );
		__syncwarp();
		if( blockSizeX >= 8 )
		reduction( sdata[ tid ], sdata[ tid + 4 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 4 ] );
		__syncwarp();
		if( blockSizeX >= 4 )
		reduction( sdata[ tid ], sdata[ tid + 2 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 2 ] );
		__syncwarp();
		if( blockSizeX >= 2 )
		reduction( sdata[ tid ], sdata[ tid + 1 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 1 ] );
		}

		// Store the result back in the global memory.

src/TNL/Containers/Algorithms/CudaReductionKernel.h

+17 −17

Original line number	Diff line number	Diff line
		@@ -63,19 +63,19 @@ CudaReductionKernel( const Result zero,

		// Start with the sequential reduction and push the result into the shared memory.
		while( gid + 4 * gridSize < size ) {
		reduction( sdata[ tid ], dataFetcher( gid ) );
		reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
		reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
		reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
		gid += 4 * gridSize;
		}
		while( gid + 2 * gridSize < size ) {
		reduction( sdata[ tid ], dataFetcher( gid ) );
		reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
		gid += 2 * gridSize;
		}
		while( gid < size ) {
		reduction( sdata[ tid ], dataFetcher( gid ) );
		sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
		gid += gridSize;
		}
		__syncthreads();
		@@ -83,48 +83,48 @@ CudaReductionKernel( const Result zero,
		// Perform the parallel reduction.
		if( blockSize >= 1024 ) {
		if( tid < 512 )
		reduction( sdata[ tid ], sdata[ tid + 512 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] );
		__syncthreads();
		}
		if( blockSize >= 512 ) {
		if( tid < 256 )
		reduction( sdata[ tid ], sdata[ tid + 256 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] );
		__syncthreads();
		}
		if( blockSize >= 256 ) {
		if( tid < 128 )
		reduction( sdata[ tid ], sdata[ tid + 128 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] );
		__syncthreads();
		}
		if( blockSize >= 128 ) {
		if( tid < 64 )
		reduction( sdata[ tid ], sdata[ tid + 64 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] );
		__syncthreads();
		}

		// This runs in one warp so we use __syncwarp() instead of __syncthreads().
		if( tid < 32 ) {
		if( blockSize >= 64 )
		reduction( sdata[ tid ], sdata[ tid + 32 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 32 ] );
		__syncwarp();
		// Note that here we do not have to check if tid < 16 etc, because we have
		// 2 * blockSize.x elements of shared memory per block, so we do not
		// access out of bounds. The results for the upper half will be undefined,
		// but unused anyway.
		if( blockSize >= 32 )
		reduction( sdata[ tid ], sdata[ tid + 16 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 16 ] );
		__syncwarp();
		if( blockSize >= 16 )
		reduction( sdata[ tid ], sdata[ tid + 8 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 8 ] );
		__syncwarp();
		if( blockSize >= 8 )
		reduction( sdata[ tid ], sdata[ tid + 4 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 4 ] );
		__syncwarp();
		if( blockSize >= 4 )
		reduction( sdata[ tid ], sdata[ tid + 2 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 2 ] );
		__syncwarp();
		if( blockSize >= 2 )
		reduction( sdata[ tid ], sdata[ tid + 1 ] );
		sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 1 ] );
		}

		// Store the result back in the global memory.

src/TNL/Containers/Algorithms/Multireduction.hpp

+19 −19

Original line number	Diff line number	Diff line
		@@ -72,10 +72,10 @@ reduce( const Result zero,
		for( int k = 0; k < n; k++ ) {
		Result* _r = r + 4 * k;
		for( int i = 0; i < block_size; i += 4 ) {
		reduction( _r[ 0 ], dataFetcher( offset + i, k ) );
		reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
		reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
		reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
		_r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i, k ) );
		_r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
		_r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
		_r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
		}
		}
		}
		@@ -86,23 +86,23 @@ reduce( const Result zero,
		for( int k = 0; k < n; k++ ) {
		Result* _r = r + 4 * k;
		for( Index i = blocks * block_size; i < size; i++ )
		reduction( _r[ 0 ], dataFetcher( i, k ) );
		_r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) );
		}
		}

		// local reduction of unrolled results
		for( int k = 0; k < n; k++ ) {
		Result* _r = r + 4 * k;
		reduction( _r[ 0 ], _r[ 1 ] );
		reduction( _r[ 0 ], _r[ 2 ] );
		reduction( _r[ 0 ], _r[ 3 ] );
		_r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] );
		_r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] );
		_r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] );
		}

		// inter-thread reduction of local results
		#pragma omp critical
		{
		for( int k = 0; k < n; k++ )
		reduction( result[ k ], r[ 4 * k ] );
		result[ k ] = reduction( result[ k ], r[ 4 * k ] );
		}
		}
		else {
		@@ -120,10 +120,10 @@ reduce( const Result zero,
		for( int k = 0; k < n; k++ ) {
		Result* _r = r + 4 * k;
		for( int i = 0; i < block_size; i += 4 ) {
		reduction( _r[ 0 ], dataFetcher( offset + i, k ) );
		reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
		reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
		reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
		_r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i, k ) );
		_r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
		_r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
		_r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
		}
		}
		}
		@@ -132,15 +132,15 @@ reduce( const Result zero,
		for( int k = 0; k < n; k++ ) {
		Result* _r = r + 4 * k;
		for( Index i = blocks * block_size; i < size; i++ )
		reduction( _r[ 0 ], dataFetcher( i, k ) );
		_r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) );
		}

		// reduction of unrolled results
		for( int k = 0; k < n; k++ ) {
		Result* _r = r + 4 * k;
		reduction( _r[ 0 ], _r[ 1 ] );
		reduction( _r[ 0 ], _r[ 2 ] );
		reduction( _r[ 0 ], _r[ 3 ] );
		_r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] );
		_r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] );
		_r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] );

		// copy the result into the output parameter
		result[ k ] = _r[ 0 ];
		@@ -154,13 +154,13 @@ reduce( const Result zero,
		const Index offset = b * block_size;
		for( int k = 0; k < n; k++ ) {
		for( int i = 0; i < block_size; i++ )
		reduction( result[ k ], dataFetcher( offset + i, k ) );
		result[ k ] = reduction( result[ k ], dataFetcher( offset + i, k ) );
		}
		}

		for( int k = 0; k < n; k++ ) {
		for( Index i = blocks * block_size; i < size; i++ )
		reduction( result[ k ], dataFetcher( i, k ) );
		result[ k ] = reduction( result[ k ], dataFetcher( i, k ) );
		}
		}
		#ifdef HAVE_OPENMP