Commit e20a0930 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Changed reduction operation to use functions with `return a + b` instead of `a += b`

This is nicer because it more clearly separates data load, computation
and data store. Furthermore, it allows to use instances of std::plus,
std::logical_and, std::logical_or, etc. instead of custom lambda
functions.
parent d0fc1bb7
Loading
Loading
Loading
Loading
+28 −28
Original line number Diff line number Diff line
@@ -29,7 +29,7 @@ getVectorMax( const Vector& v )

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
}

@@ -46,7 +46,7 @@ getVectorMin( const Vector& v )

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return data[ i ]; };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a =  TNL::min( a, b ); };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
}

@@ -63,7 +63,7 @@ getVectorAbsMax( const Vector& v )

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
}

@@ -80,7 +80,7 @@ getVectorAbsMin( const Vector& v )

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
}

@@ -97,7 +97,7 @@ getVectorL1Norm( const Vector& v )

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 );
}

@@ -114,7 +114,7 @@ getVectorL2Norm( const Vector& v )

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
   return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ) );
}

@@ -138,7 +138,7 @@ getVectorLpNorm( const Vector& v,

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
   return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p );
}

@@ -158,7 +158,7 @@ getVectorSum( const Vector& v )

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i )  -> ResultType { return data[ i ]; };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 );
}

@@ -178,7 +178,7 @@ getVectorDifferenceMax( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
}

@@ -198,7 +198,7 @@ getVectorDifferenceMin( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
}

@@ -218,7 +218,7 @@ getVectorDifferenceAbsMax( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
}

@@ -238,7 +238,7 @@ getVectorDifferenceAbsMin( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
}

@@ -258,7 +258,7 @@ getVectorDifferenceL1Norm( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 );
}

@@ -281,7 +281,7 @@ getVectorDifferenceL2Norm( const Vector1& v1,
      auto diff = data1[ i ] - data2[ i ];
      return diff * diff;
   };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
   return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ) );
}

@@ -308,7 +308,7 @@ getVectorDifferenceLpNorm( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
   return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p );
}

@@ -328,7 +328,7 @@ getVectorDifferenceSum( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 );
}

@@ -348,7 +348,7 @@ getScalarProduct( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; };
   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 );
}

+6 −6
Original line number Diff line number Diff line
@@ -132,8 +132,8 @@ compare( const Element1* destination,
   TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." );
   TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );

   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( destination[ i ] == source[ i ] ); };
   auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return destination[ i ] == source[ i ]; };
   auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );
}

@@ -149,8 +149,8 @@ containsValue( const Element* data,
   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
   TNL_ASSERT_GE( size, (Index) 0, "" );

   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( data[ i ] == value ); };
   auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a |= b; };
   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
   auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a || b; };
   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false );
}

@@ -166,8 +166,8 @@ containsOnlyValue( const Element* data,
   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
   TNL_ASSERT_GE( size, 0, "" );

   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( data[ i ] == value ); };
   auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
   auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );
}

+17 −17
Original line number Diff line number Diff line
@@ -67,19 +67,19 @@ CudaMultireductionKernel( const Result zero,

   // Start with the sequential reduction and push the result into the shared memory.
   while( gid + 4 * gridSizeX < size ) {
      reduction( sdata[ tid ], dataFetcher( gid,                 y ) );
      reduction( sdata[ tid ], dataFetcher( gid + gridSizeX,     y ) );
      reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSizeX, y ) );
      reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSizeX, y ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid,                 y ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSizeX,     y ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSizeX, y ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSizeX, y ) );
      gid += 4 * gridSizeX;
   }
   while( gid + 2 * gridSizeX < size ) {
      reduction( sdata[ tid ], dataFetcher( gid, y ) );
      reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid, y ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) );
      gid += 2 * gridSizeX;
   }
   while( gid < size ) {
      reduction( sdata[ tid ], dataFetcher( gid, y ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid, y ) );
      gid += gridSizeX;
   }
   __syncthreads();
@@ -87,48 +87,48 @@ CudaMultireductionKernel( const Result zero,
   // Perform the parallel reduction.
   if( blockSizeX >= 1024 ) {
      if( threadIdx.x < 512 )
         reduction( sdata[ tid ], sdata[ tid + 512 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] );
      __syncthreads();
   }
   if( blockSizeX >= 512 ) {
      if( threadIdx.x < 256 )
         reduction( sdata[ tid ], sdata[ tid + 256 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] );
      __syncthreads();
   }
   if( blockSizeX >= 256 ) {
      if( threadIdx.x < 128 )
         reduction( sdata[ tid ], sdata[ tid + 128 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] );
      __syncthreads();
   }
   if( blockSizeX >= 128 ) {
      if( threadIdx.x <  64 )
         reduction( sdata[ tid ], sdata[ tid + 64 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] );
      __syncthreads();
   }

   // This runs in one warp so we use __syncwarp() instead of __syncthreads().
   if( threadIdx.x < 32 ) {
      if( blockSizeX >= 64 )
         reduction( sdata[ tid ], sdata[ tid + 32 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 32 ] );
      __syncwarp();
      // Note that here we do not have to check if tid < 16 etc, because we have
      // 2 * blockSize.x elements of shared memory per block, so we do not
      // access out of bounds. The results for the upper half will be undefined,
      // but unused anyway.
      if( blockSizeX >= 32 )
         reduction( sdata[ tid ], sdata[ tid + 16 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 16 ] );
      __syncwarp();
      if( blockSizeX >= 16 )
         reduction( sdata[ tid ], sdata[ tid + 8 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 8 ] );
      __syncwarp();
      if( blockSizeX >=  8 )
         reduction( sdata[ tid ], sdata[ tid + 4 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 4 ] );
      __syncwarp();
      if( blockSizeX >=  4 )
         reduction( sdata[ tid ], sdata[ tid + 2 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 2 ] );
      __syncwarp();
      if( blockSizeX >=  2 )
         reduction( sdata[ tid ], sdata[ tid + 1 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 1 ] );
   }

   // Store the result back in the global memory.
+17 −17
Original line number Diff line number Diff line
@@ -63,19 +63,19 @@ CudaReductionKernel( const Result zero,

   // Start with the sequential reduction and push the result into the shared memory.
   while( gid + 4 * gridSize < size ) {
      reduction( sdata[ tid ], dataFetcher( gid ) );
      reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
      reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
      reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
      gid += 4 * gridSize;
   }
   while( gid + 2 * gridSize < size ) {
      reduction( sdata[ tid ], dataFetcher( gid ) );
      reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
      gid += 2 * gridSize;
   }
   while( gid < size ) {
      reduction( sdata[ tid ], dataFetcher( gid ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
      gid += gridSize;
   }
   __syncthreads();
@@ -83,48 +83,48 @@ CudaReductionKernel( const Result zero,
   // Perform the parallel reduction.
   if( blockSize >= 1024 ) {
      if( tid < 512 )
         reduction( sdata[ tid ], sdata[ tid + 512 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] );
      __syncthreads();
   }
   if( blockSize >= 512 ) {
      if( tid < 256 )
         reduction( sdata[ tid ], sdata[ tid + 256 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] );
      __syncthreads();
   }
   if( blockSize >= 256 ) {
      if( tid < 128 )
         reduction( sdata[ tid ], sdata[ tid + 128 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] );
      __syncthreads();
   }
   if( blockSize >= 128 ) {
      if( tid <  64 )
         reduction( sdata[ tid ], sdata[ tid + 64 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] );
      __syncthreads();
   }

   // This runs in one warp so we use __syncwarp() instead of __syncthreads().
   if( tid < 32 ) {
      if( blockSize >= 64 )
         reduction( sdata[ tid ], sdata[ tid + 32 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 32 ] );
      __syncwarp();
      // Note that here we do not have to check if tid < 16 etc, because we have
      // 2 * blockSize.x elements of shared memory per block, so we do not
      // access out of bounds. The results for the upper half will be undefined,
      // but unused anyway.
      if( blockSize >= 32 )
         reduction( sdata[ tid ], sdata[ tid + 16 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 16 ] );
      __syncwarp();
      if( blockSize >= 16 )
         reduction( sdata[ tid ], sdata[ tid + 8 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 8 ] );
      __syncwarp();
      if( blockSize >=  8 )
         reduction( sdata[ tid ], sdata[ tid + 4 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 4 ] );
      __syncwarp();
      if( blockSize >=  4 )
         reduction( sdata[ tid ], sdata[ tid + 2 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 2 ] );
      __syncwarp();
      if( blockSize >=  2 )
         reduction( sdata[ tid ], sdata[ tid + 1 ] );
         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 1 ] );
   }

   // Store the result back in the global memory.
+19 −19
Original line number Diff line number Diff line
@@ -72,10 +72,10 @@ reduce( const Result zero,
         for( int k = 0; k < n; k++ ) {
            Result* _r = r + 4 * k;
            for( int i = 0; i < block_size; i += 4 ) {
               reduction( _r[ 0 ], dataFetcher( offset + i,     k ) );
               reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
               reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
               reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
               _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i,     k ) );
               _r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
               _r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
               _r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
            }
         }
      }
@@ -86,23 +86,23 @@ reduce( const Result zero,
         for( int k = 0; k < n; k++ ) {
            Result* _r = r + 4 * k;
            for( Index i = blocks * block_size; i < size; i++ )
               reduction( _r[ 0 ], dataFetcher( i, k ) );
               _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) );
         }
      }

      // local reduction of unrolled results
      for( int k = 0; k < n; k++ ) {
         Result* _r = r + 4 * k;
         reduction( _r[ 0 ], _r[ 1 ] );
         reduction( _r[ 0 ], _r[ 2 ] );
         reduction( _r[ 0 ], _r[ 3 ] );
         _r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] );
         _r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] );
         _r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] );
      }

      // inter-thread reduction of local results
      #pragma omp critical
      {
         for( int k = 0; k < n; k++ )
            reduction( result[ k ], r[ 4 * k ] );
            result[ k ] = reduction( result[ k ], r[ 4 * k ] );
      }
   }
   else {
@@ -120,10 +120,10 @@ reduce( const Result zero,
            for( int k = 0; k < n; k++ ) {
               Result* _r = r + 4 * k;
               for( int i = 0; i < block_size; i += 4 ) {
                  reduction( _r[ 0 ], dataFetcher( offset + i,     k ) );
                  reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
                  reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
                  reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
                  _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i,     k ) );
                  _r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
                  _r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
                  _r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
               }
            }
         }
@@ -132,15 +132,15 @@ reduce( const Result zero,
         for( int k = 0; k < n; k++ ) {
            Result* _r = r + 4 * k;
            for( Index i = blocks * block_size; i < size; i++ )
               reduction( _r[ 0 ], dataFetcher( i, k ) );
               _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) );
         }

         // reduction of unrolled results
         for( int k = 0; k < n; k++ ) {
            Result* _r = r + 4 * k;
            reduction( _r[ 0 ], _r[ 1 ] );
            reduction( _r[ 0 ], _r[ 2 ] );
            reduction( _r[ 0 ], _r[ 3 ] );
            _r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] );
            _r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] );
            _r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] );

            // copy the result into the output parameter
            result[ k ] = _r[ 0 ];
@@ -154,13 +154,13 @@ reduce( const Result zero,
            const Index offset = b * block_size;
            for( int k = 0; k < n; k++ ) {
               for( int i = 0; i < block_size; i++ )
                  reduction( result[ k ], dataFetcher( offset + i, k ) );
                  result[ k ] = reduction( result[ k ], dataFetcher( offset + i, k ) );
            }
         }

         for( int k = 0; k < n; k++ ) {
            for( Index i = blocks * block_size; i < size; i++ )
               reduction( result[ k ], dataFetcher( i, k ) );
               result[ k ] = reduction( result[ k ], dataFetcher( i, k ) );
         }
      }
#ifdef HAVE_OPENMP
Loading