Commit 40d68215 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Explicit unrolling in the Reduction::reduce specialization for host

parent d165d1b4
Loading
Loading
Loading
Loading
+16 −6
Original line number Diff line number Diff line
@@ -188,27 +188,37 @@ reduce( Operation& operation,
      {
         const int blocks = size / block_size;

         // initialize thread-local result variable
         ResultType r = operation.initialValue();
         // initialize array for thread-local results
         ResultType r[ 4 ] = { operation.initialValue() };

         #pragma omp for nowait
         for( int b = 0; b < blocks; b++ ) {
            const int offset = b * block_size;
            for( IndexType i = 0; i < block_size; i++ )
               operation.firstReduction( r, offset + i, input1, input2 );
            for( IndexType i = 0; i < block_size; i += 4 ) {
               operation.firstReduction( r[ 0 ], offset + i,     input1, input2 );
               operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 );
               operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 );
               operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 );
            }
         }

         // the first thread that reaches here processes the last, incomplete block
         #pragma omp single nowait
         {
            for( IndexType i = blocks * block_size; i < size; i++ )
               operation.firstReduction( r, i, input1, input2 );
               operation.firstReduction( r[ 0 ], i, input1, input2 );
         }

         // reduction of local results
         operation.commonReduction( r[ 0 ], r[ 1 ] );
         operation.commonReduction( r[ 0 ], r[ 2 ] );
         operation.commonReduction( r[ 0 ], r[ 3 ] );

         // inter-thread reduction of local results
         #pragma omp critical
         {
            operation.commonReduction( result, r );
            operation.commonReduction( result, r[ 0 ] );
         }
      }
      return result;
   }