Commit 40a6fbec authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Optimized sequential reduction and multireduction - explicit unrolling

parent 6571c834
Loading
Loading
Loading
Loading
+54 −11
Original line number Diff line number Diff line
@@ -225,7 +225,7 @@ reduce( Operation& operation,
         }
      }

      // reduction of local results
      // local reduction of unrolled results
      for( int k = 0; k < n; k++ ) {
         ResultType* _r = r + 4 * k;
         operation.commonReduction( _r[ 0 ], _r[ 1 ] );
@@ -242,6 +242,48 @@ reduce( Operation& operation,
   }
   else {
#endif
      if( blocks > 1 ) {
         // initialize array for unrolled results
         // (it is accessed as a row-major matrix with n rows and 4 columns)
         ResultType r[ n * 4 ];
         for( int k = 0; k < n * 4; k++ )
            r[ k ] = operation.initialValue();

         // main reduction (explicitly unrolled loop)
         for( int b = 0; b < blocks; b++ ) {
            const IndexType offset = b * block_size;
            for( int k = 0; k < n; k++ ) {
               const DataType1* _input1 = input1 + k * ldInput1;
               ResultType* _r = r + 4 * k;
               for( int i = 0; i < block_size; i += 4 ) {
                  operation.firstReduction( _r[ 0 ], offset + i,     _input1, input2 );
                  operation.firstReduction( _r[ 1 ], offset + i + 1, _input1, input2 );
                  operation.firstReduction( _r[ 2 ], offset + i + 2, _input1, input2 );
                  operation.firstReduction( _r[ 3 ], offset + i + 3, _input1, input2 );
               }
            }
         }

         // reduction of the last, incomplete block (not unrolled)
         for( int k = 0; k < n; k++ ) {
            const DataType1* _input1 = input1 + k * ldInput1;
            ResultType* _r = r + 4 * k;
            for( IndexType i = blocks * block_size; i < size; i++ )
               operation.firstReduction( _r[ 0 ], i, _input1, input2 );
         }

         // reduction of unrolled results
         for( int k = 0; k < n; k++ ) {
            ResultType* _r = r + 4 * k;
            operation.commonReduction( _r[ 0 ], _r[ 1 ] );
            operation.commonReduction( _r[ 0 ], _r[ 2 ] );
            operation.commonReduction( _r[ 0 ], _r[ 3 ] );

            // copy the result into the output parameter
            result[ k ] = _r[ 0 ];
         }
      }
      else {
         for( int k = 0; k < n; k++ )
            result[ k ] = operation.initialValue();

@@ -259,6 +301,7 @@ reduce( Operation& operation,
            for( IndexType i = blocks * block_size; i < size; i++ )
               operation.firstReduction( result[ k ], i, _input1, input2 );
         }
      }
#ifdef HAVE_OPENMP
   }
#endif
+36 −8
Original line number Diff line number Diff line
@@ -179,15 +179,15 @@ reduce( Operation& operation,
   typedef typename Operation::DataType2 DataType2;
   typedef typename Operation::ResultType ResultType;

#ifdef HAVE_OPENMP
   constexpr int block_size = 128;
   const int blocks = size / block_size;

#ifdef HAVE_OPENMP
   if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) {
      // global result variable
      ResultType result = operation.initialValue();
#pragma omp parallel
      {
         const int blocks = size / block_size;

         // initialize array for thread-local results
         ResultType r[ 4 ] = { operation.initialValue() };

@@ -209,7 +209,7 @@ reduce( Operation& operation,
               operation.firstReduction( r[ 0 ], i, input1, input2 );
         }

         // reduction of local results
         // local reduction of unrolled results
         operation.commonReduction( r[ 0 ], r[ 1 ] );
         operation.commonReduction( r[ 0 ], r[ 2 ] );
         operation.commonReduction( r[ 0 ], r[ 3 ] );
@@ -224,10 +224,38 @@ reduce( Operation& operation,
   }
   else {
#endif
      if( blocks > 1 ) {
         // initialize array for unrolled results
         ResultType r[ 4 ] = { operation.initialValue() };

         // main reduction (explicitly unrolled loop)
         for( int b = 0; b < blocks; b++ ) {
            const IndexType offset = b * block_size;
            for( int i = 0; i < block_size; i += 4 ) {
               operation.firstReduction( r[ 0 ], offset + i,     input1, input2 );
               operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 );
               operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 );
               operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 );
            }
         }

         // reduction of the last, incomplete block (not unrolled)
         for( IndexType i = blocks * block_size; i < size; i++ )
            operation.firstReduction( r[ 0 ], i, input1, input2 );

         // reduction of unrolled results
         operation.commonReduction( r[ 0 ], r[ 1 ] );
         operation.commonReduction( r[ 0 ], r[ 2 ] );
         operation.commonReduction( r[ 0 ], r[ 3 ] );

         return r[ 0 ];
      }
      else {
         ResultType result = operation.initialValue();
         for( IndexType i = 0; i < size; i++ )
            operation.firstReduction( result, i, input1, input2 );
         return result;
      }
#ifdef HAVE_OPENMP
   }
#endif