Commit 311fcf36 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Refactored splitting of the scan operation in two phases

- sequential scan does not need to be split, so "perform" performs the
  whole simple scan algorithm, "performFirstPhase" only reduces the
  block (i.e. the whole vector), "performSecondPhase" performs the scan
  operation with the block result combined with a global offset as the
  initial value
- parallel OpenMP scan calls the sequential scan to process the block
  results
- parallel CUDA scan was changed such that the block results array is an
  exclusive scan after the first phase, same as in the other device
  specializations
parent ee8e4e92
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -154,7 +154,7 @@ struct Scan< Devices::Sequential, Type >
                       const typename Vector::IndexType begin,
                       const typename Vector::IndexType end,
                       const Reduction& reduction,
                       const typename Vector::ValueType shift );
                       const typename Vector::ValueType zero );
};

template< ScanType Type >
@@ -214,7 +214,7 @@ struct Scan< Devices::Host, Type >
                       const typename Vector::IndexType begin,
                       const typename Vector::IndexType end,
                       const Reduction& reduction,
                       const typename Vector::ValueType shift );
                       const typename Vector::ValueType zero );
};

template< ScanType Type >
@@ -274,7 +274,7 @@ struct Scan< Devices::Cuda, Type >
                       const typename Vector::IndexType begin,
                       const typename Vector::IndexType end,
                       const Reduction& reduction,
                       const typename Vector::ValueType shift );
                       const typename Vector::ValueType zero );
};

template< ScanType Type >
+28 −32
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@
#pragma once

#include "Scan.h"
#include "reduce.h"

#include <TNL/Assert.h>
#include <TNL/Containers/Array.h>
@@ -34,25 +35,11 @@ perform( Vector& v,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const typename Vector::ValueType zero )
{
   // sequential scan does not need a second phase
   performFirstPhase( v, begin, end, reduction, zero );
}

template< ScanType Type >
   template< typename Vector,
             typename Reduction >
auto
Scan< Devices::Sequential, Type >::
performFirstPhase( Vector& v,
                   const typename Vector::IndexType begin,
                   const typename Vector::IndexType end,
                   const Reduction& reduction,
                   const typename Vector::ValueType zero )
{
   using ValueType = typename Vector::ValueType;
   using IndexType = typename Vector::IndexType;

   // simple sequential algorithm - not split into phases
   ValueType aux = zero;
   if( Type == ScanType::Inclusive ) {
      for( IndexType i = begin; i < end; i++ )
@@ -66,12 +53,25 @@ performFirstPhase( Vector& v,
         aux = reduction( aux, x );
      }
   }
}

   // sequential scan = one block, so the exclusive scan is trivially [zero]
template< ScanType Type >
   template< typename Vector,
             typename Reduction >
auto
Scan< Devices::Sequential, Type >::
performFirstPhase( Vector& v,
                   const typename Vector::IndexType begin,
                   const typename Vector::IndexType end,
                   const Reduction& reduction,
                   const typename Vector::ValueType zero )
{
   // FIXME: StaticArray does not have getElement() which is used in DistributedScan
//   Containers::StaticArray< 1, ValueType > block_results;
   Containers::Array< ValueType, Devices::Host > block_results( 1 );
//   Containers::StaticArray< 2, ValueType > block_results;
   Containers::Array< typename Vector::ValueType, Devices::Sequential > block_results( 2 );
   // artificial first phase - only reduce the block
   block_results[ 0 ] = zero;
   block_results[ 1 ] = reduce< Devices::Sequential >( begin, end, v, reduction, zero );
   return block_results;
}

@@ -86,12 +86,10 @@ performSecondPhase( Vector& v,
                    const typename Vector::IndexType begin,
                    const typename Vector::IndexType end,
                    const Reduction& reduction,
                    const typename Vector::ValueType shift )
                    const typename Vector::ValueType zero )
{
   using IndexType = typename Vector::IndexType;

   for( IndexType i = begin; i < end; i++ )
      v[ i ] = reduction( v[ i ], shift );
   // artificial second phase - only one block, use the shift as the initial value
   perform( v, begin, end, reduction, reduction( zero, blockShifts[ 0 ] ) );
}

template< ScanType Type >
@@ -159,14 +157,12 @@ performFirstPhase( Vector& v,
      }

      // write the block result into the buffer
      block_results[ thread_idx + 1 ] = block_result;
      block_results[ thread_idx ] = block_result;
   }

   // block_results now contains scan results for each block. The first phase
   // ends by computing an exclusive scan of this array.
   block_results[ 0 ] = zero;
   for( int i = 1; i < threads + 1; i++ )
      block_results[ i ] = reduction( block_results[ i ], block_results[ i - 1 ] );
   Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, 0, threads + 1, reduction, zero );

   // block_results now contains shift values for each block - to be used in the second phase
   return block_results;
@@ -186,7 +182,7 @@ performSecondPhase( Vector& v,
                    const typename Vector::IndexType begin,
                    const typename Vector::IndexType end,
                    const Reduction& reduction,
                    const typename Vector::ValueType shift )
                    const typename Vector::ValueType zero )
{
#ifdef HAVE_OPENMP
   using ValueType = typename Vector::ValueType;
@@ -198,7 +194,7 @@ performSecondPhase( Vector& v,
   #pragma omp parallel num_threads(threads)
   {
      const int thread_idx = omp_get_thread_num();
      const ValueType offset = reduction( blockShifts[ thread_idx ], shift );
      const ValueType offset = reduction( zero, blockShifts[ thread_idx ] );

      // shift intermediate results by the offset
      #pragma omp for schedule(static)
@@ -206,7 +202,7 @@ performSecondPhase( Vector& v,
         v[ i ] = reduction( v[ i ], offset );
   }
#else
   Scan< Devices::Sequential, Type >::performSecondPhase( v, blockShifts, begin, end, reduction, shift );
   Scan< Devices::Sequential, Type >::performSecondPhase( v, blockShifts, begin, end, reduction, zero );
#endif
}

@@ -273,7 +269,7 @@ performSecondPhase( Vector& v,
                    const typename Vector::IndexType begin,
                    const typename Vector::IndexType end,
                    const Reduction& reduction,
                    const typename Vector::ValueType shift )
                    const typename Vector::ValueType zero )
{
#ifdef HAVE_CUDA
   using ValueType = typename Vector::ValueType;
@@ -284,7 +280,7 @@ performSecondPhase( Vector& v,
      &v.getData()[ begin ],  // output
      blockShifts.getData(),
      reduction,
      shift );
      zero );
#else
   throw Exceptions::CudaSupportMissing();
#endif
+7 −4
Original line number Diff line number Diff line
@@ -168,8 +168,7 @@ cudaSecondPhaseBlockScan( Reduction reduction,
                          Real* data,
                          Real shift )
{
   if( gridIdx > 0 || blockIdx.x > 0 )
      shift = reduction( shift, blockResults[ gridIdx * maxGridSize + blockIdx.x - 1 ] );
   shift = reduction( shift, blockResults[ gridIdx * maxGridSize + blockIdx.x ] );
   const int readOffset = blockIdx.x * elementsInBlock;
   int readIdx = threadIdx.x;
   while( readIdx < elementsInBlock && readOffset + readIdx < size )
@@ -250,7 +249,8 @@ struct CudaScanKernelLauncher

      // allocate array for the block results
      Containers::Array< Real, Devices::Cuda > blockResults;
      blockResults.setSize( numberOfBlocks );
      blockResults.setSize( numberOfBlocks + 1 );
      blockResults.setElement( 0, zero );

      // loop over all grids
      for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
@@ -278,7 +278,8 @@ struct CudaScanKernelLauncher
              elementsInBlock,
              &deviceInput[ gridOffset ],
              &deviceOutput[ gridOffset ],
              &blockResults.getData()[ gridIdx * maxGridSize() ] );
              // blockResults are shifted by 1, because the 0-th element should stay zero
              &blockResults.getData()[ gridIdx * maxGridSize() + 1 ] );
      }

      // synchronize the null-stream after all grids
@@ -288,6 +289,8 @@ struct CudaScanKernelLauncher
      // blockResults now contains scan results for each block. The first phase
      // ends by computing an exclusive scan of this array.
      if( numberOfBlocks > 1 ) {
         // we perform an inclusive scan, but the 0-th is zero and block results
         // were shifted by 1, so effectively we get an exclusive scan
         CudaScanKernelLauncher< ScanType::Inclusive, Real, Index >::perform(
            blockResults.getSize(),
            blockResults.getData(),