Commit dfe6b1e8 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Refactoring scan

- used ValueType instead of RealType - closes #87
- replaced prefix-sum with scan in the comments
- renamed variables containing "sum" to "result"
- fixed artificial blockShifts in the sequential implementation
parent f6a5cb16
Loading
Loading
Loading
Loading
+13 −13
Original line number Diff line number Diff line
@@ -29,9 +29,9 @@ struct DistributedScan
            typename DistributedVector::IndexType begin,
            typename DistributedVector::IndexType end,
            const Reduction& reduction,
            const typename DistributedVector::RealType zero )
            const typename DistributedVector::ValueType zero )
   {
      using RealType = typename DistributedVector::RealType;
      using ValueType = typename DistributedVector::ValueType;
      using DeviceType = typename DistributedVector::DeviceType;

      const auto group = v.getCommunicationGroup();
@@ -43,23 +43,23 @@ struct DistributedScan

         // perform first phase on the local data
         auto localView = v.getLocalView();
         const auto blockShifts = Scan< DeviceType, Type >::performFirstPhase( localView, begin, end, reduction, zero );
         const RealType localSum = blockShifts.getElement( blockShifts.getSize() - 1 );
         const auto block_results = Scan< DeviceType, Type >::performFirstPhase( localView, begin, end, reduction, zero );
         const ValueType local_result = block_results.getElement( block_results.getSize() - 1 );

         // exchange local sums between ranks
         // exchange local results between ranks
         const int nproc = MPI::GetSize( group );
         RealType dataForScatter[ nproc ];
         for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localSum;
         Containers::Vector< RealType, Devices::Host > rankSums( nproc );
         ValueType dataForScatter[ nproc ];
         for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = local_result;
         Containers::Vector< ValueType, Devices::Host > rank_results( nproc );
         // NOTE: exchanging general data types does not work with MPI
         MPI::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group );
         MPI::Alltoall( dataForScatter, 1, rank_results.getData(), 1, group );

         // compute the scan of the per-rank sums
         Scan< Devices::Host, ScanType::Exclusive >::perform( rankSums, 0, nproc, reduction, zero );
         // compute the scan of the per-rank results
         Scan< Devices::Host, ScanType::Exclusive >::perform( rank_results, 0, nproc, reduction, zero );

         // perform second phase: shift by the per-block and per-rank offsets
         // perform the second phase, using the per-block and per-rank results
         const int rank = MPI::GetRank( group );
         Scan< DeviceType, Type >::performSecondPhase( localView, blockShifts, begin, end, reduction, rankSums[ rank ] );
         Scan< DeviceType, Type >::performSecondPhase( localView, block_results, begin, end, reduction, rank_results[ rank ] );
      }
   }
};
+12 −12
Original line number Diff line number Diff line
@@ -134,7 +134,7 @@ struct Scan< Devices::Sequential, Type >
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const typename Vector::RealType zero );
            const typename Vector::ValueType zero );

   template< typename Vector,
             typename Reduction >
@@ -143,7 +143,7 @@ struct Scan< Devices::Sequential, Type >
                      const typename Vector::IndexType begin,
                      const typename Vector::IndexType end,
                      const Reduction& reduction,
                      const typename Vector::RealType zero );
                      const typename Vector::ValueType zero );

   template< typename Vector,
             typename BlockShifts,
@@ -154,7 +154,7 @@ struct Scan< Devices::Sequential, Type >
                       const typename Vector::IndexType begin,
                       const typename Vector::IndexType end,
                       const Reduction& reduction,
                       const typename Vector::RealType shift );
                       const typename Vector::ValueType shift );
};

template< ScanType Type >
@@ -194,7 +194,7 @@ struct Scan< Devices::Host, Type >
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const typename Vector::RealType zero );
            const typename Vector::ValueType zero );

   template< typename Vector,
             typename Reduction >
@@ -203,7 +203,7 @@ struct Scan< Devices::Host, Type >
                      const typename Vector::IndexType begin,
                      const typename Vector::IndexType end,
                      const Reduction& reduction,
                      const typename Vector::RealType zero );
                      const typename Vector::ValueType zero );

   template< typename Vector,
             typename BlockShifts,
@@ -214,7 +214,7 @@ struct Scan< Devices::Host, Type >
                       const typename Vector::IndexType begin,
                       const typename Vector::IndexType end,
                       const Reduction& reduction,
                       const typename Vector::RealType shift );
                       const typename Vector::ValueType shift );
};

template< ScanType Type >
@@ -254,7 +254,7 @@ struct Scan< Devices::Cuda, Type >
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const typename Vector::RealType zero );
            const typename Vector::ValueType zero );

   template< typename Vector,
             typename Reduction >
@@ -263,7 +263,7 @@ struct Scan< Devices::Cuda, Type >
                      const typename Vector::IndexType begin,
                      const typename Vector::IndexType end,
                      const Reduction& reduction,
                      const typename Vector::RealType zero );
                      const typename Vector::ValueType zero );

   template< typename Vector,
             typename BlockShifts,
@@ -274,7 +274,7 @@ struct Scan< Devices::Cuda, Type >
                       const typename Vector::IndexType begin,
                       const typename Vector::IndexType end,
                       const Reduction& reduction,
                       const typename Vector::RealType shift );
                       const typename Vector::ValueType shift );
};

template< ScanType Type >
@@ -318,7 +318,7 @@ struct SegmentedScan< Devices::Sequential, Type >
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const typename Vector::RealType zero );
            const typename Vector::ValueType zero );
};

template< ScanType Type >
@@ -362,7 +362,7 @@ struct SegmentedScan< Devices::Host, Type >
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const typename Vector::RealType zero );
            const typename Vector::ValueType zero );
};

template< ScanType Type >
@@ -408,7 +408,7 @@ struct SegmentedScan< Devices::Cuda, Type >
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const typename Vector::RealType zero );
            const typename Vector::ValueType zero );
};

} // namespace Algorithms
+53 −55
Original line number Diff line number Diff line
@@ -33,9 +33,9 @@ perform( Vector& v,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const typename Vector::RealType zero )
         const typename Vector::ValueType zero )
{
   // sequential prefix-sum does not need a second phase
   // sequential scan does not need a second phase
   performFirstPhase( v, begin, end, reduction, zero );
}

@@ -48,33 +48,31 @@ performFirstPhase( Vector& v,
                   const typename Vector::IndexType begin,
                   const typename Vector::IndexType end,
                   const Reduction& reduction,
                   const typename Vector::RealType zero )
                   const typename Vector::ValueType zero )
{
   using RealType = typename Vector::RealType;
   using ValueType = typename Vector::ValueType;
   using IndexType = typename Vector::IndexType;

   // FIXME: StaticArray does not have getElement() which is used in DistributedScan
//   return Containers::StaticArray< 1, RealType > block_sums;
   Containers::Array< RealType, Devices::Host > block_sums( 1 );
   block_sums[ 0 ] = zero;

   if( Type == ScanType::Inclusive ) {
      for( IndexType i = begin + 1; i < end; i++ )
         v[ i ] = reduction( v[ i ], v[ i - 1 ] );
      block_sums[ 0 ] = v[ end - 1 ];
   }
   else // Exclusive prefix sum
   else // Exclusive scan
   {
      RealType aux = zero;
      ValueType aux = zero;
      for( IndexType i = begin; i < end; i++ ) {
         const RealType x = v[ i ];
         const ValueType x = v[ i ];
         v[ i ] = aux;
         aux = reduction( aux, x );
      }
      block_sums[ 0 ] = aux;
   }

   return block_sums;
   // sequential scan = one block, so the exclusive scan is trivially [zero]
   // FIXME: StaticArray does not have getElement() which is used in DistributedScan
//   Containers::StaticArray< 1, ValueType > block_results;
   Containers::Array< ValueType, Devices::Host > block_results( 1 );
   block_results[ 0 ] = zero;
   return block_results;
}

template< ScanType Type >
@@ -88,7 +86,7 @@ performSecondPhase( Vector& v,
                    const typename Vector::IndexType begin,
                    const typename Vector::IndexType end,
                    const Reduction& reduction,
                    const typename Vector::RealType shift )
                    const typename Vector::ValueType shift )
{
   using IndexType = typename Vector::IndexType;

@@ -105,7 +103,7 @@ perform( Vector& v,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const typename Vector::RealType zero )
         const typename Vector::ValueType zero )
{
#ifdef HAVE_OPENMP
   if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() >= 2 ) {
@@ -128,50 +126,50 @@ performFirstPhase( Vector& v,
                   const typename Vector::IndexType begin,
                   const typename Vector::IndexType end,
                   const Reduction& reduction,
                   const typename Vector::RealType zero )
                   const typename Vector::ValueType zero )
{
#ifdef HAVE_OPENMP
   using RealType = typename Vector::RealType;
   using ValueType = typename Vector::ValueType;
   using IndexType = typename Vector::IndexType;

   const int threads = Devices::Host::getMaxThreadsCount();
   Containers::Array< RealType > block_sums( threads + 1 );
   block_sums[ 0 ] = zero;
   Containers::Array< ValueType > block_results( threads + 1 );

   #pragma omp parallel num_threads(threads)
   {
      // init
      const int thread_idx = omp_get_thread_num();
      RealType block_sum = zero;
      ValueType block_result = zero;

      // perform prefix-sum on blocks statically assigned to threads
      // perform scan on blocks statically assigned to threads
      if( Type == ScanType::Inclusive ) {
         #pragma omp for schedule(static)
         for( IndexType i = begin; i < end; i++ ) {
            block_sum = reduction( block_sum, v[ i ] );
            v[ i ] = block_sum;
            block_result = reduction( block_result, v[ i ] );
            v[ i ] = block_result;
         }
      }
      else {
         #pragma omp for schedule(static)
         for( IndexType i = begin; i < end; i++ ) {
            const RealType x = v[ i ];
            v[ i ] = block_sum;
            block_sum = reduction( block_sum, x );
            const ValueType x = v[ i ];
            v[ i ] = block_result;
            block_result = reduction( block_result, x );
         }
      }

      // write the block sums into the buffer
      block_sums[ thread_idx + 1 ] = block_sum;
      // write the block result into the buffer
      block_results[ thread_idx + 1 ] = block_result;
   }

   // block_sums now contains sums of numbers in each block. The first phase
   // ends by computing prefix-sum of this array.
   // block_results now contains scan results for each block. The first phase
   // ends by computing an exclusive scan of this array.
   block_results[ 0 ] = zero;
   for( int i = 1; i < threads + 1; i++ )
      block_sums[ i ] = reduction( block_sums[ i ], block_sums[ i - 1 ] );
      block_results[ i ] = reduction( block_results[ i ], block_results[ i - 1 ] );

   // block_sums now contains shift values for each block - to be used in the second phase
   return block_sums;
   // block_results now contains shift values for each block - to be used in the second phase
   return block_results;
#else
   return Scan< Devices::Sequential, Type >::performFirstPhase( v, begin, end, reduction, zero );
#endif
@@ -188,10 +186,10 @@ performSecondPhase( Vector& v,
                    const typename Vector::IndexType begin,
                    const typename Vector::IndexType end,
                    const Reduction& reduction,
                    const typename Vector::RealType shift )
                    const typename Vector::ValueType shift )
{
#ifdef HAVE_OPENMP
   using RealType = typename Vector::RealType;
   using ValueType = typename Vector::ValueType;
   using IndexType = typename Vector::IndexType;

   const int threads = blockShifts.getSize() - 1;
@@ -200,7 +198,7 @@ performSecondPhase( Vector& v,
   #pragma omp parallel num_threads(threads)
   {
      const int thread_idx = omp_get_thread_num();
      const RealType offset = reduction( blockShifts[ thread_idx ], shift );
      const ValueType offset = reduction( blockShifts[ thread_idx ], shift );

      // shift intermediate results by the offset
      #pragma omp for schedule(static)
@@ -221,13 +219,13 @@ perform( Vector& v,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const typename Vector::RealType zero )
         const typename Vector::ValueType zero )
{
#ifdef HAVE_CUDA
   using RealType = typename Vector::RealType;
   using ValueType = typename Vector::ValueType;
   using IndexType = typename Vector::IndexType;

   detail::CudaScanKernelLauncher< Type, RealType, IndexType >::perform(
   detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::perform(
      end - begin,
      &v.getData()[ begin ],  // input
      &v.getData()[ begin ],  // output
@@ -247,13 +245,13 @@ performFirstPhase( Vector& v,
                   const typename Vector::IndexType begin,
                   const typename Vector::IndexType end,
                   const Reduction& reduction,
                   const typename Vector::RealType zero )
                   const typename Vector::ValueType zero )
{
#ifdef HAVE_CUDA
   using RealType = typename Vector::RealType;
   using ValueType = typename Vector::ValueType;
   using IndexType = typename Vector::IndexType;

   return detail::CudaScanKernelLauncher< Type, RealType, IndexType >::performFirstPhase(
   return detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::performFirstPhase(
      end - begin,
      &v.getData()[ begin ],  // input
      &v.getData()[ begin ],  // output
@@ -275,13 +273,13 @@ performSecondPhase( Vector& v,
                    const typename Vector::IndexType begin,
                    const typename Vector::IndexType end,
                    const Reduction& reduction,
                    const typename Vector::RealType shift )
                    const typename Vector::ValueType shift )
{
#ifdef HAVE_CUDA
   using RealType = typename Vector::RealType;
   using ValueType = typename Vector::ValueType;
   using IndexType = typename Vector::IndexType;

   detail::CudaScanKernelLauncher< Type, RealType, IndexType >::performSecondPhase(
   detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::performSecondPhase(
      end - begin,
      &v.getData()[ begin ],  // output
      blockShifts.getData(),
@@ -304,9 +302,9 @@ perform( Vector& v,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const typename Vector::RealType zero )
         const typename Vector::ValueType zero )
{
   using RealType = typename Vector::RealType;
   using ValueType = typename Vector::ValueType;
   using IndexType = typename Vector::IndexType;

   if( Type == ScanType::Inclusive )
@@ -315,13 +313,13 @@ perform( Vector& v,
         if( ! flags[ i ] )
            v[ i ] = reduction( v[ i ], v[ i - 1 ] );
   }
   else // Exclusive prefix sum
   else // Exclusive scan
   {
       RealType aux( v[ begin ] );
      ValueType aux( v[ begin ] );
      v[ begin ] = zero;
      for( IndexType i = begin + 1; i < end; i++ )
      {
         RealType x = v[ i ];
         ValueType x = v[ i ];
         if( flags[ i ] )
            aux = zero;
         v[ i ] = aux;
@@ -341,7 +339,7 @@ perform( Vector& v,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const typename Vector::RealType zero )
         const typename Vector::ValueType zero )
{
#ifdef HAVE_OPENMP
   // TODO: parallelize with OpenMP
@@ -362,10 +360,10 @@ perform( Vector& v,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const typename Vector::RealType zero )
         const typename Vector::ValueType zero )
{
#ifdef HAVE_CUDA
   using RealType = typename Vector::RealType;
   using ValueType = typename Vector::ValueType;
   using IndexType = typename Vector::IndexType;

   throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for CUDA." );
+19 −21
Original line number Diff line number Diff line
@@ -34,7 +34,7 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
                         const int elementsInBlock,
                         const Real* input,
                         Real* output,
                         Real* auxArray )
                         Real* blockResults )
{
   Real* sharedData = TNL::Cuda::getSharedMemory< Real >();
   Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Cuda::getNumberOfSharedMemoryBanks() + 2 ];
@@ -147,11 +147,11 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
   {
      if( scanType == ScanType::Exclusive )
      {
         auxArray[ blockIdx.x ] = reduction( sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ],
         blockResults[ blockIdx.x ] = reduction( sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ],
                                                 sharedData[ Cuda::getInterleaving( lastElementInBlock ) ] );
      }
      else
         auxArray[ blockIdx.x ] = sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ];
         blockResults[ blockIdx.x ] = sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ];
   }
}

@@ -164,12 +164,12 @@ cudaSecondPhaseBlockScan( Reduction reduction,
                          const int elementsInBlock,
                          const Index gridIdx,
                          const Index maxGridSize,
                          const Real* auxArray,
                          const Real* blockResults,
                          Real* data,
                          Real shift )
{
   if( gridIdx > 0 || blockIdx.x > 0 )
      shift = reduction( shift, auxArray[ gridIdx * maxGridSize + blockIdx.x - 1 ] );
      shift = reduction( shift, blockResults[ gridIdx * maxGridSize + blockIdx.x - 1 ] );
   const int readOffset = blockIdx.x * elementsInBlock;
   int readIdx = threadIdx.x;
   while( readIdx < elementsInBlock && readOffset + readIdx < size )
@@ -248,9 +248,9 @@ struct CudaScanKernelLauncher
      const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
      //std::cerr << "numberOfgrids =  " << numberOfGrids << std::endl;

      // allocate array for the block sums
      Containers::Array< Real, Devices::Cuda > blockSums;
      blockSums.setSize( numberOfBlocks );
      // allocate array for the block results
      Containers::Array< Real, Devices::Cuda > blockResults;
      blockResults.setSize( numberOfBlocks );

      // loop over all grids
      for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
@@ -278,20 +278,20 @@ struct CudaScanKernelLauncher
              elementsInBlock,
              &deviceInput[ gridOffset ],
              &deviceOutput[ gridOffset ],
              &blockSums.getData()[ gridIdx * maxGridSize() ] );
              &blockResults.getData()[ gridIdx * maxGridSize() ] );
      }

      // synchronize the null-stream after all grids
      cudaStreamSynchronize(0);
      TNL_CHECK_CUDA_DEVICE;

      // blockSums now contains sums of numbers in each block. The first phase
      // ends by computing prefix-sum of this array.
      // blockResults now contains scan results for each block. The first phase
      // ends by computing an exclusive scan of this array.
      if( numberOfBlocks > 1 ) {
         CudaScanKernelLauncher< ScanType::Inclusive, Real, Index >::perform(
            blockSums.getSize(),
            blockSums.getData(),
            blockSums.getData(),
            blockResults.getSize(),
            blockResults.getData(),
            blockResults.getData(),
            reduction,
            zero,
            blockSize );
@@ -301,8 +301,8 @@ struct CudaScanKernelLauncher
      // to check if we test the algorithm with more than one CUDA grid.
      gridsCount() = numberOfGrids;

      // blockSums now contains shift values for each block - to be used in the second phase
      return blockSums;
      // blockResults now contains shift values for each block - to be used in the second phase
      return blockResults;
   }

   /****
@@ -363,10 +363,8 @@ struct CudaScanKernelLauncher
      TNL_CHECK_CUDA_DEVICE;
   }

   /****
    * The following serves for setting smaller maxGridSize so that we can force
    * the prefix sum in CUDA to run with more the one grids in unit tests.
    */
   // The following serves for setting smaller maxGridSize so that we can force
   // the scan in CUDA to run with more than one grid in unit tests.
   static int& maxGridSize()
   {
      static int maxGridSize = Cuda::getMaxGridSize();