Commit 0d329226 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Reduction: renamed zero and idempotent to identity

parent 9ce0b169
Loading
Loading
Loading
Loading
+12 −6
Original line number Diff line number Diff line
@@ -29,7 +29,9 @@ struct Multireduction< Devices::Sequential >
{
   /**
    * Parameters:
    *    zero: starting value for reduction
    *    identity: the [identity element](https://en.wikipedia.org/wiki/Identity_element)
    *              for the reduction operation, i.e. element which does not
    *              change the result of the reduction
    *    dataFetcher: callable object such that `dataFetcher( i, j )` yields
    *                 the i-th value to be reduced from the j-th dataset
    *                 (i = 0,...,size-1; j = 0,...,n-1)
@@ -45,7 +47,7 @@ struct Multireduction< Devices::Sequential >
             typename Reduction,
             typename Index >
   static constexpr void
   reduce( const Result zero,
   reduce( const Result identity,
           DataFetcher dataFetcher,
           const Reduction reduction,
           const Index size,
@@ -58,7 +60,9 @@ struct Multireduction< Devices::Host >
{
   /**
    * Parameters:
    *    zero: starting value for reduction
    *    identity: the [identity element](https://en.wikipedia.org/wiki/Identity_element)
    *              for the reduction operation, i.e. element which does not
    *              change the result of the reduction
    *    dataFetcher: callable object such that `dataFetcher( i, j )` yields
    *                 the i-th value to be reduced from the j-th dataset
    *                 (i = 0,...,size-1; j = 0,...,n-1)
@@ -74,7 +78,7 @@ struct Multireduction< Devices::Host >
             typename Reduction,
             typename Index >
   static void
   reduce( const Result zero,
   reduce( const Result identity,
           DataFetcher dataFetcher,
           const Reduction reduction,
           const Index size,
@@ -87,7 +91,9 @@ struct Multireduction< Devices::Cuda >
{
   /**
    * Parameters:
    *    zero: starting value for reduction
    *    identity: the [identity element](https://en.wikipedia.org/wiki/Identity_element)
    *              for the reduction operation, i.e. element which does not
    *              change the result of the reduction
    *    dataFetcher: callable object such that `dataFetcher( i, j )` yields
    *                 the i-th value to be reduced from the j-th dataset
    *                 (i = 0,...,size-1; j = 0,...,n-1)
@@ -103,7 +109,7 @@ struct Multireduction< Devices::Cuda >
             typename Reduction,
             typename Index >
   static void
   reduce( const Result zero,
   reduce( const Result identity,
           DataFetcher dataFetcher,
           const Reduction reduction,
           const Index size,
+10 −10
Original line number Diff line number Diff line
@@ -35,7 +35,7 @@ template< typename Result,
          typename Index >
void constexpr
Multireduction< Devices::Sequential >::
reduce( const Result zero,
reduce( const Result identity,
        DataFetcher dataFetcher,
        const Reduction reduction,
        const Index size,
@@ -53,7 +53,7 @@ reduce( const Result zero,
      // (it is accessed as a row-major matrix with n rows and 4 columns)
      Result r[ n * 4 ];
      for( int k = 0; k < n * 4; k++ )
         r[ k ] = zero;
         r[ k ] = identity;

      // main reduction (explicitly unrolled loop)
      for( int b = 0; b < blocks; b++ ) {
@@ -89,7 +89,7 @@ reduce( const Result zero,
   }
   else {
      for( int k = 0; k < n; k++ )
         result[ k ] = zero;
         result[ k ] = identity;

      for( int b = 0; b < blocks; b++ ) {
         const Index offset = b * block_size;
@@ -112,7 +112,7 @@ template< typename Result,
          typename Index >
void
Multireduction< Devices::Host >::
reduce( const Result zero,
reduce( const Result identity,
        DataFetcher dataFetcher,
        const Reduction reduction,
        const Index size,
@@ -134,14 +134,14 @@ reduce( const Result zero,
         #pragma omp single nowait
         {
            for( int k = 0; k < n; k++ )
               result[ k ] = zero;
               result[ k ] = identity;
         }

         // initialize array for thread-local results
         // (it is accessed as a row-major matrix with n rows and 4 columns)
         Result r[ n * 4 ];
         for( int k = 0; k < n * 4; k++ )
            r[ k ] = zero;
            r[ k ] = identity;

         #pragma omp for nowait
         for( int b = 0; b < blocks; b++ ) {
@@ -185,7 +185,7 @@ reduce( const Result zero,
   }
   else
#endif
      Multireduction< Devices::Sequential >::reduce( zero, dataFetcher, reduction, size, n, result );
      Multireduction< Devices::Sequential >::reduce( identity, dataFetcher, reduction, size, n, result );
}

template< typename Result,
@@ -194,7 +194,7 @@ template< typename Result,
          typename Index >
void
Multireduction< Devices::Cuda >::
reduce( const Result zero,
reduce( const Result identity,
        DataFetcher dataFetcher,
        const Reduction reduction,
        const Index size,
@@ -212,7 +212,7 @@ reduce( const Result zero,

   // start the reduction on the GPU
   Result* deviceAux1 = nullptr;
   const int reducedSize = detail::CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 );
   const int reducedSize = detail::CudaMultireductionKernelLauncher( identity, dataFetcher, reduction, size, n, deviceAux1 );

   #ifdef CUDA_REDUCTION_PROFILING
      timer.stop();
@@ -234,7 +234,7 @@ reduce( const Result zero,

   // finish the reduction on the host
   auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; };
   Multireduction< Devices::Sequential >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult );
   Multireduction< Devices::Sequential >::reduce( identity, dataFetcherFinish, reduction, reducedSize, n, hostResult );

   #ifdef CUDA_REDUCTION_PROFILING
      timer.stop();
+12 −9
Original line number Diff line number Diff line
@@ -81,8 +81,9 @@ struct SegmentedScan< Devices::Sequential, Type >
    * \param begin the first element in the array to be scanned
    * \param end the last element in the array to be scanned
    * \param reduction lambda function implementing the reduction operation
    * \param zero is the idempotent element for the reduction operation, i.e. element which
    *             does not change the result of the reduction.
    * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
    *                 for the reduction operation, i.e. element which does not
    *                 change the result of the reduction.
    *
    * The reduction lambda function takes two variables which are supposed to be reduced:
    *
@@ -107,7 +108,7 @@ struct SegmentedScan< Devices::Sequential, Type >
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const typename Vector::ValueType zero );
            const typename Vector::ValueType identity );
};

template< detail::ScanType Type >
@@ -125,8 +126,9 @@ struct SegmentedScan< Devices::Host, Type >
    * \param begin the first element in the array to be scanned
    * \param end the last element in the array to be scanned
    * \param reduction lambda function implementing the reduction operation
    * \param zero is the idempotent element for the reduction operation, i.e. element which
    *             does not change the result of the reduction.
    * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
    *                 for the reduction operation, i.e. element which does not
    *                 change the result of the reduction.
    *
    * The reduction lambda function takes two variables which are supposed to be reduced:
    *
@@ -151,7 +153,7 @@ struct SegmentedScan< Devices::Host, Type >
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const typename Vector::ValueType zero );
            const typename Vector::ValueType identity );
};

template< detail::ScanType Type >
@@ -169,8 +171,9 @@ struct SegmentedScan< Devices::Cuda, Type >
    * \param begin the first element in the array to be scanned
    * \param end the last element in the array to be scanned
    * \param reduction lambda function implementing the reduction operation
    * \param zero is the idempotent element for the reduction operation, i.e. element which
    *             does not change the result of the reduction.
    * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
    *                 for the reduction operation, i.e. element which does not
    *                 change the result of the reduction.
    *
    * The reduction lambda function takes two variables which are supposed to be reduced:
    *
@@ -197,7 +200,7 @@ struct SegmentedScan< Devices::Cuda, Type >
            const typename Vector::IndexType begin,
            const typename Vector::IndexType end,
            const Reduction& reduction,
            const typename Vector::ValueType zero );
            const typename Vector::ValueType identity );
};

} // namespace Algorithms
+7 −7
Original line number Diff line number Diff line
@@ -30,7 +30,7 @@ perform( Vector& v,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const typename Vector::ValueType zero )
         const typename Vector::ValueType identity )
{
   using ValueType = typename Vector::ValueType;
   using IndexType = typename Vector::IndexType;
@@ -44,12 +44,12 @@ perform( Vector& v,
   else // Exclusive scan
   {
      ValueType aux( v[ begin ] );
      v[ begin ] = zero;
      v[ begin ] = identity;
      for( IndexType i = begin + 1; i < end; i++ )
      {
         ValueType x = v[ i ];
         if( flags[ i ] )
            aux = zero;
            aux = identity;
         v[ i ] = aux;
         aux = reduction( aux, x );
      }
@@ -67,13 +67,13 @@ perform( Vector& v,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const typename Vector::ValueType zero )
         const typename Vector::ValueType identity )
{
#ifdef HAVE_OPENMP
   // TODO: parallelize with OpenMP
   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, identity );
#else
   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, identity );
#endif
}

@@ -88,7 +88,7 @@ perform( Vector& v,
         const typename Vector::IndexType begin,
         const typename Vector::IndexType end,
         const Reduction& reduction,
         const typename Vector::ValueType zero )
         const typename Vector::ValueType identity )
{
#ifdef HAVE_CUDA
   using ValueType = typename Vector::ValueType;
+12 −12
Original line number Diff line number Diff line
@@ -47,7 +47,7 @@ template< int blockSizeX,
          typename Index >
__global__ void
__launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor )
CudaMultireductionKernel( const Result zero,
CudaMultireductionKernel( const Result identity,
                          DataFetcher dataFetcher,
                          const Reduction reduction,
                          const Index size,
@@ -65,7 +65,7 @@ CudaMultireductionKernel( const Result zero,
   const int y = blockIdx.y * blockDim.y + threadIdx.y;
   if( y >= n ) return;

   sdata[ tid ] = zero;
   sdata[ tid ] = identity;

   // Start with the sequential reduction and push the result into the shared memory.
   while( gid + 4 * gridSizeX < size ) {
@@ -145,7 +145,7 @@ template< typename Result,
          typename Reduction,
          typename Index >
int
CudaMultireductionKernelLauncher( const Result zero,
CudaMultireductionKernelLauncher( const Result identity,
                                  DataFetcher dataFetcher,
                                  const Reduction reduction,
                                  const Index size,
@@ -217,55 +217,55 @@ CudaMultireductionKernelLauncher( const Result zero,
   {
      case 512:
         CudaMultireductionKernel< 512 >
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
         break;
      case 256:
         cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel< 256 >
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
         break;
      case 128:
         cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel< 128 >
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
         break;
      case  64:
         cudaFuncSetCacheConfig(CudaMultireductionKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel<  64 >
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
         break;
      case  32:
         cudaFuncSetCacheConfig(CudaMultireductionKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel<  32 >
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
         break;
      case  16:
         cudaFuncSetCacheConfig(CudaMultireductionKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel<  16 >
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
         break;
     case   8:
         cudaFuncSetCacheConfig(CudaMultireductionKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel<   8 >
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
         break;
      case   4:
         cudaFuncSetCacheConfig(CudaMultireductionKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel<   4 >
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
        break;
      case   2:
         cudaFuncSetCacheConfig(CudaMultireductionKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel<   2 >
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
         break;
      case   1:
         throw std::logic_error( "blockSize should not be 1." );
Loading