Commit 99e854a7 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Fixed parallel prefix sum

parent 73744cf5
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -608,7 +608,7 @@ computePrefixSum( Vector& v,
                                   &v.getData()[ begin ],
                                   &v.getData()[ begin ],
                                   operation,
                                   Algorithms::inclusivePrefixSum );
                                   Algorithms::PrefixSumType::inclusive );
#else
   throw Exceptions::CudaSupportMissing();
#endif
@@ -633,7 +633,7 @@ computeExclusivePrefixSum( Vector& v,
                                   &v.getData()[ begin ],
                                   &v.getData()[ begin ],
                                   operation,
                                   Algorithms::exclusivePrefixSum );
                                   Algorithms::PrefixSumType::exclusive );
#endif
}

+2 −0
Original line number Diff line number Diff line
@@ -627,6 +627,7 @@ computePrefixSum( Vector& v,
{
   typedef typename Vector::IndexType Index;

   // TODO: parallelize with OpenMP
   for( Index i = begin + 1; i < end; i++ )
      v[ i ] += v[ i - 1 ];
}
@@ -641,6 +642,7 @@ computeExclusivePrefixSum( Vector& v,
   typedef typename Vector::IndexType Index;
   typedef typename Vector::RealType Real;

   // TODO: parallelize with OpenMP
   Real aux( v[ begin ] );
   v[ begin ] = 0.0;
   for( Index i = begin + 1; i < end; i++ )
+6 −3
Original line number Diff line number Diff line
@@ -14,8 +14,11 @@ namespace TNL {
namespace Containers {
namespace Algorithms {
   
enum enumPrefixSumType { exclusivePrefixSum = 0,
                         inclusivePrefixSum };
enum class PrefixSumType
{
   exclusive,
   inclusive
};

template< typename DataType,
          typename Operation,
@@ -25,7 +28,7 @@ bool cudaPrefixSum( const Index size,
                    const DataType *deviceInput,
                    DataType* deviceOutput,
                    const Operation& operation,
                    const enumPrefixSumType prefixSumType = inclusivePrefixSum );
                    const PrefixSumType prefixSumType = PrefixSumType::inclusive );

} // namespace Algorithms
} // namespace Containers
+146 −171
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@
   
#include <iostream>

#include <TNL/Math.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Exceptions/CudaBadAlloc.h>
#include <TNL/Containers/Algorithms/reduction-operations.h>
@@ -25,7 +26,8 @@ namespace Algorithms {
template< typename DataType,
          typename Operation,
          typename Index >
__global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumType,
__global__ void
cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
                              Operation operation,
                              const Index size,
                              const Index elementsInBlock,
@@ -46,7 +48,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
    */
   const Index blockOffset = blockIdx.x * elementsInBlock;
   Index idx = threadIdx.x;
   if( prefixSumType == exclusivePrefixSum )
   if( prefixSumType == PrefixSumType::exclusive )
   {
      if( idx == 0 )
         sharedData[ 0 ] = operation.initialValue();
@@ -69,8 +71,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
   __syncthreads();
   const int chunkSize = elementsInBlock / blockDim.x;
   const int chunkOffset = threadIdx.x * chunkSize;
   const int numberOfChunks = lastElementInBlock / chunkSize +
                            ( lastElementInBlock % chunkSize != 0 );
   const int numberOfChunks = roundUpDivision( lastElementInBlock, chunkSize );

   if( chunkOffset < lastElementInBlock )
   {
@@ -136,7 +137,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT

   if( threadIdx.x == 0 )
   {
      if( prefixSumType == exclusivePrefixSum )
      if( prefixSumType == PrefixSumType::exclusive )
      {
         /*auxArray[ blockIdx.x ] = operation.commonReductionOnDevice( Devices::Cuda::getInterleaving( lastElementInBlock - 1 ),
                                                                      Devices::Cuda::getInterleaving( lastElementInBlock ),
@@ -149,13 +150,13 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
      else
         auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ];
   }

}

template< typename DataType,
          typename Operation,
          typename Index >
__global__ void cudaSecondPhaseBlockPrefixSum( Operation operation,
__global__ void
cudaSecondPhaseBlockPrefixSum( Operation operation,
                               const Index size,
                               const Index elementsInBlock,
                               const Index gridShift,
@@ -181,7 +182,8 @@ __global__ void cudaSecondPhaseBlockPrefixSum( Operation operation,
template< typename DataType,
          typename Operation,
          typename Index >
bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType,
void
cudaRecursivePrefixSum( const PrefixSumType prefixSumType,
                        Operation& operation,
                        const Index size,
                        const Index blockSize,
@@ -190,75 +192,59 @@ bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType,
                        const DataType* input,
                        DataType *output )
{
   const Index numberOfBlocks = ceil( ( double ) size / ( double ) elementsInBlock );
   const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
   const Index auxArraySize = numberOfBlocks * sizeof( DataType );
   DataType *auxArray1, *auxArray2;

   if( cudaMalloc( ( void** ) &auxArray1, auxArraySize ) != cudaSuccess ||
       cudaMalloc( ( void** ) &auxArray2, auxArraySize ) != cudaSuccess  )
      throw Exceptions::CudaBadAlloc();
   Array< DataType, Devices::Cuda > auxArray1, auxArray2;
   auxArray1.setSize( auxArraySize );
   auxArray2.setSize( auxArraySize );

   /****
    * Setup block and grid size.
    */
   dim3 cudaBlockSize( 0 ), cudaGridSize( 0 );
   cudaBlockSize.x = blockSize;
   cudaGridSize. x = size / elementsInBlock +
                     ( size % elementsInBlock != 0 );
   cudaGridSize.x = roundUpDivision( size, elementsInBlock );

   /****
    * Run the kernel.
    */
   size_t sharedDataSize = elementsInBlock +
   const std::size_t sharedDataSize = elementsInBlock +
                                      elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2;
   size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize()  ) * sizeof( DataType );
   cudaFirstPhaseBlockPrefixSum< DataType, Operation, Index >
                                <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
   const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize()  ) * sizeof( DataType );
   cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
      ( prefixSumType,
        operation,
        size,
        elementsInBlock,
        input,
        output,
                                   auxArray1 );
   if( ! TNL_CHECK_CUDA_DEVICE )
   {
      std::cerr << "The CUDA kernel 'cudaFirstPhaseBlockPrefixSum' ended with error." << std::endl;
      cudaFree( auxArray1 );
      cudaFree( auxArray2 );
      return false;
   }
        auxArray1.getData() );
   TNL_CHECK_CUDA_DEVICE;

   /***
    * In auxArray1 there is now a sum of numbers in each block.
    * We must compute prefix-sum of auxArray1 and then shift
    * each block.
    */
   if( numberOfBlocks > 1 &&
       ! cudaRecursivePrefixSum< DataType, Operation, Index >
                               ( inclusivePrefixSum,
   if( numberOfBlocks > 1 )
       cudaRecursivePrefixSum( PrefixSumType::inclusive,
                               operation,
                               numberOfBlocks,
                               blockSize,
                               elementsInBlock,
                                 0,
                                 auxArray1,
                                 auxArray2 ) )
      return false;
   cudaSecondPhaseBlockPrefixSum< DataType, Operation, Index >
                                <<< cudaGridSize, cudaBlockSize >>>
                                 ( operation, size, elementsInBlock, gridShift, auxArray2, output );

   if( ! TNL_CHECK_CUDA_DEVICE )
   {
      std::cerr << "The CUDA kernel 'cudaSecondPhaseBlockPrefixSum' ended with error." << std::endl;
      cudaFree( auxArray1 );
      cudaFree( auxArray2 );
      return false;
   }
   cudaFree( auxArray1 );
   cudaFree( auxArray2 );
   return true;
                               (Index) 0,
                               auxArray1.getData(),
                               auxArray2.getData() );

   cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>>
      ( operation,
        size,
        elementsInBlock,
        gridShift,
        auxArray2.getData(),
        output );
   TNL_CHECK_CUDA_DEVICE;
}


@@ -266,7 +252,8 @@ bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType,
template< typename DataType,
          typename Operation,
          typename Index >
bool cudaGridPrefixSum( enumPrefixSumType prefixSumType,
void
cudaGridPrefixSum( PrefixSumType prefixSumType,
                   Operation& operation,
                   const Index size,
                   const Index blockSize,
@@ -275,76 +262,64 @@ bool cudaGridPrefixSum( enumPrefixSumType prefixSumType,
                   DataType *deviceOutput,
                   Index& gridShift )
{

   if( ! cudaRecursivePrefixSum< DataType, Operation, Index >
                               ( prefixSumType,
   cudaRecursivePrefixSum( prefixSumType,
                           operation,
                           size,
                           blockSize,
                           elementsInBlock,
                           gridShift,
                           deviceInput,
                                 deviceOutput ) )
      return false;
   if( cudaMemcpy( &gridShift,
                           deviceOutput );

   cudaMemcpy( &gridShift,
               &deviceOutput[ size - 1 ],
               sizeof( DataType ),
                   cudaMemcpyDeviceToHost ) != cudaSuccess )
   {
      std::cerr << "I am not able to copy data from device to host." << std::endl;
      return false;
   }
   return true;
               cudaMemcpyDeviceToHost );
   TNL_CHECK_CUDA_DEVICE;
}

template< typename DataType,
          typename Operation,
          typename Index >
bool cudaPrefixSum( const Index size,
void
cudaPrefixSum( const Index size,
               const Index blockSize,
               const DataType *deviceInput,
               DataType* deviceOutput,
               Operation& operation,
                    const enumPrefixSumType prefixSumType )
               const PrefixSumType prefixSumType )
{
   /****
    * Compute the number of grids
    */
   const Index elementsInBlock = 8 * blockSize;
   const Index gridSize = size / elementsInBlock + ( size % elementsInBlock != 0 );
   const Index maxGridSize = 65536;
   const Index gridsNumber = gridSize / maxGridSize + ( gridSize % maxGridSize != 0 );
   const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
   const auto maxGridSize = Devices::Cuda::getMaxGridSize();
   const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize );

   /****
    * Loop over all grids.
    */
   Index gridShift( 0 );
   for( Index gridIdx = 0; gridIdx < gridsNumber; gridIdx ++ )
   Index gridShift = 0;
   for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ )
   {
      /****
       * Compute current grid size and size of data to be scanned
       */
      Index gridSize = ( size - gridIdx * maxGridSize * elementsInBlock ) /
                     elementsInBlock;
      Index currentSize = size - gridIdx * maxGridSize * elementsInBlock;
      if( gridSize > maxGridSize )
      {
         gridSize = maxGridSize;
      if( currentSize / elementsInBlock > maxGridSize )
         currentSize = maxGridSize * elementsInBlock;
      }
      Index gridOffset = gridIdx * maxGridSize * elementsInBlock;
      if( ! cudaGridPrefixSum< DataType, Operation, Index >
                             ( prefixSumType,
      const Index gridOffset = gridIdx * maxGridSize * elementsInBlock;

      cudaGridPrefixSum( prefixSumType,
                         operation,
                         currentSize,
                         blockSize,
                         elementsInBlock,
                         &deviceInput[ gridOffset ],
                         &deviceOutput[ gridOffset ],
                               gridShift ) )
         return false;
                         gridShift );
   }
   return true;
}

#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
@@ -353,7 +328,7 @@ extern template bool cudaPrefixSum( const int size,
                                    const int *deviceInput,
                                    int* deviceOutput,
                                    tnlParallelReductionSum< int, int >& operation,
                                    const enumPrefixSumType prefixSumType );
                                    const PrefixSumType prefixSumType );


extern template bool cudaPrefixSum( const int size,
@@ -361,14 +336,14 @@ extern template bool cudaPrefixSum( const int size,
                                    const float *deviceInput,
                                    float* deviceOutput,
                                    tnlParallelReductionSum< float, int >& operation,
                                    const enumPrefixSumType prefixSumType );
                                    const PrefixSumType prefixSumType );

extern template bool cudaPrefixSum( const int size,
                                    const int blockSize,
                                    const double *deviceInput,
                                    double* deviceOutput,
                                    tnlParallelReductionSum< double, int >& operation,
                                    const enumPrefixSumType prefixSumType );
                                    const PrefixSumType prefixSumType );

#ifdef INSTANTIATE_LONG_DOUBLE
extern template bool cudaPrefixSum( const int size,
@@ -376,7 +351,7 @@ extern template bool cudaPrefixSum( const int size,
                                    const long double *deviceInput,
                                    long double* deviceOutput,
                                    tnlParallelReductionSum< long double, int >& operation,
                                    const enumPrefixSumType prefixSumType );
                                    const PrefixSumType prefixSumType );
#endif

#ifdef INSTANTIATE_LONG_INT
@@ -385,7 +360,7 @@ extern template bool cudaPrefixSum( const long int size,
                                    const int *deviceInput,
                                    int* deviceOutput,
                                    tnlParallelReductionSum< int, long int >& operation,
                                    const enumPrefixSumType prefixSumType );
                                    const PrefixSumType prefixSumType );


extern template bool cudaPrefixSum( const long int size,
@@ -393,14 +368,14 @@ extern template bool cudaPrefixSum( const long int size,
                                    const float *deviceInput,
                                    float* deviceOutput,
                                    tnlParallelReductionSum< float, long int >& operation,
                                    const enumPrefixSumType prefixSumType );
                                    const PrefixSumType prefixSumType );

extern template bool cudaPrefixSum( const long int size,
                                    const long int blockSize,
                                    const double *deviceInput,
                                    double* deviceOutput,
                                    tnlParallelReductionSum< double, long int >& operation,
                                    const enumPrefixSumType prefixSumType );
                                    const PrefixSumType prefixSumType );

#ifdef INSTANTIATE_LONG_DOUBLE
extern template bool cudaPrefixSum( const long int size,
@@ -408,7 +383,7 @@ extern template bool cudaPrefixSum( const long int size,
                                    const long double *deviceInput,
                                    long double* deviceOutput,
                                    tnlParallelReductionSum< long double, long int >& operation,
                                    const enumPrefixSumType prefixSumType );
                                    const PrefixSumType prefixSumType );
#endif
#endif