Commit 11898c70 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing the prefix-sum in CUDA.

parent b2b2a5a4
Loading
Loading
Loading
Loading
+4 −3
Original line number Diff line number Diff line
@@ -21,7 +21,7 @@
enum enumPrefixSumType { exclusivePrefixSum = 0,
                         inclusivePrefixSum };

template< typename DataType >
/*template< typename DataType >
class operationSum
{
   public:
@@ -58,16 +58,17 @@ class operationSum
   };
#endif   

};
};*/


template< typename DataType,
          template< typename T > class Operation,
          typename Operation,
          typename Index >
bool cudaPrefixSum( const Index size,
                    const Index blockSize,
                    const DataType *deviceInput,
                    DataType* deviceOutput,
                    const Operation& operation,
                    const enumPrefixSumType prefixSumType = inclusivePrefixSum );


+19 −0
Original line number Diff line number Diff line
@@ -176,6 +176,25 @@ class tnlParallelReductionSum
   {
      return data[ idx1 ] + data[ idx2 ];
   };

   __device__ ResultType commonReductionOnDevice( const ResultType& a,
                                                  const ResultType& b ) const
   {
      return a + b;
   };


   __device__ RealType identity() const
   {
      return 0;
   };

   __device__ void performInPlace( ResultType& a,
                                   const ResultType& b )
   {
      a += b;
   }

#endif
};

+13 −11
Original line number Diff line number Diff line
@@ -47,7 +47,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
   if( prefixSumType == exclusivePrefixSum )
   {
      if( idx == 0 )
         sharedData[ 0 ] = operation. cudaIdentity();
         sharedData[ 0 ] = operation.identity();
      while( idx < elementsInBlock && blockOffset + idx < size )
      {
         sharedData[ tnlCuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ];
@@ -80,7 +80,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
   while( chunkPointer < chunkSize &&
          chunkOffset + chunkPointer < lastElementInBlock )
   {
      operation. cudaPerformInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ],
      operation.performInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ],
                                sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
      auxData[ threadIdx. x ] =
         sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer  ) ];
@@ -94,7 +94,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
   const int warpIdx = threadIdx. x / tnlCuda::getWarpSize();
   for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 )
      if( threadInWarpIdx >= stride && threadIdx. x < numberOfChunks )
         operation. cudaPerformInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] );
         operation.performInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] );

   if( threadInWarpIdx == tnlCuda::getWarpSize() - 1 )
      warpSums[ warpIdx ] = auxData[ threadIdx. x ];
@@ -106,14 +106,14 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
   if( warpIdx == 0 )
      for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 )
         if( threadInWarpIdx >= stride )
            operation. cudaPerformInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] );
            operation.performInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] );
   __syncthreads();

   /****
    * Shift the warp prefix-sums.
    */
   if( warpIdx > 0 )
      operation. cudaPerformInPlace( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] );
      operation.performInPlace( auxData[ threadIdx. x ], warpSums[ warpIdx - 1 ] );

   /***
    *  Store the result back in global memory.
@@ -126,7 +126,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
      Index chunkShift( operation. cudaIdentity() );
      if( chunkIdx > 0 )
         chunkShift = auxData[ chunkIdx - 1 ];
      operation. cudaPerformInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift );
      operation.performInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift );
      output[ blockOffset + idx ] = sharedData[ tnlCuda::getInterleaving( idx ) ];
      idx += blockDim. x;
   }
@@ -136,8 +136,9 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
   {
      if( prefixSumType == exclusivePrefixSum )
         auxArray[ blockIdx. x ] =
            operation. cudaPerform( sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ],
                                    sharedData[ tnlCuda::getInterleaving( lastElementInBlock ) ] );
            operation.commonReductionOnDevice( tnlCuda::getInterleaving( lastElementInBlock - 1 ),
                                               tnlCuda::getInterleaving( lastElementInBlock ),
                                               sharedData );
      else
         auxArray[ blockIdx. x ] = sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ];
   }
@@ -156,7 +157,7 @@ __global__ void cudaSecondPhaseBlockPrefixSum( const Index size,
   Operation< DataType > operation;
   if( blockIdx. x > 0 )
   {
      const Index shift = operation. cudaPerform( gridShift, auxArray[ blockIdx. x - 1 ] );
      const Index shift = operation.commonReductionOnDevice( gridShift, auxArray[ blockIdx. x - 1 ] );

      const Index readOffset = blockIdx. x * elementsInBlock;
      Index readIdx = threadIdx. x;
@@ -288,12 +289,13 @@ bool cudaGridPrefixSum( enumPrefixSumType prefixSumType,
}

template< typename DataType,
          template< typename T > class Operation,
          typename Operation,
          typename Index >
bool cudaPrefixSum( const Index size,
                    const Index blockSize,
                    const DataType *deviceInput,
                    DataType* deviceOutput,
                    const Operation& operation,
                    const enumPrefixSumType prefixSumType )
{
   /****
+17 −6
Original line number Diff line number Diff line
@@ -565,12 +565,17 @@ void tnlVectorOperations< tnlCuda >::computePrefixSum( Vector& v,
                                                       typename Vector::IndexType begin,
                                                       typename Vector::IndexType end )
{
   typedef tnlParallelReductionSum< typename Vector::IndexType,
                                    typename Vector::RealType > OperationType;

   OperationType operation;
   cudaPrefixSum< typename Vector::RealType,
                  operationSum,
                  OperationType,
                  typename Vector::IndexType >( end - begin,
                                                256,
                                                &v.getData()[ begin ],
                                                &v.getData()[ begin ],
                                                operation,
                                                inclusivePrefixSum );
}

@@ -579,12 +584,18 @@ void tnlVectorOperations< tnlCuda >::computeExclusivePrefixSum( Vector& v,
                                                                typename Vector::IndexType begin,
                                                                typename Vector::IndexType end )
{
   typedef tnlParallelReductionSum< typename Vector::IndexType,
                                    typename Vector::RealType > OperationType;

   OperationType operation;

   cudaPrefixSum< typename Vector::RealType,
                  operationSum,
                  OperationType,
                  typename Vector::IndexType >( end - begin,
                                                256,
                                                &v.getData()[ begin ],
                                                &v.getData()[ begin ],
                                                operation,
                                                exclusivePrefixSum );
}