Commit b2b2a5a4 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing the prefix-sum in CUDA.

parent da35b794
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -11,6 +11,7 @@ TEMPLATE_EXPLICIT_INSTANTIATION=yes
CMAKE="cmake"
CPUS=`grep -c processor /proc/cpuinfo`

CPUS="1"
echo "Building $TARGET using $CPUS processors."

if [ ! -d Debug ];
+42 −0
Original line number Diff line number Diff line
@@ -21,6 +21,46 @@
enum enumPrefixSumType { exclusivePrefixSum = 0,
                         inclusivePrefixSum };

template< typename DataType >
class operationSum
{
   public:

   DataType identity() const
   {
      return ( DataType ) 0.0;
   };

   void performInPlace( DataType& a, const DataType& b ) const
   {
      a += b;
   };

   DataType perform( const DataType& a, const DataType& b )
   {
      return a + b;
   };

#ifdef HAVE_CUDA
   __device__ DataType cudaIdentity() const
   {
      return ( DataType ) 0.0;
   };

   __device__ void cudaPerformInPlace( DataType& a, const DataType& b ) const
   {
      a += b;
   };

   __device__ DataType cudaPerform( const DataType& a, const DataType& b )
   {
      return a + b;
   };
#endif   

};


template< typename DataType,
          template< typename T > class Operation,
          typename Index >
@@ -31,4 +71,6 @@ bool cudaPrefixSum( const Index size,
                    const enumPrefixSumType prefixSumType = inclusivePrefixSum );


#include <implementation/core/cuda/cuda-prefix-sum_impl.h>

#endif /* CUDA_PREFIX_SUM_H_ */
+37 −2
Original line number Diff line number Diff line
@@ -42,10 +42,20 @@ class tnlCuda

   static int getGPUTransferBufferSize();

   static bool checkDevice( const char* file_name, int line );

   static size_t getFreeMemory();

#ifdef HAVE_CUDA
   static inline __host__ __device__ int getNumberOfSharedMemoryBanks();

   static inline __host__ __device__ int getWarpSize();

   template< typename Index >
   static __device__ Index getInterleaving( const Index index );
#endif


   static bool checkDevice( const char* file_name, int line );

   protected:

   static int maxGridSize, maxBlockSize;
@@ -56,4 +66,29 @@ class tnlCuda
#define tnlCudaSupportMissingMessage \
   std::cerr << "The CUDA support is missing in the source file " << __FILE__ << " at line " << __LINE__ << ". Please set WITH_CUDA=yes in the install script. " << std::endl;


// TODO: This would be nice in tnlCuda but C++ standard does not allow it.
#ifdef HAVE_CUDA
   template< typename Element >
   struct getSharedMemory
   {
       __device__ operator Element*();
   };

   template<>
   struct getSharedMemory< double >
   {
       inline __device__ operator double*();
   };

   template<>
   struct getSharedMemory< long int >
   {
       inline __device__ operator long int*();
   };

#endif

#include <implementation/core/tnlCuda_impl.h>

#endif /* TNLCUDA_H_ */
+0 −2
Original line number Diff line number Diff line
@@ -154,8 +154,6 @@ bool tnlFile :: read( Type* buffer,
      cerr << "File " << fileName << " was not opened for reading. " << endl;
      return false;
   }

   int bytesRead( 0 );
   this->readElements = 0;
   const Index host_buffer_size = :: Min( ( Index ) ( tnlFileGPUvsCPUTransferBufferSize / sizeof( Type ) ),
                                          elements );
+26 −21
Original line number Diff line number Diff line
@@ -18,6 +18,8 @@
#ifndef CUDA_PREFIX_SUM_IMPL_H_
#define CUDA_PREFIX_SUM_IMPL_H_

#ifdef HAVE_CUDA

template< typename DataType,
          template< typename T > class Operation,
          typename Index >
@@ -28,8 +30,8 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
                                              DataType* output,
                                              DataType* auxArray )
{
   DataType* sharedData = sharedMemory< DataType >();
   DataType* auxData = &sharedData[ elementsInBlock + elementsInBlock / shmBanks + 2 ];
   DataType* sharedData = getSharedMemory< DataType >();
   DataType* auxData = &sharedData[ elementsInBlock + elementsInBlock / tnlCuda::getNumberOfSharedMemoryBanks() + 2 ];
   DataType* warpSums = &auxData[ blockDim. x ];
   Operation< DataType > operation;

@@ -45,17 +47,17 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
   if( prefixSumType == exclusivePrefixSum )
   {
      if( idx == 0 )
         sharedData[ interleave< shmBanks >( 0 ) ] = operation. cudaIdentity();
         sharedData[ 0 ] = operation. cudaIdentity();
      while( idx < elementsInBlock && blockOffset + idx < size )
      {
         sharedData[ interleave< shmBanks >( idx + 1 ) ] = input[ blockOffset + idx ];
         sharedData[ tnlCuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ];
         idx += blockDim. x;
      }
   }
   else
      while( idx < elementsInBlock && blockOffset + idx < size )
      {
         sharedData[ interleave< shmBanks >( idx ) ] = input[ blockOffset + idx ];
         sharedData[ tnlCuda::getInterleaving( idx ) ] = input[ blockOffset + idx ];
         idx += blockDim. x;
      }

@@ -71,30 +73,30 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
   if( chunkOffset < lastElementInBlock )
   {
      auxData[ threadIdx. x ] =
         sharedData[ interleave< shmBanks >( chunkOffset ) ];
         sharedData[ tnlCuda::getInterleaving( chunkOffset ) ];
   }

   Index chunkPointer( 1 );
   while( chunkPointer < chunkSize &&
          chunkOffset + chunkPointer < lastElementInBlock )
   {
      operation. cudaPerformInPlace( sharedData[ interleave< shmBanks >( chunkOffset + chunkPointer ) ],
                                 sharedData[ interleave< shmBanks >( chunkOffset + chunkPointer - 1 ) ] );
      operation. cudaPerformInPlace( sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer ) ],
                                 sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
      auxData[ threadIdx. x ] =
         sharedData[ interleave< shmBanks >( chunkOffset + chunkPointer  ) ];
         sharedData[ tnlCuda::getInterleaving( chunkOffset + chunkPointer  ) ];
      chunkPointer ++;
   }

   /***
    *  Perform the parallel prefix-sum inside warps.
    */
   const int threadInWarpIdx = threadIdx. x % warpSize;
   const int warpIdx = threadIdx. x / warpSize;
   for( int stride = 1; stride < warpSize; stride *= 2 )
   const int threadInWarpIdx = threadIdx. x % tnlCuda::getWarpSize();
   const int warpIdx = threadIdx. x / tnlCuda::getWarpSize();
   for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 )
      if( threadInWarpIdx >= stride && threadIdx. x < numberOfChunks )
         operation. cudaPerformInPlace( auxData[ threadIdx. x ], auxData[ threadIdx. x - stride ] );

   if( threadInWarpIdx == warpSize - 1 )
   if( threadInWarpIdx == tnlCuda::getWarpSize() - 1 )
      warpSums[ warpIdx ] = auxData[ threadIdx. x ];
   __syncthreads();

@@ -102,7 +104,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
    * Compute prefix-sum of warp sums using one warp
    */
   if( warpIdx == 0 )
      for( int stride = 1; stride < warpSize; stride *= 2 )
      for( int stride = 1; stride < tnlCuda::getWarpSize(); stride *= 2 )
         if( threadInWarpIdx >= stride )
            operation. cudaPerformInPlace( warpSums[ threadInWarpIdx ], warpSums[ threadInWarpIdx - stride ] );
   __syncthreads();
@@ -124,8 +126,8 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
      Index chunkShift( operation. cudaIdentity() );
      if( chunkIdx > 0 )
         chunkShift = auxData[ chunkIdx - 1 ];
      operation. cudaPerformInPlace( sharedData[ interleave< shmBanks >( idx ) ], chunkShift );
      output[ blockOffset + idx ] = sharedData[ interleave< shmBanks >( idx ) ];
      operation. cudaPerformInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift );
      output[ blockOffset + idx ] = sharedData[ tnlCuda::getInterleaving( idx ) ];
      idx += blockDim. x;
   }
   __syncthreads();
@@ -134,10 +136,10 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
   {
      if( prefixSumType == exclusivePrefixSum )
         auxArray[ blockIdx. x ] =
            operation. cudaPerform( sharedData[ interleave< shmBanks >( lastElementInBlock - 1 ) ],
                                    sharedData[ interleave< shmBanks >( lastElementInBlock ) ] );
            operation. cudaPerform( sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ],
                                    sharedData[ tnlCuda::getInterleaving( lastElementInBlock ) ] );
      else
         auxArray[ blockIdx. x ] = sharedData[ interleave< shmBanks >( lastElementInBlock - 1 ) ];
         auxArray[ blockIdx. x ] = sharedData[ tnlCuda::getInterleaving( lastElementInBlock - 1 ) ];
   }

}
@@ -202,8 +204,9 @@ bool cudaRecursivePrefixSum( const enumPrefixSumType prefixSumType,
   /****
    * Run the kernel.
    */
   size_t sharedDataSize = elementsInBlock + elementsInBlock / shmBanks + 2;
   size_t sharedMemory = ( sharedDataSize + blockSize + warpSize  ) * sizeof( DataType );
   size_t sharedDataSize = elementsInBlock + 
                           elementsInBlock / tnlCuda::getNumberOfSharedMemoryBanks() + 2;
   size_t sharedMemory = ( sharedDataSize + blockSize + tnlCuda::getWarpSize()  ) * sizeof( DataType );
   cudaFirstPhaseBlockPrefixSum< DataType, Operation, Index >
                                <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
                                (  prefixSumType,
@@ -332,4 +335,6 @@ bool cudaPrefixSum( const Index size,
   return true;
}

#endif

#endif /* CUDA_PREFIX_SUM_IMPL_H_ */
Loading