Commit 3f891223 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Fixed use of buffer in CUDA paralell reduction.

parent 58af5093
Loading
Loading
Loading
Loading
+154 −96
Original line number Diff line number Diff line
@@ -178,22 +178,14 @@ CudaReductionKernel( const Result zero,

}

template< typename DataFetcher,
   typename Reduction,
   typename VolatileReduction,
   typename Index,
template< typename Index,
          typename Result >
int
CudaReductionKernelLauncher( const Index size,
                             const Reduction& reduction,
                             const VolatileReduction& volatileReduction,
                             const DataFetcher& dataFetcher,
                             const Result& zero,
                             Result*& output )
struct CudaReductionKernelLauncher
{
   using IndexType = Index;
   using ResultType = Result;

   ////
   // The number of blocks should be a multiple of the number of multiprocessors
   // to ensure optimum balancing of the load. This is very important, because
   // we run the kernel with a fixed number of blocks, so the amount of work per
@@ -203,20 +195,78 @@ CudaReductionKernelLauncher( const Index size,
   // where blocksPerMultiprocessor is determined according to the number of
   // available registers on the multiprocessor.
   // On Tesla K40c, desGridSize = 8 * 15 = 120.
   const int activeDevice = Devices::CudaDeviceInfo::getActiveDevice();
   const int blocksdPerMultiprocessor = Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice )
                                      / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread );
   const int desGridSize = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice );
   dim3 blockSize, gridSize;
   blockSize.x = Reduction_maxThreadsPerBlock;
   gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
   CudaReductionKernelLauncher( const Index size )
   : activeDevice( Devices::CudaDeviceInfo::getActiveDevice() ),
     blocksdPerMultiprocessor( Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice )
                               / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ) ),
     desGridSize( blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ),
     originalSize( size )
   {
   }

   template< typename DataFetcher,
             typename Reduction,
             typename VolatileReduction >
   int start( const Reduction& reduction,
              const VolatileReduction& volatileReduction,
              const DataFetcher& dataFetcher,
              const Result& zero,
              ResultType*& output )
   {
      ////
      // create reference to the reduction buffer singleton and set size
   const size_t buf_size = desGridSize * sizeof( ResultType );
      const size_t buf_size = 2 * desGridSize * sizeof( ResultType );
      CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
      cudaReductionBuffer.setSize( buf_size );
      output = cudaReductionBuffer.template getData< ResultType >();

      this-> reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output );
      return this->reducedSize;
   }

   template< typename Reduction,
             typename VolatileReduction >
   Result finish( const Reduction& reduction,
                  const VolatileReduction& volatileReduction,
                  const Result& zero )
   {
      ////
      // Input is the first half of the buffer, output is the second half
      const size_t buf_size = desGridSize * sizeof( ResultType );
      CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
      ResultType* input = cudaReductionBuffer.template getData< ResultType >();
      ResultType* output = &input[ buf_size ];

      auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return input[ i ]; };
      while( this->reducedSize > 1 )
      {
         this-> reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output );
         std::swap( input, output );
      }

      ////
      // Copy result on CPU
      ResultType result;
      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( &result, output, 1 );
      return result;
   }

   protected:
      template< typename DataFetcher,
                typename Reduction,
                typename VolatileReduction >
      int launch( const Index size,
                  const Reduction& reduction,
                  const VolatileReduction& volatileReduction,
                  const DataFetcher& dataFetcher,
                  const Result& zero,
                  Result* output )
      {
         dim3 blockSize, gridSize;
         blockSize.x = Reduction_maxThreadsPerBlock;
         gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );

         ////
         // when there is only one warp per blockSize.x, we need to allocate two warps
         // worth of shared memory so that we don't index shared memory out of bounds
         const IndexType shmem = (blockSize.x <= 32)
@@ -287,9 +337,17 @@ CudaReductionKernelLauncher( const Index size,
        }
        TNL_CHECK_CUDA_DEVICE;

        ////
        // return the size of the output array on the CUDA device
        return gridSize.x;
      }

      const int activeDevice;
      const int blocksdPerMultiprocessor;
      const int desGridSize;
      const IndexType originalSize;
      IndexType reducedSize;
};
#endif

} // namespace Algorithms
+9 −16
Original line number Diff line number Diff line
@@ -91,11 +91,13 @@ Reduction< Devices::Cuda >::
      timer.start();
   #endif

   CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size );

   /****
    * Reduce the data on the CUDA device.
    */
   ResultType* deviceAux1( 0 );
   IndexType reducedSize = CudaReductionKernelLauncher( size,
   IndexType reducedSize = reductionLauncher.start( 
      reduction,
      volatileReduction,
      dataFetcher,
@@ -112,7 +114,6 @@ Reduction< Devices::Cuda >::
      /***
       * Transfer the reduced data from device to host.
       */
      //ResultType* resultArray[ reducedSize ];
      std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] };
      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray.get(), deviceAux1, reducedSize );

@@ -139,15 +140,7 @@ Reduction< Devices::Cuda >::
      /***
       * Data can't be safely reduced on host, so continue with the reduction on the CUDA device.
       */
      auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return deviceAux1[ i ]; };
      while( reducedSize > 1 ) {
         reducedSize = CudaReductionKernelLauncher( reducedSize,
            reduction,
            volatileReduction,
            copyFetch,
            zero,
            deviceAux1 );
      }
      auto result = reductionLauncher.finish( reduction, volatileReduction, zero );

      #ifdef CUDA_REDUCTION_PROFILING
         timer.stop();
@@ -156,14 +149,14 @@ Reduction< Devices::Cuda >::
         timer.start();
      #endif

      ResultType resultArray[ 1 ];
      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize );
      const ResultType result = resultArray[ 0 ];
      //ResultType resultArray[ 1 ];
      //ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize );
      //const ResultType result = resultArray[ 0 ];

      #ifdef CUDA_REDUCTION_PROFILING
      /*#ifdef CUDA_REDUCTION_PROFILING
         timer.stop();
         std::cout << "   Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl;
      #endif
      #endif*/

      return result;
   }
+2 −2

File changed.

Contains only whitespace changes.