Skip to content
Snippets Groups Projects
ParallelFor.h 12.7 KiB
Newer Older
  • Learn to ignore specific revisions
  • /***************************************************************************
    
                              ParallelFor.h  -  description
    
                                 -------------------
        begin                : Mar 10, 2017
        copyright            : (C) 2017 by Tomas Oberhuber et al.
        email                : tomas.oberhuber@fjfi.cvut.cz
     ***************************************************************************/
    
    /* See Copyright Notice in tnl/Copyright */
    
    #pragma once
    
    #include <TNL/Devices/Host.h>
    #include <TNL/Devices/Cuda.h>
    #include <TNL/Devices/CudaDeviceInfo.h>
    
    #include <TNL/Math.h>
    
    
    /*
     * The implementation of ParallelFor is not meant to provide maximum performance
     * at every cost, but maximum flexibility for operating with data stored on the
     * device.
     *
     * The grid-stride loop for CUDA has been inspired by Nvidia's blog post:
     * https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
    
     *
     * Implemented by: Jakub Klinkovsky
    
     */
    
    namespace TNL {
    
    template< typename Device = Devices::Host >
    struct ParallelFor
    {
       template< typename Index,
                 typename Function,
                 typename... FunctionArgs >
       static void exec( Index start, Index end, Function f, FunctionArgs... args )
       {
    #ifdef HAVE_OPENMP
    
          // Benchmarks show that this is significantly faster compared
          // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )'
          if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )
          {
    #pragma omp parallel for
             for( Index i = start; i < end; i++ )
                f( i, args... );
          }
          else
             for( Index i = start; i < end; i++ )
                f( i, args... );
    #else
    
          for( Index i = start; i < end; i++ )
             f( i, args... );
    
    template< typename Device = Devices::Host >
    struct ParallelFor2D
    {
       template< typename Index,
                 typename Function,
                 typename... FunctionArgs >
       static void exec( Index startX, Index startY, Index endX, Index endY, Function f, FunctionArgs... args )
       {
    #ifdef HAVE_OPENMP
    
          // Benchmarks show that this is significantly faster compared
          // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )'
          if( TNL::Devices::Host::isOMPEnabled() )
          {
    #pragma omp parallel for
             for( Index i = startX; i < endX; i++ )
                for( Index j = startY; j < endY; j++ )
                   f( i, j, args... );
          }
          else
             for( Index i = startX; i < endX; i++ )
                for( Index j = startY; j < endY; j++ )
                   f( i, j, args... );
    #else
    
          for( Index i = startX; i < endX; i++ )
    
             for( Index j = startY; j < endY; j++ )
                f( i, j, args... );
    #endif
    
       }
    };
    
    template< typename Device = Devices::Host >
    struct ParallelFor3D
    {
       template< typename Index,
                 typename Function,
                 typename... FunctionArgs >
       static void exec( Index startX, Index startY, Index startZ, Index endX, Index endY, Index endZ, Function f, FunctionArgs... args )
       {
    #ifdef HAVE_OPENMP
    
          // Benchmarks show that this is significantly faster compared
          // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )'
         if( TNL::Devices::Host::isOMPEnabled() )
         {
    #pragma omp parallel for collapse(2)
          for( Index i = startX; i < endX; i++ )
             for( Index j = startY; j < endY; j++ )
                for( Index k = startZ; k < endZ; k++ )
                   f( i, j, k, args... );
         }
         else
             for( Index i = startX; i < endX; i++ )
                for( Index j = startY; j < endY; j++ )
                   for( Index k = startZ; k < endZ; k++ )
                      f( i, j, k, args... );
    #else
    
          for( Index i = startX; i < endX; i++ )
    
             for( Index j = startY; j < endY; j++ )
                for( Index k = startZ; k < endZ; k++ )
                   f( i, j, k, args... );
    #endif
    
    #ifdef HAVE_CUDA
    
    template< bool gridStrideX = true,
              typename Index,
    
              typename Function,
              typename... FunctionArgs >
    __global__ void
    ParallelForKernel( Index start, Index end, Function f, FunctionArgs... args )
    {
    
       Index i = start + blockIdx.x * blockDim.x + threadIdx.x;
       while( i < end ) {
    
          f( i, args... );
    
          if( gridStrideX ) i += blockDim.x * gridDim.x;
          else break;
       }
    }
    
    template< bool gridStrideX = true,
              bool gridStrideY = true,
              typename Index,
              typename Function,
              typename... FunctionArgs >
    __global__ void
    ParallelFor2DKernel( Index startX, Index startY, Index endX, Index endY, Function f, FunctionArgs... args )
    {
       Index j = startY + blockIdx.y * blockDim.y + threadIdx.y;
       Index i = startX + blockIdx.x * blockDim.x + threadIdx.x;
       while( j < endY ) {
          while( i < endX ) {
             f( i, j, args... );
             if( gridStrideX ) i += blockDim.x * gridDim.x;
             else break;
          }
          if( gridStrideY ) j += blockDim.y * gridDim.y;
          else break;
       }
    }
    
    template< bool gridStrideX = true,
              bool gridStrideY = true,
              bool gridStrideZ = true,
              typename Index,
              typename Function,
              typename... FunctionArgs >
    __global__ void
    ParallelFor3DKernel( Index startX, Index startY, Index startZ, Index endX, Index endY, Index endZ, Function f, FunctionArgs... args )
    {
       Index k = startZ + blockIdx.z * blockDim.z + threadIdx.z;
       Index j = startY + blockIdx.y * blockDim.y + threadIdx.y;
       Index i = startX + blockIdx.x * blockDim.x + threadIdx.x;
       while( k < endZ ) {
          while( j < endY ) {
             while( i < endX ) {
                f( i, j, k, args... );
                if( gridStrideX ) i += blockDim.x * gridDim.x;
                else break;
             }
             if( gridStrideY ) j += blockDim.y * gridDim.y;
             else break;
          }
          if( gridStrideZ ) k += blockDim.z * gridDim.z;
          else break;
    
       }
    }
    #endif
    
    template<>
    struct ParallelFor< Devices::Cuda >
    {
       template< typename Index,
                 typename Function,
                 typename... FunctionArgs >
       static void exec( Index start, Index end, Function f, FunctionArgs... args )
       {
    #ifdef HAVE_CUDA
          if( end > start ) {
             dim3 blockSize( 256 );
             dim3 gridSize;
    
             gridSize.x = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( end - start, blockSize.x ) );
    
             if( Devices::Cuda::getNumberOfGrids( end - start ) == 1 )
                ParallelForKernel< false ><<< gridSize, blockSize >>>( start, end, f, args... );
             else {
                // decrease the grid size and align to the number of multiprocessors
                const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
                gridSize.x = TNL::min( desGridSize, Devices::Cuda::getNumberOfBlocks( end - start, blockSize.x ) );
                ParallelForKernel< true ><<< gridSize, blockSize >>>( start, end, f, args... );
             }
    
             cudaDeviceSynchronize();
             TNL_CHECK_CUDA_DEVICE;
          }
    #else
          throw Exceptions::CudaSupportMissing();
    #endif
       }
    };
    
    template<>
    struct ParallelFor2D< Devices::Cuda >
    {
       template< typename Index,
                 typename Function,
                 typename... FunctionArgs >
       static void exec( Index startX, Index startY, Index endX, Index endY, Function f, FunctionArgs... args )
       {
    #ifdef HAVE_CUDA
          if( endX > startX && endY > startY ) {
             const Index sizeX = endX - startX;
             const Index sizeY = endY - startY;
    
             dim3 blockSize;
             if( sizeX >= sizeY * sizeY ) {
    
                blockSize.y = 1;
             }
             else if( sizeY >= sizeX * sizeX ) {
                blockSize.x = 1;
                blockSize.y = TNL::min( 256, sizeY );
             }
             else {
                blockSize.x = TNL::min( 32, sizeX );
                blockSize.y = TNL::min( 8, sizeY );
             }
             dim3 gridSize;
             gridSize.x = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeX, blockSize.x ) );
             gridSize.y = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeY, blockSize.y ) );
    
             dim3 gridCount;
             gridCount.x = Devices::Cuda::getNumberOfGrids( sizeX );
             gridCount.y = Devices::Cuda::getNumberOfGrids( sizeY );
    
             if( gridCount.x == 1 && gridCount.y == 1 )
                ParallelFor2DKernel< false, false ><<< gridSize, blockSize >>>
                   ( startX, startY, endX, endY, f, args... );
             else if( gridCount.x == 1 && gridCount.y > 1 )
                ParallelFor2DKernel< false, true ><<< gridSize, blockSize >>>
                   ( startX, startY, endX, endY, f, args... );
             else if( gridCount.x > 1 && gridCount.y == 1 )
                ParallelFor2DKernel< true, false ><<< gridSize, blockSize >>>
                   ( startX, startY, endX, endY, f, args... );
             else
                ParallelFor2DKernel< true, true ><<< gridSize, blockSize >>>
                   ( startX, startY, endX, endY, f, args... );
    
             cudaDeviceSynchronize();
             TNL_CHECK_CUDA_DEVICE;
          }
    #else
          throw Exceptions::CudaSupportMissing();
    #endif
       }
    };
    
    template<>
    struct ParallelFor3D< Devices::Cuda >
    {
       template< typename Index,
                 typename Function,
                 typename... FunctionArgs >
       static void exec( Index startX, Index startY, Index startZ, Index endX, Index endY, Index endZ, Function f, FunctionArgs... args )
       {
    #ifdef HAVE_CUDA
          if( endX > startX && endY > startY && endZ > startZ ) {
             const Index sizeX = endX - startX;
             const Index sizeY = endY - startY;
             const Index sizeZ = endZ - startZ;
    
             dim3 blockSize;
             if( sizeX >= sizeY * sizeZ ) {
                blockSize.x = TNL::min( 256, sizeX );
                blockSize.y = 1;
                blockSize.z = 1;
             }
             else if( sizeY >= sizeX * sizeZ ) {
                blockSize.x = 1;
                blockSize.y = TNL::min( 256, sizeY );
                blockSize.z = 1;
             }
             else if( sizeZ >= sizeX * sizeY ) {
                blockSize.x = 1;
                blockSize.y = 1;
                blockSize.z = TNL::min( 256, sizeZ );
             }
             else {
                blockSize.x = TNL::min( 16, sizeX );
                blockSize.y = TNL::min( 4, sizeY );
                blockSize.z = TNL::min( 4, sizeZ );
             }
             dim3 gridSize;
             gridSize.x = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeX, blockSize.x ) );
             gridSize.y = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeY, blockSize.y ) );
             gridSize.z = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeZ, blockSize.z ) );
    
             dim3 gridCount;
             gridCount.x = Devices::Cuda::getNumberOfGrids( sizeX );
             gridCount.y = Devices::Cuda::getNumberOfGrids( sizeY );
             gridCount.z = Devices::Cuda::getNumberOfGrids( sizeZ );
    
             if( gridCount.x == 1 && gridCount.y == 1 && gridCount.z == 1 )
                ParallelFor3DKernel< false, false, false ><<< gridSize, blockSize >>>
                   ( startX, startY, startZ, endX, endY, endZ, f, args... );
             else if( gridCount.x == 1 && gridCount.y == 1 && gridCount.z > 1 )
                ParallelFor3DKernel< false, false, true ><<< gridSize, blockSize >>>
                   ( startX, startY, startZ, endX, endY, endZ, f, args... );
             else if( gridCount.x == 1 && gridCount.y > 1 && gridCount.z == 1 )
                ParallelFor3DKernel< false, true, false ><<< gridSize, blockSize >>>
                   ( startX, startY, startZ, endX, endY, endZ, f, args... );
             else if( gridCount.x > 1 && gridCount.y == 1 && gridCount.z == 1 )
                ParallelFor3DKernel< true, false, false ><<< gridSize, blockSize >>>
                   ( startX, startY, startZ, endX, endY, endZ, f, args... );
             else if( gridCount.x == 1 && gridCount.y > 1 && gridCount.z > 1 )
                ParallelFor3DKernel< false, true, true ><<< gridSize, blockSize >>>
                   ( startX, startY, startZ, endX, endY, endZ, f, args... );
             else if( gridCount.x > 1 && gridCount.y > 1 && gridCount.z == 1 )
                ParallelFor3DKernel< true, true, false ><<< gridSize, blockSize >>>
                   ( startX, startY, startZ, endX, endY, endZ, f, args... );
             else if( gridCount.x > 1 && gridCount.y == 1 && gridCount.z > 1 )
                ParallelFor3DKernel< true, false, true ><<< gridSize, blockSize >>>
                   ( startX, startY, startZ, endX, endY, endZ, f, args... );
             else
                ParallelFor3DKernel< true, true, true ><<< gridSize, blockSize >>>
                   ( startX, startY, startZ, endX, endY, endZ, f, args... );
    
    
             cudaDeviceSynchronize();
             TNL_CHECK_CUDA_DEVICE;
    
    #else
          throw Exceptions::CudaSupportMissing();
    
    #endif
       }
    };
    
    } // namespace TNL