Commit 3781cb67 authored by Yury Hayeu's avatar Yury Hayeu
Browse files

Move out kernel coniguration to convolution task

parent 868f233e
Loading
Loading
Loading
Loading
+64 −21
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@
#ifdef HAVE_CUDA

   #include <TNL/Devices/Cuda.h>
   #include <TNL/Containers/StaticVector.h>
   #include <TNL/Cuda/LaunchHelpers.h>

template< int Dimension, typename Device >
@@ -12,10 +13,18 @@ struct Convolution< 1, TNL::Devices::Cuda >
{
public:
   template< typename Index >
   static size_t
   getDynamicSharedMemorySize( Index kernelWidth, Index endX )
   using Vector = TNL::Containers::StaticVector< 1, Index >;

   template< typename Index >
   static void
   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
   {
      return 0;
      configuration.dynamicSharedMemorySize = 0;

      // TODO: - Benchmark the best value
      configuration.blockSize.x = kernelSize.x();
      configuration.gridSize.x =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
   }
};

@@ -65,10 +74,22 @@ struct Convolution< 2, TNL::Devices::Cuda >
{
public:
   template< typename Index >
   static size_t
   getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY )
   using Vector = TNL::Containers::StaticVector< 2, Index >;

   template< typename Index >
   static void
   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
   {
      return 0;
      configuration.dynamicSharedMemorySize = 0;

      // TODO: - Benchmark the best value
      configuration.blockSize.x = kernelSize.x();
      configuration.blockSize.y = kernelSize.y();

      configuration.gridSize.x =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
      configuration.gridSize.y =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
   }
};

@@ -111,7 +132,8 @@ convolution2D( Index kernelWidth,
         Index kernelIndexX = i + radiusX;

         if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY ) {
            result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel ( kernelIndexX, kernelIndexY ) );
            result =
               convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) );
         }
         else {
            result = convolve( result, fetchData( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) );
@@ -127,10 +149,25 @@ struct Convolution< 3, TNL::Devices::Cuda >
{
public:
   template< typename Index >
   static size_t
   getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ )
   using Vector = TNL::Containers::StaticVector< 3, Index >;

   template< typename Index >
   static void
   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
   {
      return 0;
      configuration.dynamicSharedMemorySize = 0;

      // TODO: - Benchmark the best value
      configuration.blockSize.x = kernelSize.x();
      configuration.blockSize.y = kernelSize.y();
      configuration.blockSize.z = kernelSize.z();

      configuration.gridSize.x =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
      configuration.gridSize.y =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
      configuration.gridSize.y =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
   }
};

@@ -180,11 +217,17 @@ convolution3D( Index kernelWidth,
            Index elementIndexX = i + ix;
            Index kernelIndexX = i + radiusX;

            if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 || elementIndexZ >= endZ ) {
               result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) );
            if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0
                || elementIndexZ >= endZ )
            {
               result = convolve( result,
                                  fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ),
                                  fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) );
            }
            else {
               result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) );
               result = convolve( result,
                                  fetchData( elementIndexX, elementIndexY, elementIndexZ ),
                                  fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) );
            }
         }
      }
+3 −2
Original line number Diff line number Diff line
@@ -32,13 +32,14 @@ public:
         start[ i ] = parameters.getParameter< int >( minDimensionIds[ i ] );
         end[ i ] = parameters.getParameter< int >( maxDimensionIds[ i ] );
         minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] );
         maxKernelSizeIds[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] );
         maxKernelSize[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] );

         TNL_ASSERT_GT( start[ i ], 1, "Start dimension must be positive integer" );
         TNL_ASSERT_GT( end[ i ], start[ i ], "End dimension must be greater than start dimension" );

         TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" );
         TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" );
         TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End dimension must be greater than start dimension" );
         TNL_ASSERT_GT( end[ i ], start[ i ], "End kernel size must be greater than start kernel size" );
      }

@@ -85,7 +86,7 @@ public:
         for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) {
            if( currentDimension[ i ] >= maxDimension[ i ] ) {
               currentDimension[ i ] = minDimension[ i ];
               currentDimension[ i ] = maxDimension[ i ];
               currentDimension[ i + 1 ] *= dimensionStep;
            }
         }

+3 −83
Original line number Diff line number Diff line
@@ -29,20 +29,7 @@ public:
   {
      TNL::Cuda::LaunchConfiguration launchConfig;

      launchConfig.dynamicSharedMemorySize =
         ConvolutionKernel::getDynamicSharedMemorySize< Index >( kernelSize.x(), dimensions.x() );

      // TODO: - Benchmark the best value
      launchConfig.blockSize.x = 256;
      launchConfig.gridSize.x =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), launchConfig.blockSize.x ) );

      if( (std::size_t) launchConfig.blockSize.x * launchConfig.gridSize.x < (std::size_t) dimensions.x() ) {
         const int desGridSize = 32 * TNL::Cuda::DeviceInfo::getCudaMultiprocessors( TNL::Cuda::DeviceInfo::getActiveDevice() );

         launchConfig.gridSize.x =
            TNL::min( desGridSize, TNL::Cuda::getNumberOfBlocks( dimensions.x(), launchConfig.blockSize.x ) );
      }
      ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize);

      constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

@@ -78,29 +65,7 @@ public:
   {
      TNL::Cuda::LaunchConfiguration launchConfig;

      launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >(
         kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() );

      const Index sizeX = dimensions.x();
      const Index sizeY = dimensions.y();

      if( sizeX >= sizeY * sizeY ) {
         launchConfig.blockSize.x = TNL::min( 256, sizeX );
         launchConfig.blockSize.y = 1;
      }
      else if( sizeY >= sizeX * sizeX ) {
         launchConfig.blockSize.x = 1;
         launchConfig.blockSize.y = TNL::min( 256, sizeY );
      }
      else {
         launchConfig.blockSize.x = TNL::min( 32, sizeX );
         launchConfig.blockSize.y = TNL::min( 8, sizeY );
      }

      launchConfig.gridSize.x =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) );
      launchConfig.gridSize.y =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) );
      ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize);

      constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

@@ -142,52 +107,7 @@ public:

      TNL::Cuda::LaunchConfiguration launchConfig;

      launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >(
         kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z() );

      if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) {
         launchConfig.blockSize.x = TNL::min( 256, sizeX );
         launchConfig.blockSize.y = 1;
         launchConfig.blockSize.z = 1;
      }
      else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) {
         launchConfig.blockSize.x = 1;
         launchConfig.blockSize.y = TNL::min( 256, sizeY );
         launchConfig.blockSize.z = 1;
      }
      else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) {
         launchConfig.blockSize.x = TNL::min( 2, sizeX );
         launchConfig.blockSize.y = TNL::min( 2, sizeY );
         // CUDA allows max 64 for launchConfig.blockSize.z
         launchConfig.blockSize.z = TNL::min( 64, sizeZ );
      }
      else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) {
         launchConfig.blockSize.x = TNL::min( 32, sizeX );
         launchConfig.blockSize.y = TNL::min( 8, sizeY );
         launchConfig.blockSize.z = 1;
      }
      else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) {
         launchConfig.blockSize.x = TNL::min( 32, sizeX );
         launchConfig.blockSize.y = 1;
         launchConfig.blockSize.z = TNL::min( 8, sizeZ );
      }
      else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) {
         launchConfig.blockSize.x = 1;
         launchConfig.blockSize.y = TNL::min( 32, sizeY );
         launchConfig.blockSize.z = TNL::min( 8, sizeZ );
      }
      else {
         launchConfig.blockSize.x = TNL::min( 16, sizeX );
         launchConfig.blockSize.y = TNL::min( 4, sizeY );
         launchConfig.blockSize.z = TNL::min( 4, sizeZ );
      }

      launchConfig.gridSize.x =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) );
      launchConfig.gridSize.y =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) );
      launchConfig.gridSize.z =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeZ, launchConfig.blockSize.z ) );
      ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize);

      constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;