Commit c710d0b2 authored by Yury Hayeu's avatar Yury Hayeu
Browse files

Implement naive 3D kernel

parent 5a5a294c
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@ if (${BUILD_CUDA})

   FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}")

   SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}")
   SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}_${TEMPLATE_NAME}")

   CUDA_ADD_EXECUTABLE(${EXECUTABLE_NAME} ${SOURCE_FILE})
else()
+78 −55
Original line number Diff line number Diff line
@@ -37,6 +37,10 @@ convolution1D( Index kernelWidth,
               Store store )
{
   Index ix =  threadIdx.x + blockIdx.x * blockDim.x;

   if (ix >= endX)
      return;

   Index radius = kernelWidth >> 1;

   Real result = 0;
@@ -90,8 +94,11 @@ convolution2D( Index kernelWidth,
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if (ix >= endX || iy >= endY)
      return;

   Index radiusY = kernelHeight >> 1;
   Index radiusX = kernelHeight >> 1;
   Index radiusX = kernelWidth >> 1;

   Real result = 0;

@@ -115,59 +122,75 @@ convolution2D( Index kernelWidth,
   store( ix, iy, result );
}

// template<>
// struct Convolution< 3, TNL::Devices::Cuda >
// {
// public:
//    template< typename Index >
//    static size_t
//    getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ )
//    {
//       return 0;
//    }
// };

// template< typename Index,
//           typename Real,
//           typename FetchData,
//           typename FetchBoundary,
//           typename FetchKernel,
//           typename Convolve,
//           typename Store >
// __global__
// static void
// convolution3D( Index kernelWidth,
//                Index kernelHeight,
//                Index kernelDepth,
//                Index endX,
//                Index endY,
//                Index endZ,
//                FetchData& fetchData,
//                FetchBoundary& fetchBoundary,
//                FetchKernel& fetchKernel,
//                Convolve& convolve,
//                Store& store )
// {
//    int ix = threadIdx.x + blockIdx.x * blockDim.x;
//    int iy = threadIdx.y + blockIdx.y * blockDim.y;
//    int iz = threadIdx.z + blockIdx.z * blockDim.z;

//    Real result = 0;

//    for( Index k = iz - kernelDepth; k <= iz + kernelDepth; k++ ) {
//       for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) {
//          for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) {
//             if( i < 0 || i >= endX || j < 0 || j >= endY || k < 0 || k >= endZ ) {
//                result = convolve( result, fetchBoundary( i, j, k ) );
//             }
//             else {
//                result = convolve( result, fetchData( i, j, k ), fetchKernel( i, j, k ) );
//             }
//          }
//       }
//    }

//    store( ix, iy, iz, result );
// }
template<>
struct Convolution< 3, TNL::Devices::Cuda >
{
public:
   template< typename Index >
   static size_t
   getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ )
   {
      return 0;
   }
};

template< typename Index,
          typename Real,
          typename FetchData,
          typename FetchBoundary,
          typename FetchKernel,
          typename Convolve,
          typename Store >
__global__
static void
convolution3D( Index kernelWidth,
               Index kernelHeight,
               Index kernelDepth,
               Index endX,
               Index endY,
               Index endZ,
               FetchData fetchData,
               FetchBoundary fetchBoundary,
               FetchKernel fetchKernel,
               Convolve convolve,
               Store store )
{
   Index iz = threadIdx.z + blockIdx.z * blockDim.z;
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if (ix >= endX || iy >= endY || iz >= endZ)
      return;

   Index radiusZ = kernelDepth >> 1;
   Index radiusY = kernelHeight >> 1;
   Index radiusX = kernelWidth >> 1;

   Real result = 0;

   for( Index k = -radiusZ; k <= radiusZ; k++ ) {
      Index elementIndexZ = k + iz;
      Index kernelIndexZ = k + radiusZ;

      for( Index j = -radiusY; j <= radiusY; j++ ) {
         Index elementIndexY = j + iy;
         Index kernelIndexY = j + radiusY;

         for( Index i = -radiusX; i <= radiusX; i++ ) {
            Index elementIndexX = i + ix;
            Index kernelIndexX = i + radiusX;

            if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 || elementIndexZ >= endZ ) {
               result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) );
            }
            else {
               result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) );
            }
         }
      }
   }

   store( ix, iy, iz, result );
}

#endif
+47 −37
Original line number Diff line number Diff line
@@ -17,7 +17,7 @@ public:
   using Launcher = Launcher< Dimension, Device >;

   static void
   exec( const Vector& dimensions, const Vector& kernelSize, DataStore input, DataStore result, DataStore kernel )
   exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
   {
      auto fetchData = [ = ] __cuda_callable__( Index i )
      {
@@ -108,46 +108,56 @@ public:
   }
};

// template< typename Index, typename Real >
// struct DummyTask< Index, Real, 3, TNL::Devices::Cuda >
// {
// public:
//    static constexpr int Dimension = 3;
//    using Device = TNL::Devices::Cuda;
//    using Vector = TNL::Containers::StaticVector< Dimension, Index >;
//    using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
//    using Launcher = Launcher< Dimension, Device >;

//    static void
//    exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
//    {
//       auto fetchData = [ = ] __cuda_callable__( Index i, Index j, Index k ) {
template< typename Index, typename Real >
struct DummyTask< Index, Real, 3, TNL::Devices::Cuda >
{
public:
   static constexpr int Dimension = 3;
   using Device = TNL::Devices::Cuda;
   using Vector = TNL::Containers::StaticVector< Dimension, Index >;
   using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
   using Launcher = Launcher< Dimension, Device >;

//       };
   static void
   exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
   {
      auto fetchData = [ = ] __cuda_callable__( Index i, Index j, Index k )
      {
         auto index = i + j * dimensions.x() + k * dimensions.x() * dimensions.y();

//       auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j, Index k ) {
         return input[index];
      };

//       };
      auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j, Index k )
      {
         return 1;
      };

//       auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j, Index k ) {
      auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j, Index k )
      {
         auto index = i + j * kernelSize.x() + k * kernelSize.x() * kernelSize.y();

//       };
         return kernel[ index ];
      };

//       auto convolve = [ = ] __cuda_callable__( float result, Index data, Index kernel )
//       {
//          return result + data * kernel;
//       };
      auto convolve = [ = ] __cuda_callable__( float result, Index data, Index kernel )
      {
         return result + data * kernel;
      };

//       auto store = [ = ] __cuda_callable__( Index i, Index j, Index k, Real result ) {
      auto store = [ = ] __cuda_callable__( Index i, Index j, Index k, Real resultValue ) mutable
      {
         auto index = i + j * dimensions.x() + k * dimensions.x() * dimensions.y();

//       };
         result[ index ] = resultValue;
      };

//       Launcher::exec< Index >( dimensions,
//                                kernelSize,
//                                std::forward< decltype( fetchData ) >( fetchData ),
//                                std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
//                                std::forward< decltype( fetchKernel ) >( fetchKernel ),
//                                std::forward< decltype( convolve ) >( convolve ),
//                                std::forward< decltype( store ) >( store ) );
//    }
// };
      Launcher::exec< Index, Real >( dimensions,
                                     kernelSize,
                                     std::forward< decltype( fetchData ) >( fetchData ),
                                     std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
                                     std::forward< decltype( fetchKernel ) >( fetchKernel ),
                                     std::forward< decltype( convolve ) >( convolve ),
                                     std::forward< decltype( store ) >( store ) );
   }
};
+88 −92
Original line number Diff line number Diff line
@@ -119,95 +119,91 @@ public:
   }
};

// template<>
// struct Launcher< 3, TNL::Devices::Cuda >
// {
// public:
//    using Vector = TNL::Containers::StaticVector< 3, int >;
//    using ConvolutionKernel = Convolution< 3, TNL::Devices::Cuda >;

//    template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store >
//    static inline void
//    exec( const Vector& dimensions,
//          const Vector& kernelSize,
//          FetchData&& fetchData,
//          FetchBoundary&& fetchBoundary,
//          FetchKernel&& fetchKernel,
//          Convolve&& convolve,
//          Store&& store )
//    {
//       const Index sizeX = dimensions.x();
//       const Index sizeY = dimensions.y();
//       const Index sizeZ = dimensions.z();

//       TNL::Cuda::LaunchConfiguration launchConfig;

//       launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >(
//          kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z() );

//       if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) {
//          launchConfig.blockSize.x = TNL::min( 256, sizeX );
//          launchConfig.blockSize.y = 1;
//          launchConfig.blockSize.z = 1;
//       }
//       else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) {
//          launchConfig.blockSize.x = 1;
//          launchConfig.blockSize.y = TNL::min( 256, sizeY );
//          launchConfig.blockSize.z = 1;
//       }
//       else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) {
//          launchConfig.blockSize.x = TNL::min( 2, sizeX );
//          launchConfig.blockSize.y = TNL::min( 2, sizeY );
//          // CUDA allows max 64 for launchConfig.blockSize.z
//          launchConfig.blockSize.z = TNL::min( 64, sizeZ );
//       }
//       else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) {
//          launchConfig.blockSize.x = TNL::min( 32, sizeX );
//          launchConfig.blockSize.y = TNL::min( 8, sizeY );
//          launchConfig.blockSize.z = 1;
//       }
//       else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) {
//          launchConfig.blockSize.x = TNL::min( 32, sizeX );
//          launchConfig.blockSize.y = 1;
//          launchConfig.blockSize.z = TNL::min( 8, sizeZ );
//       }
//       else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) {
//          launchConfig.blockSize.x = 1;
//          launchConfig.blockSize.y = TNL::min( 32, sizeY );
//          launchConfig.blockSize.z = TNL::min( 8, sizeZ );
//       }
//       else {
//          launchConfig.blockSize.x = TNL::min( 16, sizeX );
//          launchConfig.blockSize.y = TNL::min( 4, sizeY );
//          launchConfig.blockSize.z = TNL::min( 4, sizeZ );
//       }
//       launchConfig.gridSize.x =
//          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) );
//       launchConfig.gridSize.y =
//          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) );
//       launchConfig.gridSize.z =
//          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeZ, launchConfig.blockSize.z ) );

//       dim3 gridCount;
//       gridCount.x = roundUpDivision( sizeX, launchConfig.blockSize.x * launchConfig.gridSize.x );
//       gridCount.y = roundUpDivision( sizeY, launchConfig.blockSize.y * launchConfig.gridSize.y );
//       gridCount.z = roundUpDivision( sizeZ, launchConfig.blockSize.z * launchConfig.gridSize.z );

//       constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

//       TNL::Cuda::launchKernel< true >( kernel,
//                                        0,
//                                        launchConfig,
//                                        kernelSize.x(),
//                                        kernelSize.y(),
//                                        kernelSize.z(),
//                                        dimensions.x(),
//                                        dimensions.y(),
//                                        dimensions.z(),
//                                        std::forward< FetchData >( fetchData ),
//                                        std::forward< FetchBoundary >( fetchBoundary ),
//                                        std::forward< FetchKernel >( fetchKernel ),
//                                        std::forward< Convolve >( convolve ),
//                                        std::forward< Store >( store ) );
//    }
// };
template<>
struct Launcher< 3, TNL::Devices::Cuda >
{
public:
   using Vector = TNL::Containers::StaticVector< 3, int >;
   using ConvolutionKernel = Convolution< 3, TNL::Devices::Cuda >;

   template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store >
   static inline void
   exec( const Vector& dimensions,
         const Vector& kernelSize,
         FetchData&& fetchData,
         FetchBoundary&& fetchBoundary,
         FetchKernel&& fetchKernel,
         Convolve&& convolve,
         Store&& store )
   {
      const Index sizeX = dimensions.x();
      const Index sizeY = dimensions.y();
      const Index sizeZ = dimensions.z();

      TNL::Cuda::LaunchConfiguration launchConfig;

      launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >(
         kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z() );

      if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) {
         launchConfig.blockSize.x = TNL::min( 256, sizeX );
         launchConfig.blockSize.y = 1;
         launchConfig.blockSize.z = 1;
      }
      else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) {
         launchConfig.blockSize.x = 1;
         launchConfig.blockSize.y = TNL::min( 256, sizeY );
         launchConfig.blockSize.z = 1;
      }
      else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) {
         launchConfig.blockSize.x = TNL::min( 2, sizeX );
         launchConfig.blockSize.y = TNL::min( 2, sizeY );
         // CUDA allows max 64 for launchConfig.blockSize.z
         launchConfig.blockSize.z = TNL::min( 64, sizeZ );
      }
      else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) {
         launchConfig.blockSize.x = TNL::min( 32, sizeX );
         launchConfig.blockSize.y = TNL::min( 8, sizeY );
         launchConfig.blockSize.z = 1;
      }
      else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) {
         launchConfig.blockSize.x = TNL::min( 32, sizeX );
         launchConfig.blockSize.y = 1;
         launchConfig.blockSize.z = TNL::min( 8, sizeZ );
      }
      else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) {
         launchConfig.blockSize.x = 1;
         launchConfig.blockSize.y = TNL::min( 32, sizeY );
         launchConfig.blockSize.z = TNL::min( 8, sizeZ );
      }
      else {
         launchConfig.blockSize.x = TNL::min( 16, sizeX );
         launchConfig.blockSize.y = TNL::min( 4, sizeY );
         launchConfig.blockSize.z = TNL::min( 4, sizeZ );
      }

      launchConfig.gridSize.x =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) );
      launchConfig.gridSize.y =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) );
      launchConfig.gridSize.z =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeZ, launchConfig.blockSize.z ) );

      constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

      TNL::Cuda::launchKernel< true >( kernel,
                                       0,
                                       launchConfig,
                                       kernelSize.x(),
                                       kernelSize.y(),
                                       kernelSize.z(),
                                       dimensions.x(),
                                       dimensions.y(),
                                       dimensions.z(),
                                       fetchData,
                                       fetchBoundary,
                                       fetchKernel,
                                       convolve,
                                       store );
   }
};