Commit 5a5a294c authored by Yury Hayeu's avatar Yury Hayeu
Browse files

Implement naive 2D kernel

parent 77b4e081
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -23,3 +23,4 @@ endif()
endfunction()

GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/naive.h")
GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/naive.h")
+54 −45
Original line number Diff line number Diff line
@@ -56,55 +56,64 @@ convolution1D( Index kernelWidth,
   store( ix, result );
}

// template<>
// struct Convolution< 2, TNL::Devices::Cuda >
// {
// public:
//    template< typename Index >
//    static size_t
//    getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY )
//    {
//       return 0;
//    }
// };
template<>
struct Convolution< 2, TNL::Devices::Cuda >
{
public:
   template< typename Index >
   static size_t
   getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY )
   {
      return 0;
   }
};

// template< typename Index,
//           typename Real,
//           typename FetchData,
//           typename FetchBoundary,
//           typename FetchKernel,
//           typename Convolve,
//           typename Store >
// __global__
// static void
// convolution2D( Index kernelWidth,
//                Index kernelHeight,
//                Index endX,
//                Index endY,
//                FetchData& fetchData,
//                FetchBoundary& fetchBoundary,
//                FetchKernel& fetchKernel,
//                Convolve& convolve,
//                Store& store )
// {
//    int iy = threadIdx.y + blockIdx.y * blockDim.y;
//    int ix = threadIdx.x + blockIdx.x * blockDim.x;
template< typename Index,
          typename Real,
          typename FetchData,
          typename FetchBoundary,
          typename FetchKernel,
          typename Convolve,
          typename Store >
__global__
static void
convolution2D( Index kernelWidth,
               Index kernelHeight,
               Index endX,
               Index endY,
               FetchData fetchData,
               FetchBoundary fetchBoundary,
               FetchKernel fetchKernel,
               Convolve convolve,
               Store store )
{
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

//    Real result = 0;
   Index radiusY = kernelHeight >> 1;
   Index radiusX = kernelHeight >> 1;

//    for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) {
//       for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) {
//          if( i < 0 || i >= endX || j < 0 || j >= endY ) {
//             result = convolve( result, fetchBoundary( i, j ) );
//          }
//          else {
//             result = convolve( result, fetchData( i, j ), fetchKernel( i, j ) );
//          }
//       }
//    }
   Real result = 0;

//    store( ix, iy, result );
// }
   for( Index j = - radiusY; j <= radiusY; j++ ) {
      Index elementIndexY = j + iy;
      Index kernelIndexY = j + radiusY;

      for( Index i = - radiusX; i <= radiusX; i++ ) {
         Index elementIndexX = i + ix;
         Index kernelIndexX = i + radiusX;

         if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY ) {
            result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel ( kernelIndexX, kernelIndexY ) );
         }
         else {
            result = convolve( result, fetchData( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) );
         }
      }
   }

   store( ix, iy, result );
}

// template<>
// struct Convolution< 3, TNL::Devices::Cuda >
+44 −44
Original line number Diff line number Diff line
@@ -54,59 +54,59 @@ public:
   }
};

// template< typename Index, typename Real >
// struct DummyTask< Index, Real, 2, TNL::Devices::Cuda >
// {
// public:
//    static constexpr int Dimension = 2;
//    using Device = TNL::Devices::Cuda;
//    using Vector = TNL::Containers::StaticVector< Dimension, Index >;
//    using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
//    using Launcher = Launcher< Dimension, Device >;
template< typename Index, typename Real >
struct DummyTask< Index, Real, 2, TNL::Devices::Cuda >
{
public:
   static constexpr int Dimension = 2;
   using Device = TNL::Devices::Cuda;
   using Vector = TNL::Containers::StaticVector< Dimension, Index >;
   using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
   using Launcher = Launcher< Dimension, Device >;

//    static void
//    exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
//    {
//       auto fetchData = [ = ] __cuda_callable__( Index i, Index j )
//       {
//          auto index = i + j * dimensions.x();
   static void
   exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
   {
      auto fetchData = [ = ] __cuda_callable__( Index i, Index j )
      {
         auto index = i + j * dimensions.x();

//          return input[ index ];
//       };
         return input[ index ];
      };

//       auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j )
//       {
//          return -1;
//       };
      auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j )
      {
         return -1;
      };

//       auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j )
//       {
//          auto index = i + j * kernel.x();
      auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j )
      {
         auto index = i + j * kernelSize.x();

//          return kernel[ index ];
//       };
         return kernel[ index ];
      };

//       auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel )
//       {
//          return result + data * kernel;
//       };
      auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel )
      {
         return result + data * kernel;
      };

//       auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue )
//       {
//          auto index = i + j * dimensions.x();
      auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) mutable
      {
         auto index = i + j * dimensions.x();

//          result[ index ] = resultValue;
//       };
         result[ index ] = resultValue;
      };

//       Launcher::exec< Index >( dimensions,
//                                kernelSize,
//                                std::forward< decltype( fetchData ) >( fetchData ),
//                                std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
//                                std::forward< decltype( fetchKernel ) >( fetchKernel ),
//                                std::forward< decltype( convolve ) >( convolve ),
//                                std::forward< decltype( store ) >( store ) );
//    }
// };
      Launcher::exec< Index, Real >( dimensions,
                                     kernelSize,
                                     std::forward< decltype( fetchData ) >( fetchData ),
                                     std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
                                     std::forward< decltype( fetchKernel ) >( fetchKernel ),
                                     std::forward< decltype( convolve ) >( convolve ),
                                     std::forward< decltype( store ) >( store ) );
   }
};

// template< typename Index, typename Real >
// struct DummyTask< Index, Real, 3, TNL::Devices::Cuda >
+52 −57
Original line number Diff line number Diff line
@@ -59,70 +59,65 @@ public:
   }
};

// template<>
// struct Launcher< 2, TNL::Devices::Cuda >
// {
// public:
//    using Vector = TNL::Containers::StaticVector< 2, int >;
//    using ConvolutionKernel = Convolution< 2, TNL::Devices::Cuda >;

//    template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store >
//    static inline void
//    exec( const Vector& dimensions,
//          const Vector& kernelSize,
//          FetchData&& fetchData,
//          FetchBoundary&& fetchBoundary,
//          FetchKernel&& fetchKernel,
//          Convolve&& convolve,
//          Store&& store )
//    {
//       TNL::Cuda::LaunchConfiguration launchConfig;

//       launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >(
//          kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() );
template<>
struct Launcher< 2, TNL::Devices::Cuda >
{
public:
   using Vector = TNL::Containers::StaticVector< 2, int >;
   using ConvolutionKernel = Convolution< 2, TNL::Devices::Cuda >;

//       const Index sizeX = dimensions.x();
//       const Index sizeY = dimensions.y();
   template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store >
   static inline void
   exec( const Vector& dimensions,
         const Vector& kernelSize,
         FetchData&& fetchData,
         FetchBoundary&& fetchBoundary,
         FetchKernel&& fetchKernel,
         Convolve&& convolve,
         Store&& store )
   {
      TNL::Cuda::LaunchConfiguration launchConfig;

//       if( sizeX >= sizeY * sizeY ) {
//          launchConfig.blockSize.x = TNL::min( 256, sizeX );
//          launchConfig.blockSize.y = 1;
//       }
//       else if( sizeY >= sizeX * sizeX ) {
//          launchConfig.blockSize.x = 1;
//          launchConfig.blockSize.y = TNL::min( 256, sizeY );
//       }
//       else {
//          launchConfig.blockSize.x = TNL::min( 32, sizeX );
//          launchConfig.blockSize.y = TNL::min( 8, sizeY );
//       }
      launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >(
         kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() );

//       launchConfig.gridSize.x =
//          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) );
//       launchConfig.gridSize.y =
//          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) );
      const Index sizeX = dimensions.x();
      const Index sizeY = dimensions.y();

//       dim3 gridCount;
      if( sizeX >= sizeY * sizeY ) {
         launchConfig.blockSize.x = TNL::min( 256, sizeX );
         launchConfig.blockSize.y = 1;
      }
      else if( sizeY >= sizeX * sizeX ) {
         launchConfig.blockSize.x = 1;
         launchConfig.blockSize.y = TNL::min( 256, sizeY );
      }
      else {
         launchConfig.blockSize.x = TNL::min( 32, sizeX );
         launchConfig.blockSize.y = TNL::min( 8, sizeY );
      }

//       gridCount.x = roundUpDivision( sizeX, launchConfig.blockSize.x * launchConfig.gridSize.x );
//       gridCount.y = roundUpDivision( sizeY, launchConfig.blockSize.y * launchConfig.gridSize.y );
      launchConfig.gridSize.x =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) );
      launchConfig.gridSize.y =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) );

//       constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
      constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

//       TNL::Cuda::launchKernel< true >( kernel,
//                                        0,
//                                        launchConfig,
//                                        kernelSize.x(),
//                                        kernelSize.y(),
//                                        dimensions.x(),
//                                        dimensions.y(),
//                                        std::forward< FetchData >( fetchData ),
//                                        std::forward< FetchBoundary >( fetchBoundary ),
//                                        std::forward< FetchKernel >( fetchKernel ),
//                                        std::forward< Convolve >( convolve ),
//                                        std::forward< Store >( store ) );
//    }
// };
      TNL::Cuda::launchKernel< true >( kernel,
                                       0,
                                       launchConfig,
                                       kernelSize.x(),
                                       kernelSize.y(),
                                       dimensions.x(),
                                       dimensions.y(),
                                       fetchData,
                                       fetchBoundary,
                                       fetchKernel,
                                       convolve,
                                       store );
   }
};

// template<>
// struct Launcher< 3, TNL::Devices::Cuda >