Commit 6cc6bffe authored by Yury Hayeu's avatar Yury Hayeu
Browse files

Fix heat equation solver

parent 82c7ee07
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -74,4 +74,4 @@ GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "k
GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedKernel.h")
GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedDataAndKernel.h")

GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/heatEquationSharedData.h")
GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/sharedDataAndKernel.h")
+0 −182
Original line number Diff line number Diff line

#ifdef HAVE_CUDA

/**
 * This method stores image tile into shared memory
 * and then calculates convolution.
 *
 * Thanks for the idea https://www.evl.uic.edu/sjames/cs525/final.html
 */

#include <TNL/Devices/Cuda.h>
#include <TNL/Containers/StaticVector.h>
#include <TNL/Cuda/LaunchHelpers.h>
#include <TNL/Cuda/SharedMemory.h>

template< int Dimension, typename Device >
struct Convolution;

template< typename Index,
          typename Real,
          typename FetchData,
          typename FetchBoundary,
          typename Convolve,
          typename Store >
__global__
static void
convolution2D( Index kernelWidth,
               Index kernelHeight,
               Index endX,
               Index endY,
               FetchData fetchData,
               FetchBoundary fetchBoundary,
               Convolve convolve,
               Store store )
{
   Real* data = TNL::Cuda::getSharedMemory< Real >();

   const Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   const Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   const Index radiusY = kernelHeight >> 1;
   const Index radiusX = kernelWidth >> 1;

   const Index dataBlockWidth = 2 * kernelWidth - 1;
   const Index dataBlockHeight = 2 * kernelHeight - 1;

   const Index dataBlockRadiusX = dataBlockWidth >> 1;
   const Index dataBlockRadiusY = dataBlockHeight >> 1;

   Index x, y, index;

   // Top Left
   x = ix - radiusX;
   y = iy - radiusY;
   index = threadIdx.x + threadIdx.y * dataBlockWidth;

   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
      data[ index ] = fetchBoundary( x, y );
   }
   else {
      data[ index ] = fetchData( x, y );
   }

   // Top right
   x = ix + radiusX;
   y = iy - radiusY;
   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth;

   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
      data[ index ] = fetchBoundary( x, y );
   }
   else {
      data[ index ] = fetchData( x, y );
   }

   // Bottom Left
   x = ix - radiusX;
   y = iy + radiusY;
   index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;

   if(x < 0 || y < 0 || x >= endX || y >= endY ) {
      data[ index ] = fetchBoundary( x, y );
   }
   else {
      data[ index ] = fetchData( x, y );
   }

   // Bottom Right
   x = ix + radiusX;
   y = iy + radiusY;
   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;

   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
      data[ index ] = fetchBoundary( x, y );
   }
   else {
      data[ index ] = fetchData( x, y );
   }

   __syncthreads();

   if( ix >= endX || iy >= endY )
      return;

   Real result = 0;

   for( Index j = 0; j < kernelHeight; j++ ) {
      Index align = ( j + threadIdx.y ) * dataBlockWidth;

      for( Index i = 0; i < kernelWidth; i++ ) {
         Index index = i + threadIdx.x + align;

         result = convolve( result, ix, iy, i, j, data[ index ]);
      }
   }

   store( ix, iy, result );
}


template<>
struct Convolution< 2, TNL::Devices::Cuda >
{
public:
   template< typename Index >
   using Vector = TNL::Containers::StaticVector< 2, Index >;

   template< typename Index, typename Real >
   static void
   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
   {
      Index kernelElementCount = 1;

      for( Index i = 0; i < kernelSize.getSize(); i++ )
         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;

      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );

      configuration.blockSize.x = kernelSize.x();
      configuration.blockSize.y = kernelSize.y();

      configuration.gridSize.x =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
      configuration.gridSize.y =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
   }

   template< typename Index,
             typename Real,
             typename FetchData,
             typename FetchBoundary,
             typename Convolve,
             typename Store >
   static void
   execute( const Vector< Index >& dimensions,
            const Vector< Index >& kernelSize,
            FetchData&& fetchData,
            FetchBoundary&& fetchBoundary,
            Convolve&& convolve,
            Store&& store )
   {
      TNL::Cuda::LaunchConfiguration configuration;

      setup< Index, Real >( configuration, dimensions, kernelSize );

      constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, Convolve, Store >;

      TNL::Cuda::launchKernel< true >( kernel,
                                       0,
                                       configuration,
                                       kernelSize.x(),
                                       kernelSize.y(),
                                       dimensions.x(),
                                       dimensions.y(),
                                       fetchData,
                                       fetchBoundary,
                                       convolve,
                                       store );
   };
};

#endif
+54 −8
Original line number Diff line number Diff line
@@ -2,7 +2,7 @@
#pragma once

#include "Solver.h"
#include "HeatEquationTask.h"
#include "DummyTask.h"

#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Timer.h>
@@ -11,7 +11,11 @@ static std::vector< TNL::String > dimensionIds = { "grid-x-size", "grid-y-size"
static std::vector< TNL::String > kernelSizeIds = { "kernel-x-size", "kernel-y-size" };
static std::vector< TNL::String > domainIds = { "domain-x-size", "domain-y-size" };
static std::vector< TNL::String > kernelDomainIds = { "kernel-domain-x-size", "kernel-domain-y-size" };
static std::string sigmaKey = "sigma";

static std::string alphaKey = "alpha";
static std::string betaKey = "beta";
static std::string gammaKey = "gamma";

static std::string timeStepKey = "timeStep";
static std::string startTimeKey = "startTime";
static std::string finalTimeKey = "finalTime";
@@ -116,11 +120,15 @@ public:
      config.addEntry< Real >( domainIds[ 0 ], "Domain size along x-axis.", 4.0 );
      config.addEntry< Real >( domainIds[ 1 ], "Domain size along y-axis.", 4.0 );

      config.addEntry< Real >( kernelDomainIds[ 0 ], "Kernel domain size along x-axis.", 3.0 );
      config.addEntry< Real >( kernelDomainIds[ 1 ], "Kernel domain size along y-axis.", 3.0 );
      config.addEntry< Real >( kernelDomainIds[ 0 ], "Kernel domain size along x-axis.", 4.0 );
      config.addEntry< Real >( kernelDomainIds[ 1 ], "Kernel domain size along y-axis.", 4.0 );

      config.addEntry< Real >( sigmaKey, "Sigma in exponential initial condition.", 0.5);
      config.addDelimiter( "Initial condition settings ( (x^2/alpha + y^2/beta) + gamma)):" );
      config.addEntry< Real >( alphaKey, "Alpha value in initial condition", -0.05 );
      config.addEntry< Real >( betaKey, "Beta value in initial condition", -0.05 );
      config.addEntry< Real >( gammaKey, "Gamma key in initial condition", 15 );

      config.addDelimiter( "Time settings:" );
      config.addEntry< Real >( startTimeKey, "Final time of the simulation.", 0.0);
      config.addEntry< Real >( timeStepKey, "Time step of the simulation.", 0.005);
      config.addEntry< Real >( finalTimeKey, "Final time of the simulation.", 0.36);
@@ -142,7 +150,10 @@ public:

      auto xDomainSize = parameters.getParameter< Real >( domainIds[ 0 ] );
      auto yDomainSize = parameters.getParameter< Real >( domainIds[ 1 ] );
      auto sigma = parameters.getParameter< Real >( sigmaKey );

      auto alpha = parameters.getParameter< Real >( alphaKey );
      auto beta = parameters.getParameter< Real >( betaKey );
      auto gamma = parameters.getParameter< Real >( gammaKey );

      auto init = [ = ] __cuda_callable__( int i, int j ) mutable
      {
@@ -151,7 +162,7 @@ public:
         auto x = i * spaceSteps.x() - domain.x() / 2.;
         auto y = j * spaceSteps.y() - domain.y() / 2.;

         functionView[ index ] = exp( sigma * ( x * x + y * y ) );
         functionView[ index ] = TNL::max((x * x / alpha)  + (y * y / beta) + gamma, 0);
      };

      TNL::Algorithms::ParallelFor2D< Device >::exec( 0, 0, dimensions.x(), dimensions.y(), init );
@@ -168,7 +179,42 @@ public:
             typename DataStore::ViewType result,
             const Real time ) const
   {
      HeatEquationTask< int, Real, Dimension, Device >::exec( dimensions, kernelSize, domain, kernelDomain, time, input, result);
      DataStore kernel;
      kernel.resize(kernelSize.x() * kernelSize.y());

      auto kernelView = kernel.getView();
      auto domainSpaceSteps = Point(domain.x() / dimensions.x(), domain.y() / dimensions.y());
      auto kernelSpaceSteps = Point(kernelDomain.x() / (kernelSize.x() - 1), kernelDomain.y() / (kernelSize.y() - 1));

      auto init = [ = ] __cuda_callable__( int i, int j ) mutable {
         auto index = j * kernelSize.x() + i;

         auto x = i * kernelSpaceSteps.x() - kernelDomain.x() / 2.;
         auto y = j * kernelSpaceSteps.y() - kernelDomain.y() / 2.;

         // The space step is given by the function domain
         // However, because the kernel is limited to 31x31 size
         // The user can specify it custom kernel domain from which values are taken
         kernelView[ index ] = domainSpaceSteps.x() * domainSpaceSteps.y() * ( (Real)1 / ( (Real)4 * M_PI * time ) ) * exp( - ( pow(x, 2.) + pow(y, 2.)  ) / ( (Real)4 * time ) );
      };

      TNL::Algorithms::ParallelFor2D< Device >::exec( 0, 0, kernelSize.x(), kernelSize.y(), init );

      // std::cout << std::endl << std::endl << std::endl;

      for (int i = 0; i < kernelSize.x(); i++) {
         for (int j = 0; j < kernelSize.y(); j++) {
            auto index = j * kernelSize.x() + i;

            printf("%lf ", kernelView.getElement(index));
         }

         printf("\n");
      }

      auto kernelConstView = kernel.getConstView();

      DummyTask<int, Real, Dimension, Device>::exec(dimensions, kernelSize, input, result, kernelConstView, 0);
   }

   bool
+0 −94
Original line number Diff line number Diff line

#pragma once

template< int Dimension, typename Device >
struct Convolution
{
   template< typename Index >
   using Vector = TNL::Containers::StaticVector< Dimension, Index >;

   template< typename Index,
             typename Real,
             typename FetchData,
             typename FetchBoundary,
             typename Convolve,
             typename Store >
   static void
   execute( const Vector< Index >& dimensions,
            const Vector< Index >& kernelSize,
            FetchData&& fetchData,
            FetchBoundary&& fetchBoundary,
            Convolve&& convolve,
            Store&& store );
};

template< typename Index, typename Real, int Dimension, typename Device >
struct HeatEquationTask;

template< typename Index, typename Real >
struct HeatEquationTask< Index, Real, 2, TNL::Devices::Cuda >
{
public:
   static constexpr int Dimension = 2;
   using Device = TNL::Devices::Cuda;
   using Vector = TNL::Containers::StaticVector< Dimension, Index >;
   using Point = TNL::Containers::StaticVector< Dimension, Real >;
   using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType;
   using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType;
   using ConvolutionLauncher = Convolution< Dimension, Device >;

   static void
   exec( const Vector& dimensions,
         const Vector& kernelSize,
         const Point& functionDomain,
         const Point& kernelDomain,
         const Real time,
         ConstDataStore& input,
         DataStore& result)
   {
      auto functionSpaceSteps = Point(functionDomain.x() / dimensions.x(), functionDomain.y() / dimensions.y());
      auto kernelSpaceSteps = Point(kernelDomain.x() / kernelSize.x(), kernelDomain.y() / kernelSize.y());

      auto fetchData = [ = ] __cuda_callable__( Index i, Index j )
      {
         auto index = i + j * dimensions.x();

         return input[ index ];
      };

      auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j )
      {
         return 0;
      };

      auto convolve = [ = ] __cuda_callable__( Real result, Index dataX, Index dataY, Index kernelX, Index kernelY, Real data )
      {
         auto functionXPos = dataX * functionSpaceSteps.x() - (functionDomain.x() / 2),
              functionYPos = dataY * functionSpaceSteps.y() - (functionDomain.y() / 2);

         auto kernelXPos = (kernelX - kernelSize.x() / 2) * kernelSpaceSteps.x(),
              kernelYPos = (kernelY - kernelSize.y() / 2) * kernelSpaceSteps.y();

         auto deltaXPos = kernelXPos - functionXPos,
              deltaYPos = kernelYPos - functionYPos;

         auto kernel = kernelSpaceSteps.x() * kernelSpaceSteps.y() * ( (Real)1 / ( (Real)4 * M_PI * time ) ) * exp( - ( pow(deltaXPos, 2.) + pow(deltaYPos, 2.)  ) / ( (Real)4 * time ) );

         return result + data * kernel;
      };

      auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) mutable
      {
         auto index = i + j * dimensions.x();

         result[ index ] = resultValue;
      };

      ConvolutionLauncher::execute< Index, Real >( dimensions,
                                                   kernelSize,
                                                   std::forward< decltype( fetchData ) >( fetchData ),
                                                   std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
                                                   std::forward< decltype( convolve ) >( convolve ),
                                                   std::forward< decltype( store ) >( store ) );
   }
};