Fix heat equation solver (6cc6bffe) · Commits · TNL / tnl-dev

src/Benchmarks/Convolution/CMakeLists.txt

+1 −1

Original line number	Diff line number	Diff line
		@@ -74,4 +74,4 @@ GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "k
		GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedKernel.h")
		GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedDataAndKernel.h")

		GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/heatEquationSharedData.h")
		GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/sharedDataAndKernel.h")

src/Benchmarks/Convolution/kernels/heatEquationSharedData.h

deleted100644 → 0

+0 −182

Original line number	Diff line number	Diff line

		#ifdef HAVE_CUDA

		/**
		* This method stores image tile into shared memory
		* and then calculates convolution.
		*
		* Thanks for the idea https://www.evl.uic.edu/sjames/cs525/final.html
		*/

		#include <TNL/Devices/Cuda.h>
		#include <TNL/Containers/StaticVector.h>
		#include <TNL/Cuda/LaunchHelpers.h>
		#include <TNL/Cuda/SharedMemory.h>

		template< int Dimension, typename Device >
		struct Convolution;

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename Convolve,
		typename Store >
		__global__
		static void
		convolution2D( Index kernelWidth,
		Index kernelHeight,
		Index endX,
		Index endY,
		FetchData fetchData,
		FetchBoundary fetchBoundary,
		Convolve convolve,
		Store store )
		{
		Real* data = TNL::Cuda::getSharedMemory< Real >();

		const Index iy = threadIdx.y + blockIdx.y * blockDim.y;
		const Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		const Index radiusY = kernelHeight >> 1;
		const Index radiusX = kernelWidth >> 1;

		const Index dataBlockWidth = 2 * kernelWidth - 1;
		const Index dataBlockHeight = 2 * kernelHeight - 1;

		const Index dataBlockRadiusX = dataBlockWidth >> 1;
		const Index dataBlockRadiusY = dataBlockHeight >> 1;

		Index x, y, index;

		// Top Left
		x = ix - radiusX;
		y = iy - radiusY;
		index = threadIdx.x + threadIdx.y * dataBlockWidth;

		if( x < 0 \|\| y < 0 \|\| x >= endX \|\| y >= endY ) {
		data[ index ] = fetchBoundary( x, y );
		}
		else {
		data[ index ] = fetchData( x, y );
		}

		// Top right
		x = ix + radiusX;
		y = iy - radiusY;
		index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth;

		if( x < 0 \|\| y < 0 \|\| x >= endX \|\| y >= endY ) {
		data[ index ] = fetchBoundary( x, y );
		}
		else {
		data[ index ] = fetchData( x, y );
		}

		// Bottom Left
		x = ix - radiusX;
		y = iy + radiusY;
		index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;

		if(x < 0 \|\| y < 0 \|\| x >= endX \|\| y >= endY ) {
		data[ index ] = fetchBoundary( x, y );
		}
		else {
		data[ index ] = fetchData( x, y );
		}

		// Bottom Right
		x = ix + radiusX;
		y = iy + radiusY;
		index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;

		if( x < 0 \|\| y < 0 \|\| x >= endX \|\| y >= endY ) {
		data[ index ] = fetchBoundary( x, y );
		}
		else {
		data[ index ] = fetchData( x, y );
		}

		__syncthreads();

		if( ix >= endX \|\| iy >= endY )
		return;

		Real result = 0;

		for( Index j = 0; j < kernelHeight; j++ ) {
		Index align = ( j + threadIdx.y ) * dataBlockWidth;

		for( Index i = 0; i < kernelWidth; i++ ) {
		Index index = i + threadIdx.x + align;

		result = convolve( result, ix, iy, i, j, data[ index ]);
		}
		}

		store( ix, iy, result );
		}


		template<>
		struct Convolution< 2, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 2, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		Index kernelElementCount = 1;

		for( Index i = 0; i < kernelSize.getSize(); i++ )
		kernelElementCount = ( 2 kernelSize[ i ] ) - 1;

		configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );

		configuration.blockSize.x = kernelSize.x();
		configuration.blockSize.y = kernelSize.y();

		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		}

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename Convolve,
		typename Store >
		static void
		execute( const Vector< Index >& dimensions,
		const Vector< Index >& kernelSize,
		FetchData&& fetchData,
		FetchBoundary&& fetchBoundary,
		Convolve&& convolve,
		Store&& store )
		{
		TNL::Cuda::LaunchConfiguration configuration;

		setup< Index, Real >( configuration, dimensions, kernelSize );

		constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, Convolve, Store >;

		TNL::Cuda::launchKernel< true >( kernel,
		0,
		configuration,
		kernelSize.x(),
		kernelSize.y(),
		dimensions.x(),
		dimensions.y(),
		fetchData,
		fetchBoundary,
		convolve,
		store );
		};
		};

		#endif

src/Benchmarks/Convolution/support/HeatEquationSolver.h

+54 −8

Original line number	Diff line number	Diff line
		@@ -2,7 +2,7 @@
		#pragma once

		#include "Solver.h"
		#include "HeatEquationTask.h"
		#include "DummyTask.h"

		#include <TNL/Algorithms/ParallelFor.h>
		#include <TNL/Timer.h>
		@@ -11,7 +11,11 @@ static std::vector< TNL::String > dimensionIds = { "grid-x-size", "grid-y-size"
		static std::vector< TNL::String > kernelSizeIds = { "kernel-x-size", "kernel-y-size" };
		static std::vector< TNL::String > domainIds = { "domain-x-size", "domain-y-size" };
		static std::vector< TNL::String > kernelDomainIds = { "kernel-domain-x-size", "kernel-domain-y-size" };
		static std::string sigmaKey = "sigma";

		static std::string alphaKey = "alpha";
		static std::string betaKey = "beta";
		static std::string gammaKey = "gamma";

		static std::string timeStepKey = "timeStep";
		static std::string startTimeKey = "startTime";
		static std::string finalTimeKey = "finalTime";
		@@ -116,11 +120,15 @@ public:
		config.addEntry< Real >( domainIds[ 0 ], "Domain size along x-axis.", 4.0 );
		config.addEntry< Real >( domainIds[ 1 ], "Domain size along y-axis.", 4.0 );

		config.addEntry< Real >( kernelDomainIds[ 0 ], "Kernel domain size along x-axis.", 3.0 );
		config.addEntry< Real >( kernelDomainIds[ 1 ], "Kernel domain size along y-axis.", 3.0 );
		config.addEntry< Real >( kernelDomainIds[ 0 ], "Kernel domain size along x-axis.", 4.0 );
		config.addEntry< Real >( kernelDomainIds[ 1 ], "Kernel domain size along y-axis.", 4.0 );

		config.addEntry< Real >( sigmaKey, "Sigma in exponential initial condition.", 0.5);
		config.addDelimiter( "Initial condition settings ( (x^2/alpha + y^2/beta) + gamma)):" );
		config.addEntry< Real >( alphaKey, "Alpha value in initial condition", -0.05 );
		config.addEntry< Real >( betaKey, "Beta value in initial condition", -0.05 );
		config.addEntry< Real >( gammaKey, "Gamma key in initial condition", 15 );

		config.addDelimiter( "Time settings:" );
		config.addEntry< Real >( startTimeKey, "Final time of the simulation.", 0.0);
		config.addEntry< Real >( timeStepKey, "Time step of the simulation.", 0.005);
		config.addEntry< Real >( finalTimeKey, "Final time of the simulation.", 0.36);
		@@ -142,7 +150,10 @@ public:

		auto xDomainSize = parameters.getParameter< Real >( domainIds[ 0 ] );
		auto yDomainSize = parameters.getParameter< Real >( domainIds[ 1 ] );
		auto sigma = parameters.getParameter< Real >( sigmaKey );

		auto alpha = parameters.getParameter< Real >( alphaKey );
		auto beta = parameters.getParameter< Real >( betaKey );
		auto gamma = parameters.getParameter< Real >( gammaKey );

		auto init = [ = ] __cuda_callable__( int i, int j ) mutable
		{
		@@ -151,7 +162,7 @@ public:
		auto x = i * spaceSteps.x() - domain.x() / 2.;
		auto y = j * spaceSteps.y() - domain.y() / 2.;

		functionView[ index ] = exp( sigma * ( x * x + y * y ) );
		functionView[ index ] = TNL::max((x * x / alpha) + (y * y / beta) + gamma, 0);
		};

		TNL::Algorithms::ParallelFor2D< Device >::exec( 0, 0, dimensions.x(), dimensions.y(), init );
		@@ -168,7 +179,42 @@ public:
		typename DataStore::ViewType result,
		const Real time ) const
		{
		HeatEquationTask< int, Real, Dimension, Device >::exec( dimensions, kernelSize, domain, kernelDomain, time, input, result);
		DataStore kernel;
		kernel.resize(kernelSize.x() * kernelSize.y());

		auto kernelView = kernel.getView();
		auto domainSpaceSteps = Point(domain.x() / dimensions.x(), domain.y() / dimensions.y());
		auto kernelSpaceSteps = Point(kernelDomain.x() / (kernelSize.x() - 1), kernelDomain.y() / (kernelSize.y() - 1));

		auto init = [ = ] __cuda_callable__( int i, int j ) mutable {
		auto index = j * kernelSize.x() + i;

		auto x = i * kernelSpaceSteps.x() - kernelDomain.x() / 2.;
		auto y = j * kernelSpaceSteps.y() - kernelDomain.y() / 2.;

		// The space step is given by the function domain
		// However, because the kernel is limited to 31x31 size
		// The user can specify it custom kernel domain from which values are taken
		kernelView[ index ] = domainSpaceSteps.x() * domainSpaceSteps.y() * ( (Real)1 / ( (Real)4 * M_PI * time ) ) * exp( - ( pow(x, 2.) + pow(y, 2.) ) / ( (Real)4 * time ) );
		};

		TNL::Algorithms::ParallelFor2D< Device >::exec( 0, 0, kernelSize.x(), kernelSize.y(), init );

		// std::cout << std::endl << std::endl << std::endl;

		for (int i = 0; i < kernelSize.x(); i++) {
		for (int j = 0; j < kernelSize.y(); j++) {
		auto index = j * kernelSize.x() + i;

		printf("%lf ", kernelView.getElement(index));
		}

		printf("\n");
		}

		auto kernelConstView = kernel.getConstView();

		DummyTask<int, Real, Dimension, Device>::exec(dimensions, kernelSize, input, result, kernelConstView, 0);
		}

		bool

src/Benchmarks/Convolution/support/HeatEquationTask.h

deleted100644 → 0

+0 −94

Original line number	Diff line number	Diff line

		#pragma once

		template< int Dimension, typename Device >
		struct Convolution
		{
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< Dimension, Index >;

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename Convolve,
		typename Store >
		static void
		execute( const Vector< Index >& dimensions,
		const Vector< Index >& kernelSize,
		FetchData&& fetchData,
		FetchBoundary&& fetchBoundary,
		Convolve&& convolve,
		Store&& store );
		};

		template< typename Index, typename Real, int Dimension, typename Device >
		struct HeatEquationTask;

		template< typename Index, typename Real >
		struct HeatEquationTask< Index, Real, 2, TNL::Devices::Cuda >
		{
		public:
		static constexpr int Dimension = 2;
		using Device = TNL::Devices::Cuda;
		using Vector = TNL::Containers::StaticVector< Dimension, Index >;
		using Point = TNL::Containers::StaticVector< Dimension, Real >;
		using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType;
		using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType;
		using ConvolutionLauncher = Convolution< Dimension, Device >;

		static void
		exec( const Vector& dimensions,
		const Vector& kernelSize,
		const Point& functionDomain,
		const Point& kernelDomain,
		const Real time,
		ConstDataStore& input,
		DataStore& result)
		{
		auto functionSpaceSteps = Point(functionDomain.x() / dimensions.x(), functionDomain.y() / dimensions.y());
		auto kernelSpaceSteps = Point(kernelDomain.x() / kernelSize.x(), kernelDomain.y() / kernelSize.y());

		auto fetchData = [ = ] __cuda_callable__( Index i, Index j )
		{
		auto index = i + j * dimensions.x();

		return input[ index ];
		};

		auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j )
		{
		return 0;
		};

		auto convolve = [ = ] __cuda_callable__( Real result, Index dataX, Index dataY, Index kernelX, Index kernelY, Real data )
		{
		auto functionXPos = dataX * functionSpaceSteps.x() - (functionDomain.x() / 2),
		functionYPos = dataY * functionSpaceSteps.y() - (functionDomain.y() / 2);

		auto kernelXPos = (kernelX - kernelSize.x() / 2) * kernelSpaceSteps.x(),
		kernelYPos = (kernelY - kernelSize.y() / 2) * kernelSpaceSteps.y();

		auto deltaXPos = kernelXPos - functionXPos,
		deltaYPos = kernelYPos - functionYPos;

		auto kernel = kernelSpaceSteps.x() * kernelSpaceSteps.y() * ( (Real)1 / ( (Real)4 * M_PI * time ) ) * exp( - ( pow(deltaXPos, 2.) + pow(deltaYPos, 2.) ) / ( (Real)4 * time ) );

		return result + data * kernel;
		};

		auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) mutable
		{
		auto index = i + j * dimensions.x();

		result[ index ] = resultValue;
		};

		ConvolutionLauncher::execute< Index, Real >( dimensions,
		kernelSize,
		std::forward< decltype( fetchData ) >( fetchData ),
		std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
		std::forward< decltype( convolve ) >( convolve ),
		std::forward< decltype( store ) >( store ) );
		}
		};