Move kernel launching in kernel definition (3571b27e) · Commits · TNL / tnl-dev

src/Benchmarks/Convolution/kernels/naive.h

+178 −74

Original line number	Diff line number	Diff line
		@@ -11,32 +11,13 @@
		* There are several pitfalls with such configuration.
		*
		* 1. At first we don't use shared memory
		* 2. At second we don't control block size, so we may launch extremely small kernels or otherwise we can launch extremely large kernels.
		* 2. At second we don't control block size, so we may launch extremely small kernels or otherwise we can launch extremely large
		* kernels.
		*/

		template< int Dimension, typename Device >
		struct Convolution;

		template<>
		struct Convolution< 1, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 1, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		configuration.dynamicSharedMemorySize = 0;

		// TODO: - Benchmark the best value
		configuration.blockSize.x = kernelSize.x();
		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		}
		};

		template< typename Index,
		typename Real,
		typename FetchData,
		@@ -78,29 +59,6 @@ convolution1D( Index kernelWidth,
		store( ix, result );
		}

		template<>
		struct Convolution< 2, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 2, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		configuration.dynamicSharedMemorySize = 0;

		configuration.blockSize.x = kernelSize.x();
		configuration.blockSize.y = kernelSize.y();

		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		}
		};

		template< typename Index,
		typename Real,
		typename FetchData,
		@@ -152,33 +110,6 @@ convolution2D( Index kernelWidth,
		store( ix, iy, result );
		}

		template<>
		struct Convolution< 3, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 3, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		configuration.dynamicSharedMemorySize = 0;

		// TODO: - Benchmark the best value
		configuration.blockSize.x = kernelSize.x();
		configuration.blockSize.y = kernelSize.y();
		configuration.blockSize.z = kernelSize.z();

		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
		}
		};

		template< typename Index,
		typename Real,
		typename FetchData,
		@@ -244,4 +175,177 @@ convolution3D( Index kernelWidth,
		store( ix, iy, iz, result );
		}

		template< int Dimension, typename Device >
		struct Convolution;

		template<>
		struct Convolution< 1, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 1, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		configuration.dynamicSharedMemorySize = 0;

		// TODO: - Benchmark the best value
		configuration.blockSize.x = kernelSize.x();
		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		}

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		static void
		execute( const Vector< Index >& dimensions,
		const Vector< Index >& kernelSize,
		FetchData&& fetchData,
		FetchBoundary&& fetchBoundary,
		FetchKernel&& fetchKernel,
		Convolve&& convolve,
		Store&& store )
		{
		TNL::Cuda::LaunchConfiguration configuration;

		setup< Index, Real >( configuration, dimensions, kernelSize );

		constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

		TNL::Cuda::launchKernel< true >(
		kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store );
		};
		};

		template<>
		struct Convolution< 2, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 2, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		configuration.dynamicSharedMemorySize = 0;

		configuration.blockSize.x = kernelSize.x();
		configuration.blockSize.y = kernelSize.y();

		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		}

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		static void
		execute( const Vector< Index >& dimensions,
		const Vector< Index >& kernelSize,
		FetchData&& fetchData,
		FetchBoundary&& fetchBoundary,
		FetchKernel&& fetchKernel,
		Convolve&& convolve,
		Store&& store )
		{
		TNL::Cuda::LaunchConfiguration configuration;

		setup< Index, Real >( configuration, dimensions, kernelSize );

		constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

		TNL::Cuda::launchKernel< true >( kernel,
		0,
		configuration,
		kernelSize.x(),
		kernelSize.y(),
		dimensions.x(),
		dimensions.y(),
		fetchData,
		fetchBoundary,
		fetchKernel,
		convolve,
		store );
		};
		};

		template<>
		struct Convolution< 3, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 3, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		configuration.dynamicSharedMemorySize = 0;

		// TODO: - Benchmark the best value
		configuration.blockSize.x = kernelSize.x();
		configuration.blockSize.y = kernelSize.y();
		configuration.blockSize.z = kernelSize.z();

		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
		}

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		static void
		execute( const Vector< Index >& dimensions,
		const Vector< Index >& kernelSize,
		FetchData&& fetchData,
		FetchBoundary&& fetchBoundary,
		FetchKernel&& fetchKernel,
		Convolve&& convolve,
		Store&& store )
		{
		TNL::Cuda::LaunchConfiguration configuration;

		setup< Index, Real >( configuration, dimensions, kernelSize );

		constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

		TNL::Cuda::launchKernel< true >( kernel,
		0,
		configuration,
		kernelSize.x(),
		kernelSize.y(),
		kernelSize.z(),
		dimensions.x(),
		dimensions.y(),
		dimensions.z(),
		fetchData,
		fetchBoundary,
		fetchKernel,
		convolve,
		store );
		};
		};

		#endif

src/Benchmarks/Convolution/kernels/sharedData.h

+186 −86

Original line number	Diff line number	Diff line
		@@ -14,33 +14,6 @@
		#include <TNL/Cuda/LaunchHelpers.h>
		#include <TNL/Cuda/SharedMemory.h>

		template< int Dimension, typename Device >
		struct Convolution;

		template<>
		struct Convolution< 1, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 1, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		Index kernelElementCount = 1;

		for( Index i = 0; i < kernelSize.getSize(); i++ )
		kernelElementCount = ( 2 kernelSize[ i ] ) - 1;

		configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );

		configuration.blockSize.x = kernelSize.x();
		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		}
		};

		template< typename Index,
		typename Real,
		typename FetchData,
		@@ -100,34 +73,6 @@ convolution1D( Index kernelWidth,
		store( ix, result );
		}

		template<>
		struct Convolution< 2, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 2, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		Index kernelElementCount = 1;

		for( Index i = 0; i < kernelSize.getSize(); i++ )
		kernelElementCount = ( 2 kernelSize[ i ] ) - 1;

		configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );

		configuration.blockSize.x = kernelSize.x();
		configuration.blockSize.y = kernelSize.y();

		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		}
		};

		template< typename Index,
		typename Real,
		typename FetchData,
		@@ -229,37 +174,6 @@ convolution2D( Index kernelWidth,
		store( ix, iy, result );
		}

		template<>
		struct Convolution< 3, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 3, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		Index kernelElementCount = 1;

		for( Index i = 0; i < kernelSize.getSize(); i++ )
		kernelElementCount = ( 2 kernelSize[ i ] ) - 1;

		configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );

		configuration.blockSize.x = kernelSize.x();
		configuration.blockSize.y = kernelSize.y();
		configuration.blockSize.z = kernelSize.z();

		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
		}
		};

		template< typename Index,
		typename Real,
		typename FetchData,
		@@ -429,4 +343,190 @@ convolution3D( Index kernelWidth,
		store( ix, iy, iz, result );
		}

		template< int Dimension, typename Device >
		struct Convolution;

		template<>
		struct Convolution< 1, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 1, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		Index kernelElementCount = 1;

		for( Index i = 0; i < kernelSize.getSize(); i++ )
		kernelElementCount = ( 2 kernelSize[ i ] ) - 1;

		configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );

		configuration.blockSize.x = kernelSize.x();
		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		}

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		static void
		execute( const Vector< Index >& dimensions,
		const Vector< Index >& kernelSize,
		FetchData&& fetchData,
		FetchBoundary&& fetchBoundary,
		FetchKernel&& fetchKernel,
		Convolve&& convolve,
		Store&& store )
		{
		TNL::Cuda::LaunchConfiguration configuration;

		setup< Index, Real >( configuration, dimensions, kernelSize );

		constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

		TNL::Cuda::launchKernel< true >(
		kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store );
		};
		};

		template<>
		struct Convolution< 2, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 2, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		Index kernelElementCount = 1;

		for( Index i = 0; i < kernelSize.getSize(); i++ )
		kernelElementCount = ( 2 kernelSize[ i ] ) - 1;

		configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );

		configuration.blockSize.x = kernelSize.x();
		configuration.blockSize.y = kernelSize.y();

		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		}

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		static void
		execute( const Vector< Index >& dimensions,
		const Vector< Index >& kernelSize,
		FetchData&& fetchData,
		FetchBoundary&& fetchBoundary,
		FetchKernel&& fetchKernel,
		Convolve&& convolve,
		Store&& store )
		{
		TNL::Cuda::LaunchConfiguration configuration;

		setup< Index, Real >( configuration, dimensions, kernelSize );

		constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

		TNL::Cuda::launchKernel< true >( kernel,
		0,
		configuration,
		kernelSize.x(),
		kernelSize.y(),
		dimensions.x(),
		dimensions.y(),
		fetchData,
		fetchBoundary,
		fetchKernel,
		convolve,
		store );
		};
		};

		template<>
		struct Convolution< 3, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 3, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		Index kernelElementCount = 1;

		for( Index i = 0; i < kernelSize.getSize(); i++ )
		kernelElementCount = ( 2 kernelSize[ i ] ) - 1;

		configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );

		configuration.blockSize.x = kernelSize.x();
		configuration.blockSize.y = kernelSize.y();
		configuration.blockSize.z = kernelSize.z();

		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
		}

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		static void
		execute( const Vector< Index >& dimensions,
		const Vector< Index >& kernelSize,
		FetchData&& fetchData,
		FetchBoundary&& fetchBoundary,
		FetchKernel&& fetchKernel,
		Convolve&& convolve,
		Store&& store )
		{
		TNL::Cuda::LaunchConfiguration configuration;

		setup< Index, Real >( configuration, dimensions, kernelSize );

		constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

		TNL::Cuda::launchKernel< true >( kernel,
		0,
		configuration,
		kernelSize.x(),
		kernelSize.y(),
		kernelSize.z(),
		dimensions.x(),
		dimensions.y(),
		dimensions.z(),
		fetchData,
		fetchBoundary,
		fetchKernel,
		convolve,
		store );
		};
		};

		#endif

src/Benchmarks/Convolution/kernels/sharedKernel.h

+183 −83

File changed.

Preview size limit exceeded, changes collapsed.

src/Benchmarks/Convolution/support/DummyTask.h

+47 −26

Original line number	Diff line number	Diff line

		#pragma once

		#include "Launcher.h"
		template< int Dimension, typename Device >
		struct Convolution
		{
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< Dimension, Index >;

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		static void
		execute( const Vector< Index >& dimensions,
		const Vector< Index >& kernelSize,
		FetchData&& fetchData,
		FetchBoundary&& fetchBoundary,
		FetchKernel&& fetchKernel,
		Convolve&& convolve,
		Store&& store );
		};

		template< typename Index, typename Real, int Dimension, typename Device >
		struct DummyTask;
		@@ -14,7 +35,7 @@ public:
		using Device = TNL::Devices::Cuda;
		using Vector = TNL::Containers::StaticVector< Dimension, Index >;
		using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
		using Launcher = Launcher< Dimension, Device >;
		using ConvolutionLauncher = Convolution< Dimension, Device >;

		static void
		exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
		@@ -44,7 +65,7 @@ public:
		result[ i ] = resultValue;
		};

		Launcher::exec< Index, Real >( dimensions,
		ConvolutionLauncher::execute< Index, Real >( dimensions,
		kernelSize,
		std::forward< decltype( fetchData ) >( fetchData ),
		std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
		@@ -62,7 +83,7 @@ public:
		using Device = TNL::Devices::Cuda;
		using Vector = TNL::Containers::StaticVector< Dimension, Index >;
		using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
		using Launcher = Launcher< Dimension, Device >;
		using ConvolutionLauncher = Convolution< Dimension, Device >;

		static void
		exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
		@@ -98,7 +119,7 @@ public:
		result[ index ] = resultValue;
		};

		Launcher::exec< Index, Real >( dimensions,
		ConvolutionLauncher::execute< Index, Real >( dimensions,
		kernelSize,
		std::forward< decltype( fetchData ) >( fetchData ),
		std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
		@@ -116,7 +137,7 @@ public:
		using Device = TNL::Devices::Cuda;
		using Vector = TNL::Containers::StaticVector< Dimension, Index >;
		using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
		using Launcher = Launcher< Dimension, Device >;
		using ConvolutionLauncher = Convolution< Dimension, Device >;

		static void
		exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
		@@ -152,7 +173,7 @@ public:
		result[ index ] = resultValue;
		};

		Launcher::exec< Index, Real >( dimensions,
		ConvolutionLauncher::execute< Index, Real >( dimensions,
		kernelSize,
		std::forward< decltype( fetchData ) >( fetchData ),
		std::forward< decltype( fetchBoundary ) >( fetchBoundary ),

src/Benchmarks/Convolution/support/Launcher.h

deleted100644 → 0

+0 −136

File deleted.

Preview size limit exceeded, changes collapsed.