Move out kernel coniguration to convolution task (3781cb67) · Commits · TNL / tnl-dev

src/Benchmarks/Convolution/kernels/naive.h

+64 −21

Original line number	Diff line number	Diff line
		@@ -2,6 +2,7 @@
		#ifdef HAVE_CUDA

		#include <TNL/Devices/Cuda.h>
		#include <TNL/Containers/StaticVector.h>
		#include <TNL/Cuda/LaunchHelpers.h>

		template< int Dimension, typename Device >
		@@ -12,10 +13,18 @@ struct Convolution< 1, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		static size_t
		getDynamicSharedMemorySize( Index kernelWidth, Index endX )
		using Vector = TNL::Containers::StaticVector< 1, Index >;

		template< typename Index >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		return 0;
		configuration.dynamicSharedMemorySize = 0;

		// TODO: - Benchmark the best value
		configuration.blockSize.x = kernelSize.x();
		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		}
		};

		@@ -65,10 +74,22 @@ struct Convolution< 2, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		static size_t
		getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY )
		using Vector = TNL::Containers::StaticVector< 2, Index >;

		template< typename Index >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		return 0;
		configuration.dynamicSharedMemorySize = 0;

		// TODO: - Benchmark the best value
		configuration.blockSize.x = kernelSize.x();
		configuration.blockSize.y = kernelSize.y();

		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		}
		};

		@@ -111,7 +132,8 @@ convolution2D( Index kernelWidth,
		Index kernelIndexX = i + radiusX;

		if( elementIndexX < 0 \|\| elementIndexX >= endX \|\| elementIndexY < 0 \|\| elementIndexY >= endY ) {
		result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel ( kernelIndexX, kernelIndexY ) );
		result =
		convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) );
		}
		else {
		result = convolve( result, fetchData( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) );
		@@ -127,10 +149,25 @@ struct Convolution< 3, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		static size_t
		getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ )
		using Vector = TNL::Containers::StaticVector< 3, Index >;

		template< typename Index >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		return 0;
		configuration.dynamicSharedMemorySize = 0;

		// TODO: - Benchmark the best value
		configuration.blockSize.x = kernelSize.x();
		configuration.blockSize.y = kernelSize.y();
		configuration.blockSize.z = kernelSize.z();

		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
		}
		};

		@@ -180,11 +217,17 @@ convolution3D( Index kernelWidth,
		Index elementIndexX = i + ix;
		Index kernelIndexX = i + radiusX;

		if( elementIndexX < 0 \|\| elementIndexX >= endX \|\| elementIndexY < 0 \|\| elementIndexY >= endY \|\| elementIndexZ < 0 \|\| elementIndexZ >= endZ ) {
		result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) );
		if( elementIndexX < 0 \|\| elementIndexX >= endX \|\| elementIndexY < 0 \|\| elementIndexY >= endY \|\| elementIndexZ < 0
		\|\| elementIndexZ >= endZ )
		{
		result = convolve( result,
		fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ),
		fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) );
		}
		else {
		result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) );
		result = convolve( result,
		fetchData( elementIndexX, elementIndexY, elementIndexZ ),
		fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) );
		}
		}
		}

src/Benchmarks/Convolution/support/DummyBenchmark.h

+3 −2

Original line number	Diff line number	Diff line
		@@ -32,13 +32,14 @@ public:
		start[ i ] = parameters.getParameter< int >( minDimensionIds[ i ] );
		end[ i ] = parameters.getParameter< int >( maxDimensionIds[ i ] );
		minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] );
		maxKernelSizeIds[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] );
		maxKernelSize[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] );

		TNL_ASSERT_GT( start[ i ], 1, "Start dimension must be positive integer" );
		TNL_ASSERT_GT( end[ i ], start[ i ], "End dimension must be greater than start dimension" );

		TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" );
		TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" );
		TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End dimension must be greater than start dimension" );
		TNL_ASSERT_GT( end[ i ], start[ i ], "End kernel size must be greater than start kernel size" );
		}

		@@ -85,7 +86,7 @@ public:
		for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) {
		if( currentDimension[ i ] >= maxDimension[ i ] ) {
		currentDimension[ i ] = minDimension[ i ];
		currentDimension[ i ] = maxDimension[ i ];
		currentDimension[ i + 1 ] *= dimensionStep;
		}
		}

src/Benchmarks/Convolution/support/Launcher.h

+3 −83

Original line number	Diff line number	Diff line
		@@ -29,20 +29,7 @@ public:
		{
		TNL::Cuda::LaunchConfiguration launchConfig;

		launchConfig.dynamicSharedMemorySize =
		ConvolutionKernel::getDynamicSharedMemorySize< Index >( kernelSize.x(), dimensions.x() );

		// TODO: - Benchmark the best value
		launchConfig.blockSize.x = 256;
		launchConfig.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), launchConfig.blockSize.x ) );

		if( (std::size_t) launchConfig.blockSize.x * launchConfig.gridSize.x < (std::size_t) dimensions.x() ) {
		const int desGridSize = 32 * TNL::Cuda::DeviceInfo::getCudaMultiprocessors( TNL::Cuda::DeviceInfo::getActiveDevice() );

		launchConfig.gridSize.x =
		TNL::min( desGridSize, TNL::Cuda::getNumberOfBlocks( dimensions.x(), launchConfig.blockSize.x ) );
		}
		ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize);

		constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

		@@ -78,29 +65,7 @@ public:
		{
		TNL::Cuda::LaunchConfiguration launchConfig;

		launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >(
		kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() );

		const Index sizeX = dimensions.x();
		const Index sizeY = dimensions.y();

		if( sizeX >= sizeY * sizeY ) {
		launchConfig.blockSize.x = TNL::min( 256, sizeX );
		launchConfig.blockSize.y = 1;
		}
		else if( sizeY >= sizeX * sizeX ) {
		launchConfig.blockSize.x = 1;
		launchConfig.blockSize.y = TNL::min( 256, sizeY );
		}
		else {
		launchConfig.blockSize.x = TNL::min( 32, sizeX );
		launchConfig.blockSize.y = TNL::min( 8, sizeY );
		}

		launchConfig.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) );
		launchConfig.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) );
		ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize);

		constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

		@@ -142,52 +107,7 @@ public:

		TNL::Cuda::LaunchConfiguration launchConfig;

		launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >(
		kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z() );

		if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) {
		launchConfig.blockSize.x = TNL::min( 256, sizeX );
		launchConfig.blockSize.y = 1;
		launchConfig.blockSize.z = 1;
		}
		else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) {
		launchConfig.blockSize.x = 1;
		launchConfig.blockSize.y = TNL::min( 256, sizeY );
		launchConfig.blockSize.z = 1;
		}
		else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) {
		launchConfig.blockSize.x = TNL::min( 2, sizeX );
		launchConfig.blockSize.y = TNL::min( 2, sizeY );
		// CUDA allows max 64 for launchConfig.blockSize.z
		launchConfig.blockSize.z = TNL::min( 64, sizeZ );
		}
		else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) {
		launchConfig.blockSize.x = TNL::min( 32, sizeX );
		launchConfig.blockSize.y = TNL::min( 8, sizeY );
		launchConfig.blockSize.z = 1;
		}
		else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) {
		launchConfig.blockSize.x = TNL::min( 32, sizeX );
		launchConfig.blockSize.y = 1;
		launchConfig.blockSize.z = TNL::min( 8, sizeZ );
		}
		else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) {
		launchConfig.blockSize.x = 1;
		launchConfig.blockSize.y = TNL::min( 32, sizeY );
		launchConfig.blockSize.z = TNL::min( 8, sizeZ );
		}
		else {
		launchConfig.blockSize.x = TNL::min( 16, sizeX );
		launchConfig.blockSize.y = TNL::min( 4, sizeY );
		launchConfig.blockSize.z = TNL::min( 4, sizeZ );
		}

		launchConfig.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) );
		launchConfig.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) );
		launchConfig.gridSize.z =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeZ, launchConfig.blockSize.z ) );
		ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize);

		constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;