Implement naive 2D kernel (5a5a294c) · Commits · TNL / tnl-dev

src/Benchmarks/Convolution/CMakeLists.txt

+1 −0

Original line number	Diff line number	Diff line
		@@ -23,3 +23,4 @@ endif()
		endfunction()

		GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/naive.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/naive.h")

src/Benchmarks/Convolution/kernels/naive.h

+54 −45

Original line number	Diff line number	Diff line
		@@ -56,55 +56,64 @@ convolution1D( Index kernelWidth,
		store( ix, result );
		}

		// template<>
		// struct Convolution< 2, TNL::Devices::Cuda >
		// {
		// public:
		// template< typename Index >
		// static size_t
		// getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY )
		// {
		// return 0;
		// }
		// };
		template<>
		struct Convolution< 2, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		static size_t
		getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY )
		{
		return 0;
		}
		};

		// template< typename Index,
		// typename Real,
		// typename FetchData,
		// typename FetchBoundary,
		// typename FetchKernel,
		// typename Convolve,
		// typename Store >
		// __global__
		// static void
		// convolution2D( Index kernelWidth,
		// Index kernelHeight,
		// Index endX,
		// Index endY,
		// FetchData& fetchData,
		// FetchBoundary& fetchBoundary,
		// FetchKernel& fetchKernel,
		// Convolve& convolve,
		// Store& store )
		// {
		// int iy = threadIdx.y + blockIdx.y * blockDim.y;
		// int ix = threadIdx.x + blockIdx.x * blockDim.x;
		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		__global__
		static void
		convolution2D( Index kernelWidth,
		Index kernelHeight,
		Index endX,
		Index endY,
		FetchData fetchData,
		FetchBoundary fetchBoundary,
		FetchKernel fetchKernel,
		Convolve convolve,
		Store store )
		{
		Index iy = threadIdx.y + blockIdx.y * blockDim.y;
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		// Real result = 0;
		Index radiusY = kernelHeight >> 1;
		Index radiusX = kernelHeight >> 1;

		// for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) {
		// for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) {
		// if( i < 0 \|\| i >= endX \|\| j < 0 \|\| j >= endY ) {
		// result = convolve( result, fetchBoundary( i, j ) );
		// }
		// else {
		// result = convolve( result, fetchData( i, j ), fetchKernel( i, j ) );
		// }
		// }
		// }
		Real result = 0;

		// store( ix, iy, result );
		// }
		for( Index j = - radiusY; j <= radiusY; j++ ) {
		Index elementIndexY = j + iy;
		Index kernelIndexY = j + radiusY;

		for( Index i = - radiusX; i <= radiusX; i++ ) {
		Index elementIndexX = i + ix;
		Index kernelIndexX = i + radiusX;

		if( elementIndexX < 0 \|\| elementIndexX >= endX \|\| elementIndexY < 0 \|\| elementIndexY >= endY ) {
		result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel ( kernelIndexX, kernelIndexY ) );
		}
		else {
		result = convolve( result, fetchData( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) );
		}
		}
		}

		store( ix, iy, result );
		}

		// template<>
		// struct Convolution< 3, TNL::Devices::Cuda >

src/Benchmarks/Convolution/support/DummyTask.h

+44 −44

Original line number	Diff line number	Diff line
		@@ -54,59 +54,59 @@ public:
		}
		};

		// template< typename Index, typename Real >
		// struct DummyTask< Index, Real, 2, TNL::Devices::Cuda >
		// {
		// public:
		// static constexpr int Dimension = 2;
		// using Device = TNL::Devices::Cuda;
		// using Vector = TNL::Containers::StaticVector< Dimension, Index >;
		// using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
		// using Launcher = Launcher< Dimension, Device >;
		template< typename Index, typename Real >
		struct DummyTask< Index, Real, 2, TNL::Devices::Cuda >
		{
		public:
		static constexpr int Dimension = 2;
		using Device = TNL::Devices::Cuda;
		using Vector = TNL::Containers::StaticVector< Dimension, Index >;
		using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
		using Launcher = Launcher< Dimension, Device >;

		// static void
		// exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
		// {
		// auto fetchData = [ = ] __cuda_callable__( Index i, Index j )
		// {
		// auto index = i + j * dimensions.x();
		static void
		exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
		{
		auto fetchData = [ = ] __cuda_callable__( Index i, Index j )
		{
		auto index = i + j * dimensions.x();

		// return input[ index ];
		// };
		return input[ index ];
		};

		// auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j )
		// {
		// return -1;
		// };
		auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j )
		{
		return -1;
		};

		// auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j )
		// {
		// auto index = i + j * kernel.x();
		auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j )
		{
		auto index = i + j * kernelSize.x();

		// return kernel[ index ];
		// };
		return kernel[ index ];
		};

		// auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel )
		// {
		// return result + data * kernel;
		// };
		auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel )
		{
		return result + data * kernel;
		};

		// auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue )
		// {
		// auto index = i + j * dimensions.x();
		auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) mutable
		{
		auto index = i + j * dimensions.x();

		// result[ index ] = resultValue;
		// };
		result[ index ] = resultValue;
		};

		// Launcher::exec< Index >( dimensions,
		// kernelSize,
		// std::forward< decltype( fetchData ) >( fetchData ),
		// std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
		// std::forward< decltype( fetchKernel ) >( fetchKernel ),
		// std::forward< decltype( convolve ) >( convolve ),
		// std::forward< decltype( store ) >( store ) );
		// }
		// };
		Launcher::exec< Index, Real >( dimensions,
		kernelSize,
		std::forward< decltype( fetchData ) >( fetchData ),
		std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
		std::forward< decltype( fetchKernel ) >( fetchKernel ),
		std::forward< decltype( convolve ) >( convolve ),
		std::forward< decltype( store ) >( store ) );
		}
		};

		// template< typename Index, typename Real >
		// struct DummyTask< Index, Real, 3, TNL::Devices::Cuda >

src/Benchmarks/Convolution/support/Launcher.h

+52 −57

Original line number	Diff line number	Diff line
		@@ -59,70 +59,65 @@ public:
		}
		};

		// template<>
		// struct Launcher< 2, TNL::Devices::Cuda >
		// {
		// public:
		// using Vector = TNL::Containers::StaticVector< 2, int >;
		// using ConvolutionKernel = Convolution< 2, TNL::Devices::Cuda >;

		// template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store >
		// static inline void
		// exec( const Vector& dimensions,
		// const Vector& kernelSize,
		// FetchData&& fetchData,
		// FetchBoundary&& fetchBoundary,
		// FetchKernel&& fetchKernel,
		// Convolve&& convolve,
		// Store&& store )
		// {
		// TNL::Cuda::LaunchConfiguration launchConfig;

		// launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >(
		// kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() );
		template<>
		struct Launcher< 2, TNL::Devices::Cuda >
		{
		public:
		using Vector = TNL::Containers::StaticVector< 2, int >;
		using ConvolutionKernel = Convolution< 2, TNL::Devices::Cuda >;

		// const Index sizeX = dimensions.x();
		// const Index sizeY = dimensions.y();
		template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store >
		static inline void
		exec( const Vector& dimensions,
		const Vector& kernelSize,
		FetchData&& fetchData,
		FetchBoundary&& fetchBoundary,
		FetchKernel&& fetchKernel,
		Convolve&& convolve,
		Store&& store )
		{
		TNL::Cuda::LaunchConfiguration launchConfig;

		// if( sizeX >= sizeY * sizeY ) {
		// launchConfig.blockSize.x = TNL::min( 256, sizeX );
		// launchConfig.blockSize.y = 1;
		// }
		// else if( sizeY >= sizeX * sizeX ) {
		// launchConfig.blockSize.x = 1;
		// launchConfig.blockSize.y = TNL::min( 256, sizeY );
		// }
		// else {
		// launchConfig.blockSize.x = TNL::min( 32, sizeX );
		// launchConfig.blockSize.y = TNL::min( 8, sizeY );
		// }
		launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >(
		kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() );

		// launchConfig.gridSize.x =
		// TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) );
		// launchConfig.gridSize.y =
		// TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) );
		const Index sizeX = dimensions.x();
		const Index sizeY = dimensions.y();

		// dim3 gridCount;
		if( sizeX >= sizeY * sizeY ) {
		launchConfig.blockSize.x = TNL::min( 256, sizeX );
		launchConfig.blockSize.y = 1;
		}
		else if( sizeY >= sizeX * sizeX ) {
		launchConfig.blockSize.x = 1;
		launchConfig.blockSize.y = TNL::min( 256, sizeY );
		}
		else {
		launchConfig.blockSize.x = TNL::min( 32, sizeX );
		launchConfig.blockSize.y = TNL::min( 8, sizeY );
		}

		// gridCount.x = roundUpDivision( sizeX, launchConfig.blockSize.x * launchConfig.gridSize.x );
		// gridCount.y = roundUpDivision( sizeY, launchConfig.blockSize.y * launchConfig.gridSize.y );
		launchConfig.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) );
		launchConfig.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) );

		// constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
		constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

		// TNL::Cuda::launchKernel< true >( kernel,
		// 0,
		// launchConfig,
		// kernelSize.x(),
		// kernelSize.y(),
		// dimensions.x(),
		// dimensions.y(),
		// std::forward< FetchData >( fetchData ),
		// std::forward< FetchBoundary >( fetchBoundary ),
		// std::forward< FetchKernel >( fetchKernel ),
		// std::forward< Convolve >( convolve ),
		// std::forward< Store >( store ) );
		// }
		// };
		TNL::Cuda::launchKernel< true >( kernel,
		0,
		launchConfig,
		kernelSize.x(),
		kernelSize.y(),
		dimensions.x(),
		dimensions.y(),
		fetchData,
		fetchBoundary,
		fetchKernel,
		convolve,
		store );
		}
		};

		// template<>
		// struct Launcher< 3, TNL::Devices::Cuda >