Implement convolution with storing kernel in shared memory (be08382d) · Commits · TNL / tnl-dev

src/Benchmarks/Convolution/CMakeLists.txt

+9 −0

Original line number	Diff line number	Diff line
		@@ -10,6 +10,7 @@ if (${BUILD_CUDA})
		FILE(READ ${TEMPLATE} TEMPLATE_CONTENT)

		STRING(REGEX REPLACE "DIMENSION_VALUE" ${DIMENSION} TEMPLATE_CONTENT "${TEMPLATE_CONTENT}")
		STRING(REGEX REPLACE "KERNEL_VALUE" "\"../${KERNEL_HEADER}\"" TEMPLATE_CONTENT "${TEMPLATE_CONTENT}")

		FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}")

		@@ -29,3 +30,11 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/naiv
		GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/naive.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/naive.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/naive.h")

		GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedKernel.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedKernel.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedKernel.h")

		GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedKernel.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedKernel.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedKernel.h")

src/Benchmarks/Convolution/kernels/naive.h

+15 −7

Original line number	Diff line number	Diff line

		#pragma once

		#ifdef HAVE_CUDA

		#include <TNL/Devices/Cuda.h>
		#include <TNL/Containers/StaticVector.h>
		#include <TNL/Cuda/LaunchHelpers.h>

		/**
		* There are several pitfalls with such configuration.
		*
		* 1. At first we don't use shared memory
		* 2. At second we don't control block size, so we may launch extremely small kernels or otherwise we can launch extremely large kernels.
		*/

		template< int Dimension, typename Device >
		struct Convolution;

		@@ -15,7 +24,7 @@ public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 1, Index >;

		template< typename Index >
		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		@@ -76,13 +85,12 @@ public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 2, Index >;

		template< typename Index >
		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		configuration.dynamicSharedMemorySize = 0;

		// TODO: - Benchmark the best value
		configuration.blockSize.x = kernelSize.x();
		configuration.blockSize.y = kernelSize.y();

		@@ -151,7 +159,7 @@ public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 3, Index >;

		template< typename Index >
		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{

src/Benchmarks/Convolution/kernels/sharedKernel.h

0 → 100644

+274 −0

Original line number	Diff line number	Diff line

		#pragma once

		#ifdef HAVE_CUDA

		#include <TNL/Devices/Cuda.h>
		#include <TNL/Containers/StaticVector.h>
		#include <TNL/Cuda/LaunchHelpers.h>
		#include <TNL/Cuda/SharedMemory.h>

		template< int Dimension, typename Device >
		struct Convolution;

		template<>
		struct Convolution< 1, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 1, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		Index kernelElementCount = 1;

		for( Index i = 0; i < kernelSize.getSize(); i++ )
		kernelElementCount *= kernelSize[ i ];

		configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );

		configuration.blockSize.x = kernelSize.x();
		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		}
		};

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		__global__
		static void
		convolution1D( Index kernelWidth,
		Index endX,
		FetchData fetchData,
		FetchBoundary fetchBoundary,
		FetchKernel fetchKernel,
		Convolve convolve,
		Store store )
		{
		Real* shared = TNL::Cuda::getSharedMemory< Real >();

		Index radius = kernelWidth >> 1;
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		// The size of the block is equal to the kernel size
		shared[ threadIdx.x ] = fetchKernel( threadIdx.x );

		__syncthreads();

		Real result = 0;

		for( Index i = -radius; i <= radius; i++ ) {
		Index elementIndex = i + ix;
		Index kernelIndex = i + radius;

		if( elementIndex < 0 \|\| elementIndex >= endX ) {
		result = convolve( result, fetchBoundary( elementIndex ), shared[ kernelIndex ] );
		}
		else {
		result = convolve( result, fetchData( elementIndex ), shared[ kernelIndex ] );
		}
		}

		store( ix, result );
		}

		template<>
		struct Convolution< 2, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 2, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		Index kernelElementCount = 1;

		for( Index i = 0; i < kernelSize.getSize(); i++ )
		kernelElementCount *= kernelSize[ i ];

		configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );

		configuration.blockSize.x = kernelSize.x();
		configuration.blockSize.y = kernelSize.y();

		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		}
		};

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		__global__
		static void
		convolution2D( Index kernelWidth,
		Index kernelHeight,
		Index endX,
		Index endY,
		FetchData fetchData,
		FetchBoundary fetchBoundary,
		FetchKernel fetchKernel,
		Convolve convolve,
		Store store )
		{
		Real* shared = TNL::Cuda::getSharedMemory< Real >();

		Index radiusY = kernelHeight >> 1;
		Index radiusX = kernelWidth >> 1;

		Index iy = threadIdx.y + blockIdx.y * blockDim.y;
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y;

		// The size of the block is equal to the kernel size
		shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y );

		__syncthreads();

		Real result = 0;

		for( Index j = -radiusY; j <= radiusY; j++ ) {
		Index elementIndexY = j + iy;
		Index kernelIndexY = j + radiusY;

		for( Index i = -radiusX; i <= radiusX; i++ ) {
		Index elementIndexX = i + ix;
		Index kernelIndexX = i + radiusX;

		Index threadIndex = kernelIndexX + kernelWidth * kernelIndexY;

		if( elementIndexX < 0 \|\| elementIndexX >= endX \|\| elementIndexY < 0 \|\| elementIndexY >= endY ) {
		result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), shared[ threadIndex ] );
		}
		else {
		result = convolve( result, fetchData( elementIndexX, elementIndexY ), shared[ threadIndex ] );
		}
		}
		}

		store( ix, iy, result );
		}

		template<>
		struct Convolution< 3, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 3, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		Index kernelElementCount = 1;

		for( Index i = 0; i < kernelSize.getSize(); i++ )
		kernelElementCount *= kernelSize[ i ];

		configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );

		configuration.blockSize.x = kernelSize.x();
		configuration.blockSize.y = kernelSize.y();
		configuration.blockSize.z = kernelSize.z();

		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
		}
		};

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		__global__
		static void
		convolution3D( Index kernelWidth,
		Index kernelHeight,
		Index kernelDepth,
		Index endX,
		Index endY,
		Index endZ,
		FetchData fetchData,
		FetchBoundary fetchBoundary,
		FetchKernel fetchKernel,
		Convolve convolve,
		Store store )
		{
		Real* shared = TNL::Cuda::getSharedMemory< Real >();

		Index iz = threadIdx.z + blockIdx.z * blockDim.z;
		Index iy = threadIdx.y + blockIdx.y * blockDim.y;
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		Index radiusZ = kernelDepth >> 1;
		Index radiusY = kernelHeight >> 1;
		Index radiusX = kernelWidth >> 1;

		Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z;

		printf( "%d\n", threadIndex );

		// The size of the block is equal to the kernel size
		shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z );

		__syncthreads();

		Real result = 0;

		for( Index k = -radiusZ; k <= radiusZ; k++ ) {
		Index elementIndexZ = k + iz;
		Index kernelIndexZ = k + radiusZ;

		for( Index j = -radiusY; j <= radiusY; j++ ) {
		Index elementIndexY = j + iy;
		Index kernelIndexY = j + radiusY;

		for( Index i = -radiusX; i <= radiusX; i++ ) {
		Index elementIndexX = i + ix;
		Index kernelIndexX = i + radiusX;

		Index threadIndex = kernelIndexX + kernelWidth * kernelIndexY + kernelWidth * kernelHeight * kernelIndexZ;

		if( elementIndexX < 0 \|\| elementIndexX >= endX \|\| elementIndexY < 0 \|\| elementIndexY >= endY \|\| elementIndexZ < 0
		\|\| elementIndexZ >= endZ )
		{
		result = convolve( result,
		fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ),
		shared[threadIndex] );
		}
		else {
		result = convolve( result,
		fetchData( elementIndexX, elementIndexY, elementIndexZ ),
		shared[threadIndex] );
		}
		}
		}
		}

		store( ix, iy, iz, result );
		}

		#endif

src/Benchmarks/Convolution/support/Launcher.h

+11 −4

Original line number	Diff line number	Diff line
		@@ -5,7 +5,14 @@
		#include <TNL/Cuda/KernelLaunch.h>

		template< int Dimension, typename Device >
		struct Convolution;
		struct Convolution {
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 1, Index >;

		template< typename Index, typename Real >
		static void
		setup(TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize);
		};

		template< int Dimension, typename Device >
		struct Launcher;
		@@ -29,7 +36,7 @@ public:
		{
		TNL::Cuda::LaunchConfiguration launchConfig;

		ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize);
		ConvolutionKernel::setup<Index, Real>(launchConfig, dimensions, kernelSize);

		constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

		@@ -65,7 +72,7 @@ public:
		{
		TNL::Cuda::LaunchConfiguration launchConfig;

		ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize);
		ConvolutionKernel::setup<Index, Real>(launchConfig, dimensions, kernelSize);

		constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

		@@ -107,7 +114,7 @@ public:

		TNL::Cuda::LaunchConfiguration launchConfig;

		ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize);
		ConvolutionKernel::setup<Index, Real>(launchConfig, dimensions, kernelSize);

		constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

src/Benchmarks/Convolution/templates/main_benchmark.h

+4 −3

Original line number	Diff line number	Diff line

		#include "../kernels/naive.h"
		#define KERNEL KERNEL_VALUE
		#define DIMENSION DIMENSION_VALUE

		#include KERNEL_VALUE
		#include "../support/DummyBenchmark.h"

		#include <TNL/Config/parseCommandLine.h>

		#define DIMENSION DIMENSION_VALUE

		using TaskBenchmark = DummyBenchmark< DIMENSION, TNL::Devices::Cuda >;

		int main(int argc, char* argv[])