Implement kernel, which loads data in shared memory (97284805) · Commits · TNL / tnl-dev

src/Benchmarks/Convolution/CMakeLists.txt

+8 −0

Original line number	Diff line number	Diff line
		@@ -38,3 +38,11 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/shar
		GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedKernel.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedKernel.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedKernel.h")

		GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedData.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedData.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedData.h")

		GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedData.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedData.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedData.h")

src/Benchmarks/Convolution/kernels/sharedData.h

0 → 100644

+432 −0

Original line number	Diff line number	Diff line
		#pragma once

		#ifdef HAVE_CUDA

		/**
		* This method stores image tile into shared memory
		* and then calculates convolution.
		*
		* Thanks for the idea https://www.evl.uic.edu/sjames/cs525/final.html
		*/

		#include <TNL/Devices/Cuda.h>
		#include <TNL/Containers/StaticVector.h>
		#include <TNL/Cuda/LaunchHelpers.h>
		#include <TNL/Cuda/SharedMemory.h>

		template< int Dimension, typename Device >
		struct Convolution;

		template<>
		struct Convolution< 1, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 1, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		Index kernelElementCount = 1;

		for( Index i = 0; i < kernelSize.getSize(); i++ )
		kernelElementCount = ( 2 kernelSize[ i ] ) - 1;

		configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );

		configuration.blockSize.x = kernelSize.x();
		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		}
		};

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		__global__
		static void
		convolution1D( Index kernelWidth,
		Index endX,
		FetchData fetchData,
		FetchBoundary fetchBoundary,
		FetchKernel fetchKernel,
		Convolve convolve,
		Store store )
		{
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		if( ix >= endX )
		return;

		Real* shared = TNL::Cuda::getSharedMemory< Real >();
		Index radius = kernelWidth >> 1;

		// Left
		Index lhs = ix - radius;

		if( lhs < 0 ) {
		shared[ threadIdx.x ] = fetchBoundary( lhs );
		}
		else {
		shared[ threadIdx.x ] = fetchData( lhs );
		}

		// Right
		Index rhs = ix + radius;

		if( rhs >= endX ) {
		shared[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs );
		}
		else {
		shared[ threadIdx.x + blockDim.x ] = fetchData( rhs );
		}

		__syncthreads();

		Real result = 0;

		#pragma unroll
		for( Index i = 0; i < kernelWidth; i++ ) {
		Index elementIndex = i + threadIdx.x;

		result = convolve( result, shared[ elementIndex ], fetchKernel( i ) );
		}

		store( ix, result );
		}

		template<>
		struct Convolution< 2, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 2, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		Index kernelElementCount = 1;

		for( Index i = 0; i < kernelSize.getSize(); i++ )
		kernelElementCount = ( 2 kernelSize[ i ] ) - 1;

		configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );

		configuration.blockSize.x = kernelSize.x();
		configuration.blockSize.y = kernelSize.y();

		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		}
		};

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		__global__
		static void
		convolution2D( Index kernelWidth,
		Index kernelHeight,
		Index endX,
		Index endY,
		FetchData fetchData,
		FetchBoundary fetchBoundary,
		FetchKernel fetchKernel,
		Convolve convolve,
		Store store )
		{
		Index iy = threadIdx.y + blockIdx.y * blockDim.y;
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		if( ix >= endX \|\| iy >= endY )
		return;

		Real* shared = TNL::Cuda::getSharedMemory< Real >();

		Index radiusY = kernelHeight >> 1;
		Index radiusX = kernelWidth >> 1;

		Index x, y, index;

		// Top Left
		x = ix - radiusX;
		y = iy - radiusY;

		index = threadIdx.x + threadIdx.y * blockDim.x;

		if( x < 0 \|\| y < 0 ) {
		shared[ index ] = fetchBoundary( x, y );
		}
		else {
		shared[ index ] = fetchData( x, y );
		}

		// Top right
		x = ix + radiusX;
		y = iy - radiusY;

		index = radiusX + threadIdx.x + threadIdx.y * blockDim.x;

		if( x >= endX \|\| y < 0 ) {
		shared[ index ] = fetchBoundary( x, y );
		}
		else {
		shared[ index ] = fetchData( x, y );
		}

		// Bottom Left
		x = ix - radiusX;
		y = iy + radiusY;

		index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;

		if( x < 0 \|\| y >= endY ) {
		shared[ index ] = fetchBoundary( x, y );
		}
		else {
		shared[ index ] = fetchData( x, y );
		}

		// Bottom Right
		x = ix + radiusX;
		y = iy + radiusY;

		index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;

		if( x >= endX \|\| y >= endY ) {
		shared[ index ] = fetchBoundary( x, y );
		}
		else {
		shared[ index ] = fetchData( x, y );
		}

		__syncthreads();

		Real result = 0;

		for( Index j = 0; j <= radiusY; j++ ) {
		Index align = ( j + threadIdx.y ) * blockDim.y;

		for( Index i = 0; i <= radiusX; i++ ) {
		Index index = i + threadIdx.x + align;

		result = convolve( result, shared[ index ], fetchKernel( i, j ) );
		}
		}

		store( ix, iy, result );
		}

		template<>
		struct Convolution< 3, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 3, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		Index kernelElementCount = 1;

		for( Index i = 0; i < kernelSize.getSize(); i++ )
		kernelElementCount = ( 2 kernelSize[ i ] ) - 1;

		configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );

		configuration.blockSize.x = kernelSize.x();
		configuration.blockSize.y = kernelSize.y();
		configuration.blockSize.z = kernelSize.z();

		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
		}
		};

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		__global__
		static void
		convolution3D( Index kernelWidth,
		Index kernelHeight,
		Index kernelDepth,
		Index endX,
		Index endY,
		Index endZ,
		FetchData fetchData,
		FetchBoundary fetchBoundary,
		FetchKernel fetchKernel,
		Convolve convolve,
		Store store )
		{
		Index iz = threadIdx.z + blockIdx.z * blockDim.z;
		Index iy = threadIdx.y + blockIdx.y * blockDim.y;
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		if( ix >= endX \|\| iy >= endY \|\| iz >= endZ )
		return;

		Real* shared = TNL::Cuda::getSharedMemory< Real >();

		Index radiusZ = kernelDepth >> 1;
		Index radiusY = kernelHeight >> 1;
		Index radiusX = kernelWidth >> 1;

		Index x, y, z, index;

		// Z: 0 Y: 0 X: 0
		x = ix - radiusX;
		y = iy - radiusY;
		z = iz - radiusZ;

		index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;

		if( x < 0 \|\| y < 0 \|\| z < 0 ) {
		shared[ index ] = fetchBoundary( x, y, z );
		}
		else {
		shared[ index ] = fetchData( x, y, z );
		}

		// Z: 0 Y: 0 X: 1
		x = ix + radiusX;
		y = iy - radiusY;
		z = iz - radiusZ;

		index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;

		if( x >= endX \|\| y < 0 \|\| z < 0 ) {
		shared[ index ] = fetchBoundary( x, y, z );
		}
		else {
		shared[ index ] = fetchData( x, y, z );
		}

		// Z: 0 Y: 1 X: 0
		x = ix - radiusX;
		y = iy + radiusY;
		z = iz - radiusZ;

		index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;

		if( x < 0 \|\| y >= endY \|\| z < 0 ) {
		shared[ index ] = fetchBoundary( x, y, z );
		}
		else {
		shared[ index ] = fetchData( x, y, z );
		}

		// Z: 1 Y: 0 X: 0
		x = ix - radiusX;
		y = iy - radiusY;
		z = iz + radiusZ;

		index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;

		if( x < 0 \|\| y < 0 \|\| z >= endZ ) {
		shared[ index ] = fetchBoundary( x, y, z );
		}
		else {
		shared[ index ] = fetchData( x, y, z );
		}

		// Z: 0 Y: 1 X: 1
		x = ix + radiusX;
		y = iy + radiusY;
		z = iz - radiusZ;

		index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;

		if( x >= endX \|\| y >= endY \|\| z < 0 ) {
		shared[ index ] = fetchBoundary( x, y, z );
		}
		else {
		shared[ index ] = fetchData( x, y, z );
		}

		// Z: 1 Y: 0 X: 1
		x = ix + radiusX;
		y = iy - radiusY;
		z = iz + radiusZ;

		index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;

		if( x >= endX \|\| y < 0 \|\| z >= endZ ) {
		shared[ index ] = fetchBoundary( x, y, z );
		}
		else {
		shared[ index ] = fetchData( x, y, z );
		}

		// Z: 1 Y: 1 X: 0
		x = ix - radiusX;
		y = iy + radiusY;
		z = iz + radiusZ;

		index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;

		if( x < 0 \|\| y >= endY \|\| z >= endZ ) {
		shared[ index ] = fetchBoundary( x, y, z );
		}
		else {
		shared[ index ] = fetchData( x, y, z );
		}

		// Z: 1 Y: 1 X: 1
		x = ix + radiusX;
		y = iy + radiusY;
		z = iz + radiusZ;

		index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;

		if( x >= endX \|\| y >= endY \|\| z >= endZ ) {
		shared[ index ] = fetchBoundary( x, y, z );
		}
		else {
		shared[ index ] = fetchData( x, y, z );
		}

		__syncthreads();

		Real result = 0;

		for( Index k = 0; k <= radiusZ; k++ ) {
		Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;

		for( Index j = 0; j <= radiusY; j++ ) {
		Index xAlign = ( j + threadIdx.y ) * blockDim.y;

		for( Index i = 0; i <= radiusX; i++ ) {
		Index index = i + threadIdx.x + xAlign + xyAlign;

		result = convolve( result, shared[ index ], fetchKernel( i, j, k ) );
		}
		}
		}

		store( ix, iy, iz, result );
		}

		#endif

src/Benchmarks/Convolution/kernels/sharedKernel.h

+26 −16

Original line number	Diff line number	Diff line
		@@ -8,6 +8,10 @@
		#include <TNL/Cuda/LaunchHelpers.h>
		#include <TNL/Cuda/SharedMemory.h>

		/**
		* This method stores kernel in the shared memory to reduce amount of loads.
		*/

		template< int Dimension, typename Device >
		struct Convolution;

		@@ -52,10 +56,14 @@ convolution1D( Index kernelWidth,
		Convolve convolve,
		Store store )
		{
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		if( ix >= endX )
		return;

		Real* shared = TNL::Cuda::getSharedMemory< Real >();

		Index radius = kernelWidth >> 1;
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		// The size of the block is equal to the kernel size
		shared[ threadIdx.x ] = fetchKernel( threadIdx.x );
		@@ -126,14 +134,17 @@ convolution2D( Index kernelWidth,
		Convolve convolve,
		Store store )
		{
		Index iy = threadIdx.y + blockIdx.y * blockDim.y;
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		if( ix >= endX \|\| iy >= endY )
		return;

		Real* shared = TNL::Cuda::getSharedMemory< Real >();

		Index radiusY = kernelHeight >> 1;
		Index radiusX = kernelWidth >> 1;

		Index iy = threadIdx.y + blockIdx.y * blockDim.y;
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y;

		// The size of the block is equal to the kernel size
		@@ -217,12 +228,15 @@ convolution3D( Index kernelWidth,
		Convolve convolve,
		Store store )
		{
		Real* shared = TNL::Cuda::getSharedMemory< Real >();

		Index iz = threadIdx.z + blockIdx.z * blockDim.z;
		Index iy = threadIdx.y + blockIdx.y * blockDim.y;
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		if( ix >= endX \|\| iy >= endY \|\| iz >= endZ )
		return;

		Real* shared = TNL::Cuda::getSharedMemory< Real >();

		Index radiusZ = kernelDepth >> 1;
		Index radiusY = kernelHeight >> 1;
		Index radiusX = kernelWidth >> 1;
		@@ -255,14 +269,10 @@ convolution3D( Index kernelWidth,
		if( elementIndexX < 0 \|\| elementIndexX >= endX \|\| elementIndexY < 0 \|\| elementIndexY >= endY \|\| elementIndexZ < 0
		\|\| elementIndexZ >= endZ )
		{
		result = convolve( result,
		fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ),
		shared[threadIndex] );
		result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), shared[ threadIndex ] );
		}
		else {
		result = convolve( result,
		fetchData( elementIndexX, elementIndexY, elementIndexZ ),
		shared[threadIndex] );
		result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), shared[ threadIndex ] );
		}
		}
		}