Fix convolution kernel execution (6860696c) · Commits · TNL / tnl-dev

src/Benchmarks/Convolution/kernels/naive.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -305,7 +305,7 @@ public:
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		configuration.gridSize.y =
		configuration.gridSize.z =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
		}

src/Benchmarks/Convolution/kernels/sharedData.h

+44 −41

Original line number	Diff line number	Diff line
		@@ -33,16 +33,13 @@ convolution1D( Index kernelWidth,
		{
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		if( ix >= endX )
		return;

		Real* shared = TNL::Cuda::getSharedMemory< Real >();
		Index radius = kernelWidth >> 1;

		// Left
		Index lhs = ix - radius;

		if( lhs < 0 ) {
		if( lhs < 0 \|\| lhs >= endX ) {
		shared[ threadIdx.x ] = fetchBoundary( lhs );
		}
		else {
		@@ -52,7 +49,7 @@ convolution1D( Index kernelWidth,
		// Right
		Index rhs = ix + radius;

		if( rhs >= endX ) {
		if( rhs < 0 \|\| rhs >= endX ) {
		shared[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs );
		}
		else {
		@@ -61,6 +58,9 @@ convolution1D( Index kernelWidth,

		__syncthreads();

		if( ix >= endX )
		return;

		Real result = 0;

		#pragma unroll
		@@ -95,9 +95,6 @@ convolution2D( Index kernelWidth,
		Index iy = threadIdx.y + blockIdx.y * blockDim.y;
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		if( ix >= endX \|\| iy >= endY )
		return;

		Real* shared = TNL::Cuda::getSharedMemory< Real >();

		Index radiusY = kernelHeight >> 1;
		@@ -105,13 +102,16 @@ convolution2D( Index kernelWidth,

		Index x, y, index;

		Index kernelHorizontalPadding = kernelWidth == 1 ? 0 : kernelWidth;
		Index kernelVerticalPadding = kernelHeight == 1 ? 0 : kernelHeight;

		// Top Left
		x = ix - radiusX;
		y = iy - radiusY;

		index = threadIdx.x + threadIdx.y * blockDim.x;

		if( x < 0 \|\| y < 0 ) {
		if( x < 0 \|\| y < 0 \|\| x >= endX \|\| y >= endY ) {
		shared[ index ] = fetchBoundary( x, y );
		}
		else {
		@@ -122,9 +122,9 @@ convolution2D( Index kernelWidth,
		x = ix + radiusX;
		y = iy - radiusY;

		index = radiusX + threadIdx.x + threadIdx.y * blockDim.x;
		index = kernelHorizontalPadding + threadIdx.x + threadIdx.y * blockDim.x;

		if( x >= endX \|\| y < 0 ) {
		if( x < 0 \|\| y < 0 \|\| x >= endX \|\| y >= endY ) {
		shared[ index ] = fetchBoundary( x, y );
		}
		else {
		@@ -135,9 +135,9 @@ convolution2D( Index kernelWidth,
		x = ix - radiusX;
		y = iy + radiusY;

		index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;
		index = threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x;

		if( x < 0 \|\| y >= endY ) {
		if(x < 0 \|\| y < 0 \|\| x >= endX \|\| y >= endY ) {
		shared[ index ] = fetchBoundary( x, y );
		}
		else {
		@@ -148,9 +148,9 @@ convolution2D( Index kernelWidth,
		x = ix + radiusX;
		y = iy + radiusY;

		index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;
		index = kernelHorizontalPadding + threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x;

		if( x >= endX \|\| y >= endY ) {
		if( x < 0 \|\| y < 0 \|\| x >= endX \|\| y >= endY ) {
		shared[ index ] = fetchBoundary( x, y );
		}
		else {
		@@ -159,12 +159,15 @@ convolution2D( Index kernelWidth,

		__syncthreads();

		if( ix >= endX \|\| iy >= endY )
		return;

		Real result = 0;

		for( Index j = 0; j <= radiusY; j++ ) {
		for( Index j = 0; j < kernelHeight; j++ ) {
		Index align = ( j + threadIdx.y ) * blockDim.x;

		for( Index i = 0; i <= radiusX; i++ ) {
		for( Index i = 0; i < kernelWidth; i++ ) {
		Index index = i + threadIdx.x + align;

		result = convolve( result, shared[ index ], fetchKernel( i, j ) );
		@@ -199,9 +202,6 @@ convolution3D( Index kernelWidth,
		Index iy = threadIdx.y + blockIdx.y * blockDim.y;
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		if( ix >= endX \|\| iy >= endY \|\| iz >= endZ )
		return;

		Real* shared = TNL::Cuda::getSharedMemory< Real >();

		Index radiusZ = kernelDepth >> 1;
		@@ -215,9 +215,9 @@ convolution3D( Index kernelWidth,
		y = iy - radiusY;
		z = iz - radiusZ;

		index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
		index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;

		if( x < 0 \|\| y < 0 \|\| z < 0 ) {
		if( x < 0 \|\| y < 0 \|\| z < 0 \|\| x >= endX \|\| y >= endY \|\| z >= endZ ) {
		shared[ index ] = fetchBoundary( x, y, z );
		}
		else {
		@@ -229,9 +229,9 @@ convolution3D( Index kernelWidth,
		y = iy - radiusY;
		z = iz - radiusZ;

		index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
		index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;

		if( x >= endX \|\| y < 0 \|\| z < 0 ) {
		if( x < 0 \|\| y < 0 \|\| z < 0 \|\| x >= endX \|\| y >= endY \|\| z >= endZ ) {
		shared[ index ] = fetchBoundary( x, y, z );
		}
		else {
		@@ -243,9 +243,9 @@ convolution3D( Index kernelWidth,
		y = iy + radiusY;
		z = iz - radiusZ;

		index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
		index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;

		if( x < 0 \|\| y >= endY \|\| z < 0 ) {
		if( x < 0 \|\| y < 0 \|\| z < 0 \|\| x >= endX \|\| y >= endY \|\| z >= endZ ) {
		shared[ index ] = fetchBoundary( x, y, z );
		}
		else {
		@@ -257,9 +257,9 @@ convolution3D( Index kernelWidth,
		y = iy - radiusY;
		z = iz + radiusZ;

		index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
		index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;

		if( x < 0 \|\| y < 0 \|\| z >= endZ ) {
		if( x < 0 \|\| y < 0 \|\| z < 0 \|\| x >= endX \|\| y >= endY \|\| z >= endZ ) {
		shared[ index ] = fetchBoundary( x, y, z );
		}
		else {
		@@ -271,9 +271,9 @@ convolution3D( Index kernelWidth,
		y = iy + radiusY;
		z = iz - radiusZ;

		index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
		index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;

		if( x >= endX \|\| y >= endY \|\| z < 0 ) {
		if( x < 0 \|\| y < 0 \|\| z < 0 \|\| x >= endX \|\| y >= endY \|\| z >= endZ ) {
		shared[ index ] = fetchBoundary( x, y, z );
		}
		else {
		@@ -285,9 +285,9 @@ convolution3D( Index kernelWidth,
		y = iy - radiusY;
		z = iz + radiusZ;

		index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
		index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;

		if( x >= endX \|\| y < 0 \|\| z >= endZ ) {
		if( x < 0 \|\| y < 0 \|\| z < 0 \|\| x >= endX \|\| y >= endY \|\| z >= endZ ) {
		shared[ index ] = fetchBoundary( x, y, z );
		}
		else {
		@@ -299,9 +299,9 @@ convolution3D( Index kernelWidth,
		y = iy + radiusY;
		z = iz + radiusZ;

		index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
		index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;

		if( x < 0 \|\| y >= endY \|\| z >= endZ ) {
		if( x < 0 \|\| y < 0 \|\| z < 0 \|\| x >= endX \|\| y >= endY \|\| z >= endZ ) {
		shared[ index ] = fetchBoundary( x, y, z );
		}
		else {
		@@ -313,9 +313,9 @@ convolution3D( Index kernelWidth,
		y = iy + radiusY;
		z = iz + radiusZ;

		index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
		index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;

		if( x >= endX \|\| y >= endY \|\| z >= endZ ) {
		if( x < 0 \|\| y < 0 \|\| z < 0 \|\| x >= endX \|\| y >= endY \|\| z >= endZ ) {
		shared[ index ] = fetchBoundary( x, y, z );
		}
		else {
		@@ -324,15 +324,18 @@ convolution3D( Index kernelWidth,

		__syncthreads();

		if( ix >= endX \|\| iy >= endY \|\| iz >= endZ )
		return;

		Real result = 0;

		for( Index k = 0; k <= radiusZ; k++ ) {
		for( Index k = 0; k < kernelDepth; k++ ) {
		Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;

		for( Index j = 0; j <= radiusY; j++ ) {
		for( Index j = 0; j < kernelHeight; j++ ) {
		Index xAlign = ( j + threadIdx.y ) * blockDim.x;

		for( Index i = 0; i <= radiusX; i++ ) {
		for( Index i = 0; i < kernelWidth; i++ ) {
		Index index = i + threadIdx.x + xAlign + xyAlign;

		result = convolve( result, shared[ index ], fetchKernel( i, j, k ) );
		@@ -486,7 +489,7 @@ public:
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		configuration.gridSize.y =
		configuration.gridSize.z =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
		}

src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h

+44 −43

Original line number	Diff line number	Diff line
		@@ -49,9 +49,6 @@ convolution1D( Index kernelWidth,
		{
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		if( ix >= endX )
		return;

		Index kernelOffset = 2 * kernelWidth;

		Real* data = TNL::Cuda::getSharedMemory< Real >();
		@@ -62,7 +59,7 @@ convolution1D( Index kernelWidth,
		// Left
		Index lhs = ix - radius;

		if( lhs < 0 ) {
		if( lhs < 0 \|\| lhs >= endX ) {
		data[ threadIdx.x ] = fetchBoundary( lhs );
		}
		else {
		@@ -72,7 +69,7 @@ convolution1D( Index kernelWidth,
		// Right
		Index rhs = ix + radius;

		if( rhs >= endX ) {
		if( rhs < 0 \|\| rhs >= endX ) {
		data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs );
		}
		else {
		@@ -83,6 +80,9 @@ convolution1D( Index kernelWidth,

		__syncthreads();

		if( ix >= endX )
		return;

		Real result = 0;

		#pragma unroll
		@@ -117,9 +117,6 @@ convolution2D( Index kernelWidth,
		Index iy = threadIdx.y + blockIdx.y * blockDim.y;
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		if( ix >= endX \|\| iy >= endY )
		return;

		Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 );

		Real* data = TNL::Cuda::getSharedMemory< Real >();
		@@ -138,7 +135,7 @@ convolution2D( Index kernelWidth,

		kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y );

		if( x < 0 \|\| y < 0 ) {
		if( x < 0 \|\| y < 0 \|\| x >= endX \|\| y >= endY ) {
		data[ index ] = fetchBoundary( x, y );
		}
		else {
		@@ -149,9 +146,9 @@ convolution2D( Index kernelWidth,
		x = ix + radiusX;
		y = iy - radiusY;

		index = radiusX + threadIdx.x + threadIdx.y * blockDim.x;
		index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x;

		if( x >= endX \|\| y < 0 ) {
		if( x < 0 \|\| y < 0 \|\| x >= endX \|\| y >= endY ) {
		data[ index ] = fetchBoundary( x, y );
		}
		else {
		@@ -162,9 +159,9 @@ convolution2D( Index kernelWidth,
		x = ix - radiusX;
		y = iy + radiusY;

		index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;
		index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x;

		if( x < 0 \|\| y >= endY ) {
		if( x < 0 \|\| y < 0 \|\| x >= endX \|\| y >= endY ) {
		data[ index ] = fetchBoundary( x, y );
		}
		else {
		@@ -175,9 +172,9 @@ convolution2D( Index kernelWidth,
		x = ix + radiusX;
		y = iy + radiusY;

		index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;
		index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x;

		if( x >= endX \|\| y >= endY ) {
		if( x < 0 \|\| y < 0 \|\| x >= endX \|\| y >= endY ) {
		data[ index ] = fetchBoundary( x, y );
		}
		else {
		@@ -186,15 +183,18 @@ convolution2D( Index kernelWidth,

		__syncthreads();

		if( ix >= endX \|\| iy >= endY )
		return;

		Real result = 0;

		#pragma unroll
		for( Index j = 0; j <= radiusY; j++ ) {
		for( Index j = 0; j < kernelHeight; j++ ) {
		Index elementAlign = ( j + threadIdx.y ) * blockDim.x;
		Index kernelAlign = j * blockDim.x;

		#pragma unroll
		for( Index i = 0; i <= radiusX; i++ ) {
		for( Index i = 0; i < kernelWidth; i++ ) {
		Index elementIndex = i + threadIdx.x + elementAlign;
		Index kernelIndex = i + kernelAlign;

		@@ -230,9 +230,6 @@ convolution3D( Index kernelWidth,
		Index iy = threadIdx.y + blockIdx.y * blockDim.y;
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		if( ix >= endX \|\| iy >= endY \|\| iz >= endZ )
		return;

		Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ) * ( 2 * kernelDepth - 1 );

		Real* data = TNL::Cuda::getSharedMemory< Real >();
		@@ -249,11 +246,11 @@ convolution3D( Index kernelWidth,
		y = iy - radiusY;
		z = iz - radiusZ;

		index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
		index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;

		kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z );

		if( x < 0 \|\| y < 0 \|\| z < 0 ) {
		if( x < 0 \|\| y < 0 \|\| z < 0 \|\| x >= endX \|\| y >= endY \|\| z >= endZ ) {
		data[ index ] = fetchBoundary( x, y, z );
		}
		else {
		@@ -265,9 +262,9 @@ convolution3D( Index kernelWidth,
		y = iy - radiusY;
		z = iz - radiusZ;

		index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
		index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;

		if( x >= endX \|\| y < 0 \|\| z < 0 ) {
		if( x < 0 \|\| y < 0 \|\| z < 0 \|\| x >= endX \|\| y >= endY \|\| z >= endZ ) {
		data[ index ] = fetchBoundary( x, y, z );
		}
		else {
		@@ -279,9 +276,9 @@ convolution3D( Index kernelWidth,
		y = iy + radiusY;
		z = iz - radiusZ;

		index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
		index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;

		if( x < 0 \|\| y >= endY \|\| z < 0 ) {
		if( x < 0 \|\| y < 0 \|\| z < 0 \|\| x >= endX \|\| y >= endY \|\| z >= endZ ) {
		data[ index ] = fetchBoundary( x, y, z );
		}
		else {
		@@ -293,9 +290,9 @@ convolution3D( Index kernelWidth,
		y = iy - radiusY;
		z = iz + radiusZ;

		index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
		index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;

		if( x < 0 \|\| y < 0 \|\| z >= endZ ) {
		if( x < 0 \|\| y < 0 \|\| z < 0 \|\| x >= endX \|\| y >= endY \|\| z >= endZ ) {
		data[ index ] = fetchBoundary( x, y, z );
		}
		else {
		@@ -307,9 +304,9 @@ convolution3D( Index kernelWidth,
		y = iy + radiusY;
		z = iz - radiusZ;

		index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
		index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;

		if( x >= endX \|\| y >= endY \|\| z < 0 ) {
		if( x < 0 \|\| y < 0 \|\| z < 0 \|\| x >= endX \|\| y >= endY \|\| z >= endZ) {
		data[ index ] = fetchBoundary( x, y, z );
		}
		else {
		@@ -321,9 +318,9 @@ convolution3D( Index kernelWidth,
		y = iy - radiusY;
		z = iz + radiusZ;

		index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
		index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;

		if( x >= endX \|\| y < 0 \|\| z >= endZ ) {
		if( x < 0 \|\| y < 0 \|\| z < 0 \|\| x >= endX \|\| y >= endY \|\| z >= endZ ) {
		data[ index ] = fetchBoundary( x, y, z );
		}
		else {
		@@ -335,9 +332,9 @@ convolution3D( Index kernelWidth,
		y = iy + radiusY;
		z = iz + radiusZ;

		index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
		index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;

		if( x < 0 \|\| y >= endY \|\| z >= endZ ) {
		if(x < 0 \|\| y < 0 \|\| z < 0 \|\| x >= endX \|\| y >= endY \|\| z >= endZ ) {
		data[ index ] = fetchBoundary( x, y, z );
		}
		else {
		@@ -349,9 +346,9 @@ convolution3D( Index kernelWidth,
		y = iy + radiusY;
		z = iz + radiusZ;

		index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
		index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;

		if( x >= endX \|\| y >= endY \|\| z >= endZ ) {
		if( x < 0 \|\| y < 0 \|\| z < 0 \|\| x >= endX \|\| y >= endY \|\| z >= endZ ) {
		data[ index ] = fetchBoundary( x, y, z );
		}
		else {
		@@ -360,21 +357,25 @@ convolution3D( Index kernelWidth,

		__syncthreads();

		if( ix >= endX \|\| iy >= endY \|\| iz >= endZ )
		return;

		Real result = 0;

		for( Index k = 0; k <= radiusZ; k++ ) {
		#pragma unroll
		for( Index k = 0; k < kernelDepth; k++ ) {
		Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;
		Index xyKernelAlign = k * blockDim.x * blockDim.y;

		for( Index j = 0; j <= radiusY; j++ ) {
		#pragma unroll
		for( Index j = 0; j < kernelHeight; j++ ) {
		Index xAlign = ( j + threadIdx.y ) * blockDim.x;
		Index xKernelAlign = j * blockDim.x;

		for( Index i = 0; i <= radiusX; i++ ) {
		#pragma unroll
		for( Index i = 0; i < kernelWidth; i++ ) {
		Index elementIndex = i + threadIdx.x + xAlign + xyAlign;
		Index kernelIndex = i + xKernelAlign + xyKernelAlign;

		result = convolve( result, data[ index ], kernel[ kernelIndex ] );
		result = convolve( result, data[ elementIndex ], kernel[ kernelIndex ] );
		}
		}
		}
		@@ -531,7 +532,7 @@ public:
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		configuration.gridSize.y =
		configuration.gridSize.z =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
		}

src/Benchmarks/Convolution/kernels/sharedKernel.h

+16 −12

Original line number	Diff line number	Diff line
		@@ -34,9 +34,6 @@ convolution1D( Index kernelWidth,
		{
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		if( ix >= endX )
		return;

		Real* shared = TNL::Cuda::getSharedMemory< Real >();

		Index radius = kernelWidth >> 1;
		@@ -46,8 +43,12 @@ convolution1D( Index kernelWidth,

		__syncthreads();

		if( ix >= endX )
		return;

		Real result = 0;

		#pragma unroll
		for( Index i = -radius; i <= radius; i++ ) {
		Index elementIndex = i + ix;
		Index kernelIndex = i + radius;
		@@ -85,9 +86,6 @@ convolution2D( Index kernelWidth,
		Index iy = threadIdx.y + blockIdx.y * blockDim.y;
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		if( ix >= endX \|\| iy >= endY )
		return;

		Real* shared = TNL::Cuda::getSharedMemory< Real >();

		Index radiusY = kernelHeight >> 1;
		@@ -100,12 +98,17 @@ convolution2D( Index kernelWidth,

		__syncthreads();

		if( ix >= endX \|\| iy >= endY )
		return;

		Real result = 0;

		#pragma unroll
		for( Index j = -radiusY; j <= radiusY; j++ ) {
		Index elementIndexY = j + iy;
		Index kernelIndexY = j + radiusY;

		#pragma unroll
		for( Index i = -radiusX; i <= radiusX; i++ ) {
		Index elementIndexX = i + ix;
		Index kernelIndexX = i + radiusX;
		@@ -149,9 +152,6 @@ convolution3D( Index kernelWidth,
		Index iy = threadIdx.y + blockIdx.y * blockDim.y;
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		if( ix >= endX \|\| iy >= endY \|\| iz >= endZ )
		return;

		Real* shared = TNL::Cuda::getSharedMemory< Real >();

		Index radiusZ = kernelDepth >> 1;
		@@ -160,23 +160,27 @@ convolution3D( Index kernelWidth,

		Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z;

		printf( "%d\n", threadIndex );

		// The size of the block is equal to the kernel size
		shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z );

		__syncthreads();

		if( ix >= endX \|\| iy >= endY \|\| iz >= endZ )
		return;

		Real result = 0;

		#pragma unroll
		for( Index k = -radiusZ; k <= radiusZ; k++ ) {
		Index elementIndexZ = k + iz;
		Index kernelIndexZ = k + radiusZ;

		#pragma unroll
		for( Index j = -radiusY; j <= radiusY; j++ ) {
		Index elementIndexY = j + iy;
		Index kernelIndexY = j + radiusY;

		#pragma unroll
		for( Index i = -radiusX; i <= radiusX; i++ ) {
		Index elementIndexX = i + ix;
		Index kernelIndexX = i + radiusX;
		@@ -338,7 +342,7 @@ public:
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		configuration.gridSize.y =
		configuration.gridSize.z =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
		}

src/Benchmarks/Convolution/support/DummySolver.h

+7 −2

Original line number	Diff line number	Diff line
		@@ -61,6 +61,11 @@ public:

		DummyTask<int, float, Dimension, Device>::exec(dimension, kernelSize, inputView, resultView, kernelView);

		TNL::Containers::Array< float, TNL::Devices::Host, int > host(result);

		for (int i = 0; i < host.getSize(); i++)
		TNL_ASSERT_EQ(host[i], kernelElementsCount, "Dummy task always sets volume of kernel");

		std::cout << "Everything is fine" << std::endl;
		}

		@@ -72,12 +77,12 @@ public:
		config.addDelimiter( "Grid dimension settings:" );

		for( int i = 0; i < Dimension; i++ )
		config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 512 );
		config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 64 );

		config.addDelimiter( "Kernel settings:" );

		for( int i = 0; i < Dimension; i++ )
		config.addEntry< int >( kernelSizeIds[ i ], kernelSizeIds[ i ] + " (odd) :", 11 );
		config.addEntry< int >( kernelSizeIds[ i ], kernelSizeIds[ i ] + " (odd) :", 9 );

		return config;
		}