Loading src/Benchmarks/Convolution/kernels/naive.h +1 −1 Original line number Diff line number Diff line Loading @@ -305,7 +305,7 @@ public: TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); configuration.gridSize.y = configuration.gridSize.z = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } Loading src/Benchmarks/Convolution/kernels/sharedData.h +44 −41 Original line number Diff line number Diff line Loading @@ -33,16 +33,13 @@ convolution1D( Index kernelWidth, { Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX ) return; Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radius = kernelWidth >> 1; // Left Index lhs = ix - radius; if( lhs < 0 ) { if( lhs < 0 || lhs >= endX ) { shared[ threadIdx.x ] = fetchBoundary( lhs ); } else { Loading @@ -52,7 +49,7 @@ convolution1D( Index kernelWidth, // Right Index rhs = ix + radius; if( rhs >= endX ) { if( rhs < 0 || rhs >= endX ) { shared[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs ); } else { Loading @@ -61,6 +58,9 @@ convolution1D( Index kernelWidth, __syncthreads(); if( ix >= endX ) return; Real result = 0; #pragma unroll Loading Loading @@ -95,9 +95,6 @@ convolution2D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX || iy >= endY ) return; Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radiusY = kernelHeight >> 1; Loading @@ -105,13 +102,16 @@ convolution2D( Index kernelWidth, Index x, y, index; Index kernelHorizontalPadding = kernelWidth == 1 ? 0 : kernelWidth; Index kernelVerticalPadding = kernelHeight == 1 ? 0 : kernelHeight; // Top Left x = ix - radiusX; y = iy - radiusY; index = threadIdx.x + threadIdx.y * blockDim.x; if( x < 0 || y < 0 ) { if( x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); } else { Loading @@ -122,9 +122,9 @@ convolution2D( Index kernelWidth, x = ix + radiusX; y = iy - radiusY; index = radiusX + threadIdx.x + threadIdx.y * blockDim.x; index = kernelHorizontalPadding + threadIdx.x + threadIdx.y * blockDim.x; if( x >= endX || y < 0 ) { if( x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); } else { Loading @@ -135,9 +135,9 @@ convolution2D( Index kernelWidth, x = ix - radiusX; y = iy + radiusY; index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; index = threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x; if( x < 0 || y >= endY ) { if(x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); } else { Loading @@ -148,9 +148,9 @@ convolution2D( Index kernelWidth, x = ix + radiusX; y = iy + radiusY; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; index = kernelHorizontalPadding + threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x; if( x >= endX || y >= endY ) { if( x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); } else { Loading @@ -159,12 +159,15 @@ convolution2D( Index kernelWidth, __syncthreads(); if( ix >= endX || iy >= endY ) return; Real result = 0; for( Index j = 0; j <= radiusY; j++ ) { for( Index j = 0; j < kernelHeight; j++ ) { Index align = ( j + threadIdx.y ) * blockDim.x; for( Index i = 0; i <= radiusX; i++ ) { for( Index i = 0; i < kernelWidth; i++ ) { Index index = i + threadIdx.x + align; result = convolve( result, shared[ index ], fetchKernel( i, j ) ); Loading Loading @@ -199,9 +202,6 @@ convolution3D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX || iy >= endY || iz >= endZ ) return; Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radiusZ = kernelDepth >> 1; Loading @@ -215,9 +215,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; if( x < 0 || y < 0 || z < 0 ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -229,9 +229,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; if( x >= endX || y < 0 || z < 0 ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -243,9 +243,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; if( x < 0 || y >= endY || z < 0 ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -257,9 +257,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; if( x < 0 || y < 0 || z >= endZ ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -271,9 +271,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; if( x >= endX || y >= endY || z < 0 ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -285,9 +285,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; if( x >= endX || y < 0 || z >= endZ ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -299,9 +299,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; if( x < 0 || y >= endY || z >= endZ ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -313,9 +313,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; if( x >= endX || y >= endY || z >= endZ ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -324,15 +324,18 @@ convolution3D( Index kernelWidth, __syncthreads(); if( ix >= endX || iy >= endY || iz >= endZ ) return; Real result = 0; for( Index k = 0; k <= radiusZ; k++ ) { for( Index k = 0; k < kernelDepth; k++ ) { Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; for( Index j = 0; j <= radiusY; j++ ) { for( Index j = 0; j < kernelHeight; j++ ) { Index xAlign = ( j + threadIdx.y ) * blockDim.x; for( Index i = 0; i <= radiusX; i++ ) { for( Index i = 0; i < kernelWidth; i++ ) { Index index = i + threadIdx.x + xAlign + xyAlign; result = convolve( result, shared[ index ], fetchKernel( i, j, k ) ); Loading Loading @@ -486,7 +489,7 @@ public: TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); configuration.gridSize.y = configuration.gridSize.z = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } Loading src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h +44 −43 Original line number Diff line number Diff line Loading @@ -49,9 +49,6 @@ convolution1D( Index kernelWidth, { Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX ) return; Index kernelOffset = 2 * kernelWidth; Real* data = TNL::Cuda::getSharedMemory< Real >(); Loading @@ -62,7 +59,7 @@ convolution1D( Index kernelWidth, // Left Index lhs = ix - radius; if( lhs < 0 ) { if( lhs < 0 || lhs >= endX ) { data[ threadIdx.x ] = fetchBoundary( lhs ); } else { Loading @@ -72,7 +69,7 @@ convolution1D( Index kernelWidth, // Right Index rhs = ix + radius; if( rhs >= endX ) { if( rhs < 0 || rhs >= endX ) { data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs ); } else { Loading @@ -83,6 +80,9 @@ convolution1D( Index kernelWidth, __syncthreads(); if( ix >= endX ) return; Real result = 0; #pragma unroll Loading Loading @@ -117,9 +117,6 @@ convolution2D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX || iy >= endY ) return; Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ); Real* data = TNL::Cuda::getSharedMemory< Real >(); Loading @@ -138,7 +135,7 @@ convolution2D( Index kernelWidth, kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y ); if( x < 0 || y < 0 ) { if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { Loading @@ -149,9 +146,9 @@ convolution2D( Index kernelWidth, x = ix + radiusX; y = iy - radiusY; index = radiusX + threadIdx.x + threadIdx.y * blockDim.x; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x; if( x >= endX || y < 0 ) { if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { Loading @@ -162,9 +159,9 @@ convolution2D( Index kernelWidth, x = ix - radiusX; y = iy + radiusY; index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x; if( x < 0 || y >= endY ) { if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { Loading @@ -175,9 +172,9 @@ convolution2D( Index kernelWidth, x = ix + radiusX; y = iy + radiusY; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x; if( x >= endX || y >= endY ) { if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { Loading @@ -186,15 +183,18 @@ convolution2D( Index kernelWidth, __syncthreads(); if( ix >= endX || iy >= endY ) return; Real result = 0; #pragma unroll for( Index j = 0; j <= radiusY; j++ ) { for( Index j = 0; j < kernelHeight; j++ ) { Index elementAlign = ( j + threadIdx.y ) * blockDim.x; Index kernelAlign = j * blockDim.x; #pragma unroll for( Index i = 0; i <= radiusX; i++ ) { for( Index i = 0; i < kernelWidth; i++ ) { Index elementIndex = i + threadIdx.x + elementAlign; Index kernelIndex = i + kernelAlign; Loading Loading @@ -230,9 +230,6 @@ convolution3D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX || iy >= endY || iz >= endZ ) return; Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ) * ( 2 * kernelDepth - 1 ); Real* data = TNL::Cuda::getSharedMemory< Real >(); Loading @@ -249,11 +246,11 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); if( x < 0 || y < 0 || z < 0 ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -265,9 +262,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; if( x >= endX || y < 0 || z < 0 ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -279,9 +276,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; if( x < 0 || y >= endY || z < 0 ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -293,9 +290,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; if( x < 0 || y < 0 || z >= endZ ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -307,9 +304,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; if( x >= endX || y >= endY || z < 0 ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ) { data[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -321,9 +318,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; if( x >= endX || y < 0 || z >= endZ ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -335,9 +332,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; if( x < 0 || y >= endY || z >= endZ ) { if(x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -349,9 +346,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; if( x >= endX || y >= endY || z >= endZ ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -360,21 +357,25 @@ convolution3D( Index kernelWidth, __syncthreads(); if( ix >= endX || iy >= endY || iz >= endZ ) return; Real result = 0; for( Index k = 0; k <= radiusZ; k++ ) { #pragma unroll for( Index k = 0; k < kernelDepth; k++ ) { Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; Index xyKernelAlign = k * blockDim.x * blockDim.y; for( Index j = 0; j <= radiusY; j++ ) { #pragma unroll for( Index j = 0; j < kernelHeight; j++ ) { Index xAlign = ( j + threadIdx.y ) * blockDim.x; Index xKernelAlign = j * blockDim.x; for( Index i = 0; i <= radiusX; i++ ) { #pragma unroll for( Index i = 0; i < kernelWidth; i++ ) { Index elementIndex = i + threadIdx.x + xAlign + xyAlign; Index kernelIndex = i + xKernelAlign + xyKernelAlign; result = convolve( result, data[ index ], kernel[ kernelIndex ] ); result = convolve( result, data[ elementIndex ], kernel[ kernelIndex ] ); } } } Loading Loading @@ -531,7 +532,7 @@ public: TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); configuration.gridSize.y = configuration.gridSize.z = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } Loading src/Benchmarks/Convolution/kernels/sharedKernel.h +16 −12 Original line number Diff line number Diff line Loading @@ -34,9 +34,6 @@ convolution1D( Index kernelWidth, { Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX ) return; Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radius = kernelWidth >> 1; Loading @@ -46,8 +43,12 @@ convolution1D( Index kernelWidth, __syncthreads(); if( ix >= endX ) return; Real result = 0; #pragma unroll for( Index i = -radius; i <= radius; i++ ) { Index elementIndex = i + ix; Index kernelIndex = i + radius; Loading Loading @@ -85,9 +86,6 @@ convolution2D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX || iy >= endY ) return; Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radiusY = kernelHeight >> 1; Loading @@ -100,12 +98,17 @@ convolution2D( Index kernelWidth, __syncthreads(); if( ix >= endX || iy >= endY ) return; Real result = 0; #pragma unroll for( Index j = -radiusY; j <= radiusY; j++ ) { Index elementIndexY = j + iy; Index kernelIndexY = j + radiusY; #pragma unroll for( Index i = -radiusX; i <= radiusX; i++ ) { Index elementIndexX = i + ix; Index kernelIndexX = i + radiusX; Loading Loading @@ -149,9 +152,6 @@ convolution3D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX || iy >= endY || iz >= endZ ) return; Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radiusZ = kernelDepth >> 1; Loading @@ -160,23 +160,27 @@ convolution3D( Index kernelWidth, Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z; printf( "%d\n", threadIndex ); // The size of the block is equal to the kernel size shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); __syncthreads(); if( ix >= endX || iy >= endY || iz >= endZ ) return; Real result = 0; #pragma unroll for( Index k = -radiusZ; k <= radiusZ; k++ ) { Index elementIndexZ = k + iz; Index kernelIndexZ = k + radiusZ; #pragma unroll for( Index j = -radiusY; j <= radiusY; j++ ) { Index elementIndexY = j + iy; Index kernelIndexY = j + radiusY; #pragma unroll for( Index i = -radiusX; i <= radiusX; i++ ) { Index elementIndexX = i + ix; Index kernelIndexX = i + radiusX; Loading Loading @@ -338,7 +342,7 @@ public: TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); configuration.gridSize.y = configuration.gridSize.z = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } Loading src/Benchmarks/Convolution/support/DummySolver.h +7 −2 Original line number Diff line number Diff line Loading @@ -61,6 +61,11 @@ public: DummyTask<int, float, Dimension, Device>::exec(dimension, kernelSize, inputView, resultView, kernelView); TNL::Containers::Array< float, TNL::Devices::Host, int > host(result); for (int i = 0; i < host.getSize(); i++) TNL_ASSERT_EQ(host[i], kernelElementsCount, "Dummy task always sets volume of kernel"); std::cout << "Everything is fine" << std::endl; } Loading @@ -72,12 +77,12 @@ public: config.addDelimiter( "Grid dimension settings:" ); for( int i = 0; i < Dimension; i++ ) config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 512 ); config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 64 ); config.addDelimiter( "Kernel settings:" ); for( int i = 0; i < Dimension; i++ ) config.addEntry< int >( kernelSizeIds[ i ], kernelSizeIds[ i ] + " (odd) :", 11 ); config.addEntry< int >( kernelSizeIds[ i ], kernelSizeIds[ i ] + " (odd) :", 9 ); return config; } Loading Loading
src/Benchmarks/Convolution/kernels/naive.h +1 −1 Original line number Diff line number Diff line Loading @@ -305,7 +305,7 @@ public: TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); configuration.gridSize.y = configuration.gridSize.z = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } Loading
src/Benchmarks/Convolution/kernels/sharedData.h +44 −41 Original line number Diff line number Diff line Loading @@ -33,16 +33,13 @@ convolution1D( Index kernelWidth, { Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX ) return; Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radius = kernelWidth >> 1; // Left Index lhs = ix - radius; if( lhs < 0 ) { if( lhs < 0 || lhs >= endX ) { shared[ threadIdx.x ] = fetchBoundary( lhs ); } else { Loading @@ -52,7 +49,7 @@ convolution1D( Index kernelWidth, // Right Index rhs = ix + radius; if( rhs >= endX ) { if( rhs < 0 || rhs >= endX ) { shared[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs ); } else { Loading @@ -61,6 +58,9 @@ convolution1D( Index kernelWidth, __syncthreads(); if( ix >= endX ) return; Real result = 0; #pragma unroll Loading Loading @@ -95,9 +95,6 @@ convolution2D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX || iy >= endY ) return; Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radiusY = kernelHeight >> 1; Loading @@ -105,13 +102,16 @@ convolution2D( Index kernelWidth, Index x, y, index; Index kernelHorizontalPadding = kernelWidth == 1 ? 0 : kernelWidth; Index kernelVerticalPadding = kernelHeight == 1 ? 0 : kernelHeight; // Top Left x = ix - radiusX; y = iy - radiusY; index = threadIdx.x + threadIdx.y * blockDim.x; if( x < 0 || y < 0 ) { if( x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); } else { Loading @@ -122,9 +122,9 @@ convolution2D( Index kernelWidth, x = ix + radiusX; y = iy - radiusY; index = radiusX + threadIdx.x + threadIdx.y * blockDim.x; index = kernelHorizontalPadding + threadIdx.x + threadIdx.y * blockDim.x; if( x >= endX || y < 0 ) { if( x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); } else { Loading @@ -135,9 +135,9 @@ convolution2D( Index kernelWidth, x = ix - radiusX; y = iy + radiusY; index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; index = threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x; if( x < 0 || y >= endY ) { if(x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); } else { Loading @@ -148,9 +148,9 @@ convolution2D( Index kernelWidth, x = ix + radiusX; y = iy + radiusY; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; index = kernelHorizontalPadding + threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x; if( x >= endX || y >= endY ) { if( x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); } else { Loading @@ -159,12 +159,15 @@ convolution2D( Index kernelWidth, __syncthreads(); if( ix >= endX || iy >= endY ) return; Real result = 0; for( Index j = 0; j <= radiusY; j++ ) { for( Index j = 0; j < kernelHeight; j++ ) { Index align = ( j + threadIdx.y ) * blockDim.x; for( Index i = 0; i <= radiusX; i++ ) { for( Index i = 0; i < kernelWidth; i++ ) { Index index = i + threadIdx.x + align; result = convolve( result, shared[ index ], fetchKernel( i, j ) ); Loading Loading @@ -199,9 +202,6 @@ convolution3D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX || iy >= endY || iz >= endZ ) return; Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radiusZ = kernelDepth >> 1; Loading @@ -215,9 +215,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; if( x < 0 || y < 0 || z < 0 ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -229,9 +229,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; if( x >= endX || y < 0 || z < 0 ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -243,9 +243,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; if( x < 0 || y >= endY || z < 0 ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -257,9 +257,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; if( x < 0 || y < 0 || z >= endZ ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -271,9 +271,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; if( x >= endX || y >= endY || z < 0 ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -285,9 +285,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; if( x >= endX || y < 0 || z >= endZ ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -299,9 +299,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; if( x < 0 || y >= endY || z >= endZ ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -313,9 +313,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; if( x >= endX || y >= endY || z >= endZ ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -324,15 +324,18 @@ convolution3D( Index kernelWidth, __syncthreads(); if( ix >= endX || iy >= endY || iz >= endZ ) return; Real result = 0; for( Index k = 0; k <= radiusZ; k++ ) { for( Index k = 0; k < kernelDepth; k++ ) { Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; for( Index j = 0; j <= radiusY; j++ ) { for( Index j = 0; j < kernelHeight; j++ ) { Index xAlign = ( j + threadIdx.y ) * blockDim.x; for( Index i = 0; i <= radiusX; i++ ) { for( Index i = 0; i < kernelWidth; i++ ) { Index index = i + threadIdx.x + xAlign + xyAlign; result = convolve( result, shared[ index ], fetchKernel( i, j, k ) ); Loading Loading @@ -486,7 +489,7 @@ public: TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); configuration.gridSize.y = configuration.gridSize.z = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } Loading
src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h +44 −43 Original line number Diff line number Diff line Loading @@ -49,9 +49,6 @@ convolution1D( Index kernelWidth, { Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX ) return; Index kernelOffset = 2 * kernelWidth; Real* data = TNL::Cuda::getSharedMemory< Real >(); Loading @@ -62,7 +59,7 @@ convolution1D( Index kernelWidth, // Left Index lhs = ix - radius; if( lhs < 0 ) { if( lhs < 0 || lhs >= endX ) { data[ threadIdx.x ] = fetchBoundary( lhs ); } else { Loading @@ -72,7 +69,7 @@ convolution1D( Index kernelWidth, // Right Index rhs = ix + radius; if( rhs >= endX ) { if( rhs < 0 || rhs >= endX ) { data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs ); } else { Loading @@ -83,6 +80,9 @@ convolution1D( Index kernelWidth, __syncthreads(); if( ix >= endX ) return; Real result = 0; #pragma unroll Loading Loading @@ -117,9 +117,6 @@ convolution2D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX || iy >= endY ) return; Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ); Real* data = TNL::Cuda::getSharedMemory< Real >(); Loading @@ -138,7 +135,7 @@ convolution2D( Index kernelWidth, kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y ); if( x < 0 || y < 0 ) { if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { Loading @@ -149,9 +146,9 @@ convolution2D( Index kernelWidth, x = ix + radiusX; y = iy - radiusY; index = radiusX + threadIdx.x + threadIdx.y * blockDim.x; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x; if( x >= endX || y < 0 ) { if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { Loading @@ -162,9 +159,9 @@ convolution2D( Index kernelWidth, x = ix - radiusX; y = iy + radiusY; index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x; if( x < 0 || y >= endY ) { if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { Loading @@ -175,9 +172,9 @@ convolution2D( Index kernelWidth, x = ix + radiusX; y = iy + radiusY; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x; if( x >= endX || y >= endY ) { if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { Loading @@ -186,15 +183,18 @@ convolution2D( Index kernelWidth, __syncthreads(); if( ix >= endX || iy >= endY ) return; Real result = 0; #pragma unroll for( Index j = 0; j <= radiusY; j++ ) { for( Index j = 0; j < kernelHeight; j++ ) { Index elementAlign = ( j + threadIdx.y ) * blockDim.x; Index kernelAlign = j * blockDim.x; #pragma unroll for( Index i = 0; i <= radiusX; i++ ) { for( Index i = 0; i < kernelWidth; i++ ) { Index elementIndex = i + threadIdx.x + elementAlign; Index kernelIndex = i + kernelAlign; Loading Loading @@ -230,9 +230,6 @@ convolution3D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX || iy >= endY || iz >= endZ ) return; Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ) * ( 2 * kernelDepth - 1 ); Real* data = TNL::Cuda::getSharedMemory< Real >(); Loading @@ -249,11 +246,11 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); if( x < 0 || y < 0 || z < 0 ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -265,9 +262,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; if( x >= endX || y < 0 || z < 0 ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -279,9 +276,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; if( x < 0 || y >= endY || z < 0 ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -293,9 +290,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; if( x < 0 || y < 0 || z >= endZ ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -307,9 +304,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; if( x >= endX || y >= endY || z < 0 ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ) { data[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -321,9 +318,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; if( x >= endX || y < 0 || z >= endZ ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -335,9 +332,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; if( x < 0 || y >= endY || z >= endZ ) { if(x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -349,9 +346,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; if( x >= endX || y >= endY || z >= endZ ) { if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { Loading @@ -360,21 +357,25 @@ convolution3D( Index kernelWidth, __syncthreads(); if( ix >= endX || iy >= endY || iz >= endZ ) return; Real result = 0; for( Index k = 0; k <= radiusZ; k++ ) { #pragma unroll for( Index k = 0; k < kernelDepth; k++ ) { Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; Index xyKernelAlign = k * blockDim.x * blockDim.y; for( Index j = 0; j <= radiusY; j++ ) { #pragma unroll for( Index j = 0; j < kernelHeight; j++ ) { Index xAlign = ( j + threadIdx.y ) * blockDim.x; Index xKernelAlign = j * blockDim.x; for( Index i = 0; i <= radiusX; i++ ) { #pragma unroll for( Index i = 0; i < kernelWidth; i++ ) { Index elementIndex = i + threadIdx.x + xAlign + xyAlign; Index kernelIndex = i + xKernelAlign + xyKernelAlign; result = convolve( result, data[ index ], kernel[ kernelIndex ] ); result = convolve( result, data[ elementIndex ], kernel[ kernelIndex ] ); } } } Loading Loading @@ -531,7 +532,7 @@ public: TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); configuration.gridSize.y = configuration.gridSize.z = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } Loading
src/Benchmarks/Convolution/kernels/sharedKernel.h +16 −12 Original line number Diff line number Diff line Loading @@ -34,9 +34,6 @@ convolution1D( Index kernelWidth, { Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX ) return; Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radius = kernelWidth >> 1; Loading @@ -46,8 +43,12 @@ convolution1D( Index kernelWidth, __syncthreads(); if( ix >= endX ) return; Real result = 0; #pragma unroll for( Index i = -radius; i <= radius; i++ ) { Index elementIndex = i + ix; Index kernelIndex = i + radius; Loading Loading @@ -85,9 +86,6 @@ convolution2D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX || iy >= endY ) return; Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radiusY = kernelHeight >> 1; Loading @@ -100,12 +98,17 @@ convolution2D( Index kernelWidth, __syncthreads(); if( ix >= endX || iy >= endY ) return; Real result = 0; #pragma unroll for( Index j = -radiusY; j <= radiusY; j++ ) { Index elementIndexY = j + iy; Index kernelIndexY = j + radiusY; #pragma unroll for( Index i = -radiusX; i <= radiusX; i++ ) { Index elementIndexX = i + ix; Index kernelIndexX = i + radiusX; Loading Loading @@ -149,9 +152,6 @@ convolution3D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX || iy >= endY || iz >= endZ ) return; Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radiusZ = kernelDepth >> 1; Loading @@ -160,23 +160,27 @@ convolution3D( Index kernelWidth, Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z; printf( "%d\n", threadIndex ); // The size of the block is equal to the kernel size shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); __syncthreads(); if( ix >= endX || iy >= endY || iz >= endZ ) return; Real result = 0; #pragma unroll for( Index k = -radiusZ; k <= radiusZ; k++ ) { Index elementIndexZ = k + iz; Index kernelIndexZ = k + radiusZ; #pragma unroll for( Index j = -radiusY; j <= radiusY; j++ ) { Index elementIndexY = j + iy; Index kernelIndexY = j + radiusY; #pragma unroll for( Index i = -radiusX; i <= radiusX; i++ ) { Index elementIndexX = i + ix; Index kernelIndexX = i + radiusX; Loading Loading @@ -338,7 +342,7 @@ public: TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); configuration.gridSize.y = configuration.gridSize.z = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } Loading
src/Benchmarks/Convolution/support/DummySolver.h +7 −2 Original line number Diff line number Diff line Loading @@ -61,6 +61,11 @@ public: DummyTask<int, float, Dimension, Device>::exec(dimension, kernelSize, inputView, resultView, kernelView); TNL::Containers::Array< float, TNL::Devices::Host, int > host(result); for (int i = 0; i < host.getSize(); i++) TNL_ASSERT_EQ(host[i], kernelElementsCount, "Dummy task always sets volume of kernel"); std::cout << "Everything is fine" << std::endl; } Loading @@ -72,12 +77,12 @@ public: config.addDelimiter( "Grid dimension settings:" ); for( int i = 0; i < Dimension; i++ ) config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 512 ); config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 64 ); config.addDelimiter( "Kernel settings:" ); for( int i = 0; i < Dimension; i++ ) config.addEntry< int >( kernelSizeIds[ i ], kernelSizeIds[ i ] + " (odd) :", 11 ); config.addEntry< int >( kernelSizeIds[ i ], kernelSizeIds[ i ] + " (odd) :", 9 ); return config; } Loading