Commit 6860696c authored by Yury Hayeu's avatar Yury Hayeu
Browse files

Fix convolution kernel execution

parent 5d0dac39
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -305,7 +305,7 @@ public:
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
      configuration.gridSize.y =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
      configuration.gridSize.y =
      configuration.gridSize.z =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
   }

+44 −41
Original line number Diff line number Diff line
@@ -33,16 +33,13 @@ convolution1D( Index kernelWidth,
{
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if( ix >= endX )
      return;

   Real* shared = TNL::Cuda::getSharedMemory< Real >();
   Index radius = kernelWidth >> 1;

   // Left
   Index lhs = ix - radius;

   if( lhs < 0 ) {
   if( lhs < 0 || lhs >= endX ) {
      shared[ threadIdx.x ] = fetchBoundary( lhs );
   }
   else {
@@ -52,7 +49,7 @@ convolution1D( Index kernelWidth,
   // Right
   Index rhs = ix + radius;

   if( rhs >= endX ) {
   if( rhs < 0 || rhs >= endX ) {
      shared[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs );
   }
   else {
@@ -61,6 +58,9 @@ convolution1D( Index kernelWidth,

   __syncthreads();

   if( ix >= endX )
      return;

   Real result = 0;

   #pragma unroll
@@ -95,9 +95,6 @@ convolution2D( Index kernelWidth,
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if( ix >= endX || iy >= endY )
      return;

   Real* shared = TNL::Cuda::getSharedMemory< Real >();

   Index radiusY = kernelHeight >> 1;
@@ -105,13 +102,16 @@ convolution2D( Index kernelWidth,

   Index x, y, index;

   Index kernelHorizontalPadding = kernelWidth == 1 ? 0 : kernelWidth;
   Index kernelVerticalPadding = kernelHeight == 1 ? 0 : kernelHeight;

   // Top Left
   x = ix - radiusX;
   y = iy - radiusY;

   index = threadIdx.x + threadIdx.y * blockDim.x;

   if( x < 0 || y < 0 ) {
   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
      shared[ index ] = fetchBoundary( x, y );
   }
   else {
@@ -122,9 +122,9 @@ convolution2D( Index kernelWidth,
   x = ix + radiusX;
   y = iy - radiusY;

   index = radiusX + threadIdx.x + threadIdx.y * blockDim.x;
   index = kernelHorizontalPadding + threadIdx.x + threadIdx.y * blockDim.x;

   if( x >= endX || y < 0 ) {
   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
      shared[ index ] = fetchBoundary( x, y );
   }
   else {
@@ -135,9 +135,9 @@ convolution2D( Index kernelWidth,
   x = ix - radiusX;
   y = iy + radiusY;

   index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;
   index = threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x;

   if( x < 0 || y >= endY ) {
   if(x < 0 || y < 0 || x >= endX || y >= endY ) {
      shared[ index ] = fetchBoundary( x, y );
   }
   else {
@@ -148,9 +148,9 @@ convolution2D( Index kernelWidth,
   x = ix + radiusX;
   y = iy + radiusY;

   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;
   index = kernelHorizontalPadding + threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x;

   if( x >= endX || y >= endY ) {
   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
      shared[ index ] = fetchBoundary( x, y );
   }
   else {
@@ -159,12 +159,15 @@ convolution2D( Index kernelWidth,

   __syncthreads();

   if( ix >= endX || iy >= endY )
      return;

   Real result = 0;

   for( Index j = 0; j <= radiusY; j++ ) {
   for( Index j = 0; j < kernelHeight; j++ ) {
      Index align = ( j + threadIdx.y ) * blockDim.x;

      for( Index i = 0; i <= radiusX; i++ ) {
      for( Index i = 0; i < kernelWidth; i++ ) {
         Index index = i + threadIdx.x + align;

         result = convolve( result, shared[ index ], fetchKernel( i, j ) );
@@ -199,9 +202,6 @@ convolution3D( Index kernelWidth,
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if( ix >= endX || iy >= endY || iz >= endZ )
      return;

   Real* shared = TNL::Cuda::getSharedMemory< Real >();

   Index radiusZ = kernelDepth >> 1;
@@ -215,9 +215,9 @@ convolution3D( Index kernelWidth,
   y = iy - radiusY;
   z = iz - radiusZ;

   index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
   index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;

   if( x < 0 || y < 0 || z < 0 ) {
   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
   }
   else {
@@ -229,9 +229,9 @@ convolution3D( Index kernelWidth,
   y = iy - radiusY;
   z = iz - radiusZ;

   index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;

   if( x >= endX || y < 0 || z < 0 ) {
   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
   }
   else {
@@ -243,9 +243,9 @@ convolution3D( Index kernelWidth,
   y = iy + radiusY;
   z = iz - radiusZ;

   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;

   if( x < 0 || y >= endY || z < 0 ) {
   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
   }
   else {
@@ -257,9 +257,9 @@ convolution3D( Index kernelWidth,
   y = iy - radiusY;
   z = iz + radiusZ;

   index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
   index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;

   if( x < 0 || y < 0 || z >= endZ ) {
   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
   }
   else {
@@ -271,9 +271,9 @@ convolution3D( Index kernelWidth,
   y = iy + radiusY;
   z = iz - radiusZ;

   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;

   if( x >= endX || y >= endY || z < 0 ) {
   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
   }
   else {
@@ -285,9 +285,9 @@ convolution3D( Index kernelWidth,
   y = iy - radiusY;
   z = iz + radiusZ;

   index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;

   if( x >= endX || y < 0 || z >= endZ ) {
   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
   }
   else {
@@ -299,9 +299,9 @@ convolution3D( Index kernelWidth,
   y = iy + radiusY;
   z = iz + radiusZ;

   index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
   index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;

   if( x < 0 || y >= endY || z >= endZ ) {
   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
   }
   else {
@@ -313,9 +313,9 @@ convolution3D( Index kernelWidth,
   y = iy + radiusY;
   z = iz + radiusZ;

   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;

   if( x >= endX || y >= endY || z >= endZ ) {
   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
   }
   else {
@@ -324,15 +324,18 @@ convolution3D( Index kernelWidth,

   __syncthreads();

   if( ix >= endX || iy >= endY || iz >= endZ )
      return;

   Real result = 0;

   for( Index k = 0; k <= radiusZ; k++ ) {
   for( Index k = 0; k < kernelDepth; k++ ) {
      Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;

      for( Index j = 0; j <= radiusY; j++ ) {
      for( Index j = 0; j < kernelHeight; j++ ) {
         Index xAlign = ( j + threadIdx.y ) * blockDim.x;

         for( Index i = 0; i <= radiusX; i++ ) {
         for( Index i = 0; i < kernelWidth; i++ ) {
            Index index = i + threadIdx.x + xAlign + xyAlign;

            result = convolve( result, shared[ index ], fetchKernel( i, j, k ) );
@@ -486,7 +489,7 @@ public:
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
      configuration.gridSize.y =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
      configuration.gridSize.y =
      configuration.gridSize.z =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
   }

+44 −43
Original line number Diff line number Diff line
@@ -49,9 +49,6 @@ convolution1D( Index kernelWidth,
{
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if( ix >= endX )
      return;

   Index kernelOffset = 2 * kernelWidth;

   Real* data = TNL::Cuda::getSharedMemory< Real >();
@@ -62,7 +59,7 @@ convolution1D( Index kernelWidth,
   // Left
   Index lhs = ix - radius;

   if( lhs < 0 ) {
   if( lhs < 0 || lhs >= endX ) {
      data[ threadIdx.x ] = fetchBoundary( lhs );
   }
   else {
@@ -72,7 +69,7 @@ convolution1D( Index kernelWidth,
   // Right
   Index rhs = ix + radius;

   if( rhs >= endX ) {
   if( rhs < 0 || rhs >= endX ) {
      data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs );
   }
   else {
@@ -83,6 +80,9 @@ convolution1D( Index kernelWidth,

   __syncthreads();

   if( ix >= endX )
      return;

   Real result = 0;

   #pragma unroll
@@ -117,9 +117,6 @@ convolution2D( Index kernelWidth,
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if( ix >= endX || iy >= endY )
      return;

   Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 );

   Real* data = TNL::Cuda::getSharedMemory< Real >();
@@ -138,7 +135,7 @@ convolution2D( Index kernelWidth,

   kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y );

   if( x < 0 || y < 0 ) {
   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
      data[ index ] = fetchBoundary( x, y );
   }
   else {
@@ -149,9 +146,9 @@ convolution2D( Index kernelWidth,
   x = ix + radiusX;
   y = iy - radiusY;

   index = radiusX + threadIdx.x + threadIdx.y * blockDim.x;
   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x;

   if( x >= endX || y < 0 ) {
   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
      data[ index ] = fetchBoundary( x, y );
   }
   else {
@@ -162,9 +159,9 @@ convolution2D( Index kernelWidth,
   x = ix - radiusX;
   y = iy + radiusY;

   index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;
   index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x;

   if( x < 0 || y >= endY ) {
   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
      data[ index ] = fetchBoundary( x, y );
   }
   else {
@@ -175,9 +172,9 @@ convolution2D( Index kernelWidth,
   x = ix + radiusX;
   y = iy + radiusY;

   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;
   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x;

   if( x >= endX || y >= endY ) {
   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
      data[ index ] = fetchBoundary( x, y );
   }
   else {
@@ -186,15 +183,18 @@ convolution2D( Index kernelWidth,

   __syncthreads();

    if( ix >= endX || iy >= endY )
      return;

   Real result = 0;

   #pragma unroll
   for( Index j = 0; j <= radiusY; j++ ) {
   for( Index j = 0; j < kernelHeight; j++ ) {
      Index elementAlign = ( j + threadIdx.y ) * blockDim.x;
      Index kernelAlign = j * blockDim.x;

   #pragma unroll
      for( Index i = 0; i <= radiusX; i++ ) {
      for( Index i = 0; i < kernelWidth; i++ ) {
         Index elementIndex = i + threadIdx.x + elementAlign;
         Index kernelIndex = i + kernelAlign;

@@ -230,9 +230,6 @@ convolution3D( Index kernelWidth,
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if( ix >= endX || iy >= endY || iz >= endZ )
      return;

   Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ) * ( 2 * kernelDepth - 1 );

   Real* data = TNL::Cuda::getSharedMemory< Real >();
@@ -249,11 +246,11 @@ convolution3D( Index kernelWidth,
   y = iy - radiusY;
   z = iz - radiusZ;

   index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
   index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;

   kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z );

   if( x < 0 || y < 0 || z < 0 ) {
   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
@@ -265,9 +262,9 @@ convolution3D( Index kernelWidth,
   y = iy - radiusY;
   z = iz - radiusZ;

   index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;

   if( x >= endX || y < 0 || z < 0 ) {
   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
@@ -279,9 +276,9 @@ convolution3D( Index kernelWidth,
   y = iy + radiusY;
   z = iz - radiusZ;

   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;

   if( x < 0 || y >= endY || z < 0 ) {
   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
@@ -293,9 +290,9 @@ convolution3D( Index kernelWidth,
   y = iy - radiusY;
   z = iz + radiusZ;

   index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
   index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;

   if( x < 0 || y < 0 || z >= endZ ) {
   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
@@ -307,9 +304,9 @@ convolution3D( Index kernelWidth,
   y = iy + radiusY;
   z = iz - radiusZ;

   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;

   if( x >= endX || y >= endY || z < 0 ) {
   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ) {
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
@@ -321,9 +318,9 @@ convolution3D( Index kernelWidth,
   y = iy - radiusY;
   z = iz + radiusZ;

   index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;

   if( x >= endX || y < 0 || z >= endZ ) {
   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
@@ -335,9 +332,9 @@ convolution3D( Index kernelWidth,
   y = iy + radiusY;
   z = iz + radiusZ;

   index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
   index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;

   if( x < 0 || y >= endY || z >= endZ ) {
   if(x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
@@ -349,9 +346,9 @@ convolution3D( Index kernelWidth,
   y = iy + radiusY;
   z = iz + radiusZ;

   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;

   if( x >= endX || y >= endY || z >= endZ ) {
   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
@@ -360,21 +357,25 @@ convolution3D( Index kernelWidth,

   __syncthreads();

   if( ix >= endX || iy >= endY || iz >= endZ )
      return;

   Real result = 0;

   for( Index k = 0; k <= radiusZ; k++ ) {
   #pragma unroll
   for( Index k = 0; k < kernelDepth; k++ ) {
      Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;
      Index xyKernelAlign = k * blockDim.x * blockDim.y;

      for( Index j = 0; j <= radiusY; j++ ) {
   #pragma unroll
      for( Index j = 0; j < kernelHeight; j++ ) {
         Index xAlign = ( j + threadIdx.y ) * blockDim.x;
         Index xKernelAlign = j * blockDim.x;

         for( Index i = 0; i <= radiusX; i++ ) {
   #pragma unroll
         for( Index i = 0; i < kernelWidth; i++ ) {
            Index elementIndex = i + threadIdx.x + xAlign + xyAlign;
            Index kernelIndex = i + xKernelAlign + xyKernelAlign;

            result = convolve( result, data[ index ], kernel[ kernelIndex ] );
            result = convolve( result, data[ elementIndex ], kernel[ kernelIndex ] );
         }
      }
   }
@@ -531,7 +532,7 @@ public:
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
      configuration.gridSize.y =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
      configuration.gridSize.y =
      configuration.gridSize.z =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
   }

+16 −12
Original line number Diff line number Diff line
@@ -34,9 +34,6 @@ convolution1D( Index kernelWidth,
{
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if( ix >= endX )
      return;

   Real* shared = TNL::Cuda::getSharedMemory< Real >();

   Index radius = kernelWidth >> 1;
@@ -46,8 +43,12 @@ convolution1D( Index kernelWidth,

   __syncthreads();

   if( ix >= endX )
      return;

   Real result = 0;

   #pragma unroll
   for( Index i = -radius; i <= radius; i++ ) {
      Index elementIndex = i + ix;
      Index kernelIndex = i + radius;
@@ -85,9 +86,6 @@ convolution2D( Index kernelWidth,
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if( ix >= endX || iy >= endY )
      return;

   Real* shared = TNL::Cuda::getSharedMemory< Real >();

   Index radiusY = kernelHeight >> 1;
@@ -100,12 +98,17 @@ convolution2D( Index kernelWidth,

   __syncthreads();

   if( ix >= endX || iy >= endY )
      return;

   Real result = 0;

   #pragma unroll
   for( Index j = -radiusY; j <= radiusY; j++ ) {
      Index elementIndexY = j + iy;
      Index kernelIndexY = j + radiusY;

      #pragma unroll
      for( Index i = -radiusX; i <= radiusX; i++ ) {
         Index elementIndexX = i + ix;
         Index kernelIndexX = i + radiusX;
@@ -149,9 +152,6 @@ convolution3D( Index kernelWidth,
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if( ix >= endX || iy >= endY || iz >= endZ )
      return;

   Real* shared = TNL::Cuda::getSharedMemory< Real >();

   Index radiusZ = kernelDepth >> 1;
@@ -160,23 +160,27 @@ convolution3D( Index kernelWidth,

   Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z;

   printf( "%d\n", threadIndex );

   // The size of the block is equal to the kernel size
   shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z );

   __syncthreads();

   if( ix >= endX || iy >= endY || iz >= endZ )
      return;

   Real result = 0;

   #pragma unroll
   for( Index k = -radiusZ; k <= radiusZ; k++ ) {
      Index elementIndexZ = k + iz;
      Index kernelIndexZ = k + radiusZ;

      #pragma unroll
      for( Index j = -radiusY; j <= radiusY; j++ ) {
         Index elementIndexY = j + iy;
         Index kernelIndexY = j + radiusY;

         #pragma unroll
         for( Index i = -radiusX; i <= radiusX; i++ ) {
            Index elementIndexX = i + ix;
            Index kernelIndexX = i + radiusX;
@@ -338,7 +342,7 @@ public:
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
      configuration.gridSize.y =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
      configuration.gridSize.y =
      configuration.gridSize.z =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
   }

+7 −2
Original line number Diff line number Diff line
@@ -61,6 +61,11 @@ public:

      DummyTask<int, float, Dimension, Device>::exec(dimension, kernelSize, inputView, resultView, kernelView);

      TNL::Containers::Array< float, TNL::Devices::Host, int > host(result);

      for (int i = 0; i < host.getSize(); i++)
         TNL_ASSERT_EQ(host[i], kernelElementsCount, "Dummy task always sets volume of kernel");

      std::cout << "Everything is fine" << std::endl;
   }

@@ -72,12 +77,12 @@ public:
      config.addDelimiter( "Grid dimension settings:" );

      for( int i = 0; i < Dimension; i++ )
         config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 512 );
         config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 64 );

      config.addDelimiter( "Kernel settings:" );

      for( int i = 0; i < Dimension; i++ )
         config.addEntry< int >( kernelSizeIds[ i ], kernelSizeIds[ i ] + " (odd) :", 11 );
         config.addEntry< int >( kernelSizeIds[ i ], kernelSizeIds[ i ] + " (odd) :", 9 );

      return config;
   }
Loading