Commit 231c69bf authored by Yury Hayeu's avatar Yury Hayeu
Browse files

Add image convolution solver

parent 12827c2d
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -17,6 +17,10 @@ if (${BUILD_CUDA})
   SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}_${TEMPLATE_NAME}")

   CUDA_ADD_EXECUTABLE(${EXECUTABLE_NAME} ${SOURCE_FILE})

   if( PNG_FOUND )
      target_link_libraries( ${EXECUTABLE_NAME} ${PNG_LIBRARIES} )
   endif()
else()
   MESSAGE(WARNING "Convolutions are not supported on CPU")
endif()
@@ -54,3 +58,8 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/shar
GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")
GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")
GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")

GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/naive.h")
GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedData.h")
GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedKernel.h")
GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedDataAndKernel.h")
+75 −66
Original line number Diff line number Diff line
@@ -33,27 +33,27 @@ convolution1D( Index kernelWidth,
{
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   Real* shared = TNL::Cuda::getSharedMemory< Real >();
   Real* data = TNL::Cuda::getSharedMemory< Real >();
   Index radius = kernelWidth >> 1;

   // Left
   Index lhs = ix - radius;

   if( lhs < 0 || lhs >= endX ) {
      shared[ threadIdx.x ] = fetchBoundary( lhs );
      data[ threadIdx.x ] = fetchBoundary( lhs );
   }
   else {
      shared[ threadIdx.x ] = fetchData( lhs );
      data[ threadIdx.x ] = fetchData( lhs );
   }

   // Right
   Index rhs = ix + radius;

   if( rhs < 0 || rhs >= endX ) {
      shared[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs );
      data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs );
   }
   else {
      shared[ threadIdx.x + blockDim.x ] = fetchData( rhs );
      data[ threadIdx.x + blockDim.x ] = fetchData( rhs );
   }

   __syncthreads();
@@ -67,7 +67,7 @@ convolution1D( Index kernelWidth,
   for( Index i = 0; i < kernelWidth; i++ ) {
      Index elementIndex = i + threadIdx.x;

      result = convolve( result, shared[ elementIndex ], fetchKernel( i ) );
      result = convolve( result, data[ elementIndex ], fetchKernel( i ) );
   }

   store( ix, result );
@@ -92,69 +92,68 @@ convolution2D( Index kernelWidth,
               Convolve convolve,
               Store store )
{
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
   Real* data = TNL::Cuda::getSharedMemory< Real >();

   Real* shared = TNL::Cuda::getSharedMemory< Real >();
   const Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   const Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   Index radiusY = kernelHeight >> 1;
   Index radiusX = kernelWidth >> 1;
   const Index radiusY = kernelHeight >> 1;
   const Index radiusX = kernelWidth >> 1;

   Index x, y, index;
   const Index dataBlockWidth = 2 * kernelWidth - 1;
   const Index dataBlockHeight = 2 * kernelHeight - 1;

   const Index dataBlockRadiusX = dataBlockWidth >> 1;
   const Index dataBlockRadiusY = dataBlockHeight >> 1;

   Index kernelHorizontalPadding = kernelWidth == 1 ? 0 : kernelWidth;
   Index kernelVerticalPadding = kernelHeight == 1 ? 0 : kernelHeight;
   Index x, y, index;

   // Top Left
   x = ix - radiusX;
   y = iy - radiusY;

   index = threadIdx.x + threadIdx.y * blockDim.x;
   index = threadIdx.x + threadIdx.y * dataBlockWidth;

   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
      shared[ index ] = fetchBoundary( x, y );
      data[ index ] = fetchBoundary( x, y );
   }
   else {
      shared[ index ] = fetchData( x, y );
      data[ index ] = fetchData( x, y );
   }

   // Top right
   x = ix + radiusX;
   y = iy - radiusY;

   index = kernelHorizontalPadding + threadIdx.x + threadIdx.y * blockDim.x;
   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth;

   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
      shared[ index ] = fetchBoundary( x, y );
      data[ index ] = fetchBoundary( x, y );
   }
   else {
      shared[ index ] = fetchData( x, y );
      data[ index ] = fetchData( x, y );
   }

   // Bottom Left
   x = ix - radiusX;
   y = iy + radiusY;

   index = threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x;
   index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;

   if(x < 0 || y < 0 || x >= endX || y >= endY ) {
      shared[ index ] = fetchBoundary( x, y );
      data[ index ] = fetchBoundary( x, y );
   }
   else {
      shared[ index ] = fetchData( x, y );
      data[ index ] = fetchData( x, y );
   }

   // Bottom Right
   x = ix + radiusX;
   y = iy + radiusY;

   index = kernelHorizontalPadding + threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x;
   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;

   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
      shared[ index ] = fetchBoundary( x, y );
      data[ index ] = fetchBoundary( x, y );
   }
   else {
      shared[ index ] = fetchData( x, y );
      data[ index ] = fetchData( x, y );
   }

   __syncthreads();
@@ -165,12 +164,12 @@ convolution2D( Index kernelWidth,
   Real result = 0;

   for( Index j = 0; j < kernelHeight; j++ ) {
      Index align = ( j + threadIdx.y ) * blockDim.x;
      Index align = ( j + threadIdx.y ) * dataBlockWidth;

      for( Index i = 0; i < kernelWidth; i++ ) {
         Index index = i + threadIdx.x + align;

         result = convolve( result, shared[ index ], fetchKernel( i, j ) );
         result = convolve( result, data[ index ], fetchKernel( i, j ) );
      }
   }

@@ -198,15 +197,25 @@ convolution3D( Index kernelWidth,
               Convolve convolve,
               Store store )
{
   Index iz = threadIdx.z + blockIdx.z * blockDim.z;
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
   Real* data = TNL::Cuda::getSharedMemory< Real >();

   const Index ix = threadIdx.x + blockIdx.x * blockDim.x;
   const Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   const Index iz = threadIdx.z + blockIdx.z * blockDim.z;

   const Index radiusX = kernelWidth >> 1;
   const Index radiusY = kernelHeight >> 1;
   const Index radiusZ = kernelDepth >> 1;

   const Index dataBlockWidth = 2 * kernelWidth - 1;
   const Index dataBlockHeight = 2 * kernelHeight - 1;
   const Index dataBlockDepth = 2 * kernelDepth - 1;

   Real* shared = TNL::Cuda::getSharedMemory< Real >();
   const Index dataBlockXYVolume = dataBlockWidth * dataBlockHeight;

   Index radiusZ = kernelDepth >> 1;
   Index radiusY = kernelHeight >> 1;
   Index radiusX = kernelWidth >> 1;
   const Index dataBlockRadiusX = dataBlockWidth >> 1;
   const Index dataBlockRadiusY = dataBlockHeight >> 1;
   const Index dataBlockRadiusZ = dataBlockDepth >> 1;

   Index x, y, z, index;

@@ -215,13 +224,13 @@ convolution3D( Index kernelWidth,
   y = iy - radiusY;
   z = iz - radiusZ;

   index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
   index = threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume;

   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
      shared[ index ] = fetchData( x, y, z );
      data[ index ] = fetchData( x, y, z );
   }

   // Z: 0 Y: 0 X: 1
@@ -229,13 +238,13 @@ convolution3D( Index kernelWidth,
   y = iy - radiusY;
   z = iz - radiusZ;

   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume;

   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
      shared[ index ] = fetchData( x, y, z );
      data[ index ] = fetchData( x, y, z );
   }

   // Z: 0 Y: 1 X: 0
@@ -243,13 +252,13 @@ convolution3D( Index kernelWidth,
   y = iy + radiusY;
   z = iz - radiusZ;

   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume;

   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
      shared[ index ] = fetchData( x, y, z );
      data[ index ] = fetchData( x, y, z );
   }

   // Z: 1 Y: 0 X: 0
@@ -257,13 +266,13 @@ convolution3D( Index kernelWidth,
   y = iy - radiusY;
   z = iz + radiusZ;

   index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
   index = threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;

   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
      shared[ index ] = fetchData( x, y, z );
      data[ index ] = fetchData( x, y, z );
   }

   // Z: 0 Y: 1 X: 1
@@ -271,13 +280,13 @@ convolution3D( Index kernelWidth,
   y = iy + radiusY;
   z = iz - radiusZ;

   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume;

   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
      shared[ index ] = fetchData( x, y, z );
      data[ index ] = fetchData( x, y, z );
   }

   // Z: 1 Y: 0 X: 1
@@ -285,13 +294,13 @@ convolution3D( Index kernelWidth,
   y = iy - radiusY;
   z = iz + radiusZ;

   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;

   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
      shared[ index ] = fetchData( x, y, z );
      data[ index ] = fetchData( x, y, z );
   }

   // Z: 1 Y: 1 X: 0
@@ -299,13 +308,13 @@ convolution3D( Index kernelWidth,
   y = iy + radiusY;
   z = iz + radiusZ;

   index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
   index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;

   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
      shared[ index ] = fetchData( x, y, z );
      data[ index ] = fetchData( x, y, z );
   }

   // Z: 1 Y: 1 X: 1
@@ -313,13 +322,13 @@ convolution3D( Index kernelWidth,
   y = iy + radiusY;
   z = iz + radiusZ;

   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;

   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      shared[ index ] = fetchBoundary( x, y, z );
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
      shared[ index ] = fetchData( x, y, z );
      data[ index ] = fetchData( x, y, z );
   }

   __syncthreads();
@@ -330,15 +339,15 @@ convolution3D( Index kernelWidth,
   Real result = 0;

   for( Index k = 0; k < kernelDepth; k++ ) {
      Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;
      Index xyAlign = ( k + threadIdx.z ) * dataBlockXYVolume;

      for( Index j = 0; j < kernelHeight; j++ ) {
         Index xAlign = ( j + threadIdx.y ) * blockDim.x;
         Index xAlign = ( j + threadIdx.y ) * dataBlockWidth;

         for( Index i = 0; i < kernelWidth; i++ ) {
            Index index = i + threadIdx.x + xAlign + xyAlign;

            result = convolve( result, shared[ index ], fetchKernel( i, j, k ) );
            result = convolve( result, data[ index ], fetchKernel( i, j, k ) );
         }
      }
   }
+58 −42
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@
   #include <TNL/Cuda/SharedMemory.h>

/**
 * This method stores kernel and data in the shared memory to reduce amount of loads.
 * This method stores kernel and data in the data memory to reduce amount of loads.
 *
 * We can calculate the size of shared memory needed the next way:
 * 1. We need to store in shared memory:
@@ -49,7 +49,7 @@ convolution1D( Index kernelWidth,
{
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   Index kernelOffset = 2 * kernelWidth;
   Index kernelOffset = 2 * kernelWidth - 1;

   Real* data = TNL::Cuda::getSharedMemory< Real >();
   Real* kernel = data + kernelOffset;
@@ -114,26 +114,29 @@ convolution2D( Index kernelWidth,
               Convolve convolve,
               Store store )
{
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
   const Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   const Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   const Index radiusY = kernelHeight >> 1;
   const Index radiusX = kernelWidth >> 1;

   const Index dataBlockWidth = 2 * kernelWidth - 1;
   const Index dataBlockHeight = 2 * kernelHeight - 1;

   const Index dataBlockRadiusX = dataBlockWidth >> 1;
   const Index dataBlockRadiusY = dataBlockHeight >> 1;

   Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 );
   const Index kernelOffset = dataBlockWidth * dataBlockHeight;

   Real* data = TNL::Cuda::getSharedMemory< Real >();
   Real* kernel = data + kernelOffset;

   Index radiusY = kernelHeight >> 1;
   Index radiusX = kernelWidth >> 1;

   Index x, y, index;

   // Top Left
   x = ix - radiusX;
   y = iy - radiusY;

   index = threadIdx.x + threadIdx.y * blockDim.x;

   kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y );
   index = threadIdx.x + threadIdx.y * dataBlockWidth;

   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
      data[ index ] = fetchBoundary( x, y );
@@ -145,8 +148,7 @@ convolution2D( Index kernelWidth,
   // Top right
   x = ix + radiusX;
   y = iy - radiusY;

   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x;
   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth;

   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
      data[ index ] = fetchBoundary( x, y );
@@ -158,8 +160,7 @@ convolution2D( Index kernelWidth,
   // Bottom Left
   x = ix - radiusX;
   y = iy + radiusY;

   index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x;
   index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;

   if(x < 0 || y < 0 || x >= endX || y >= endY ) {
      data[ index ] = fetchBoundary( x, y );
@@ -171,8 +172,7 @@ convolution2D( Index kernelWidth,
   // Bottom Right
   x = ix + radiusX;
   y = iy + radiusY;

   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x;
   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;

   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
      data[ index ] = fetchBoundary( x, y );
@@ -181,6 +181,10 @@ convolution2D( Index kernelWidth,
      data[ index ] = fetchData( x, y );
   }

   index = threadIdx.x + threadIdx.y * blockDim.x;

   kernel[index] = fetchKernel( threadIdx.x, threadIdx.y );

   __syncthreads();

    if( ix >= endX || iy >= endY )
@@ -190,7 +194,7 @@ convolution2D( Index kernelWidth,

   #pragma unroll
   for( Index j = 0; j < kernelHeight; j++ ) {
      Index elementAlign = ( j + threadIdx.y ) * blockDim.x;
      Index elementAlign = ( j + threadIdx.y ) * dataBlockWidth;
      Index kernelAlign = j * blockDim.x;

   #pragma unroll
@@ -226,19 +230,29 @@ convolution3D( Index kernelWidth,
               Convolve convolve,
               Store store )
{
   Index iz = threadIdx.z + blockIdx.z * blockDim.z;
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
   const Index ix = threadIdx.x + blockIdx.x * blockDim.x;
   const Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   const Index iz = threadIdx.z + blockIdx.z * blockDim.z;

   const Index radiusX = kernelWidth >> 1;
   const Index radiusY = kernelHeight >> 1;
   const Index radiusZ = kernelDepth >> 1;

   const Index dataBlockWidth = 2 * kernelWidth - 1;
   const Index dataBlockHeight = 2 * kernelHeight - 1;
   const Index dataBlockDepth = 2 * kernelDepth - 1;

   Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ) * ( 2 * kernelDepth - 1 );
   const Index dataBlockXYVolume = dataBlockWidth * dataBlockHeight;

   const Index dataBlockRadiusX = dataBlockWidth >> 1;
   const Index dataBlockRadiusY = dataBlockHeight >> 1;
   const Index dataBlockRadiusZ = dataBlockDepth >> 1;

   const Index kernelOffset = dataBlockWidth * dataBlockHeight * dataBlockDepth;

   Real* data = TNL::Cuda::getSharedMemory< Real >();
   Real* kernel = data + kernelOffset;

   Index radiusZ = kernelDepth >> 1;
   Index radiusY = kernelHeight >> 1;
   Index radiusX = kernelWidth >> 1;

   Index x, y, z, index;

   // Z: 0 Y: 0 X: 0
@@ -246,9 +260,7 @@ convolution3D( Index kernelWidth,
   y = iy - radiusY;
   z = iz - radiusZ;

   index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;

   kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z );
   index = threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume;

   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
@@ -262,7 +274,7 @@ convolution3D( Index kernelWidth,
   y = iy - radiusY;
   z = iz - radiusZ;

   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume;

   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
@@ -276,7 +288,7 @@ convolution3D( Index kernelWidth,
   y = iy + radiusY;
   z = iz - radiusZ;

   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume;

   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
@@ -290,7 +302,7 @@ convolution3D( Index kernelWidth,
   y = iy - radiusY;
   z = iz + radiusZ;

   index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
   index = threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;

   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
@@ -304,7 +316,7 @@ convolution3D( Index kernelWidth,
   y = iy + radiusY;
   z = iz - radiusZ;

   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume;

   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
@@ -318,7 +330,7 @@ convolution3D( Index kernelWidth,
   y = iy - radiusY;
   z = iz + radiusZ;

   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;

   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
@@ -332,7 +344,7 @@ convolution3D( Index kernelWidth,
   y = iy + radiusY;
   z = iz + radiusZ;

   index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
   index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;

   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
@@ -346,7 +358,7 @@ convolution3D( Index kernelWidth,
   y = iy + radiusY;
   z = iz + radiusZ;

   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;

   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
@@ -355,6 +367,10 @@ convolution3D( Index kernelWidth,
      data[ index ] = fetchData( x, y, z );
   }

   index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;

   kernel[index] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z );

   __syncthreads();

   if( ix >= endX || iy >= endY || iz >= endZ )
@@ -364,11 +380,11 @@ convolution3D( Index kernelWidth,

   #pragma unroll
   for( Index k = 0; k < kernelDepth; k++ ) {
      Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;
      Index xyAlign = ( k + threadIdx.z ) * dataBlockXYVolume;
      Index xyKernelAlign = k * blockDim.x * blockDim.y;
   #pragma unroll
      for( Index j = 0; j < kernelHeight; j++ ) {
         Index xAlign = ( j + threadIdx.y ) * blockDim.x;
         Index xAlign = ( j + threadIdx.y ) * dataBlockWidth;
         Index xKernelAlign = j * blockDim.x;
   #pragma unroll
         for( Index i = 0; i < kernelWidth; i++ ) {
+3 −3
Original line number Diff line number Diff line
@@ -16,7 +16,7 @@ class DummyBenchmark : public Benchmark< Dimension, Device >
{
public:
   using Vector = TNL::Containers::StaticVector< Dimension, int >;
   using DataStore = TNL::Containers::Array< float, Device, int >;
   using DataStore = TNL::Containers::Vector< float, Device, int >;
   using Base = Benchmark< Dimension, Device >;
   using TNLBenchmark = typename Base::TNLBenchmark;

@@ -103,9 +103,9 @@ public:
      result = 1;
      kernel = 1;

      auto inputView = input.getView();
      auto inputView = input.getConstView();
      auto kernelView = kernel.getConstView();
      auto resultView = result.getView();
      auto kernelView = kernel.getView();

      auto measure = [ & ]()
      {
+3 −3
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@ class DummySolver : public Solver< Dimension, Device >
public:
   using Base = Solver< Dimension, Device >;
   using Vector = TNL::Containers::StaticVector< Dimension, int >;
   using DataStore = TNL::Containers::Array< float, Device, int >;
   using DataStore = TNL::Containers::Vector< float, Device, int >;

   virtual void
   start( const TNL::Config::ParameterContainer& parameters ) const override
@@ -55,9 +55,9 @@ public:
      result = 1;
      kernel = 1;

      auto inputView = input.getView();
      auto inputView = input.getConstView();
      auto kernelView = kernel.getConstView();
      auto resultView = result.getView();
      auto kernelView = kernel.getView();

      DummyTask<int, float, Dimension, Device>::exec(dimension, kernelSize, inputView, resultView, kernelView);

Loading