Loading src/Benchmarks/Convolution/CMakeLists.txt +9 −0 Original line number Diff line number Diff line Loading @@ -17,6 +17,10 @@ if (${BUILD_CUDA}) SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}_${TEMPLATE_NAME}") CUDA_ADD_EXECUTABLE(${EXECUTABLE_NAME} ${SOURCE_FILE}) if( PNG_FOUND ) target_link_libraries( ${EXECUTABLE_NAME} ${PNG_LIBRARIES} ) endif() else() MESSAGE(WARNING "Convolutions are not supported on CPU") endif() Loading Loading @@ -54,3 +58,8 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/shar GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/naive.h") GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedData.h") GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedDataAndKernel.h") src/Benchmarks/Convolution/kernels/sharedData.h +75 −66 Original line number Diff line number Diff line Loading @@ -33,27 +33,27 @@ convolution1D( Index kernelWidth, { Index ix = threadIdx.x + blockIdx.x * blockDim.x; Real* shared = TNL::Cuda::getSharedMemory< Real >(); Real* data = TNL::Cuda::getSharedMemory< Real >(); Index radius = kernelWidth >> 1; // Left Index lhs = ix - radius; if( lhs < 0 || lhs >= endX ) { shared[ threadIdx.x ] = fetchBoundary( lhs ); data[ threadIdx.x ] = fetchBoundary( lhs ); } else { shared[ threadIdx.x ] = fetchData( lhs ); data[ threadIdx.x ] = fetchData( lhs ); } // Right Index rhs = ix + radius; if( rhs < 0 || rhs >= endX ) { shared[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs ); data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs ); } else { shared[ threadIdx.x + blockDim.x ] = fetchData( rhs ); data[ threadIdx.x + blockDim.x ] = fetchData( rhs ); } __syncthreads(); Loading @@ -67,7 +67,7 @@ convolution1D( Index kernelWidth, for( Index i = 0; i < kernelWidth; i++ ) { Index elementIndex = i + threadIdx.x; result = convolve( result, shared[ elementIndex ], fetchKernel( i ) ); result = convolve( result, data[ elementIndex ], fetchKernel( i ) ); } store( ix, result ); Loading @@ -92,69 +92,68 @@ convolution2D( Index kernelWidth, Convolve convolve, Store store ) { Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; Real* data = TNL::Cuda::getSharedMemory< Real >(); Real* shared = TNL::Cuda::getSharedMemory< Real >(); const Index iy = threadIdx.y + blockIdx.y * blockDim.y; const Index ix = threadIdx.x + blockIdx.x * blockDim.x; Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; const Index radiusY = kernelHeight >> 1; const Index radiusX = kernelWidth >> 1; Index x, y, index; const Index dataBlockWidth = 2 * kernelWidth - 1; const Index dataBlockHeight = 2 * kernelHeight - 1; const Index dataBlockRadiusX = dataBlockWidth >> 1; const Index dataBlockRadiusY = dataBlockHeight >> 1; Index kernelHorizontalPadding = kernelWidth == 1 ? 0 : kernelWidth; Index kernelVerticalPadding = kernelHeight == 1 ? 0 : kernelHeight; Index x, y, index; // Top Left x = ix - radiusX; y = iy - radiusY; index = threadIdx.x + threadIdx.y * blockDim.x; index = threadIdx.x + threadIdx.y * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); data[ index ] = fetchBoundary( x, y ); } else { shared[ index ] = fetchData( x, y ); data[ index ] = fetchData( x, y ); } // Top right x = ix + radiusX; y = iy - radiusY; index = kernelHorizontalPadding + threadIdx.x + threadIdx.y * blockDim.x; index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); data[ index ] = fetchBoundary( x, y ); } else { shared[ index ] = fetchData( x, y ); data[ index ] = fetchData( x, y ); } // Bottom Left x = ix - radiusX; y = iy + radiusY; index = threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x; index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; if(x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); data[ index ] = fetchBoundary( x, y ); } else { shared[ index ] = fetchData( x, y ); data[ index ] = fetchData( x, y ); } // Bottom Right x = ix + radiusX; y = iy + radiusY; index = kernelHorizontalPadding + threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x; index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); data[ index ] = fetchBoundary( x, y ); } else { shared[ index ] = fetchData( x, y ); data[ index ] = fetchData( x, y ); } __syncthreads(); Loading @@ -165,12 +164,12 @@ convolution2D( Index kernelWidth, Real result = 0; for( Index j = 0; j < kernelHeight; j++ ) { Index align = ( j + threadIdx.y ) * blockDim.x; Index align = ( j + threadIdx.y ) * dataBlockWidth; for( Index i = 0; i < kernelWidth; i++ ) { Index index = i + threadIdx.x + align; result = convolve( result, shared[ index ], fetchKernel( i, j ) ); result = convolve( result, data[ index ], fetchKernel( i, j ) ); } } Loading Loading @@ -198,15 +197,25 @@ convolution3D( Index kernelWidth, Convolve convolve, Store store ) { Index iz = threadIdx.z + blockIdx.z * blockDim.z; Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; Real* data = TNL::Cuda::getSharedMemory< Real >(); const Index ix = threadIdx.x + blockIdx.x * blockDim.x; const Index iy = threadIdx.y + blockIdx.y * blockDim.y; const Index iz = threadIdx.z + blockIdx.z * blockDim.z; const Index radiusX = kernelWidth >> 1; const Index radiusY = kernelHeight >> 1; const Index radiusZ = kernelDepth >> 1; const Index dataBlockWidth = 2 * kernelWidth - 1; const Index dataBlockHeight = 2 * kernelHeight - 1; const Index dataBlockDepth = 2 * kernelDepth - 1; Real* shared = TNL::Cuda::getSharedMemory< Real >(); const Index dataBlockXYVolume = dataBlockWidth * dataBlockHeight; Index radiusZ = kernelDepth >> 1; Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; const Index dataBlockRadiusX = dataBlockWidth >> 1; const Index dataBlockRadiusY = dataBlockHeight >> 1; const Index dataBlockRadiusZ = dataBlockDepth >> 1; Index x, y, z, index; Loading @@ -215,13 +224,13 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; index = threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); data[ index ] = fetchBoundary( x, y, z ); } else { shared[ index ] = fetchData( x, y, z ); data[ index ] = fetchData( x, y, z ); } // Z: 0 Y: 0 X: 1 Loading @@ -229,13 +238,13 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); data[ index ] = fetchBoundary( x, y, z ); } else { shared[ index ] = fetchData( x, y, z ); data[ index ] = fetchData( x, y, z ); } // Z: 0 Y: 1 X: 0 Loading @@ -243,13 +252,13 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); data[ index ] = fetchBoundary( x, y, z ); } else { shared[ index ] = fetchData( x, y, z ); data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 0 X: 0 Loading @@ -257,13 +266,13 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; index = threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); data[ index ] = fetchBoundary( x, y, z ); } else { shared[ index ] = fetchData( x, y, z ); data[ index ] = fetchData( x, y, z ); } // Z: 0 Y: 1 X: 1 Loading @@ -271,13 +280,13 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); data[ index ] = fetchBoundary( x, y, z ); } else { shared[ index ] = fetchData( x, y, z ); data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 0 X: 1 Loading @@ -285,13 +294,13 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); data[ index ] = fetchBoundary( x, y, z ); } else { shared[ index ] = fetchData( x, y, z ); data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 1 X: 0 Loading @@ -299,13 +308,13 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); data[ index ] = fetchBoundary( x, y, z ); } else { shared[ index ] = fetchData( x, y, z ); data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 1 X: 1 Loading @@ -313,13 +322,13 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); data[ index ] = fetchBoundary( x, y, z ); } else { shared[ index ] = fetchData( x, y, z ); data[ index ] = fetchData( x, y, z ); } __syncthreads(); Loading @@ -330,15 +339,15 @@ convolution3D( Index kernelWidth, Real result = 0; for( Index k = 0; k < kernelDepth; k++ ) { Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; Index xyAlign = ( k + threadIdx.z ) * dataBlockXYVolume; for( Index j = 0; j < kernelHeight; j++ ) { Index xAlign = ( j + threadIdx.y ) * blockDim.x; Index xAlign = ( j + threadIdx.y ) * dataBlockWidth; for( Index i = 0; i < kernelWidth; i++ ) { Index index = i + threadIdx.x + xAlign + xyAlign; result = convolve( result, shared[ index ], fetchKernel( i, j, k ) ); result = convolve( result, data[ index ], fetchKernel( i, j, k ) ); } } } Loading src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h +58 −42 Original line number Diff line number Diff line Loading @@ -8,7 +8,7 @@ #include <TNL/Cuda/SharedMemory.h> /** * This method stores kernel and data in the shared memory to reduce amount of loads. * This method stores kernel and data in the data memory to reduce amount of loads. * * We can calculate the size of shared memory needed the next way: * 1. We need to store in shared memory: Loading Loading @@ -49,7 +49,7 @@ convolution1D( Index kernelWidth, { Index ix = threadIdx.x + blockIdx.x * blockDim.x; Index kernelOffset = 2 * kernelWidth; Index kernelOffset = 2 * kernelWidth - 1; Real* data = TNL::Cuda::getSharedMemory< Real >(); Real* kernel = data + kernelOffset; Loading Loading @@ -114,26 +114,29 @@ convolution2D( Index kernelWidth, Convolve convolve, Store store ) { Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; const Index iy = threadIdx.y + blockIdx.y * blockDim.y; const Index ix = threadIdx.x + blockIdx.x * blockDim.x; const Index radiusY = kernelHeight >> 1; const Index radiusX = kernelWidth >> 1; const Index dataBlockWidth = 2 * kernelWidth - 1; const Index dataBlockHeight = 2 * kernelHeight - 1; const Index dataBlockRadiusX = dataBlockWidth >> 1; const Index dataBlockRadiusY = dataBlockHeight >> 1; Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ); const Index kernelOffset = dataBlockWidth * dataBlockHeight; Real* data = TNL::Cuda::getSharedMemory< Real >(); Real* kernel = data + kernelOffset; Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; Index x, y, index; // Top Left x = ix - radiusX; y = iy - radiusY; index = threadIdx.x + threadIdx.y * blockDim.x; kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y ); index = threadIdx.x + threadIdx.y * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); Loading @@ -145,8 +148,7 @@ convolution2D( Index kernelWidth, // Top right x = ix + radiusX; y = iy - radiusY; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x; index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); Loading @@ -158,8 +160,7 @@ convolution2D( Index kernelWidth, // Bottom Left x = ix - radiusX; y = iy + radiusY; index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x; index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; if(x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); Loading @@ -171,8 +172,7 @@ convolution2D( Index kernelWidth, // Bottom Right x = ix + radiusX; y = iy + radiusY; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x; index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); Loading @@ -181,6 +181,10 @@ convolution2D( Index kernelWidth, data[ index ] = fetchData( x, y ); } index = threadIdx.x + threadIdx.y * blockDim.x; kernel[index] = fetchKernel( threadIdx.x, threadIdx.y ); __syncthreads(); if( ix >= endX || iy >= endY ) Loading @@ -190,7 +194,7 @@ convolution2D( Index kernelWidth, #pragma unroll for( Index j = 0; j < kernelHeight; j++ ) { Index elementAlign = ( j + threadIdx.y ) * blockDim.x; Index elementAlign = ( j + threadIdx.y ) * dataBlockWidth; Index kernelAlign = j * blockDim.x; #pragma unroll Loading Loading @@ -226,19 +230,29 @@ convolution3D( Index kernelWidth, Convolve convolve, Store store ) { Index iz = threadIdx.z + blockIdx.z * blockDim.z; Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; const Index ix = threadIdx.x + blockIdx.x * blockDim.x; const Index iy = threadIdx.y + blockIdx.y * blockDim.y; const Index iz = threadIdx.z + blockIdx.z * blockDim.z; const Index radiusX = kernelWidth >> 1; const Index radiusY = kernelHeight >> 1; const Index radiusZ = kernelDepth >> 1; const Index dataBlockWidth = 2 * kernelWidth - 1; const Index dataBlockHeight = 2 * kernelHeight - 1; const Index dataBlockDepth = 2 * kernelDepth - 1; Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ) * ( 2 * kernelDepth - 1 ); const Index dataBlockXYVolume = dataBlockWidth * dataBlockHeight; const Index dataBlockRadiusX = dataBlockWidth >> 1; const Index dataBlockRadiusY = dataBlockHeight >> 1; const Index dataBlockRadiusZ = dataBlockDepth >> 1; const Index kernelOffset = dataBlockWidth * dataBlockHeight * dataBlockDepth; Real* data = TNL::Cuda::getSharedMemory< Real >(); Real* kernel = data + kernelOffset; Index radiusZ = kernelDepth >> 1; Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; Index x, y, z, index; // Z: 0 Y: 0 X: 0 Loading @@ -246,9 +260,7 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); index = threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); Loading @@ -262,7 +274,7 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); Loading @@ -276,7 +288,7 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); Loading @@ -290,7 +302,7 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; index = threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); Loading @@ -304,7 +316,7 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); Loading @@ -318,7 +330,7 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); Loading @@ -332,7 +344,7 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); Loading @@ -346,7 +358,7 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); Loading @@ -355,6 +367,10 @@ convolution3D( Index kernelWidth, data[ index ] = fetchData( x, y, z ); } index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; kernel[index] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); __syncthreads(); if( ix >= endX || iy >= endY || iz >= endZ ) Loading @@ -364,11 +380,11 @@ convolution3D( Index kernelWidth, #pragma unroll for( Index k = 0; k < kernelDepth; k++ ) { Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; Index xyAlign = ( k + threadIdx.z ) * dataBlockXYVolume; Index xyKernelAlign = k * blockDim.x * blockDim.y; #pragma unroll for( Index j = 0; j < kernelHeight; j++ ) { Index xAlign = ( j + threadIdx.y ) * blockDim.x; Index xAlign = ( j + threadIdx.y ) * dataBlockWidth; Index xKernelAlign = j * blockDim.x; #pragma unroll for( Index i = 0; i < kernelWidth; i++ ) { Loading src/Benchmarks/Convolution/support/DummyBenchmark.h +3 −3 Original line number Diff line number Diff line Loading @@ -16,7 +16,7 @@ class DummyBenchmark : public Benchmark< Dimension, Device > { public: using Vector = TNL::Containers::StaticVector< Dimension, int >; using DataStore = TNL::Containers::Array< float, Device, int >; using DataStore = TNL::Containers::Vector< float, Device, int >; using Base = Benchmark< Dimension, Device >; using TNLBenchmark = typename Base::TNLBenchmark; Loading Loading @@ -103,9 +103,9 @@ public: result = 1; kernel = 1; auto inputView = input.getView(); auto inputView = input.getConstView(); auto kernelView = kernel.getConstView(); auto resultView = result.getView(); auto kernelView = kernel.getView(); auto measure = [ & ]() { Loading src/Benchmarks/Convolution/support/DummySolver.h +3 −3 Original line number Diff line number Diff line Loading @@ -13,7 +13,7 @@ class DummySolver : public Solver< Dimension, Device > public: using Base = Solver< Dimension, Device >; using Vector = TNL::Containers::StaticVector< Dimension, int >; using DataStore = TNL::Containers::Array< float, Device, int >; using DataStore = TNL::Containers::Vector< float, Device, int >; virtual void start( const TNL::Config::ParameterContainer& parameters ) const override Loading Loading @@ -55,9 +55,9 @@ public: result = 1; kernel = 1; auto inputView = input.getView(); auto inputView = input.getConstView(); auto kernelView = kernel.getConstView(); auto resultView = result.getView(); auto kernelView = kernel.getView(); DummyTask<int, float, Dimension, Device>::exec(dimension, kernelSize, inputView, resultView, kernelView); Loading Loading
src/Benchmarks/Convolution/CMakeLists.txt +9 −0 Original line number Diff line number Diff line Loading @@ -17,6 +17,10 @@ if (${BUILD_CUDA}) SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}_${TEMPLATE_NAME}") CUDA_ADD_EXECUTABLE(${EXECUTABLE_NAME} ${SOURCE_FILE}) if( PNG_FOUND ) target_link_libraries( ${EXECUTABLE_NAME} ${PNG_LIBRARIES} ) endif() else() MESSAGE(WARNING "Convolutions are not supported on CPU") endif() Loading Loading @@ -54,3 +58,8 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/shar GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/naive.h") GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedData.h") GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedDataAndKernel.h")
src/Benchmarks/Convolution/kernels/sharedData.h +75 −66 Original line number Diff line number Diff line Loading @@ -33,27 +33,27 @@ convolution1D( Index kernelWidth, { Index ix = threadIdx.x + blockIdx.x * blockDim.x; Real* shared = TNL::Cuda::getSharedMemory< Real >(); Real* data = TNL::Cuda::getSharedMemory< Real >(); Index radius = kernelWidth >> 1; // Left Index lhs = ix - radius; if( lhs < 0 || lhs >= endX ) { shared[ threadIdx.x ] = fetchBoundary( lhs ); data[ threadIdx.x ] = fetchBoundary( lhs ); } else { shared[ threadIdx.x ] = fetchData( lhs ); data[ threadIdx.x ] = fetchData( lhs ); } // Right Index rhs = ix + radius; if( rhs < 0 || rhs >= endX ) { shared[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs ); data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs ); } else { shared[ threadIdx.x + blockDim.x ] = fetchData( rhs ); data[ threadIdx.x + blockDim.x ] = fetchData( rhs ); } __syncthreads(); Loading @@ -67,7 +67,7 @@ convolution1D( Index kernelWidth, for( Index i = 0; i < kernelWidth; i++ ) { Index elementIndex = i + threadIdx.x; result = convolve( result, shared[ elementIndex ], fetchKernel( i ) ); result = convolve( result, data[ elementIndex ], fetchKernel( i ) ); } store( ix, result ); Loading @@ -92,69 +92,68 @@ convolution2D( Index kernelWidth, Convolve convolve, Store store ) { Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; Real* data = TNL::Cuda::getSharedMemory< Real >(); Real* shared = TNL::Cuda::getSharedMemory< Real >(); const Index iy = threadIdx.y + blockIdx.y * blockDim.y; const Index ix = threadIdx.x + blockIdx.x * blockDim.x; Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; const Index radiusY = kernelHeight >> 1; const Index radiusX = kernelWidth >> 1; Index x, y, index; const Index dataBlockWidth = 2 * kernelWidth - 1; const Index dataBlockHeight = 2 * kernelHeight - 1; const Index dataBlockRadiusX = dataBlockWidth >> 1; const Index dataBlockRadiusY = dataBlockHeight >> 1; Index kernelHorizontalPadding = kernelWidth == 1 ? 0 : kernelWidth; Index kernelVerticalPadding = kernelHeight == 1 ? 0 : kernelHeight; Index x, y, index; // Top Left x = ix - radiusX; y = iy - radiusY; index = threadIdx.x + threadIdx.y * blockDim.x; index = threadIdx.x + threadIdx.y * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); data[ index ] = fetchBoundary( x, y ); } else { shared[ index ] = fetchData( x, y ); data[ index ] = fetchData( x, y ); } // Top right x = ix + radiusX; y = iy - radiusY; index = kernelHorizontalPadding + threadIdx.x + threadIdx.y * blockDim.x; index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); data[ index ] = fetchBoundary( x, y ); } else { shared[ index ] = fetchData( x, y ); data[ index ] = fetchData( x, y ); } // Bottom Left x = ix - radiusX; y = iy + radiusY; index = threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x; index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; if(x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); data[ index ] = fetchBoundary( x, y ); } else { shared[ index ] = fetchData( x, y ); data[ index ] = fetchData( x, y ); } // Bottom Right x = ix + radiusX; y = iy + radiusY; index = kernelHorizontalPadding + threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x; index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); data[ index ] = fetchBoundary( x, y ); } else { shared[ index ] = fetchData( x, y ); data[ index ] = fetchData( x, y ); } __syncthreads(); Loading @@ -165,12 +164,12 @@ convolution2D( Index kernelWidth, Real result = 0; for( Index j = 0; j < kernelHeight; j++ ) { Index align = ( j + threadIdx.y ) * blockDim.x; Index align = ( j + threadIdx.y ) * dataBlockWidth; for( Index i = 0; i < kernelWidth; i++ ) { Index index = i + threadIdx.x + align; result = convolve( result, shared[ index ], fetchKernel( i, j ) ); result = convolve( result, data[ index ], fetchKernel( i, j ) ); } } Loading Loading @@ -198,15 +197,25 @@ convolution3D( Index kernelWidth, Convolve convolve, Store store ) { Index iz = threadIdx.z + blockIdx.z * blockDim.z; Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; Real* data = TNL::Cuda::getSharedMemory< Real >(); const Index ix = threadIdx.x + blockIdx.x * blockDim.x; const Index iy = threadIdx.y + blockIdx.y * blockDim.y; const Index iz = threadIdx.z + blockIdx.z * blockDim.z; const Index radiusX = kernelWidth >> 1; const Index radiusY = kernelHeight >> 1; const Index radiusZ = kernelDepth >> 1; const Index dataBlockWidth = 2 * kernelWidth - 1; const Index dataBlockHeight = 2 * kernelHeight - 1; const Index dataBlockDepth = 2 * kernelDepth - 1; Real* shared = TNL::Cuda::getSharedMemory< Real >(); const Index dataBlockXYVolume = dataBlockWidth * dataBlockHeight; Index radiusZ = kernelDepth >> 1; Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; const Index dataBlockRadiusX = dataBlockWidth >> 1; const Index dataBlockRadiusY = dataBlockHeight >> 1; const Index dataBlockRadiusZ = dataBlockDepth >> 1; Index x, y, z, index; Loading @@ -215,13 +224,13 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; index = threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); data[ index ] = fetchBoundary( x, y, z ); } else { shared[ index ] = fetchData( x, y, z ); data[ index ] = fetchData( x, y, z ); } // Z: 0 Y: 0 X: 1 Loading @@ -229,13 +238,13 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); data[ index ] = fetchBoundary( x, y, z ); } else { shared[ index ] = fetchData( x, y, z ); data[ index ] = fetchData( x, y, z ); } // Z: 0 Y: 1 X: 0 Loading @@ -243,13 +252,13 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); data[ index ] = fetchBoundary( x, y, z ); } else { shared[ index ] = fetchData( x, y, z ); data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 0 X: 0 Loading @@ -257,13 +266,13 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; index = threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); data[ index ] = fetchBoundary( x, y, z ); } else { shared[ index ] = fetchData( x, y, z ); data[ index ] = fetchData( x, y, z ); } // Z: 0 Y: 1 X: 1 Loading @@ -271,13 +280,13 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); data[ index ] = fetchBoundary( x, y, z ); } else { shared[ index ] = fetchData( x, y, z ); data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 0 X: 1 Loading @@ -285,13 +294,13 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); data[ index ] = fetchBoundary( x, y, z ); } else { shared[ index ] = fetchData( x, y, z ); data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 1 X: 0 Loading @@ -299,13 +308,13 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); data[ index ] = fetchBoundary( x, y, z ); } else { shared[ index ] = fetchData( x, y, z ); data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 1 X: 1 Loading @@ -313,13 +322,13 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); data[ index ] = fetchBoundary( x, y, z ); } else { shared[ index ] = fetchData( x, y, z ); data[ index ] = fetchData( x, y, z ); } __syncthreads(); Loading @@ -330,15 +339,15 @@ convolution3D( Index kernelWidth, Real result = 0; for( Index k = 0; k < kernelDepth; k++ ) { Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; Index xyAlign = ( k + threadIdx.z ) * dataBlockXYVolume; for( Index j = 0; j < kernelHeight; j++ ) { Index xAlign = ( j + threadIdx.y ) * blockDim.x; Index xAlign = ( j + threadIdx.y ) * dataBlockWidth; for( Index i = 0; i < kernelWidth; i++ ) { Index index = i + threadIdx.x + xAlign + xyAlign; result = convolve( result, shared[ index ], fetchKernel( i, j, k ) ); result = convolve( result, data[ index ], fetchKernel( i, j, k ) ); } } } Loading
src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h +58 −42 Original line number Diff line number Diff line Loading @@ -8,7 +8,7 @@ #include <TNL/Cuda/SharedMemory.h> /** * This method stores kernel and data in the shared memory to reduce amount of loads. * This method stores kernel and data in the data memory to reduce amount of loads. * * We can calculate the size of shared memory needed the next way: * 1. We need to store in shared memory: Loading Loading @@ -49,7 +49,7 @@ convolution1D( Index kernelWidth, { Index ix = threadIdx.x + blockIdx.x * blockDim.x; Index kernelOffset = 2 * kernelWidth; Index kernelOffset = 2 * kernelWidth - 1; Real* data = TNL::Cuda::getSharedMemory< Real >(); Real* kernel = data + kernelOffset; Loading Loading @@ -114,26 +114,29 @@ convolution2D( Index kernelWidth, Convolve convolve, Store store ) { Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; const Index iy = threadIdx.y + blockIdx.y * blockDim.y; const Index ix = threadIdx.x + blockIdx.x * blockDim.x; const Index radiusY = kernelHeight >> 1; const Index radiusX = kernelWidth >> 1; const Index dataBlockWidth = 2 * kernelWidth - 1; const Index dataBlockHeight = 2 * kernelHeight - 1; const Index dataBlockRadiusX = dataBlockWidth >> 1; const Index dataBlockRadiusY = dataBlockHeight >> 1; Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ); const Index kernelOffset = dataBlockWidth * dataBlockHeight; Real* data = TNL::Cuda::getSharedMemory< Real >(); Real* kernel = data + kernelOffset; Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; Index x, y, index; // Top Left x = ix - radiusX; y = iy - radiusY; index = threadIdx.x + threadIdx.y * blockDim.x; kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y ); index = threadIdx.x + threadIdx.y * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); Loading @@ -145,8 +148,7 @@ convolution2D( Index kernelWidth, // Top right x = ix + radiusX; y = iy - radiusY; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x; index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); Loading @@ -158,8 +160,7 @@ convolution2D( Index kernelWidth, // Bottom Left x = ix - radiusX; y = iy + radiusY; index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x; index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; if(x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); Loading @@ -171,8 +172,7 @@ convolution2D( Index kernelWidth, // Bottom Right x = ix + radiusX; y = iy + radiusY; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x; index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); Loading @@ -181,6 +181,10 @@ convolution2D( Index kernelWidth, data[ index ] = fetchData( x, y ); } index = threadIdx.x + threadIdx.y * blockDim.x; kernel[index] = fetchKernel( threadIdx.x, threadIdx.y ); __syncthreads(); if( ix >= endX || iy >= endY ) Loading @@ -190,7 +194,7 @@ convolution2D( Index kernelWidth, #pragma unroll for( Index j = 0; j < kernelHeight; j++ ) { Index elementAlign = ( j + threadIdx.y ) * blockDim.x; Index elementAlign = ( j + threadIdx.y ) * dataBlockWidth; Index kernelAlign = j * blockDim.x; #pragma unroll Loading Loading @@ -226,19 +230,29 @@ convolution3D( Index kernelWidth, Convolve convolve, Store store ) { Index iz = threadIdx.z + blockIdx.z * blockDim.z; Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; const Index ix = threadIdx.x + blockIdx.x * blockDim.x; const Index iy = threadIdx.y + blockIdx.y * blockDim.y; const Index iz = threadIdx.z + blockIdx.z * blockDim.z; const Index radiusX = kernelWidth >> 1; const Index radiusY = kernelHeight >> 1; const Index radiusZ = kernelDepth >> 1; const Index dataBlockWidth = 2 * kernelWidth - 1; const Index dataBlockHeight = 2 * kernelHeight - 1; const Index dataBlockDepth = 2 * kernelDepth - 1; Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ) * ( 2 * kernelDepth - 1 ); const Index dataBlockXYVolume = dataBlockWidth * dataBlockHeight; const Index dataBlockRadiusX = dataBlockWidth >> 1; const Index dataBlockRadiusY = dataBlockHeight >> 1; const Index dataBlockRadiusZ = dataBlockDepth >> 1; const Index kernelOffset = dataBlockWidth * dataBlockHeight * dataBlockDepth; Real* data = TNL::Cuda::getSharedMemory< Real >(); Real* kernel = data + kernelOffset; Index radiusZ = kernelDepth >> 1; Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; Index x, y, z, index; // Z: 0 Y: 0 X: 0 Loading @@ -246,9 +260,7 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); index = threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); Loading @@ -262,7 +274,7 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); Loading @@ -276,7 +288,7 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); Loading @@ -290,7 +302,7 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; index = threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); Loading @@ -304,7 +316,7 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); Loading @@ -318,7 +330,7 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); Loading @@ -332,7 +344,7 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); Loading @@ -346,7 +358,7 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); Loading @@ -355,6 +367,10 @@ convolution3D( Index kernelWidth, data[ index ] = fetchData( x, y, z ); } index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; kernel[index] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); __syncthreads(); if( ix >= endX || iy >= endY || iz >= endZ ) Loading @@ -364,11 +380,11 @@ convolution3D( Index kernelWidth, #pragma unroll for( Index k = 0; k < kernelDepth; k++ ) { Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; Index xyAlign = ( k + threadIdx.z ) * dataBlockXYVolume; Index xyKernelAlign = k * blockDim.x * blockDim.y; #pragma unroll for( Index j = 0; j < kernelHeight; j++ ) { Index xAlign = ( j + threadIdx.y ) * blockDim.x; Index xAlign = ( j + threadIdx.y ) * dataBlockWidth; Index xKernelAlign = j * blockDim.x; #pragma unroll for( Index i = 0; i < kernelWidth; i++ ) { Loading
src/Benchmarks/Convolution/support/DummyBenchmark.h +3 −3 Original line number Diff line number Diff line Loading @@ -16,7 +16,7 @@ class DummyBenchmark : public Benchmark< Dimension, Device > { public: using Vector = TNL::Containers::StaticVector< Dimension, int >; using DataStore = TNL::Containers::Array< float, Device, int >; using DataStore = TNL::Containers::Vector< float, Device, int >; using Base = Benchmark< Dimension, Device >; using TNLBenchmark = typename Base::TNLBenchmark; Loading Loading @@ -103,9 +103,9 @@ public: result = 1; kernel = 1; auto inputView = input.getView(); auto inputView = input.getConstView(); auto kernelView = kernel.getConstView(); auto resultView = result.getView(); auto kernelView = kernel.getView(); auto measure = [ & ]() { Loading
src/Benchmarks/Convolution/support/DummySolver.h +3 −3 Original line number Diff line number Diff line Loading @@ -13,7 +13,7 @@ class DummySolver : public Solver< Dimension, Device > public: using Base = Solver< Dimension, Device >; using Vector = TNL::Containers::StaticVector< Dimension, int >; using DataStore = TNL::Containers::Array< float, Device, int >; using DataStore = TNL::Containers::Vector< float, Device, int >; virtual void start( const TNL::Config::ParameterContainer& parameters ) const override Loading Loading @@ -55,9 +55,9 @@ public: result = 1; kernel = 1; auto inputView = input.getView(); auto inputView = input.getConstView(); auto kernelView = kernel.getConstView(); auto resultView = result.getView(); auto kernelView = kernel.getView(); DummyTask<int, float, Dimension, Device>::exec(dimension, kernelSize, inputView, resultView, kernelView); Loading