Loading src/Benchmarks/Convolution/CMakeLists.txt +9 −0 Original line number Diff line number Diff line Loading @@ -10,6 +10,7 @@ if (${BUILD_CUDA}) FILE(READ ${TEMPLATE} TEMPLATE_CONTENT) STRING(REGEX REPLACE "DIMENSION_VALUE" ${DIMENSION} TEMPLATE_CONTENT "${TEMPLATE_CONTENT}") STRING(REGEX REPLACE "KERNEL_VALUE" "\"../${KERNEL_HEADER}\"" TEMPLATE_CONTENT "${TEMPLATE_CONTENT}") FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}") Loading @@ -29,3 +30,11 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/naiv GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/naive.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/naive.h") GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/naive.h") GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedKernel.h") src/Benchmarks/Convolution/kernels/naive.h +15 −7 Original line number Diff line number Diff line #pragma once #ifdef HAVE_CUDA #include <TNL/Devices/Cuda.h> #include <TNL/Containers/StaticVector.h> #include <TNL/Cuda/LaunchHelpers.h> /** * There are several pitfalls with such configuration. * * 1. At first we don't use shared memory * 2. At second we don't control block size, so we may launch extremely small kernels or otherwise we can launch extremely large kernels. */ template< int Dimension, typename Device > struct Convolution; Loading @@ -15,7 +24,7 @@ public: template< typename Index > using Vector = TNL::Containers::StaticVector< 1, Index >; template< typename Index > template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { Loading Loading @@ -76,13 +85,12 @@ public: template< typename Index > using Vector = TNL::Containers::StaticVector< 2, Index >; template< typename Index > template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { configuration.dynamicSharedMemorySize = 0; // TODO: - Benchmark the best value configuration.blockSize.x = kernelSize.x(); configuration.blockSize.y = kernelSize.y(); Loading Loading @@ -151,7 +159,7 @@ public: template< typename Index > using Vector = TNL::Containers::StaticVector< 3, Index >; template< typename Index > template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { Loading src/Benchmarks/Convolution/kernels/sharedKernel.h 0 → 100644 +274 −0 Original line number Diff line number Diff line #pragma once #ifdef HAVE_CUDA #include <TNL/Devices/Cuda.h> #include <TNL/Containers/StaticVector.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Cuda/SharedMemory.h> template< int Dimension, typename Device > struct Convolution; template<> struct Convolution< 1, TNL::Devices::Cuda > { public: template< typename Index > using Vector = TNL::Containers::StaticVector< 1, Index >; template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { Index kernelElementCount = 1; for( Index i = 0; i < kernelSize.getSize(); i++ ) kernelElementCount *= kernelSize[ i ]; configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); configuration.blockSize.x = kernelSize.x(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); } }; template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > __global__ static void convolution1D( Index kernelWidth, Index endX, FetchData fetchData, FetchBoundary fetchBoundary, FetchKernel fetchKernel, Convolve convolve, Store store ) { Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radius = kernelWidth >> 1; Index ix = threadIdx.x + blockIdx.x * blockDim.x; // The size of the block is equal to the kernel size shared[ threadIdx.x ] = fetchKernel( threadIdx.x ); __syncthreads(); Real result = 0; for( Index i = -radius; i <= radius; i++ ) { Index elementIndex = i + ix; Index kernelIndex = i + radius; if( elementIndex < 0 || elementIndex >= endX ) { result = convolve( result, fetchBoundary( elementIndex ), shared[ kernelIndex ] ); } else { result = convolve( result, fetchData( elementIndex ), shared[ kernelIndex ] ); } } store( ix, result ); } template<> struct Convolution< 2, TNL::Devices::Cuda > { public: template< typename Index > using Vector = TNL::Containers::StaticVector< 2, Index >; template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { Index kernelElementCount = 1; for( Index i = 0; i < kernelSize.getSize(); i++ ) kernelElementCount *= kernelSize[ i ]; configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); configuration.blockSize.x = kernelSize.x(); configuration.blockSize.y = kernelSize.y(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); } }; template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > __global__ static void convolution2D( Index kernelWidth, Index kernelHeight, Index endX, Index endY, FetchData fetchData, FetchBoundary fetchBoundary, FetchKernel fetchKernel, Convolve convolve, Store store ) { Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y; // The size of the block is equal to the kernel size shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y ); __syncthreads(); Real result = 0; for( Index j = -radiusY; j <= radiusY; j++ ) { Index elementIndexY = j + iy; Index kernelIndexY = j + radiusY; for( Index i = -radiusX; i <= radiusX; i++ ) { Index elementIndexX = i + ix; Index kernelIndexX = i + radiusX; Index threadIndex = kernelIndexX + kernelWidth * kernelIndexY; if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY ) { result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), shared[ threadIndex ] ); } else { result = convolve( result, fetchData( elementIndexX, elementIndexY ), shared[ threadIndex ] ); } } } store( ix, iy, result ); } template<> struct Convolution< 3, TNL::Devices::Cuda > { public: template< typename Index > using Vector = TNL::Containers::StaticVector< 3, Index >; template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { Index kernelElementCount = 1; for( Index i = 0; i < kernelSize.getSize(); i++ ) kernelElementCount *= kernelSize[ i ]; configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); configuration.blockSize.x = kernelSize.x(); configuration.blockSize.y = kernelSize.y(); configuration.blockSize.z = kernelSize.z(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } }; template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > __global__ static void convolution3D( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ, FetchData fetchData, FetchBoundary fetchBoundary, FetchKernel fetchKernel, Convolve convolve, Store store ) { Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index iz = threadIdx.z + blockIdx.z * blockDim.z; Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; Index radiusZ = kernelDepth >> 1; Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z; printf( "%d\n", threadIndex ); // The size of the block is equal to the kernel size shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); __syncthreads(); Real result = 0; for( Index k = -radiusZ; k <= radiusZ; k++ ) { Index elementIndexZ = k + iz; Index kernelIndexZ = k + radiusZ; for( Index j = -radiusY; j <= radiusY; j++ ) { Index elementIndexY = j + iy; Index kernelIndexY = j + radiusY; for( Index i = -radiusX; i <= radiusX; i++ ) { Index elementIndexX = i + ix; Index kernelIndexX = i + radiusX; Index threadIndex = kernelIndexX + kernelWidth * kernelIndexY + kernelWidth * kernelHeight * kernelIndexZ; if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 || elementIndexZ >= endZ ) { result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), shared[threadIndex] ); } else { result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), shared[threadIndex] ); } } } } store( ix, iy, iz, result ); } #endif src/Benchmarks/Convolution/support/Launcher.h +11 −4 Original line number Diff line number Diff line Loading @@ -5,7 +5,14 @@ #include <TNL/Cuda/KernelLaunch.h> template< int Dimension, typename Device > struct Convolution; struct Convolution { template< typename Index > using Vector = TNL::Containers::StaticVector< 1, Index >; template< typename Index, typename Real > static void setup(TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize); }; template< int Dimension, typename Device > struct Launcher; Loading @@ -29,7 +36,7 @@ public: { TNL::Cuda::LaunchConfiguration launchConfig; ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize); ConvolutionKernel::setup<Index, Real>(launchConfig, dimensions, kernelSize); constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; Loading Loading @@ -65,7 +72,7 @@ public: { TNL::Cuda::LaunchConfiguration launchConfig; ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize); ConvolutionKernel::setup<Index, Real>(launchConfig, dimensions, kernelSize); constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; Loading Loading @@ -107,7 +114,7 @@ public: TNL::Cuda::LaunchConfiguration launchConfig; ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize); ConvolutionKernel::setup<Index, Real>(launchConfig, dimensions, kernelSize); constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; Loading src/Benchmarks/Convolution/templates/main_benchmark.h +4 −3 Original line number Diff line number Diff line #include "../kernels/naive.h" #define KERNEL KERNEL_VALUE #define DIMENSION DIMENSION_VALUE #include KERNEL_VALUE #include "../support/DummyBenchmark.h" #include <TNL/Config/parseCommandLine.h> #define DIMENSION DIMENSION_VALUE using TaskBenchmark = DummyBenchmark< DIMENSION, TNL::Devices::Cuda >; int main(int argc, char* argv[]) Loading Loading
src/Benchmarks/Convolution/CMakeLists.txt +9 −0 Original line number Diff line number Diff line Loading @@ -10,6 +10,7 @@ if (${BUILD_CUDA}) FILE(READ ${TEMPLATE} TEMPLATE_CONTENT) STRING(REGEX REPLACE "DIMENSION_VALUE" ${DIMENSION} TEMPLATE_CONTENT "${TEMPLATE_CONTENT}") STRING(REGEX REPLACE "KERNEL_VALUE" "\"../${KERNEL_HEADER}\"" TEMPLATE_CONTENT "${TEMPLATE_CONTENT}") FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}") Loading @@ -29,3 +30,11 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/naiv GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/naive.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/naive.h") GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/naive.h") GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedKernel.h")
src/Benchmarks/Convolution/kernels/naive.h +15 −7 Original line number Diff line number Diff line #pragma once #ifdef HAVE_CUDA #include <TNL/Devices/Cuda.h> #include <TNL/Containers/StaticVector.h> #include <TNL/Cuda/LaunchHelpers.h> /** * There are several pitfalls with such configuration. * * 1. At first we don't use shared memory * 2. At second we don't control block size, so we may launch extremely small kernels or otherwise we can launch extremely large kernels. */ template< int Dimension, typename Device > struct Convolution; Loading @@ -15,7 +24,7 @@ public: template< typename Index > using Vector = TNL::Containers::StaticVector< 1, Index >; template< typename Index > template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { Loading Loading @@ -76,13 +85,12 @@ public: template< typename Index > using Vector = TNL::Containers::StaticVector< 2, Index >; template< typename Index > template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { configuration.dynamicSharedMemorySize = 0; // TODO: - Benchmark the best value configuration.blockSize.x = kernelSize.x(); configuration.blockSize.y = kernelSize.y(); Loading Loading @@ -151,7 +159,7 @@ public: template< typename Index > using Vector = TNL::Containers::StaticVector< 3, Index >; template< typename Index > template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { Loading
src/Benchmarks/Convolution/kernels/sharedKernel.h 0 → 100644 +274 −0 Original line number Diff line number Diff line #pragma once #ifdef HAVE_CUDA #include <TNL/Devices/Cuda.h> #include <TNL/Containers/StaticVector.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Cuda/SharedMemory.h> template< int Dimension, typename Device > struct Convolution; template<> struct Convolution< 1, TNL::Devices::Cuda > { public: template< typename Index > using Vector = TNL::Containers::StaticVector< 1, Index >; template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { Index kernelElementCount = 1; for( Index i = 0; i < kernelSize.getSize(); i++ ) kernelElementCount *= kernelSize[ i ]; configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); configuration.blockSize.x = kernelSize.x(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); } }; template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > __global__ static void convolution1D( Index kernelWidth, Index endX, FetchData fetchData, FetchBoundary fetchBoundary, FetchKernel fetchKernel, Convolve convolve, Store store ) { Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radius = kernelWidth >> 1; Index ix = threadIdx.x + blockIdx.x * blockDim.x; // The size of the block is equal to the kernel size shared[ threadIdx.x ] = fetchKernel( threadIdx.x ); __syncthreads(); Real result = 0; for( Index i = -radius; i <= radius; i++ ) { Index elementIndex = i + ix; Index kernelIndex = i + radius; if( elementIndex < 0 || elementIndex >= endX ) { result = convolve( result, fetchBoundary( elementIndex ), shared[ kernelIndex ] ); } else { result = convolve( result, fetchData( elementIndex ), shared[ kernelIndex ] ); } } store( ix, result ); } template<> struct Convolution< 2, TNL::Devices::Cuda > { public: template< typename Index > using Vector = TNL::Containers::StaticVector< 2, Index >; template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { Index kernelElementCount = 1; for( Index i = 0; i < kernelSize.getSize(); i++ ) kernelElementCount *= kernelSize[ i ]; configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); configuration.blockSize.x = kernelSize.x(); configuration.blockSize.y = kernelSize.y(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); } }; template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > __global__ static void convolution2D( Index kernelWidth, Index kernelHeight, Index endX, Index endY, FetchData fetchData, FetchBoundary fetchBoundary, FetchKernel fetchKernel, Convolve convolve, Store store ) { Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y; // The size of the block is equal to the kernel size shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y ); __syncthreads(); Real result = 0; for( Index j = -radiusY; j <= radiusY; j++ ) { Index elementIndexY = j + iy; Index kernelIndexY = j + radiusY; for( Index i = -radiusX; i <= radiusX; i++ ) { Index elementIndexX = i + ix; Index kernelIndexX = i + radiusX; Index threadIndex = kernelIndexX + kernelWidth * kernelIndexY; if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY ) { result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), shared[ threadIndex ] ); } else { result = convolve( result, fetchData( elementIndexX, elementIndexY ), shared[ threadIndex ] ); } } } store( ix, iy, result ); } template<> struct Convolution< 3, TNL::Devices::Cuda > { public: template< typename Index > using Vector = TNL::Containers::StaticVector< 3, Index >; template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { Index kernelElementCount = 1; for( Index i = 0; i < kernelSize.getSize(); i++ ) kernelElementCount *= kernelSize[ i ]; configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); configuration.blockSize.x = kernelSize.x(); configuration.blockSize.y = kernelSize.y(); configuration.blockSize.z = kernelSize.z(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } }; template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > __global__ static void convolution3D( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ, FetchData fetchData, FetchBoundary fetchBoundary, FetchKernel fetchKernel, Convolve convolve, Store store ) { Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index iz = threadIdx.z + blockIdx.z * blockDim.z; Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; Index radiusZ = kernelDepth >> 1; Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z; printf( "%d\n", threadIndex ); // The size of the block is equal to the kernel size shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); __syncthreads(); Real result = 0; for( Index k = -radiusZ; k <= radiusZ; k++ ) { Index elementIndexZ = k + iz; Index kernelIndexZ = k + radiusZ; for( Index j = -radiusY; j <= radiusY; j++ ) { Index elementIndexY = j + iy; Index kernelIndexY = j + radiusY; for( Index i = -radiusX; i <= radiusX; i++ ) { Index elementIndexX = i + ix; Index kernelIndexX = i + radiusX; Index threadIndex = kernelIndexX + kernelWidth * kernelIndexY + kernelWidth * kernelHeight * kernelIndexZ; if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 || elementIndexZ >= endZ ) { result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), shared[threadIndex] ); } else { result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), shared[threadIndex] ); } } } } store( ix, iy, iz, result ); } #endif
src/Benchmarks/Convolution/support/Launcher.h +11 −4 Original line number Diff line number Diff line Loading @@ -5,7 +5,14 @@ #include <TNL/Cuda/KernelLaunch.h> template< int Dimension, typename Device > struct Convolution; struct Convolution { template< typename Index > using Vector = TNL::Containers::StaticVector< 1, Index >; template< typename Index, typename Real > static void setup(TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize); }; template< int Dimension, typename Device > struct Launcher; Loading @@ -29,7 +36,7 @@ public: { TNL::Cuda::LaunchConfiguration launchConfig; ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize); ConvolutionKernel::setup<Index, Real>(launchConfig, dimensions, kernelSize); constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; Loading Loading @@ -65,7 +72,7 @@ public: { TNL::Cuda::LaunchConfiguration launchConfig; ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize); ConvolutionKernel::setup<Index, Real>(launchConfig, dimensions, kernelSize); constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; Loading Loading @@ -107,7 +114,7 @@ public: TNL::Cuda::LaunchConfiguration launchConfig; ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize); ConvolutionKernel::setup<Index, Real>(launchConfig, dimensions, kernelSize); constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; Loading
src/Benchmarks/Convolution/templates/main_benchmark.h +4 −3 Original line number Diff line number Diff line #include "../kernels/naive.h" #define KERNEL KERNEL_VALUE #define DIMENSION DIMENSION_VALUE #include KERNEL_VALUE #include "../support/DummyBenchmark.h" #include <TNL/Config/parseCommandLine.h> #define DIMENSION DIMENSION_VALUE using TaskBenchmark = DummyBenchmark< DIMENSION, TNL::Devices::Cuda >; int main(int argc, char* argv[]) Loading