Loading src/Benchmarks/Convolution/kernels/naive.h +64 −21 Original line number Diff line number Diff line Loading @@ -2,6 +2,7 @@ #ifdef HAVE_CUDA #include <TNL/Devices/Cuda.h> #include <TNL/Containers/StaticVector.h> #include <TNL/Cuda/LaunchHelpers.h> template< int Dimension, typename Device > Loading @@ -12,10 +13,18 @@ struct Convolution< 1, TNL::Devices::Cuda > { public: template< typename Index > static size_t getDynamicSharedMemorySize( Index kernelWidth, Index endX ) using Vector = TNL::Containers::StaticVector< 1, Index >; template< typename Index > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { return 0; configuration.dynamicSharedMemorySize = 0; // TODO: - Benchmark the best value configuration.blockSize.x = kernelSize.x(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); } }; Loading Loading @@ -65,10 +74,22 @@ struct Convolution< 2, TNL::Devices::Cuda > { public: template< typename Index > static size_t getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY ) using Vector = TNL::Containers::StaticVector< 2, Index >; template< typename Index > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { return 0; configuration.dynamicSharedMemorySize = 0; // TODO: - Benchmark the best value configuration.blockSize.x = kernelSize.x(); configuration.blockSize.y = kernelSize.y(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); } }; Loading Loading @@ -111,7 +132,8 @@ convolution2D( Index kernelWidth, Index kernelIndexX = i + radiusX; if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY ) { result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel ( kernelIndexX, kernelIndexY ) ); result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) ); } else { result = convolve( result, fetchData( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) ); Loading @@ -127,10 +149,25 @@ struct Convolution< 3, TNL::Devices::Cuda > { public: template< typename Index > static size_t getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ ) using Vector = TNL::Containers::StaticVector< 3, Index >; template< typename Index > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { return 0; configuration.dynamicSharedMemorySize = 0; // TODO: - Benchmark the best value configuration.blockSize.x = kernelSize.x(); configuration.blockSize.y = kernelSize.y(); configuration.blockSize.z = kernelSize.z(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } }; Loading Loading @@ -180,11 +217,17 @@ convolution3D( Index kernelWidth, Index elementIndexX = i + ix; Index kernelIndexX = i + radiusX; if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 || elementIndexZ >= endZ ) { result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 || elementIndexZ >= endZ ) { result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); } else { result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); } } } Loading src/Benchmarks/Convolution/support/DummyBenchmark.h +3 −2 Original line number Diff line number Diff line Loading @@ -32,13 +32,14 @@ public: start[ i ] = parameters.getParameter< int >( minDimensionIds[ i ] ); end[ i ] = parameters.getParameter< int >( maxDimensionIds[ i ] ); minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] ); maxKernelSizeIds[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] ); maxKernelSize[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] ); TNL_ASSERT_GT( start[ i ], 1, "Start dimension must be positive integer" ); TNL_ASSERT_GT( end[ i ], start[ i ], "End dimension must be greater than start dimension" ); TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" ); TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" ); TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End dimension must be greater than start dimension" ); TNL_ASSERT_GT( end[ i ], start[ i ], "End kernel size must be greater than start kernel size" ); } Loading Loading @@ -85,7 +86,7 @@ public: for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) { if( currentDimension[ i ] >= maxDimension[ i ] ) { currentDimension[ i ] = minDimension[ i ]; currentDimension[ i ] = maxDimension[ i ]; currentDimension[ i + 1 ] *= dimensionStep; } } Loading src/Benchmarks/Convolution/support/Launcher.h +3 −83 Original line number Diff line number Diff line Loading @@ -29,20 +29,7 @@ public: { TNL::Cuda::LaunchConfiguration launchConfig; launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( kernelSize.x(), dimensions.x() ); // TODO: - Benchmark the best value launchConfig.blockSize.x = 256; launchConfig.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), launchConfig.blockSize.x ) ); if( (std::size_t) launchConfig.blockSize.x * launchConfig.gridSize.x < (std::size_t) dimensions.x() ) { const int desGridSize = 32 * TNL::Cuda::DeviceInfo::getCudaMultiprocessors( TNL::Cuda::DeviceInfo::getActiveDevice() ); launchConfig.gridSize.x = TNL::min( desGridSize, TNL::Cuda::getNumberOfBlocks( dimensions.x(), launchConfig.blockSize.x ) ); } ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize); constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; Loading Loading @@ -78,29 +65,7 @@ public: { TNL::Cuda::LaunchConfiguration launchConfig; launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() ); const Index sizeX = dimensions.x(); const Index sizeY = dimensions.y(); if( sizeX >= sizeY * sizeY ) { launchConfig.blockSize.x = TNL::min( 256, sizeX ); launchConfig.blockSize.y = 1; } else if( sizeY >= sizeX * sizeX ) { launchConfig.blockSize.x = 1; launchConfig.blockSize.y = TNL::min( 256, sizeY ); } else { launchConfig.blockSize.x = TNL::min( 32, sizeX ); launchConfig.blockSize.y = TNL::min( 8, sizeY ); } launchConfig.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); launchConfig.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize); constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; Loading Loading @@ -142,52 +107,7 @@ public: TNL::Cuda::LaunchConfiguration launchConfig; launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z() ); if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) { launchConfig.blockSize.x = TNL::min( 256, sizeX ); launchConfig.blockSize.y = 1; launchConfig.blockSize.z = 1; } else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) { launchConfig.blockSize.x = 1; launchConfig.blockSize.y = TNL::min( 256, sizeY ); launchConfig.blockSize.z = 1; } else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) { launchConfig.blockSize.x = TNL::min( 2, sizeX ); launchConfig.blockSize.y = TNL::min( 2, sizeY ); // CUDA allows max 64 for launchConfig.blockSize.z launchConfig.blockSize.z = TNL::min( 64, sizeZ ); } else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) { launchConfig.blockSize.x = TNL::min( 32, sizeX ); launchConfig.blockSize.y = TNL::min( 8, sizeY ); launchConfig.blockSize.z = 1; } else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) { launchConfig.blockSize.x = TNL::min( 32, sizeX ); launchConfig.blockSize.y = 1; launchConfig.blockSize.z = TNL::min( 8, sizeZ ); } else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) { launchConfig.blockSize.x = 1; launchConfig.blockSize.y = TNL::min( 32, sizeY ); launchConfig.blockSize.z = TNL::min( 8, sizeZ ); } else { launchConfig.blockSize.x = TNL::min( 16, sizeX ); launchConfig.blockSize.y = TNL::min( 4, sizeY ); launchConfig.blockSize.z = TNL::min( 4, sizeZ ); } launchConfig.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); launchConfig.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); launchConfig.gridSize.z = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeZ, launchConfig.blockSize.z ) ); ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize); constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; Loading Loading
src/Benchmarks/Convolution/kernels/naive.h +64 −21 Original line number Diff line number Diff line Loading @@ -2,6 +2,7 @@ #ifdef HAVE_CUDA #include <TNL/Devices/Cuda.h> #include <TNL/Containers/StaticVector.h> #include <TNL/Cuda/LaunchHelpers.h> template< int Dimension, typename Device > Loading @@ -12,10 +13,18 @@ struct Convolution< 1, TNL::Devices::Cuda > { public: template< typename Index > static size_t getDynamicSharedMemorySize( Index kernelWidth, Index endX ) using Vector = TNL::Containers::StaticVector< 1, Index >; template< typename Index > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { return 0; configuration.dynamicSharedMemorySize = 0; // TODO: - Benchmark the best value configuration.blockSize.x = kernelSize.x(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); } }; Loading Loading @@ -65,10 +74,22 @@ struct Convolution< 2, TNL::Devices::Cuda > { public: template< typename Index > static size_t getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY ) using Vector = TNL::Containers::StaticVector< 2, Index >; template< typename Index > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { return 0; configuration.dynamicSharedMemorySize = 0; // TODO: - Benchmark the best value configuration.blockSize.x = kernelSize.x(); configuration.blockSize.y = kernelSize.y(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); } }; Loading Loading @@ -111,7 +132,8 @@ convolution2D( Index kernelWidth, Index kernelIndexX = i + radiusX; if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY ) { result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel ( kernelIndexX, kernelIndexY ) ); result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) ); } else { result = convolve( result, fetchData( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) ); Loading @@ -127,10 +149,25 @@ struct Convolution< 3, TNL::Devices::Cuda > { public: template< typename Index > static size_t getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ ) using Vector = TNL::Containers::StaticVector< 3, Index >; template< typename Index > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { return 0; configuration.dynamicSharedMemorySize = 0; // TODO: - Benchmark the best value configuration.blockSize.x = kernelSize.x(); configuration.blockSize.y = kernelSize.y(); configuration.blockSize.z = kernelSize.z(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } }; Loading Loading @@ -180,11 +217,17 @@ convolution3D( Index kernelWidth, Index elementIndexX = i + ix; Index kernelIndexX = i + radiusX; if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 || elementIndexZ >= endZ ) { result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 || elementIndexZ >= endZ ) { result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); } else { result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); } } } Loading
src/Benchmarks/Convolution/support/DummyBenchmark.h +3 −2 Original line number Diff line number Diff line Loading @@ -32,13 +32,14 @@ public: start[ i ] = parameters.getParameter< int >( minDimensionIds[ i ] ); end[ i ] = parameters.getParameter< int >( maxDimensionIds[ i ] ); minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] ); maxKernelSizeIds[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] ); maxKernelSize[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] ); TNL_ASSERT_GT( start[ i ], 1, "Start dimension must be positive integer" ); TNL_ASSERT_GT( end[ i ], start[ i ], "End dimension must be greater than start dimension" ); TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" ); TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" ); TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End dimension must be greater than start dimension" ); TNL_ASSERT_GT( end[ i ], start[ i ], "End kernel size must be greater than start kernel size" ); } Loading Loading @@ -85,7 +86,7 @@ public: for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) { if( currentDimension[ i ] >= maxDimension[ i ] ) { currentDimension[ i ] = minDimension[ i ]; currentDimension[ i ] = maxDimension[ i ]; currentDimension[ i + 1 ] *= dimensionStep; } } Loading
src/Benchmarks/Convolution/support/Launcher.h +3 −83 Original line number Diff line number Diff line Loading @@ -29,20 +29,7 @@ public: { TNL::Cuda::LaunchConfiguration launchConfig; launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( kernelSize.x(), dimensions.x() ); // TODO: - Benchmark the best value launchConfig.blockSize.x = 256; launchConfig.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), launchConfig.blockSize.x ) ); if( (std::size_t) launchConfig.blockSize.x * launchConfig.gridSize.x < (std::size_t) dimensions.x() ) { const int desGridSize = 32 * TNL::Cuda::DeviceInfo::getCudaMultiprocessors( TNL::Cuda::DeviceInfo::getActiveDevice() ); launchConfig.gridSize.x = TNL::min( desGridSize, TNL::Cuda::getNumberOfBlocks( dimensions.x(), launchConfig.blockSize.x ) ); } ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize); constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; Loading Loading @@ -78,29 +65,7 @@ public: { TNL::Cuda::LaunchConfiguration launchConfig; launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() ); const Index sizeX = dimensions.x(); const Index sizeY = dimensions.y(); if( sizeX >= sizeY * sizeY ) { launchConfig.blockSize.x = TNL::min( 256, sizeX ); launchConfig.blockSize.y = 1; } else if( sizeY >= sizeX * sizeX ) { launchConfig.blockSize.x = 1; launchConfig.blockSize.y = TNL::min( 256, sizeY ); } else { launchConfig.blockSize.x = TNL::min( 32, sizeX ); launchConfig.blockSize.y = TNL::min( 8, sizeY ); } launchConfig.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); launchConfig.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize); constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; Loading Loading @@ -142,52 +107,7 @@ public: TNL::Cuda::LaunchConfiguration launchConfig; launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z() ); if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) { launchConfig.blockSize.x = TNL::min( 256, sizeX ); launchConfig.blockSize.y = 1; launchConfig.blockSize.z = 1; } else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) { launchConfig.blockSize.x = 1; launchConfig.blockSize.y = TNL::min( 256, sizeY ); launchConfig.blockSize.z = 1; } else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) { launchConfig.blockSize.x = TNL::min( 2, sizeX ); launchConfig.blockSize.y = TNL::min( 2, sizeY ); // CUDA allows max 64 for launchConfig.blockSize.z launchConfig.blockSize.z = TNL::min( 64, sizeZ ); } else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) { launchConfig.blockSize.x = TNL::min( 32, sizeX ); launchConfig.blockSize.y = TNL::min( 8, sizeY ); launchConfig.blockSize.z = 1; } else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) { launchConfig.blockSize.x = TNL::min( 32, sizeX ); launchConfig.blockSize.y = 1; launchConfig.blockSize.z = TNL::min( 8, sizeZ ); } else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) { launchConfig.blockSize.x = 1; launchConfig.blockSize.y = TNL::min( 32, sizeY ); launchConfig.blockSize.z = TNL::min( 8, sizeZ ); } else { launchConfig.blockSize.x = TNL::min( 16, sizeX ); launchConfig.blockSize.y = TNL::min( 4, sizeY ); launchConfig.blockSize.z = TNL::min( 4, sizeZ ); } launchConfig.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); launchConfig.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); launchConfig.gridSize.z = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeZ, launchConfig.blockSize.z ) ); ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize); constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; Loading