Loading src/Benchmarks/Convolution/CMakeLists.txt +8 −0 Original line number Diff line number Diff line Loading @@ -46,3 +46,11 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/shar GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedData.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedData.h") GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedData.h") GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") src/Benchmarks/Convolution/kernels/sharedData.h +2 −2 Original line number Diff line number Diff line Loading @@ -162,7 +162,7 @@ convolution2D( Index kernelWidth, Real result = 0; for( Index j = 0; j <= radiusY; j++ ) { Index align = ( j + threadIdx.y ) * blockDim.y; Index align = ( j + threadIdx.y ) * blockDim.x; for( Index i = 0; i <= radiusX; i++ ) { Index index = i + threadIdx.x + align; Loading Loading @@ -330,7 +330,7 @@ convolution3D( Index kernelWidth, Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; for( Index j = 0; j <= radiusY; j++ ) { Index xAlign = ( j + threadIdx.y ) * blockDim.y; Index xAlign = ( j + threadIdx.y ) * blockDim.x; for( Index i = 0; i <= radiusX; i++ ) { Index index = i + threadIdx.x + xAlign + xyAlign; Loading src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h 0 → 100644 +577 −0 Original line number Diff line number Diff line #pragma once #ifdef HAVE_CUDA #include <TNL/Devices/Cuda.h> #include <TNL/Containers/StaticVector.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Cuda/SharedMemory.h> /** * This method stores kernel and data in the shared memory to reduce amount of loads. * * We can calculate the size of shared memory needed the next way: * 1. We need to store in shared memory: * * for 1D -> (2 * kernelWidth) - 1 < 2 * kernelWidth * * for 2D -> ( (2 * kernelWidth) - 1 ) * ( (2 * kernelHeight) - 1 ) < 4 * kernelWidth * kernelHeight * * for 3D -> ( (2 * kernelWidth) - 1 ) * ( (2 * kernelHeight) - 1 ) * ( (2 * kernelDepth) - 1 ) < 8 * kernelWidth * * kernelHeight * kernelDepth * 2. We take into account, that the maximal block size is 1024, so the maximum volume of kernel is 1024. * Then the maximal amount of shared memory is: * * for 1D -> 2 * 1024 -> 2048 elements (Note, that even if we take long double (16B) we still can fit in the shared * memory) * * for 2D -> 4 * 1024 -> 4096 elements * * for 3D -> 8 * 1024 -> 8196 elements (Note, that if double takes 8 bytes, then we can't fit tile into shared memory, * because we have 64 KB of data) * 3. The last thing is, that even if we take 1D and 2D case we have enough space to store 1024 kernel element. * Then the maximal amount of shared memory is: * * for 1D -> 3 * 1024 -> can use long double, double, float * * for 2D -> 5 * 1024 -> can use double, float * * for 3D -> 9 * 1024 -> can use float */ template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > __global__ static void convolution1D( Index kernelWidth, Index endX, FetchData fetchData, FetchBoundary fetchBoundary, FetchKernel fetchKernel, Convolve convolve, Store store ) { Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX ) return; Index kernelOffset = 2 * kernelWidth; Real* data = TNL::Cuda::getSharedMemory< Real >(); Real* kernel = data + kernelOffset; Index radius = kernelWidth >> 1; // Left Index lhs = ix - radius; if( lhs < 0 ) { data[ threadIdx.x ] = fetchBoundary( lhs ); } else { data[ threadIdx.x ] = fetchData( lhs ); } // Right Index rhs = ix + radius; if( rhs >= endX ) { data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs ); } else { data[ threadIdx.x + blockDim.x ] = fetchData( rhs ); } kernel[ threadIdx.x ] = fetchKernel( threadIdx.x ); __syncthreads(); Real result = 0; #pragma unroll for( Index i = 0; i < kernelWidth; i++ ) { Index elementIndex = i + threadIdx.x; result = convolve( result, data[ elementIndex ], kernel[ i ] ); } store( ix, result ); } template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > __global__ static void convolution2D( Index kernelWidth, Index kernelHeight, Index endX, Index endY, FetchData fetchData, FetchBoundary fetchBoundary, FetchKernel fetchKernel, Convolve convolve, Store store ) { Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX || iy >= endY ) return; Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ); Real* data = TNL::Cuda::getSharedMemory< Real >(); Real* kernel = data + kernelOffset; Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; Index x, y, index; // Top Left x = ix - radiusX; y = iy - radiusY; index = threadIdx.x + threadIdx.y * blockDim.x; kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y ); if( x < 0 || y < 0 ) { data[ index ] = fetchBoundary( x, y ); } else { data[ index ] = fetchData( x, y ); } // Top right x = ix + radiusX; y = iy - radiusY; index = radiusX + threadIdx.x + threadIdx.y * blockDim.x; if( x >= endX || y < 0 ) { data[ index ] = fetchBoundary( x, y ); } else { data[ index ] = fetchData( x, y ); } // Bottom Left x = ix - radiusX; y = iy + radiusY; index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; if( x < 0 || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { data[ index ] = fetchData( x, y ); } // Bottom Right x = ix + radiusX; y = iy + radiusY; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; if( x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { data[ index ] = fetchData( x, y ); } __syncthreads(); Real result = 0; #pragma unroll for( Index j = 0; j <= radiusY; j++ ) { Index elementAlign = ( j + threadIdx.y ) * blockDim.x; Index kernelAlign = j * blockDim.x; #pragma unroll for( Index i = 0; i <= radiusX; i++ ) { Index elementIndex = i + threadIdx.x + elementAlign; Index kernelIndex = i + kernelAlign; result = convolve( result, data[ elementIndex ], kernel[ kernelIndex ] ); } } store( ix, iy, result ); } template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > __global__ static void convolution3D( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ, FetchData fetchData, FetchBoundary fetchBoundary, FetchKernel fetchKernel, Convolve convolve, Store store ) { Index iz = threadIdx.z + blockIdx.z * blockDim.z; Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX || iy >= endY || iz >= endZ ) return; Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ) * ( 2 * kernelDepth - 1 ); Real* data = TNL::Cuda::getSharedMemory< Real >(); Real* kernel = data + kernelOffset; Index radiusZ = kernelDepth >> 1; Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; Index x, y, z, index; // Z: 0 Y: 0 X: 0 x = ix - radiusX; y = iy - radiusY; z = iz - radiusZ; index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); if( x < 0 || y < 0 || z < 0 ) { data[ index ] = fetchBoundary( x, y, z ); } else { data[ index ] = fetchData( x, y, z ); } // Z: 0 Y: 0 X: 1 x = ix + radiusX; y = iy - radiusY; z = iz - radiusZ; index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; if( x >= endX || y < 0 || z < 0 ) { data[ index ] = fetchBoundary( x, y, z ); } else { data[ index ] = fetchData( x, y, z ); } // Z: 0 Y: 1 X: 0 x = ix - radiusX; y = iy + radiusY; z = iz - radiusZ; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; if( x < 0 || y >= endY || z < 0 ) { data[ index ] = fetchBoundary( x, y, z ); } else { data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 0 X: 0 x = ix - radiusX; y = iy - radiusY; z = iz + radiusZ; index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; if( x < 0 || y < 0 || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { data[ index ] = fetchData( x, y, z ); } // Z: 0 Y: 1 X: 1 x = ix + radiusX; y = iy + radiusY; z = iz - radiusZ; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; if( x >= endX || y >= endY || z < 0 ) { data[ index ] = fetchBoundary( x, y, z ); } else { data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 0 X: 1 x = ix + radiusX; y = iy - radiusY; z = iz + radiusZ; index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; if( x >= endX || y < 0 || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 1 X: 0 x = ix - radiusX; y = iy + radiusY; z = iz + radiusZ; index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; if( x < 0 || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 1 X: 1 x = ix + radiusX; y = iy + radiusY; z = iz + radiusZ; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; if( x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { data[ index ] = fetchData( x, y, z ); } __syncthreads(); Real result = 0; for( Index k = 0; k <= radiusZ; k++ ) { Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; Index xyKernelAlign = k * blockDim.x * blockDim.y; for( Index j = 0; j <= radiusY; j++ ) { Index xAlign = ( j + threadIdx.y ) * blockDim.x; Index xKernelAlign = j * blockDim.x; for( Index i = 0; i <= radiusX; i++ ) { Index elementIndex = i + threadIdx.x + xAlign + xyAlign; Index kernelIndex = i + xKernelAlign + xyKernelAlign; result = convolve( result, data[ index ], kernel[ kernelIndex ] ); } } } store( ix, iy, iz, result ); } template< int Dimension, typename Device > struct Convolution; template<> struct Convolution< 1, TNL::Devices::Cuda > { public: template< typename Index > using Vector = TNL::Containers::StaticVector< 1, Index >; template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { Index kernelElementCount = 1; for( Index i = 0; i < kernelSize.getSize(); i++ ) kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; configuration.dynamicSharedMemorySize = ( kernelSize.x() + kernelElementCount ) * sizeof( Real ); configuration.blockSize.x = kernelSize.x(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); } template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > static void execute( const Vector< Index >& dimensions, const Vector< Index >& kernelSize, FetchData&& fetchData, FetchBoundary&& fetchBoundary, FetchKernel&& fetchKernel, Convolve&& convolve, Store&& store ) { TNL::Cuda::LaunchConfiguration configuration; setup< Index, Real >( configuration, dimensions, kernelSize ); constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; TNL::Cuda::launchKernel< true >( kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store ); }; }; template<> struct Convolution< 2, TNL::Devices::Cuda > { public: template< typename Index > using Vector = TNL::Containers::StaticVector< 2, Index >; template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { Index kernelElementCount = 1; Index kernelVolume = 1; for( Index i = 0; i < kernelSize.getSize(); i++ ) { kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; kernelVolume *= kernelSize[ i ]; } configuration.dynamicSharedMemorySize = ( kernelVolume + kernelElementCount ) * sizeof( Real ); configuration.blockSize.x = kernelSize.x(); configuration.blockSize.y = kernelSize.y(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); } template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > static void execute( const Vector< Index >& dimensions, const Vector< Index >& kernelSize, FetchData&& fetchData, FetchBoundary&& fetchBoundary, FetchKernel&& fetchKernel, Convolve&& convolve, Store&& store ) { TNL::Cuda::LaunchConfiguration configuration; setup< Index, Real >( configuration, dimensions, kernelSize ); constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; TNL::Cuda::launchKernel< true >( kernel, 0, configuration, kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y(), fetchData, fetchBoundary, fetchKernel, convolve, store ); }; }; template<> struct Convolution< 3, TNL::Devices::Cuda > { public: template< typename Index > using Vector = TNL::Containers::StaticVector< 3, Index >; template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { Index kernelElementCount = 1; Index kernelVolume = 1; for( Index i = 0; i < kernelSize.getSize(); i++ ) { kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; kernelVolume *= kernelSize[ i ]; } configuration.dynamicSharedMemorySize = ( kernelVolume + kernelElementCount ) * sizeof( Real ); configuration.blockSize.x = kernelSize.x(); configuration.blockSize.y = kernelSize.y(); configuration.blockSize.z = kernelSize.z(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > static void execute( const Vector< Index >& dimensions, const Vector< Index >& kernelSize, FetchData&& fetchData, FetchBoundary&& fetchBoundary, FetchKernel&& fetchKernel, Convolve&& convolve, Store&& store ) { TNL::Cuda::LaunchConfiguration configuration; setup< Index, Real >( configuration, dimensions, kernelSize ); constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; TNL::Cuda::launchKernel< true >( kernel, 0, configuration, kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z(), fetchData, fetchBoundary, fetchKernel, convolve, store ); }; }; #endif src/Benchmarks/Convolution/support/Benchmark.h +2 −11 Original line number Diff line number Diff line Loading @@ -19,12 +19,12 @@ public: void run( const TNL::Config::ParameterContainer& parameters ) const { if( ! TNL::Devices::Host::setup( parameters ) || ! TNL::Devices::Cuda::setup( parameters ) ) if( ! TNL::Devices::Cuda::setup( parameters ) ) return; const TNL::String logFileName = parameters.getParameter< TNL::String >( "log-file" ); const TNL::String outputMode = parameters.getParameter< TNL::String >( "output-mode" ); const TNL::String device = parameters.getParameter< TNL::String >( "device" ); const int verbose = parameters.getParameter< int >( "verbose" ); const int loops = parameters.getParameter< int >( "loops" ); Loading Loading @@ -58,19 +58,10 @@ public: config.addEntryEnum( "append" ); config.addEntryEnum( "overwrite" ); config.addEntry< TNL::String >( "device", "Device the computation will run on.", "cuda" ); config.addEntryEnum< TNL::String >( "all" ); config.addEntryEnum< TNL::String >( "host" ); #ifdef HAVE_CUDA config.addEntryEnum< TNL::String >( "cuda" ); #endif config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); config.addEntry< int >( "verbose", "Verbose mode.", 1 ); config.addDelimiter( "Device settings:" ); TNL::Devices::Host::configSetup( config ); #ifdef HAVE_CUDA TNL::Devices::Cuda::configSetup( config ); Loading src/Benchmarks/Convolution/support/DummyBenchmark.h +20 −50 Original line number Diff line number Diff line Loading @@ -23,56 +23,42 @@ public: virtual void start( TNLBenchmark& benchmark, const TNL::Config::ParameterContainer& parameters ) const override { Vector start; Vector end; Vector dimension; Vector minKernelSize; Vector maxKernelSize; for( int i = 0; i < Dimension; i++ ) { start[ i ] = parameters.getParameter< int >( minDimensionIds[ i ] ); end[ i ] = parameters.getParameter< int >( maxDimensionIds[ i ] ); dimension[ i ] = parameters.getParameter< int >( dimensionIds[ i ] ); minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] ); maxKernelSize[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] ); TNL_ASSERT_GT( start[ i ], 1, "Start dimension must be positive integer" ); TNL_ASSERT_GT( end[ i ], start[ i ], "End dimension must be greater than start dimension" ); TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" ); TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" ); TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End dimension must be greater than start dimension" ); TNL_ASSERT_GT( end[ i ], start[ i ], "End kernel size must be greater than start kernel size" ); TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End kernel size must be greater than start kernel size" ); } int dimensionStep = parameters.getParameter< int >( "dimension-step" ); int kernelStep = parameters.getParameter< int >( "kernel-step" ); TNL_ASSERT_GT( dimensionStep, 1, "Dimension step must be a positive number" ); TNL_ASSERT_GT( kernelStep, 0, "Kernel step must be a positive number" ); TNL_ASSERT_EQ( kernelStep % 2, 0, "Kernel step must be even" ); TNL::String id = parameters.getParameter< TNL::String >( "id" ); time( id, benchmark, start, end, dimensionStep, minKernelSize, maxKernelSize, kernelStep ); time( id, benchmark, dimension, minKernelSize, maxKernelSize, kernelStep ); } virtual void time( const TNL::String& id, TNLBenchmark& benchmark, const Vector& minDimension, const Vector& maxDimension, const int dimensionStep, const Vector& dimension, const Vector& minKernelSize, const Vector& maxKernelSize, const int kernelStep ) const { Vector currentDimension = minDimension; Vector currentKernelSize; do { currentKernelSize = minKernelSize; Vector currentKernelSize = minKernelSize; do { timeConvolution( id, benchmark, currentDimension, currentKernelSize ); timeConvolution( id, benchmark, dimension, currentKernelSize ); currentKernelSize[ 0 ] += kernelStep; Loading @@ -83,17 +69,6 @@ public: } } } while( currentKernelSize < maxKernelSize ); currentDimension[ 0 ] *= dimensionStep; for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) { if( currentDimension[ i ] >= maxDimension[ i ] ) { currentDimension[ i ] = minDimension[ i ]; currentDimension[ i + 1 ] *= dimensionStep; } } } while( currentDimension < maxDimension ); } void Loading Loading @@ -148,12 +123,7 @@ public: config.addDelimiter( "Grid dimension settings:" ); for( int i = 0; i < Dimension; i++ ) config.addEntry< int >( minDimensionIds[ i ], minDimensionIds[ i ], 16 ); for( int i = 0; i < Dimension; i++ ) config.addEntry< int >( maxDimensionIds[ i ], maxDimensionIds[ i ], 128 ); config.addEntry< int >( "dimension-step", "Step of kernel increase by which dimension is multiplied (must be even)", 2 ); config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 16 ); config.addDelimiter( "Kernel settings:" ); Loading Loading
src/Benchmarks/Convolution/CMakeLists.txt +8 −0 Original line number Diff line number Diff line Loading @@ -46,3 +46,11 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/shar GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedData.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedData.h") GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedData.h") GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")
src/Benchmarks/Convolution/kernels/sharedData.h +2 −2 Original line number Diff line number Diff line Loading @@ -162,7 +162,7 @@ convolution2D( Index kernelWidth, Real result = 0; for( Index j = 0; j <= radiusY; j++ ) { Index align = ( j + threadIdx.y ) * blockDim.y; Index align = ( j + threadIdx.y ) * blockDim.x; for( Index i = 0; i <= radiusX; i++ ) { Index index = i + threadIdx.x + align; Loading Loading @@ -330,7 +330,7 @@ convolution3D( Index kernelWidth, Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; for( Index j = 0; j <= radiusY; j++ ) { Index xAlign = ( j + threadIdx.y ) * blockDim.y; Index xAlign = ( j + threadIdx.y ) * blockDim.x; for( Index i = 0; i <= radiusX; i++ ) { Index index = i + threadIdx.x + xAlign + xyAlign; Loading
src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h 0 → 100644 +577 −0 Original line number Diff line number Diff line #pragma once #ifdef HAVE_CUDA #include <TNL/Devices/Cuda.h> #include <TNL/Containers/StaticVector.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Cuda/SharedMemory.h> /** * This method stores kernel and data in the shared memory to reduce amount of loads. * * We can calculate the size of shared memory needed the next way: * 1. We need to store in shared memory: * * for 1D -> (2 * kernelWidth) - 1 < 2 * kernelWidth * * for 2D -> ( (2 * kernelWidth) - 1 ) * ( (2 * kernelHeight) - 1 ) < 4 * kernelWidth * kernelHeight * * for 3D -> ( (2 * kernelWidth) - 1 ) * ( (2 * kernelHeight) - 1 ) * ( (2 * kernelDepth) - 1 ) < 8 * kernelWidth * * kernelHeight * kernelDepth * 2. We take into account, that the maximal block size is 1024, so the maximum volume of kernel is 1024. * Then the maximal amount of shared memory is: * * for 1D -> 2 * 1024 -> 2048 elements (Note, that even if we take long double (16B) we still can fit in the shared * memory) * * for 2D -> 4 * 1024 -> 4096 elements * * for 3D -> 8 * 1024 -> 8196 elements (Note, that if double takes 8 bytes, then we can't fit tile into shared memory, * because we have 64 KB of data) * 3. The last thing is, that even if we take 1D and 2D case we have enough space to store 1024 kernel element. * Then the maximal amount of shared memory is: * * for 1D -> 3 * 1024 -> can use long double, double, float * * for 2D -> 5 * 1024 -> can use double, float * * for 3D -> 9 * 1024 -> can use float */ template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > __global__ static void convolution1D( Index kernelWidth, Index endX, FetchData fetchData, FetchBoundary fetchBoundary, FetchKernel fetchKernel, Convolve convolve, Store store ) { Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX ) return; Index kernelOffset = 2 * kernelWidth; Real* data = TNL::Cuda::getSharedMemory< Real >(); Real* kernel = data + kernelOffset; Index radius = kernelWidth >> 1; // Left Index lhs = ix - radius; if( lhs < 0 ) { data[ threadIdx.x ] = fetchBoundary( lhs ); } else { data[ threadIdx.x ] = fetchData( lhs ); } // Right Index rhs = ix + radius; if( rhs >= endX ) { data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs ); } else { data[ threadIdx.x + blockDim.x ] = fetchData( rhs ); } kernel[ threadIdx.x ] = fetchKernel( threadIdx.x ); __syncthreads(); Real result = 0; #pragma unroll for( Index i = 0; i < kernelWidth; i++ ) { Index elementIndex = i + threadIdx.x; result = convolve( result, data[ elementIndex ], kernel[ i ] ); } store( ix, result ); } template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > __global__ static void convolution2D( Index kernelWidth, Index kernelHeight, Index endX, Index endY, FetchData fetchData, FetchBoundary fetchBoundary, FetchKernel fetchKernel, Convolve convolve, Store store ) { Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX || iy >= endY ) return; Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ); Real* data = TNL::Cuda::getSharedMemory< Real >(); Real* kernel = data + kernelOffset; Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; Index x, y, index; // Top Left x = ix - radiusX; y = iy - radiusY; index = threadIdx.x + threadIdx.y * blockDim.x; kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y ); if( x < 0 || y < 0 ) { data[ index ] = fetchBoundary( x, y ); } else { data[ index ] = fetchData( x, y ); } // Top right x = ix + radiusX; y = iy - radiusY; index = radiusX + threadIdx.x + threadIdx.y * blockDim.x; if( x >= endX || y < 0 ) { data[ index ] = fetchBoundary( x, y ); } else { data[ index ] = fetchData( x, y ); } // Bottom Left x = ix - radiusX; y = iy + radiusY; index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; if( x < 0 || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { data[ index ] = fetchData( x, y ); } // Bottom Right x = ix + radiusX; y = iy + radiusY; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; if( x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { data[ index ] = fetchData( x, y ); } __syncthreads(); Real result = 0; #pragma unroll for( Index j = 0; j <= radiusY; j++ ) { Index elementAlign = ( j + threadIdx.y ) * blockDim.x; Index kernelAlign = j * blockDim.x; #pragma unroll for( Index i = 0; i <= radiusX; i++ ) { Index elementIndex = i + threadIdx.x + elementAlign; Index kernelIndex = i + kernelAlign; result = convolve( result, data[ elementIndex ], kernel[ kernelIndex ] ); } } store( ix, iy, result ); } template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > __global__ static void convolution3D( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ, FetchData fetchData, FetchBoundary fetchBoundary, FetchKernel fetchKernel, Convolve convolve, Store store ) { Index iz = threadIdx.z + blockIdx.z * blockDim.z; Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; if( ix >= endX || iy >= endY || iz >= endZ ) return; Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ) * ( 2 * kernelDepth - 1 ); Real* data = TNL::Cuda::getSharedMemory< Real >(); Real* kernel = data + kernelOffset; Index radiusZ = kernelDepth >> 1; Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; Index x, y, z, index; // Z: 0 Y: 0 X: 0 x = ix - radiusX; y = iy - radiusY; z = iz - radiusZ; index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); if( x < 0 || y < 0 || z < 0 ) { data[ index ] = fetchBoundary( x, y, z ); } else { data[ index ] = fetchData( x, y, z ); } // Z: 0 Y: 0 X: 1 x = ix + radiusX; y = iy - radiusY; z = iz - radiusZ; index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; if( x >= endX || y < 0 || z < 0 ) { data[ index ] = fetchBoundary( x, y, z ); } else { data[ index ] = fetchData( x, y, z ); } // Z: 0 Y: 1 X: 0 x = ix - radiusX; y = iy + radiusY; z = iz - radiusZ; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; if( x < 0 || y >= endY || z < 0 ) { data[ index ] = fetchBoundary( x, y, z ); } else { data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 0 X: 0 x = ix - radiusX; y = iy - radiusY; z = iz + radiusZ; index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; if( x < 0 || y < 0 || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { data[ index ] = fetchData( x, y, z ); } // Z: 0 Y: 1 X: 1 x = ix + radiusX; y = iy + radiusY; z = iz - radiusZ; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; if( x >= endX || y >= endY || z < 0 ) { data[ index ] = fetchBoundary( x, y, z ); } else { data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 0 X: 1 x = ix + radiusX; y = iy - radiusY; z = iz + radiusZ; index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; if( x >= endX || y < 0 || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 1 X: 0 x = ix - radiusX; y = iy + radiusY; z = iz + radiusZ; index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; if( x < 0 || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 1 X: 1 x = ix + radiusX; y = iy + radiusY; z = iz + radiusZ; index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; if( x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { data[ index ] = fetchData( x, y, z ); } __syncthreads(); Real result = 0; for( Index k = 0; k <= radiusZ; k++ ) { Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; Index xyKernelAlign = k * blockDim.x * blockDim.y; for( Index j = 0; j <= radiusY; j++ ) { Index xAlign = ( j + threadIdx.y ) * blockDim.x; Index xKernelAlign = j * blockDim.x; for( Index i = 0; i <= radiusX; i++ ) { Index elementIndex = i + threadIdx.x + xAlign + xyAlign; Index kernelIndex = i + xKernelAlign + xyKernelAlign; result = convolve( result, data[ index ], kernel[ kernelIndex ] ); } } } store( ix, iy, iz, result ); } template< int Dimension, typename Device > struct Convolution; template<> struct Convolution< 1, TNL::Devices::Cuda > { public: template< typename Index > using Vector = TNL::Containers::StaticVector< 1, Index >; template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { Index kernelElementCount = 1; for( Index i = 0; i < kernelSize.getSize(); i++ ) kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; configuration.dynamicSharedMemorySize = ( kernelSize.x() + kernelElementCount ) * sizeof( Real ); configuration.blockSize.x = kernelSize.x(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); } template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > static void execute( const Vector< Index >& dimensions, const Vector< Index >& kernelSize, FetchData&& fetchData, FetchBoundary&& fetchBoundary, FetchKernel&& fetchKernel, Convolve&& convolve, Store&& store ) { TNL::Cuda::LaunchConfiguration configuration; setup< Index, Real >( configuration, dimensions, kernelSize ); constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; TNL::Cuda::launchKernel< true >( kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store ); }; }; template<> struct Convolution< 2, TNL::Devices::Cuda > { public: template< typename Index > using Vector = TNL::Containers::StaticVector< 2, Index >; template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { Index kernelElementCount = 1; Index kernelVolume = 1; for( Index i = 0; i < kernelSize.getSize(); i++ ) { kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; kernelVolume *= kernelSize[ i ]; } configuration.dynamicSharedMemorySize = ( kernelVolume + kernelElementCount ) * sizeof( Real ); configuration.blockSize.x = kernelSize.x(); configuration.blockSize.y = kernelSize.y(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); } template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > static void execute( const Vector< Index >& dimensions, const Vector< Index >& kernelSize, FetchData&& fetchData, FetchBoundary&& fetchBoundary, FetchKernel&& fetchKernel, Convolve&& convolve, Store&& store ) { TNL::Cuda::LaunchConfiguration configuration; setup< Index, Real >( configuration, dimensions, kernelSize ); constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; TNL::Cuda::launchKernel< true >( kernel, 0, configuration, kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y(), fetchData, fetchBoundary, fetchKernel, convolve, store ); }; }; template<> struct Convolution< 3, TNL::Devices::Cuda > { public: template< typename Index > using Vector = TNL::Containers::StaticVector< 3, Index >; template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { Index kernelElementCount = 1; Index kernelVolume = 1; for( Index i = 0; i < kernelSize.getSize(); i++ ) { kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; kernelVolume *= kernelSize[ i ]; } configuration.dynamicSharedMemorySize = ( kernelVolume + kernelElementCount ) * sizeof( Real ); configuration.blockSize.x = kernelSize.x(); configuration.blockSize.y = kernelSize.y(); configuration.blockSize.z = kernelSize.z(); configuration.gridSize.x = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > static void execute( const Vector< Index >& dimensions, const Vector< Index >& kernelSize, FetchData&& fetchData, FetchBoundary&& fetchBoundary, FetchKernel&& fetchKernel, Convolve&& convolve, Store&& store ) { TNL::Cuda::LaunchConfiguration configuration; setup< Index, Real >( configuration, dimensions, kernelSize ); constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; TNL::Cuda::launchKernel< true >( kernel, 0, configuration, kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z(), fetchData, fetchBoundary, fetchKernel, convolve, store ); }; }; #endif
src/Benchmarks/Convolution/support/Benchmark.h +2 −11 Original line number Diff line number Diff line Loading @@ -19,12 +19,12 @@ public: void run( const TNL::Config::ParameterContainer& parameters ) const { if( ! TNL::Devices::Host::setup( parameters ) || ! TNL::Devices::Cuda::setup( parameters ) ) if( ! TNL::Devices::Cuda::setup( parameters ) ) return; const TNL::String logFileName = parameters.getParameter< TNL::String >( "log-file" ); const TNL::String outputMode = parameters.getParameter< TNL::String >( "output-mode" ); const TNL::String device = parameters.getParameter< TNL::String >( "device" ); const int verbose = parameters.getParameter< int >( "verbose" ); const int loops = parameters.getParameter< int >( "loops" ); Loading Loading @@ -58,19 +58,10 @@ public: config.addEntryEnum( "append" ); config.addEntryEnum( "overwrite" ); config.addEntry< TNL::String >( "device", "Device the computation will run on.", "cuda" ); config.addEntryEnum< TNL::String >( "all" ); config.addEntryEnum< TNL::String >( "host" ); #ifdef HAVE_CUDA config.addEntryEnum< TNL::String >( "cuda" ); #endif config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); config.addEntry< int >( "verbose", "Verbose mode.", 1 ); config.addDelimiter( "Device settings:" ); TNL::Devices::Host::configSetup( config ); #ifdef HAVE_CUDA TNL::Devices::Cuda::configSetup( config ); Loading
src/Benchmarks/Convolution/support/DummyBenchmark.h +20 −50 Original line number Diff line number Diff line Loading @@ -23,56 +23,42 @@ public: virtual void start( TNLBenchmark& benchmark, const TNL::Config::ParameterContainer& parameters ) const override { Vector start; Vector end; Vector dimension; Vector minKernelSize; Vector maxKernelSize; for( int i = 0; i < Dimension; i++ ) { start[ i ] = parameters.getParameter< int >( minDimensionIds[ i ] ); end[ i ] = parameters.getParameter< int >( maxDimensionIds[ i ] ); dimension[ i ] = parameters.getParameter< int >( dimensionIds[ i ] ); minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] ); maxKernelSize[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] ); TNL_ASSERT_GT( start[ i ], 1, "Start dimension must be positive integer" ); TNL_ASSERT_GT( end[ i ], start[ i ], "End dimension must be greater than start dimension" ); TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" ); TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" ); TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End dimension must be greater than start dimension" ); TNL_ASSERT_GT( end[ i ], start[ i ], "End kernel size must be greater than start kernel size" ); TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End kernel size must be greater than start kernel size" ); } int dimensionStep = parameters.getParameter< int >( "dimension-step" ); int kernelStep = parameters.getParameter< int >( "kernel-step" ); TNL_ASSERT_GT( dimensionStep, 1, "Dimension step must be a positive number" ); TNL_ASSERT_GT( kernelStep, 0, "Kernel step must be a positive number" ); TNL_ASSERT_EQ( kernelStep % 2, 0, "Kernel step must be even" ); TNL::String id = parameters.getParameter< TNL::String >( "id" ); time( id, benchmark, start, end, dimensionStep, minKernelSize, maxKernelSize, kernelStep ); time( id, benchmark, dimension, minKernelSize, maxKernelSize, kernelStep ); } virtual void time( const TNL::String& id, TNLBenchmark& benchmark, const Vector& minDimension, const Vector& maxDimension, const int dimensionStep, const Vector& dimension, const Vector& minKernelSize, const Vector& maxKernelSize, const int kernelStep ) const { Vector currentDimension = minDimension; Vector currentKernelSize; do { currentKernelSize = minKernelSize; Vector currentKernelSize = minKernelSize; do { timeConvolution( id, benchmark, currentDimension, currentKernelSize ); timeConvolution( id, benchmark, dimension, currentKernelSize ); currentKernelSize[ 0 ] += kernelStep; Loading @@ -83,17 +69,6 @@ public: } } } while( currentKernelSize < maxKernelSize ); currentDimension[ 0 ] *= dimensionStep; for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) { if( currentDimension[ i ] >= maxDimension[ i ] ) { currentDimension[ i ] = minDimension[ i ]; currentDimension[ i + 1 ] *= dimensionStep; } } } while( currentDimension < maxDimension ); } void Loading Loading @@ -148,12 +123,7 @@ public: config.addDelimiter( "Grid dimension settings:" ); for( int i = 0; i < Dimension; i++ ) config.addEntry< int >( minDimensionIds[ i ], minDimensionIds[ i ], 16 ); for( int i = 0; i < Dimension; i++ ) config.addEntry< int >( maxDimensionIds[ i ], maxDimensionIds[ i ], 128 ); config.addEntry< int >( "dimension-step", "Step of kernel increase by which dimension is multiplied (must be even)", 2 ); config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 16 ); config.addDelimiter( "Kernel settings:" ); Loading