Implement shared data and kernel kernel (5d0dac39) · Commits · TNL / tnl-dev

src/Benchmarks/Convolution/CMakeLists.txt

+8 −0

Original line number	Diff line number	Diff line
		@@ -46,3 +46,11 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/shar
		GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedData.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedData.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedData.h")

		GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedDataAndKernel.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedDataAndKernel.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedDataAndKernel.h")

		GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")
		GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")

src/Benchmarks/Convolution/kernels/sharedData.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -162,7 +162,7 @@ convolution2D( Index kernelWidth,
		Real result = 0;

		for( Index j = 0; j <= radiusY; j++ ) {
		Index align = ( j + threadIdx.y ) * blockDim.y;
		Index align = ( j + threadIdx.y ) * blockDim.x;

		for( Index i = 0; i <= radiusX; i++ ) {
		Index index = i + threadIdx.x + align;
		@@ -330,7 +330,7 @@ convolution3D( Index kernelWidth,
		Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;

		for( Index j = 0; j <= radiusY; j++ ) {
		Index xAlign = ( j + threadIdx.y ) * blockDim.y;
		Index xAlign = ( j + threadIdx.y ) * blockDim.x;

		for( Index i = 0; i <= radiusX; i++ ) {
		Index index = i + threadIdx.x + xAlign + xyAlign;

src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h

0 → 100644

+577 −0

Original line number	Diff line number	Diff line
		#pragma once

		#ifdef HAVE_CUDA

		#include <TNL/Devices/Cuda.h>
		#include <TNL/Containers/StaticVector.h>
		#include <TNL/Cuda/LaunchHelpers.h>
		#include <TNL/Cuda/SharedMemory.h>

		/**
		* This method stores kernel and data in the shared memory to reduce amount of loads.
		*
		* We can calculate the size of shared memory needed the next way:
		* 1. We need to store in shared memory:
		* * for 1D -> (2 * kernelWidth) - 1 < 2 * kernelWidth
		* * for 2D -> ( (2 * kernelWidth) - 1 ) * ( (2 * kernelHeight) - 1 ) < 4 * kernelWidth * kernelHeight
		* * for 3D -> ( (2 * kernelWidth) - 1 ) * ( (2 * kernelHeight) - 1 ) * ( (2 * kernelDepth) - 1 ) < 8 * kernelWidth *
		* kernelHeight * kernelDepth
		* 2. We take into account, that the maximal block size is 1024, so the maximum volume of kernel is 1024.
		* Then the maximal amount of shared memory is:
		* * for 1D -> 2 * 1024 -> 2048 elements (Note, that even if we take long double (16B) we still can fit in the shared
		* memory)
		* * for 2D -> 4 * 1024 -> 4096 elements
		* * for 3D -> 8 * 1024 -> 8196 elements (Note, that if double takes 8 bytes, then we can't fit tile into shared memory,
		* because we have 64 KB of data)
		* 3. The last thing is, that even if we take 1D and 2D case we have enough space to store 1024 kernel element.
		* Then the maximal amount of shared memory is:
		* * for 1D -> 3 * 1024 -> can use long double, double, float
		* * for 2D -> 5 * 1024 -> can use double, float
		* * for 3D -> 9 * 1024 -> can use float
		*/

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		__global__
		static void
		convolution1D( Index kernelWidth,
		Index endX,
		FetchData fetchData,
		FetchBoundary fetchBoundary,
		FetchKernel fetchKernel,
		Convolve convolve,
		Store store )
		{
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		if( ix >= endX )
		return;

		Index kernelOffset = 2 * kernelWidth;

		Real* data = TNL::Cuda::getSharedMemory< Real >();
		Real* kernel = data + kernelOffset;

		Index radius = kernelWidth >> 1;

		// Left
		Index lhs = ix - radius;

		if( lhs < 0 ) {
		data[ threadIdx.x ] = fetchBoundary( lhs );
		}
		else {
		data[ threadIdx.x ] = fetchData( lhs );
		}

		// Right
		Index rhs = ix + radius;

		if( rhs >= endX ) {
		data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs );
		}
		else {
		data[ threadIdx.x + blockDim.x ] = fetchData( rhs );
		}

		kernel[ threadIdx.x ] = fetchKernel( threadIdx.x );

		__syncthreads();

		Real result = 0;

		#pragma unroll
		for( Index i = 0; i < kernelWidth; i++ ) {
		Index elementIndex = i + threadIdx.x;

		result = convolve( result, data[ elementIndex ], kernel[ i ] );
		}

		store( ix, result );
		}

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		__global__
		static void
		convolution2D( Index kernelWidth,
		Index kernelHeight,
		Index endX,
		Index endY,
		FetchData fetchData,
		FetchBoundary fetchBoundary,
		FetchKernel fetchKernel,
		Convolve convolve,
		Store store )
		{
		Index iy = threadIdx.y + blockIdx.y * blockDim.y;
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		if( ix >= endX \|\| iy >= endY )
		return;

		Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 );

		Real* data = TNL::Cuda::getSharedMemory< Real >();
		Real* kernel = data + kernelOffset;

		Index radiusY = kernelHeight >> 1;
		Index radiusX = kernelWidth >> 1;

		Index x, y, index;

		// Top Left
		x = ix - radiusX;
		y = iy - radiusY;

		index = threadIdx.x + threadIdx.y * blockDim.x;

		kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y );

		if( x < 0 \|\| y < 0 ) {
		data[ index ] = fetchBoundary( x, y );
		}
		else {
		data[ index ] = fetchData( x, y );
		}

		// Top right
		x = ix + radiusX;
		y = iy - radiusY;

		index = radiusX + threadIdx.x + threadIdx.y * blockDim.x;

		if( x >= endX \|\| y < 0 ) {
		data[ index ] = fetchBoundary( x, y );
		}
		else {
		data[ index ] = fetchData( x, y );
		}

		// Bottom Left
		x = ix - radiusX;
		y = iy + radiusY;

		index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;

		if( x < 0 \|\| y >= endY ) {
		data[ index ] = fetchBoundary( x, y );
		}
		else {
		data[ index ] = fetchData( x, y );
		}

		// Bottom Right
		x = ix + radiusX;
		y = iy + radiusY;

		index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;

		if( x >= endX \|\| y >= endY ) {
		data[ index ] = fetchBoundary( x, y );
		}
		else {
		data[ index ] = fetchData( x, y );
		}

		__syncthreads();

		Real result = 0;

		#pragma unroll
		for( Index j = 0; j <= radiusY; j++ ) {
		Index elementAlign = ( j + threadIdx.y ) * blockDim.x;
		Index kernelAlign = j * blockDim.x;

		#pragma unroll
		for( Index i = 0; i <= radiusX; i++ ) {
		Index elementIndex = i + threadIdx.x + elementAlign;
		Index kernelIndex = i + kernelAlign;

		result = convolve( result, data[ elementIndex ], kernel[ kernelIndex ] );
		}
		}

		store( ix, iy, result );
		}

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		__global__
		static void
		convolution3D( Index kernelWidth,
		Index kernelHeight,
		Index kernelDepth,
		Index endX,
		Index endY,
		Index endZ,
		FetchData fetchData,
		FetchBoundary fetchBoundary,
		FetchKernel fetchKernel,
		Convolve convolve,
		Store store )
		{
		Index iz = threadIdx.z + blockIdx.z * blockDim.z;
		Index iy = threadIdx.y + blockIdx.y * blockDim.y;
		Index ix = threadIdx.x + blockIdx.x * blockDim.x;

		if( ix >= endX \|\| iy >= endY \|\| iz >= endZ )
		return;

		Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ) * ( 2 * kernelDepth - 1 );

		Real* data = TNL::Cuda::getSharedMemory< Real >();
		Real* kernel = data + kernelOffset;

		Index radiusZ = kernelDepth >> 1;
		Index radiusY = kernelHeight >> 1;
		Index radiusX = kernelWidth >> 1;

		Index x, y, z, index;

		// Z: 0 Y: 0 X: 0
		x = ix - radiusX;
		y = iy - radiusY;
		z = iz - radiusZ;

		index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;

		kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z );

		if( x < 0 \|\| y < 0 \|\| z < 0 ) {
		data[ index ] = fetchBoundary( x, y, z );
		}
		else {
		data[ index ] = fetchData( x, y, z );
		}

		// Z: 0 Y: 0 X: 1
		x = ix + radiusX;
		y = iy - radiusY;
		z = iz - radiusZ;

		index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;

		if( x >= endX \|\| y < 0 \|\| z < 0 ) {
		data[ index ] = fetchBoundary( x, y, z );
		}
		else {
		data[ index ] = fetchData( x, y, z );
		}

		// Z: 0 Y: 1 X: 0
		x = ix - radiusX;
		y = iy + radiusY;
		z = iz - radiusZ;

		index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;

		if( x < 0 \|\| y >= endY \|\| z < 0 ) {
		data[ index ] = fetchBoundary( x, y, z );
		}
		else {
		data[ index ] = fetchData( x, y, z );
		}

		// Z: 1 Y: 0 X: 0
		x = ix - radiusX;
		y = iy - radiusY;
		z = iz + radiusZ;

		index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;

		if( x < 0 \|\| y < 0 \|\| z >= endZ ) {
		data[ index ] = fetchBoundary( x, y, z );
		}
		else {
		data[ index ] = fetchData( x, y, z );
		}

		// Z: 0 Y: 1 X: 1
		x = ix + radiusX;
		y = iy + radiusY;
		z = iz - radiusZ;

		index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;

		if( x >= endX \|\| y >= endY \|\| z < 0 ) {
		data[ index ] = fetchBoundary( x, y, z );
		}
		else {
		data[ index ] = fetchData( x, y, z );
		}

		// Z: 1 Y: 0 X: 1
		x = ix + radiusX;
		y = iy - radiusY;
		z = iz + radiusZ;

		index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;

		if( x >= endX \|\| y < 0 \|\| z >= endZ ) {
		data[ index ] = fetchBoundary( x, y, z );
		}
		else {
		data[ index ] = fetchData( x, y, z );
		}

		// Z: 1 Y: 1 X: 0
		x = ix - radiusX;
		y = iy + radiusY;
		z = iz + radiusZ;

		index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;

		if( x < 0 \|\| y >= endY \|\| z >= endZ ) {
		data[ index ] = fetchBoundary( x, y, z );
		}
		else {
		data[ index ] = fetchData( x, y, z );
		}

		// Z: 1 Y: 1 X: 1
		x = ix + radiusX;
		y = iy + radiusY;
		z = iz + radiusZ;

		index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;

		if( x >= endX \|\| y >= endY \|\| z >= endZ ) {
		data[ index ] = fetchBoundary( x, y, z );
		}
		else {
		data[ index ] = fetchData( x, y, z );
		}

		__syncthreads();

		Real result = 0;

		for( Index k = 0; k <= radiusZ; k++ ) {
		Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;
		Index xyKernelAlign = k * blockDim.x * blockDim.y;

		for( Index j = 0; j <= radiusY; j++ ) {
		Index xAlign = ( j + threadIdx.y ) * blockDim.x;
		Index xKernelAlign = j * blockDim.x;

		for( Index i = 0; i <= radiusX; i++ ) {
		Index elementIndex = i + threadIdx.x + xAlign + xyAlign;
		Index kernelIndex = i + xKernelAlign + xyKernelAlign;

		result = convolve( result, data[ index ], kernel[ kernelIndex ] );
		}
		}
		}

		store( ix, iy, iz, result );
		}

		template< int Dimension, typename Device >
		struct Convolution;

		template<>
		struct Convolution< 1, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 1, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		Index kernelElementCount = 1;

		for( Index i = 0; i < kernelSize.getSize(); i++ )
		kernelElementCount = ( 2 kernelSize[ i ] ) - 1;

		configuration.dynamicSharedMemorySize = ( kernelSize.x() + kernelElementCount ) * sizeof( Real );

		configuration.blockSize.x = kernelSize.x();
		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		}

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		static void
		execute( const Vector< Index >& dimensions,
		const Vector< Index >& kernelSize,
		FetchData&& fetchData,
		FetchBoundary&& fetchBoundary,
		FetchKernel&& fetchKernel,
		Convolve&& convolve,
		Store&& store )
		{
		TNL::Cuda::LaunchConfiguration configuration;

		setup< Index, Real >( configuration, dimensions, kernelSize );

		constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

		TNL::Cuda::launchKernel< true >(
		kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store );
		};
		};

		template<>
		struct Convolution< 2, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 2, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		Index kernelElementCount = 1;
		Index kernelVolume = 1;

		for( Index i = 0; i < kernelSize.getSize(); i++ ) {
		kernelElementCount = ( 2 kernelSize[ i ] ) - 1;
		kernelVolume *= kernelSize[ i ];
		}

		configuration.dynamicSharedMemorySize = ( kernelVolume + kernelElementCount ) * sizeof( Real );

		configuration.blockSize.x = kernelSize.x();
		configuration.blockSize.y = kernelSize.y();

		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		}

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		static void
		execute( const Vector< Index >& dimensions,
		const Vector< Index >& kernelSize,
		FetchData&& fetchData,
		FetchBoundary&& fetchBoundary,
		FetchKernel&& fetchKernel,
		Convolve&& convolve,
		Store&& store )
		{
		TNL::Cuda::LaunchConfiguration configuration;

		setup< Index, Real >( configuration, dimensions, kernelSize );

		constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

		TNL::Cuda::launchKernel< true >( kernel,
		0,
		configuration,
		kernelSize.x(),
		kernelSize.y(),
		dimensions.x(),
		dimensions.y(),
		fetchData,
		fetchBoundary,
		fetchKernel,
		convolve,
		store );
		};
		};

		template<>
		struct Convolution< 3, TNL::Devices::Cuda >
		{
		public:
		template< typename Index >
		using Vector = TNL::Containers::StaticVector< 3, Index >;

		template< typename Index, typename Real >
		static void
		setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
		{
		Index kernelElementCount = 1;
		Index kernelVolume = 1;

		for( Index i = 0; i < kernelSize.getSize(); i++ ) {
		kernelElementCount = ( 2 kernelSize[ i ] ) - 1;
		kernelVolume *= kernelSize[ i ];
		}

		configuration.dynamicSharedMemorySize = ( kernelVolume + kernelElementCount ) * sizeof( Real );

		configuration.blockSize.x = kernelSize.x();
		configuration.blockSize.y = kernelSize.y();
		configuration.blockSize.z = kernelSize.z();

		configuration.gridSize.x =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
		configuration.gridSize.y =
		TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
		}

		template< typename Index,
		typename Real,
		typename FetchData,
		typename FetchBoundary,
		typename FetchKernel,
		typename Convolve,
		typename Store >
		static void
		execute( const Vector< Index >& dimensions,
		const Vector< Index >& kernelSize,
		FetchData&& fetchData,
		FetchBoundary&& fetchBoundary,
		FetchKernel&& fetchKernel,
		Convolve&& convolve,
		Store&& store )
		{
		TNL::Cuda::LaunchConfiguration configuration;

		setup< Index, Real >( configuration, dimensions, kernelSize );

		constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

		TNL::Cuda::launchKernel< true >( kernel,
		0,
		configuration,
		kernelSize.x(),
		kernelSize.y(),
		kernelSize.z(),
		dimensions.x(),
		dimensions.y(),
		dimensions.z(),
		fetchData,
		fetchBoundary,
		fetchKernel,
		convolve,
		store );
		};
		};

		#endif

src/Benchmarks/Convolution/support/Benchmark.h

+2 −11

Original line number	Diff line number	Diff line
		@@ -19,12 +19,12 @@ public:
		void
		run( const TNL::Config::ParameterContainer& parameters ) const
		{
		if( ! TNL::Devices::Host::setup( parameters ) \|\| ! TNL::Devices::Cuda::setup( parameters ) )
		if( ! TNL::Devices::Cuda::setup( parameters ) )
		return;

		const TNL::String logFileName = parameters.getParameter< TNL::String >( "log-file" );
		const TNL::String outputMode = parameters.getParameter< TNL::String >( "output-mode" );
		const TNL::String device = parameters.getParameter< TNL::String >( "device" );


		const int verbose = parameters.getParameter< int >( "verbose" );
		const int loops = parameters.getParameter< int >( "loops" );
		@@ -58,19 +58,10 @@ public:
		config.addEntryEnum( "append" );
		config.addEntryEnum( "overwrite" );

		config.addEntry< TNL::String >( "device", "Device the computation will run on.", "cuda" );
		config.addEntryEnum< TNL::String >( "all" );
		config.addEntryEnum< TNL::String >( "host" );

		#ifdef HAVE_CUDA
		config.addEntryEnum< TNL::String >( "cuda" );
		#endif

		config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
		config.addEntry< int >( "verbose", "Verbose mode.", 1 );

		config.addDelimiter( "Device settings:" );
		TNL::Devices::Host::configSetup( config );

		#ifdef HAVE_CUDA
		TNL::Devices::Cuda::configSetup( config );

src/Benchmarks/Convolution/support/DummyBenchmark.h

+20 −50

Original line number	Diff line number	Diff line
		@@ -23,56 +23,42 @@ public:
		virtual void
		start( TNLBenchmark& benchmark, const TNL::Config::ParameterContainer& parameters ) const override
		{
		Vector start;
		Vector end;
		Vector dimension;
		Vector minKernelSize;
		Vector maxKernelSize;

		for( int i = 0; i < Dimension; i++ ) {
		start[ i ] = parameters.getParameter< int >( minDimensionIds[ i ] );
		end[ i ] = parameters.getParameter< int >( maxDimensionIds[ i ] );
		dimension[ i ] = parameters.getParameter< int >( dimensionIds[ i ] );
		minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] );
		maxKernelSize[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] );

		TNL_ASSERT_GT( start[ i ], 1, "Start dimension must be positive integer" );
		TNL_ASSERT_GT( end[ i ], start[ i ], "End dimension must be greater than start dimension" );

		TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" );
		TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" );
		TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End dimension must be greater than start dimension" );
		TNL_ASSERT_GT( end[ i ], start[ i ], "End kernel size must be greater than start kernel size" );
		TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End kernel size must be greater than start kernel size" );
		}

		int dimensionStep = parameters.getParameter< int >( "dimension-step" );
		int kernelStep = parameters.getParameter< int >( "kernel-step" );

		TNL_ASSERT_GT( dimensionStep, 1, "Dimension step must be a positive number" );
		TNL_ASSERT_GT( kernelStep, 0, "Kernel step must be a positive number" );
		TNL_ASSERT_EQ( kernelStep % 2, 0, "Kernel step must be even" );

		TNL::String id = parameters.getParameter< TNL::String >( "id" );

		time( id, benchmark, start, end, dimensionStep, minKernelSize, maxKernelSize, kernelStep );
		time( id, benchmark, dimension, minKernelSize, maxKernelSize, kernelStep );
		}

		virtual void
		time( const TNL::String& id,
		TNLBenchmark& benchmark,
		const Vector& minDimension,
		const Vector& maxDimension,
		const int dimensionStep,
		const Vector& dimension,
		const Vector& minKernelSize,
		const Vector& maxKernelSize,
		const int kernelStep ) const
		{
		Vector currentDimension = minDimension;
		Vector currentKernelSize;

		do {
		currentKernelSize = minKernelSize;
		Vector currentKernelSize = minKernelSize;

		do {
		timeConvolution( id, benchmark, currentDimension, currentKernelSize );
		timeConvolution( id, benchmark, dimension, currentKernelSize );

		currentKernelSize[ 0 ] += kernelStep;

		@@ -83,17 +69,6 @@ public:
		}
		}
		} while( currentKernelSize < maxKernelSize );

		currentDimension[ 0 ] *= dimensionStep;

		for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) {
		if( currentDimension[ i ] >= maxDimension[ i ] ) {
		currentDimension[ i ] = minDimension[ i ];
		currentDimension[ i + 1 ] *= dimensionStep;
		}
		}

		} while( currentDimension < maxDimension );
		}

		void
		@@ -148,12 +123,7 @@ public:
		config.addDelimiter( "Grid dimension settings:" );

		for( int i = 0; i < Dimension; i++ )
		config.addEntry< int >( minDimensionIds[ i ], minDimensionIds[ i ], 16 );

		for( int i = 0; i < Dimension; i++ )
		config.addEntry< int >( maxDimensionIds[ i ], maxDimensionIds[ i ], 128 );

		config.addEntry< int >( "dimension-step", "Step of kernel increase by which dimension is multiplied (must be even)", 2 );
		config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 16 );

		config.addDelimiter( "Kernel settings:" );