diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt
index 50e46776282b3a13845337c26ddb9d83c31751ae..e3b14d8518bc57ee74964fe181eff630d6ef98bc 100644
--- a/src/Benchmarks/CMakeLists.txt
+++ b/src/Benchmarks/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory( Convolution )
 add_subdirectory( HeatEquation )
 add_subdirectory( BLAS )
 add_subdirectory( NDArray )
diff --git a/src/Benchmarks/Convolution/.gitignore b/src/Benchmarks/Convolution/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..86d4c2dd380eb2e9e2a050e882f40bc65dc2a48b
--- /dev/null
+++ b/src/Benchmarks/Convolution/.gitignore
@@ -0,0 +1 @@
+generated
diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..31d46cbf70d34245f377bd5309e190629f7a903e
--- /dev/null
+++ b/src/Benchmarks/Convolution/CMakeLists.txt
@@ -0,0 +1,77 @@
+
+function(generate_cuda_executable PREFIX DIMENSION TEMPLATE KERNEL_HEADER)
+
+get_filename_component(MODULE_NAME ${KERNEL_HEADER} NAME_WE)
+get_filename_component(TEMPLATE_NAME ${TEMPLATE} NAME_WE)
+
+if (${BUILD_CUDA})
+   SET(SOURCE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/generated/${MODULE_NAME}_${DIMENSION}_${TEMPLATE_NAME}.cu")
+
+   FILE(READ ${TEMPLATE} TEMPLATE_CONTENT)
+
+   STRING(REGEX REPLACE "DIMENSION_VALUE" ${DIMENSION} TEMPLATE_CONTENT "${TEMPLATE_CONTENT}")
+   STRING(REGEX REPLACE "KERNEL_VALUE" "\"../${KERNEL_HEADER}\"" TEMPLATE_CONTENT "${TEMPLATE_CONTENT}")
+
+   get_filename_component(ABSOLUTE_SUPPORT_PATH ${SOURCE_FILE} ABSOLUTE)
+
+   if(NOT EXISTS ${ABSOLUTE_SUPPORT_PATH})
+      FILE(WRITE ${ABSOLUTE_SUPPORT_PATH} "")
+   endif()
+
+   FILE(READ ${SOURCE_FILE} SOURCE_FILE_CONTENT)
+
+   if ( NOT "${SOURCE_FILE_CONTENT}" STREQUAL "${TEMPLATE_CONTENT}" )
+      FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}")
+   endif()
+
+   SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}_${TEMPLATE_NAME}")
+
+   CUDA_ADD_EXECUTABLE(${EXECUTABLE_NAME} ${SOURCE_FILE})
+
+   if( PNG_FOUND )
+      target_link_libraries( ${EXECUTABLE_NAME} ${PNG_LIBRARIES} )
+   endif()
+else()
+   MESSAGE(WARNING "Convolutions are not supported on CPU")
+endif()
+
+endfunction()
+
+GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/naive.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/naive.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/naive.h")
+
+GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/naive.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/naive.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/naive.h")
+
+GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedKernel.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedKernel.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedKernel.h")
+
+GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedKernel.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedKernel.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedKernel.h")
+
+GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedData.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedData.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedData.h")
+
+GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedData.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedData.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedData.h")
+
+GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedDataAndKernel.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedDataAndKernel.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedDataAndKernel.h")
+
+GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")
+
+GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/naive.h")
+GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedData.h")
+GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedKernel.h")
+GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedDataAndKernel.h")
+
+GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/sharedDataAndKernel.h")
diff --git a/src/Benchmarks/Convolution/kernels/naive.h b/src/Benchmarks/Convolution/kernels/naive.h
new file mode 100644
index 0000000000000000000000000000000000000000..5705deb04be7089688bd0212d7c8c517bab56cc5
--- /dev/null
+++ b/src/Benchmarks/Convolution/kernels/naive.h
@@ -0,0 +1,351 @@
+
+#pragma once
+
+#ifdef HAVE_CUDA
+
+   #include <TNL/Devices/Cuda.h>
+   #include <TNL/Containers/StaticVector.h>
+   #include <TNL/Cuda/LaunchHelpers.h>
+
+/**
+ * There are several pitfalls with such configuration.
+ *
+ * 1. At first we don't use shared memory
+ * 2. At second we don't control block size, so we may launch extremely small kernels or otherwise we can launch extremely large
+ * kernels.
+ */
+
+template< int Dimension, typename Device >
+struct Convolution;
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution1D( Index kernelWidth,
+               Index endX,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   if( ix >= endX )
+      return;
+
+   Index radius = kernelWidth >> 1;
+
+   Real result = 0;
+
+   for( Index i = -radius; i <= radius; i++ ) {
+      Index elementIndex = i + ix;
+      Index kernelIndex = i + radius;
+
+      if( elementIndex < 0 || elementIndex >= endX ) {
+         result = convolve( result, fetchBoundary( elementIndex ), fetchKernel( kernelIndex ) );
+      }
+      else {
+         result = convolve( result, fetchData( elementIndex ), fetchKernel( kernelIndex ) );
+      }
+   }
+
+   store( ix, result );
+}
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution2D( Index kernelWidth,
+               Index kernelHeight,
+               Index endX,
+               Index endY,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   if( ix >= endX || iy >= endY )
+      return;
+
+   Index radiusY = kernelHeight >> 1;
+   Index radiusX = kernelWidth >> 1;
+
+   Real result = 0;
+
+   for( Index j = -radiusY; j <= radiusY; j++ ) {
+      Index elementIndexY = j + iy;
+      Index kernelIndexY = j + radiusY;
+
+      for( Index i = -radiusX; i <= radiusX; i++ ) {
+         Index elementIndexX = i + ix;
+         Index kernelIndexX = i + radiusX;
+
+         if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY ) {
+            result =
+               convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) );
+         }
+         else {
+            result = convolve( result, fetchData( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) );
+         }
+      }
+   }
+
+   store( ix, iy, result );
+}
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution3D( Index kernelWidth,
+               Index kernelHeight,
+               Index kernelDepth,
+               Index endX,
+               Index endY,
+               Index endZ,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Index iz = threadIdx.z + blockIdx.z * blockDim.z;
+   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   if( ix >= endX || iy >= endY || iz >= endZ )
+      return;
+
+   Index radiusZ = kernelDepth >> 1;
+   Index radiusY = kernelHeight >> 1;
+   Index radiusX = kernelWidth >> 1;
+
+   Real result = 0;
+
+   for( Index k = -radiusZ; k <= radiusZ; k++ ) {
+      Index elementIndexZ = k + iz;
+      Index kernelIndexZ = k + radiusZ;
+
+      for( Index j = -radiusY; j <= radiusY; j++ ) {
+         Index elementIndexY = j + iy;
+         Index kernelIndexY = j + radiusY;
+
+         for( Index i = -radiusX; i <= radiusX; i++ ) {
+            Index elementIndexX = i + ix;
+            Index kernelIndexX = i + radiusX;
+
+            if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0
+                || elementIndexZ >= endZ )
+            {
+               result = convolve( result,
+                                  fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ),
+                                  fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) );
+            }
+            else {
+               result = convolve( result,
+                                  fetchData( elementIndexX, elementIndexY, elementIndexZ ),
+                                  fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) );
+            }
+         }
+      }
+   }
+
+   store( ix, iy, iz, result );
+}
+
+template< int Dimension, typename Device >
+struct Convolution;
+
+template<>
+struct Convolution< 1, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 1, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      configuration.dynamicSharedMemorySize = 0;
+
+      // TODO: - Benchmark the best value
+      configuration.blockSize.x = kernelSize.x();
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >(
+         kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store );
+   };
+};
+
+template<>
+struct Convolution< 2, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 2, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      configuration.dynamicSharedMemorySize = 0;
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       configuration,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   };
+};
+
+template<>
+struct Convolution< 3, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 3, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      configuration.dynamicSharedMemorySize = 0;
+
+      // TODO: - Benchmark the best value
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+      configuration.blockSize.z = kernelSize.z();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+      configuration.gridSize.z =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       configuration,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       kernelSize.z(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       dimensions.z(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   };
+};
+
+#endif
diff --git a/src/Benchmarks/Convolution/kernels/sharedData.h b/src/Benchmarks/Convolution/kernels/sharedData.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1dfb90086fb3b75fda3b2f325d7d39bb6cee6a9
--- /dev/null
+++ b/src/Benchmarks/Convolution/kernels/sharedData.h
@@ -0,0 +1,544 @@
+#pragma once
+
+#ifdef HAVE_CUDA
+
+/**
+ * This method stores image tile into shared memory
+ * and then calculates convolution.
+ *
+ * Thanks for the idea https://www.evl.uic.edu/sjames/cs525/final.html
+ */
+
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/StaticVector.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Cuda/SharedMemory.h>
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution1D( Index kernelWidth,
+               Index endX,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   Real* data = TNL::Cuda::getSharedMemory< Real >();
+   Index radius = kernelWidth >> 1;
+
+   // Left
+   Index lhs = ix - radius;
+
+   if( lhs < 0 || lhs >= endX ) {
+      data[ threadIdx.x ] = fetchBoundary( lhs );
+   }
+   else {
+      data[ threadIdx.x ] = fetchData( lhs );
+   }
+
+   // Right
+   Index rhs = ix + radius;
+
+   if( rhs < 0 || rhs >= endX ) {
+      data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs );
+   }
+   else {
+      data[ threadIdx.x + blockDim.x ] = fetchData( rhs );
+   }
+
+   __syncthreads();
+
+   if( ix >= endX )
+      return;
+
+   Real result = 0;
+
+   #pragma unroll
+   for( Index i = 0; i < kernelWidth; i++ ) {
+      Index elementIndex = i + threadIdx.x;
+
+      result = convolve( result, data[ elementIndex ], fetchKernel( i ) );
+   }
+
+   store( ix, result );
+}
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution2D( Index kernelWidth,
+               Index kernelHeight,
+               Index endX,
+               Index endY,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Real* data = TNL::Cuda::getSharedMemory< Real >();
+
+   const Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   const Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   const Index radiusY = kernelHeight >> 1;
+   const Index radiusX = kernelWidth >> 1;
+
+   const Index dataBlockWidth = 2 * kernelWidth - 1;
+   const Index dataBlockHeight = 2 * kernelHeight - 1;
+
+   const Index dataBlockRadiusX = dataBlockWidth >> 1;
+   const Index dataBlockRadiusY = dataBlockHeight >> 1;
+
+   Index x, y, index;
+
+   // Top Left
+   x = ix - radiusX;
+   y = iy - radiusY;
+   index = threadIdx.x + threadIdx.y * dataBlockWidth;
+
+   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
+      data[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      data[ index ] = fetchData( x, y );
+   }
+
+   // Top right
+   x = ix + radiusX;
+   y = iy - radiusY;
+   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth;
+
+   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
+      data[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      data[ index ] = fetchData( x, y );
+   }
+
+   // Bottom Left
+   x = ix - radiusX;
+   y = iy + radiusY;
+   index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;
+
+   if(x < 0 || y < 0 || x >= endX || y >= endY ) {
+      data[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      data[ index ] = fetchData( x, y );
+   }
+
+   // Bottom Right
+   x = ix + radiusX;
+   y = iy + radiusY;
+   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;
+
+   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
+      data[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      data[ index ] = fetchData( x, y );
+   }
+
+   __syncthreads();
+
+   if( ix >= endX || iy >= endY )
+      return;
+
+   Real result = 0;
+
+   for( Index j = 0; j < kernelHeight; j++ ) {
+      Index align = ( j + threadIdx.y ) * dataBlockWidth;
+
+      for( Index i = 0; i < kernelWidth; i++ ) {
+         Index index = i + threadIdx.x + align;
+
+         result = convolve( result, data[ index ], fetchKernel( i, j ) );
+      }
+   }
+
+   store( ix, iy, result );
+}
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution3D( Index kernelWidth,
+               Index kernelHeight,
+               Index kernelDepth,
+               Index endX,
+               Index endY,
+               Index endZ,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Real* data = TNL::Cuda::getSharedMemory< Real >();
+
+   const Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+   const Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   const Index iz = threadIdx.z + blockIdx.z * blockDim.z;
+
+   const Index radiusX = kernelWidth >> 1;
+   const Index radiusY = kernelHeight >> 1;
+   const Index radiusZ = kernelDepth >> 1;
+
+   const Index dataBlockWidth = 2 * kernelWidth - 1;
+   const Index dataBlockHeight = 2 * kernelHeight - 1;
+   const Index dataBlockDepth = 2 * kernelDepth - 1;
+
+   const Index dataBlockXYVolume = dataBlockWidth * dataBlockHeight;
+
+   const Index dataBlockRadiusX = dataBlockWidth >> 1;
+   const Index dataBlockRadiusY = dataBlockHeight >> 1;
+   const Index dataBlockRadiusZ = dataBlockDepth >> 1;
+
+   Index x, y, z, index;
+
+   // Z: 0 Y: 0 X: 0
+   x = ix - radiusX;
+   y = iy - radiusY;
+   z = iz - radiusZ;
+
+   index = threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume;
+
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 0 Y: 0 X: 1
+   x = ix + radiusX;
+   y = iy - radiusY;
+   z = iz - radiusZ;
+
+   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume;
+
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 0 Y: 1 X: 0
+   x = ix - radiusX;
+   y = iy + radiusY;
+   z = iz - radiusZ;
+
+   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume;
+
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 1 Y: 0 X: 0
+   x = ix - radiusX;
+   y = iy - radiusY;
+   z = iz + radiusZ;
+
+   index = threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;
+
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 0 Y: 1 X: 1
+   x = ix + radiusX;
+   y = iy + radiusY;
+   z = iz - radiusZ;
+
+   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume;
+
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 1 Y: 0 X: 1
+   x = ix + radiusX;
+   y = iy - radiusY;
+   z = iz + radiusZ;
+
+   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;
+
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 1 Y: 1 X: 0
+   x = ix - radiusX;
+   y = iy + radiusY;
+   z = iz + radiusZ;
+
+   index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;
+
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 1 Y: 1 X: 1
+   x = ix + radiusX;
+   y = iy + radiusY;
+   z = iz + radiusZ;
+
+   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;
+
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   __syncthreads();
+
+   if( ix >= endX || iy >= endY || iz >= endZ )
+      return;
+
+   Real result = 0;
+
+   for( Index k = 0; k < kernelDepth; k++ ) {
+      Index xyAlign = ( k + threadIdx.z ) * dataBlockXYVolume;
+
+      for( Index j = 0; j < kernelHeight; j++ ) {
+         Index xAlign = ( j + threadIdx.y ) * dataBlockWidth;
+
+         for( Index i = 0; i < kernelWidth; i++ ) {
+            Index index = i + threadIdx.x + xAlign + xyAlign;
+
+            result = convolve( result, data[ index ], fetchKernel( i, j, k ) );
+         }
+      }
+   }
+
+   store( ix, iy, iz, result );
+}
+
+template< int Dimension, typename Device >
+struct Convolution;
+
+template<>
+struct Convolution< 1, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 1, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1 ;
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >(
+         kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store );
+   };
+};
+
+template<>
+struct Convolution< 2, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 2, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       configuration,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   };
+};
+
+template<>
+struct Convolution< 3, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 3, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+      configuration.blockSize.z = kernelSize.z();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+      configuration.gridSize.z =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       configuration,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       kernelSize.z(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       dimensions.z(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   };
+};
+
+#endif
diff --git a/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9d094203b8787fade33e15be996fc2a4277c66c
--- /dev/null
+++ b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h
@@ -0,0 +1,600 @@
+#pragma once
+
+#ifdef HAVE_CUDA
+
+   #include <TNL/Devices/Cuda.h>
+   #include <TNL/Containers/StaticVector.h>
+   #include <TNL/Cuda/LaunchHelpers.h>
+   #include <TNL/Cuda/SharedMemory.h>
+
+/**
+ * This method stores kernel and data in the data memory to reduce amount of loads.
+ *
+ * We can calculate the size of shared memory needed the next way:
+ * 1. We need to store in shared memory:
+ *      * for 1D -> (2 * kernelWidth) - 1 < 2 * kernelWidth
+ *      * for 2D -> ( (2 * kernelWidth) - 1 ) * ( (2 * kernelHeight) - 1 ) < 4 * kernelWidth * kernelHeight
+ *      * for 3D -> ( (2 * kernelWidth) - 1 ) * ( (2 * kernelHeight) - 1 ) * ( (2 * kernelDepth) - 1 ) < 8 * kernelWidth *
+ * kernelHeight * kernelDepth
+ * 2. We take into account, that the maximal block size is 1024, so the maximum volume of kernel is 1024.
+ *    Then the maximal amount of shared memory is:
+ *      * for 1D -> 2 * 1024 -> 2048 elements (Note, that even if we take long double (16B) we still can fit in the shared
+ * memory)
+ *      * for 2D -> 4 * 1024 -> 4096 elements
+ *      * for 3D -> 8 * 1024 -> 8196 elements (Note, that if double takes 8 bytes, then we can't fit tile into shared memory,
+ * because we have 64 KB of data)
+ * 3. The last thing is, that even if we take 1D and 2D case we have enough space to store 1024 kernel element.
+ *    Then the maximal amount of shared memory is:
+ *      * for 1D -> 3 * 1024 -> can use long double, double, float
+ *      * for 2D -> 5 * 1024 -> can use double, float
+ *      * for 3D -> 9 * 1024 -> can use float
+ */
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution1D( Index kernelWidth,
+               Index endX,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   Index kernelOffset = 2 * kernelWidth - 1;
+
+   Real* data = TNL::Cuda::getSharedMemory< Real >();
+   Real* kernel = data + kernelOffset;
+
+   Index radius = kernelWidth >> 1;
+
+   // Left
+   Index lhs = ix - radius;
+
+   if( lhs < 0 || lhs >= endX ) {
+      data[ threadIdx.x ] = fetchBoundary( lhs );
+   }
+   else {
+      data[ threadIdx.x ] = fetchData( lhs );
+   }
+
+   // Right
+   Index rhs = ix + radius;
+
+   if( rhs < 0 || rhs >= endX ) {
+      data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs );
+   }
+   else {
+      data[ threadIdx.x + blockDim.x ] = fetchData( rhs );
+   }
+
+   kernel[ threadIdx.x ] = fetchKernel( threadIdx.x );
+
+   __syncthreads();
+
+   if( ix >= endX )
+      return;
+
+   Real result = 0;
+
+   #pragma unroll
+   for( Index i = 0; i < kernelWidth; i++ ) {
+      Index elementIndex = i + threadIdx.x;
+
+      result = convolve( result, data[ elementIndex ], kernel[ i ] );
+   }
+
+   store( ix, result );
+}
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution2D( Index kernelWidth,
+               Index kernelHeight,
+               Index endX,
+               Index endY,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   const Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   const Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   const Index radiusY = kernelHeight >> 1;
+   const Index radiusX = kernelWidth >> 1;
+
+   const Index dataBlockWidth = 2 * kernelWidth - 1;
+   const Index dataBlockHeight = 2 * kernelHeight - 1;
+
+   const Index dataBlockRadiusX = dataBlockWidth >> 1;
+   const Index dataBlockRadiusY = dataBlockHeight >> 1;
+
+   const Index kernelOffset = dataBlockWidth * dataBlockHeight;
+
+   Real* data = TNL::Cuda::getSharedMemory< Real >();
+   Real* kernel = data + kernelOffset;
+
+   Index x, y, index;
+
+   // Top Left
+   x = ix - radiusX;
+   y = iy - radiusY;
+   index = threadIdx.x + threadIdx.y * dataBlockWidth;
+
+   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
+      data[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      data[ index ] = fetchData( x, y );
+   }
+
+   // Top right
+   x = ix + radiusX;
+   y = iy - radiusY;
+   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth;
+
+   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
+      data[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      data[ index ] = fetchData( x, y );
+   }
+
+   // Bottom Left
+   x = ix - radiusX;
+   y = iy + radiusY;
+   index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;
+
+   if(x < 0 || y < 0 || x >= endX || y >= endY ) {
+      data[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      data[ index ] = fetchData( x, y );
+   }
+
+   // Bottom Right
+   x = ix + radiusX;
+   y = iy + radiusY;
+   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;
+
+   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
+      data[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      data[ index ] = fetchData( x, y );
+   }
+
+   index = threadIdx.x + threadIdx.y * blockDim.x;
+
+   kernel[index] = fetchKernel( threadIdx.x, threadIdx.y );
+
+   __syncthreads();
+
+    if( ix >= endX || iy >= endY )
+      return;
+
+   Real result = 0;
+
+   #pragma unroll
+   for( Index j = 0; j < kernelHeight; j++ ) {
+      Index elementAlign = ( j + threadIdx.y ) * dataBlockWidth;
+      Index kernelAlign = j * blockDim.x;
+
+   #pragma unroll
+      for( Index i = 0; i < kernelWidth; i++ ) {
+         Index elementIndex = i + threadIdx.x + elementAlign;
+         Index kernelIndex = i + kernelAlign;
+
+         result = convolve( result, data[ elementIndex ], kernel[ kernelIndex ] );
+      }
+   }
+
+   store( ix, iy, result );
+}
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution3D( Index kernelWidth,
+               Index kernelHeight,
+               Index kernelDepth,
+               Index endX,
+               Index endY,
+               Index endZ,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   const Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+   const Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   const Index iz = threadIdx.z + blockIdx.z * blockDim.z;
+
+   const Index radiusX = kernelWidth >> 1;
+   const Index radiusY = kernelHeight >> 1;
+   const Index radiusZ = kernelDepth >> 1;
+
+   const Index dataBlockWidth = 2 * kernelWidth - 1;
+   const Index dataBlockHeight = 2 * kernelHeight - 1;
+   const Index dataBlockDepth = 2 * kernelDepth - 1;
+
+   const Index dataBlockXYVolume = dataBlockWidth * dataBlockHeight;
+
+   const Index dataBlockRadiusX = dataBlockWidth >> 1;
+   const Index dataBlockRadiusY = dataBlockHeight >> 1;
+   const Index dataBlockRadiusZ = dataBlockDepth >> 1;
+
+   const Index kernelOffset = dataBlockWidth * dataBlockHeight * dataBlockDepth;
+
+   Real* data = TNL::Cuda::getSharedMemory< Real >();
+   Real* kernel = data + kernelOffset;
+
+   Index x, y, z, index;
+
+   // Z: 0 Y: 0 X: 0
+   x = ix - radiusX;
+   y = iy - radiusY;
+   z = iz - radiusZ;
+
+   index = threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume;
+
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 0 Y: 0 X: 1
+   x = ix + radiusX;
+   y = iy - radiusY;
+   z = iz - radiusZ;
+
+   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume;
+
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 0 Y: 1 X: 0
+   x = ix - radiusX;
+   y = iy + radiusY;
+   z = iz - radiusZ;
+
+   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume;
+
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 1 Y: 0 X: 0
+   x = ix - radiusX;
+   y = iy - radiusY;
+   z = iz + radiusZ;
+
+   index = threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;
+
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 0 Y: 1 X: 1
+   x = ix + radiusX;
+   y = iy + radiusY;
+   z = iz - radiusZ;
+
+   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume;
+
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 1 Y: 0 X: 1
+   x = ix + radiusX;
+   y = iy - radiusY;
+   z = iz + radiusZ;
+
+   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;
+
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 1 Y: 1 X: 0
+   x = ix - radiusX;
+   y = iy + radiusY;
+   z = iz + radiusZ;
+
+   index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;
+
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 1 Y: 1 X: 1
+   x = ix + radiusX;
+   y = iy + radiusY;
+   z = iz + radiusZ;
+
+   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;
+
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
+
+   kernel[index] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z );
+
+   __syncthreads();
+
+   if( ix >= endX || iy >= endY || iz >= endZ )
+      return;
+
+   Real result = 0;
+
+   #pragma unroll
+   for( Index k = 0; k < kernelDepth; k++ ) {
+      Index xyAlign = ( k + threadIdx.z ) * dataBlockXYVolume;
+      Index xyKernelAlign = k * blockDim.x * blockDim.y;
+   #pragma unroll
+      for( Index j = 0; j < kernelHeight; j++ ) {
+         Index xAlign = ( j + threadIdx.y ) * dataBlockWidth;
+         Index xKernelAlign = j * blockDim.x;
+   #pragma unroll
+         for( Index i = 0; i < kernelWidth; i++ ) {
+            Index elementIndex = i + threadIdx.x + xAlign + xyAlign;
+            Index kernelIndex = i + xKernelAlign + xyKernelAlign;
+
+            result = convolve( result, data[ elementIndex ], kernel[ kernelIndex ] );
+         }
+      }
+   }
+
+   store( ix, iy, iz, result );
+}
+
+template< int Dimension, typename Device >
+struct Convolution;
+
+template<>
+struct Convolution< 1, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 1, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
+
+      configuration.dynamicSharedMemorySize = ( kernelSize.x() + kernelElementCount ) * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared);
+
+      TNL::Cuda::launchKernel< true >(
+         kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store );
+   };
+};
+
+template<>
+struct Convolution< 2, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 2, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+      Index kernelVolume = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ ) {
+         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
+         kernelVolume *= kernelSize[ i ];
+      }
+
+      configuration.dynamicSharedMemorySize = ( kernelVolume + kernelElementCount ) * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared);
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       configuration,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   };
+};
+
+template<>
+struct Convolution< 3, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 3, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+      Index kernelVolume = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ ) {
+         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
+         kernelVolume *= kernelSize[ i ];
+      }
+
+      configuration.dynamicSharedMemorySize = ( kernelVolume + kernelElementCount ) * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+      configuration.blockSize.z = kernelSize.z();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+      configuration.gridSize.z =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared);
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       configuration,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       kernelSize.z(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       dimensions.z(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   };
+};
+
+#endif
diff --git a/src/Benchmarks/Convolution/kernels/sharedKernel.h b/src/Benchmarks/Convolution/kernels/sharedKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba98efe736cde1246c1b4f84aaeefaebfa22cf19
--- /dev/null
+++ b/src/Benchmarks/Convolution/kernels/sharedKernel.h
@@ -0,0 +1,394 @@
+
+#pragma once
+
+#ifdef HAVE_CUDA
+
+   #include <TNL/Devices/Cuda.h>
+   #include <TNL/Containers/StaticVector.h>
+   #include <TNL/Cuda/LaunchHelpers.h>
+   #include <TNL/Cuda/SharedMemory.h>
+
+/**
+ * This method stores kernel in the shared memory to reduce amount of loads.
+ */
+
+template< int Dimension, typename Device >
+struct Convolution;
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution1D( Index kernelWidth,
+               Index endX,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   Real* shared = TNL::Cuda::getSharedMemory< Real >();
+
+   Index radius = kernelWidth >> 1;
+
+   // The size of the block is equal to the kernel size
+   shared[ threadIdx.x ] = fetchKernel( threadIdx.x );
+
+   __syncthreads();
+
+   if( ix >= endX )
+      return;
+
+   Real result = 0;
+
+   #pragma unroll
+   for( Index i = -radius; i <= radius; i++ ) {
+      Index elementIndex = i + ix;
+      Index kernelIndex = i + radius;
+
+      if( elementIndex < 0 || elementIndex >= endX ) {
+         result = convolve( result, fetchBoundary( elementIndex ), shared[ kernelIndex ] );
+      }
+      else {
+         result = convolve( result, fetchData( elementIndex ), shared[ kernelIndex ] );
+      }
+   }
+
+   store( ix, result );
+}
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution2D( Index kernelWidth,
+               Index kernelHeight,
+               Index endX,
+               Index endY,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   Real* shared = TNL::Cuda::getSharedMemory< Real >();
+
+   Index radiusY = kernelHeight >> 1;
+   Index radiusX = kernelWidth >> 1;
+
+   Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y;
+
+   // The size of the block is equal to the kernel size
+   shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y );
+
+   __syncthreads();
+
+   if( ix >= endX || iy >= endY )
+      return;
+
+   Real result = 0;
+
+   #pragma unroll
+   for( Index j = -radiusY; j <= radiusY; j++ ) {
+      Index elementIndexY = j + iy;
+      Index kernelIndexY = j + radiusY;
+
+      #pragma unroll
+      for( Index i = -radiusX; i <= radiusX; i++ ) {
+         Index elementIndexX = i + ix;
+         Index kernelIndexX = i + radiusX;
+
+         Index threadIndex = kernelIndexX + kernelWidth * kernelIndexY;
+
+         if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY ) {
+            result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), shared[ threadIndex ] );
+         }
+         else {
+            result = convolve( result, fetchData( elementIndexX, elementIndexY ), shared[ threadIndex ] );
+         }
+      }
+   }
+
+   store( ix, iy, result );
+}
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution3D( Index kernelWidth,
+               Index kernelHeight,
+               Index kernelDepth,
+               Index endX,
+               Index endY,
+               Index endZ,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Index iz = threadIdx.z + blockIdx.z * blockDim.z;
+   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   Real* shared = TNL::Cuda::getSharedMemory< Real >();
+
+   Index radiusZ = kernelDepth >> 1;
+   Index radiusY = kernelHeight >> 1;
+   Index radiusX = kernelWidth >> 1;
+
+   Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z;
+
+   // The size of the block is equal to the kernel size
+   shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z );
+
+   __syncthreads();
+
+   if( ix >= endX || iy >= endY || iz >= endZ )
+      return;
+
+   Real result = 0;
+
+   #pragma unroll
+   for( Index k = -radiusZ; k <= radiusZ; k++ ) {
+      Index elementIndexZ = k + iz;
+      Index kernelIndexZ = k + radiusZ;
+
+      #pragma unroll
+      for( Index j = -radiusY; j <= radiusY; j++ ) {
+         Index elementIndexY = j + iy;
+         Index kernelIndexY = j + radiusY;
+
+         #pragma unroll
+         for( Index i = -radiusX; i <= radiusX; i++ ) {
+            Index elementIndexX = i + ix;
+            Index kernelIndexX = i + radiusX;
+
+            Index threadIndex = kernelIndexX + kernelWidth * kernelIndexY + kernelWidth * kernelHeight * kernelIndexZ;
+
+            if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0
+                || elementIndexZ >= endZ )
+            {
+               result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), shared[ threadIndex ] );
+            }
+            else {
+               result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), shared[ threadIndex ] );
+            }
+         }
+      }
+   }
+
+   store( ix, iy, iz, result );
+}
+
+template<>
+struct Convolution< 1, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 1, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= kernelSize[ i ];
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared);
+
+      TNL::Cuda::launchKernel< true >(
+         kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store );
+   };
+};
+
+template<>
+struct Convolution< 2, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 2, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= kernelSize[ i ];
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared);
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       configuration,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   };
+};
+
+template<>
+struct Convolution< 3, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 3, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= kernelSize[ i ];
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+      configuration.blockSize.z = kernelSize.z();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+      configuration.gridSize.z =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared);
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       configuration,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       kernelSize.z(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       dimensions.z(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   };
+};
+
+#endif
diff --git a/src/Benchmarks/Convolution/support/Benchmark.h b/src/Benchmarks/Convolution/support/Benchmark.h
new file mode 100644
index 0000000000000000000000000000000000000000..b489000d4a952ae93cc6e27e5252fd4859d47a2a
--- /dev/null
+++ b/src/Benchmarks/Convolution/support/Benchmark.h
@@ -0,0 +1,71 @@
+
+#pragma once
+
+#include <TNL/Config/parseCommandLine.h>
+
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+#include <TNL/Benchmarks/Benchmarks.h>
+#include <TNL/Containers/StaticVector.h>
+#include <TNL/Containers/Array.h>
+
+template< int Dimension, typename Device >
+class Benchmark
+{
+public:
+   using TNLBenchmark = typename TNL::Benchmarks::Benchmark<>;
+
+   void
+   run( const TNL::Config::ParameterContainer& parameters ) const
+   {
+      if( ! TNL::Devices::Cuda::setup( parameters ) )
+         return;
+
+      const TNL::String logFileName = parameters.getParameter< TNL::String >( "log-file" );
+      const TNL::String outputMode = parameters.getParameter< TNL::String >( "output-mode" );
+   
+
+      const int verbose = parameters.getParameter< int >( "verbose" );
+      const int loops = parameters.getParameter< int >( "loops" );
+
+      auto mode = std::ios::out;
+
+      if( outputMode == "append" )
+         mode |= std::ios::app;
+
+      std::ofstream logFile( logFileName.getString(), mode );
+
+      TNLBenchmark benchmark( logFile, loops, verbose );
+
+      std::map< std::string, std::string > metadata = TNL::Benchmarks::getHardwareMetadata();
+      TNL::Benchmarks::writeMapAsJson( metadata, logFileName, ".metadata.json" );
+
+      start(benchmark, parameters);
+   }
+
+   virtual void start( TNLBenchmark& benchmark, const TNL::Config::ParameterContainer& parameters) const {
+      TNL_ASSERT_TRUE(false, "Should be overriden");
+   }
+
+   virtual TNL::Config::ConfigDescription makeInputConfig() const {
+      TNL::Config::ConfigDescription config;
+
+      config.addDelimiter( "Benchmark settings:" );
+      config.addEntry< TNL::String >( "id", "Identifier of the run", "unknown" );
+      config.addEntry< TNL::String >( "log-file", "Log file name.", "output.log" );
+      config.addEntry< TNL::String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+      config.addEntryEnum( "append" );
+      config.addEntryEnum( "overwrite" );
+
+      config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
+      config.addEntry< int >( "verbose", "Verbose mode.", 1 );
+
+      config.addDelimiter( "Device settings:" );
+
+#ifdef HAVE_CUDA
+      TNL::Devices::Cuda::configSetup( config );
+#endif
+      return config;
+   }
+};
diff --git a/src/Benchmarks/Convolution/support/DummyBenchmark.h b/src/Benchmarks/Convolution/support/DummyBenchmark.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8e2d4ae8113adac6849fb5ca05d80c9a1484cfd
--- /dev/null
+++ b/src/Benchmarks/Convolution/support/DummyBenchmark.h
@@ -0,0 +1,140 @@
+
+#pragma once
+
+#include "Benchmark.h"
+#include "DummyTask.h"
+
+static std::vector< TNL::String > minDimensionIds = { "min-x-dimension", "min-y-dimension", "min-z-dimension" };
+static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" };
+static std::vector< TNL::String > maxDimensionIds = { "max-x-dimension", "max-y-dimension", "max-z-dimension" };
+static std::vector< TNL::String > minKernelSizeIds = { "min-kernel-width", "min-kernel-height", "min-kernel-depth" };
+static std::vector< TNL::String > kernelSizeIds = { "x-kernelSize", "y-kernelSize", "z-kernelSize" };
+static std::vector< TNL::String > maxKernelSizeIds = { "max-kernel-width", "max-kernel-height", "max-kernel-depth" };
+
+template< int Dimension, typename Device >
+class DummyBenchmark : public Benchmark< Dimension, Device >
+{
+public:
+   using Vector = TNL::Containers::StaticVector< Dimension, int >;
+   using DataStore = TNL::Containers::Vector< float, Device, int >;
+   using Base = Benchmark< Dimension, Device >;
+   using TNLBenchmark = typename Base::TNLBenchmark;
+
+   virtual void
+   start( TNLBenchmark& benchmark, const TNL::Config::ParameterContainer& parameters ) const override
+   {
+      Vector dimension;
+      Vector minKernelSize;
+      Vector maxKernelSize;
+
+      for( int i = 0; i < Dimension; i++ ) {
+         dimension[ i ] = parameters.getParameter< int >( dimensionIds[ i ] );
+         minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] );
+         maxKernelSize[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] );
+
+         TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" );
+         TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" );
+         TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End kernel size must be greater than start kernel size" );
+      }
+
+      int kernelStep = parameters.getParameter< int >( "kernel-step" );
+
+      TNL_ASSERT_GT( kernelStep, 0, "Kernel step must be a positive number" );
+      TNL_ASSERT_EQ( kernelStep % 2, 0, "Kernel step must be even" );
+
+      TNL::String id = parameters.getParameter< TNL::String >( "id" );
+
+      time( id, benchmark, dimension, minKernelSize, maxKernelSize, kernelStep );
+   }
+
+   virtual void
+   time( const TNL::String& id,
+         TNLBenchmark& benchmark,
+         const Vector& dimension,
+         const Vector& minKernelSize,
+         const Vector& maxKernelSize,
+         const int kernelStep ) const
+   {
+      Vector currentKernelSize = minKernelSize;
+
+      do {
+         timeConvolution( id, benchmark, dimension, currentKernelSize );
+
+         currentKernelSize[ 0 ] += kernelStep;
+
+         for( size_t i = 0; i < currentKernelSize.getSize() - 1; i++ ) {
+            if( currentKernelSize[ i ] >= maxKernelSize[ i ] ) {
+               currentKernelSize[ i ] = minKernelSize[ i ];
+               currentKernelSize[ i + 1 ] += kernelStep;
+            }
+         }
+      } while( currentKernelSize < maxKernelSize );
+   }
+
+   void
+   timeConvolution( const TNL::String& id, TNLBenchmark& benchmark, const Vector& dimension, const Vector& kernelSize ) const
+   {
+      auto device = TNL::getType< Device >();
+
+      typename TNLBenchmark::MetadataColumns columns = { { "id", id } };
+
+      size_t elementsCount = 1;
+      size_t kernelElementsCount = 1;
+
+      for( size_t i = 0; i < dimension.getSize(); i++ ) {
+         elementsCount *= dimension[ i ];
+         kernelElementsCount *= kernelSize[ i ];
+
+         columns.push_back( { dimensionIds[ i ], TNL::convertToString( dimension[ i ] ) } );
+         columns.push_back( { kernelSizeIds[ i ], TNL::convertToString( kernelSize[ i ] ) } );
+      }
+
+      benchmark.setDatasetSize( ( elementsCount * 4 ) / 1.e9, 1.0 );
+      benchmark.setMetadataColumns( columns );
+
+      // Setup input data
+      DataStore input, result, kernel;
+
+      input.resize( elementsCount );
+      result.resize( elementsCount );
+      kernel.resize( kernelElementsCount );
+
+      input = 1;
+      result = 1;
+      kernel = 1;
+
+      auto inputView = input.getConstView();
+      auto kernelView = kernel.getConstView();
+      auto resultView = result.getView();
+
+      auto measure = [ & ]()
+      {
+         DummyTask< int, float, Dimension, Device >::exec( dimension, kernelSize, inputView, resultView, kernelView );
+      };
+
+      benchmark.template time< Device >( device, measure );
+   }
+
+   TNL::Config::ConfigDescription
+   makeInputConfig() const override
+   {
+      TNL::Config::ConfigDescription config = Base::makeInputConfig();
+
+      config.addDelimiter( "Grid dimension settings:" );
+
+      for( int i = 0; i < Dimension; i++ )
+         config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 16 );
+
+      config.addDelimiter( "Kernel settings:" );
+
+      for( int i = 0; i < Dimension; i++ )
+         config.addEntry< int >( minKernelSizeIds[ i ], minKernelSizeIds[ i ] + " (odd) :", 1 );
+
+      for( int i = 0; i < Dimension; i++ )
+         config.addEntry< int >( maxKernelSizeIds[ i ], maxKernelSizeIds[ i ] + " (odd) :", 11 );
+
+      config.addEntry< int >( "kernel-step", "Step of kernel increase which is added to kernel (must be even)", 2 );
+
+      return config;
+   }
+};
diff --git a/src/Benchmarks/Convolution/support/DummySolver.h b/src/Benchmarks/Convolution/support/DummySolver.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b1e60041236d6127b7a74483d3c8fb384d358e7
--- /dev/null
+++ b/src/Benchmarks/Convolution/support/DummySolver.h
@@ -0,0 +1,89 @@
+
+#pragma once
+
+#include "Solver.h"
+#include "DummyTask.h"
+
+static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" };
+static std::vector< TNL::String > kernelSizeIds = { "x-kernel-size", "y-kernel-size", "z-kernel-size" };
+
+template< int Dimension, typename Device >
+class DummySolver : public Solver< Dimension, Device >
+{
+public:
+   using Base = Solver< Dimension, Device >;
+   using Vector = TNL::Containers::StaticVector< Dimension, int >;
+   using DataStore = TNL::Containers::Vector< float, Device, int >;
+
+   virtual void
+   start( const TNL::Config::ParameterContainer& parameters ) const override
+   {
+      Vector dimensions;
+      Vector kernelSize;
+
+      for( int i = 0; i < Dimension; i++ ) {
+         dimensions[ i ] = parameters.getParameter< int >( dimensionIds[ i ] );
+         kernelSize[ i ] = parameters.getParameter< int >( kernelSizeIds[ i ] );
+
+         TNL_ASSERT_GT( dimensions[ i ], 1, "Start dimension must be positive integer" );
+
+         TNL_ASSERT_GE( kernelSize[ i ], 1, "Minimal kernel size must be a positive number" );
+         TNL_ASSERT_EQ( kernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" );
+      }
+
+      launchConvolution( dimensions, kernelSize );
+   }
+
+   void
+   launchConvolution( const Vector& dimension, const Vector& kernelSize ) const
+   {
+      DataStore input, result, kernel;
+
+      size_t elementsCount = 1;
+      size_t kernelElementsCount = 1;
+
+      for( size_t i = 0; i < (size_t) dimension.getSize(); i++ ) {
+         elementsCount *= dimension[ i ];
+         kernelElementsCount *= kernelSize[ i ];
+      }
+
+      input.resize( elementsCount );
+      result.resize( elementsCount );
+      kernel.resize( kernelElementsCount );
+
+      input = 1;
+      result = 1;
+      kernel = 1;
+
+      auto inputView = input.getConstView();
+      auto kernelView = kernel.getConstView();
+      auto resultView = result.getView();
+
+      DummyTask<int, float, Dimension, Device>::exec(dimension, kernelSize, inputView, resultView, kernelView);
+
+      TNL::Containers::Array< float, TNL::Devices::Host, int > host(result);
+
+      for (int i = 0; i < host.getSize(); i++)
+         TNL_ASSERT_EQ(host[i], kernelElementsCount, "Dummy task always sets volume of kernel");
+
+      std::cout << "Everything is fine" << std::endl;
+   }
+
+   virtual TNL::Config::ConfigDescription
+   makeInputConfig() const override
+   {
+      TNL::Config::ConfigDescription config = Base::makeInputConfig();
+
+      config.addDelimiter( "Grid dimension settings:" );
+
+      for( int i = 0; i < Dimension; i++ )
+         config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 64 );
+
+      config.addDelimiter( "Kernel settings:" );
+
+      for( int i = 0; i < Dimension; i++ )
+         config.addEntry< int >( kernelSizeIds[ i ], kernelSizeIds[ i ] + " (odd) :", 9 );
+
+      return config;
+   }
+};
diff --git a/src/Benchmarks/Convolution/support/DummyTask.h b/src/Benchmarks/Convolution/support/DummyTask.h
new file mode 100644
index 0000000000000000000000000000000000000000..07a58e98dc026565bea1f3e27ed95b395a7780ee
--- /dev/null
+++ b/src/Benchmarks/Convolution/support/DummyTask.h
@@ -0,0 +1,187 @@
+
+#pragma once
+
+template< int Dimension, typename Device >
+struct Convolution
+{
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< Dimension, Index >;
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store );
+};
+
+template< typename Index, typename Real, int Dimension, typename Device >
+struct DummyTask;
+
+template< typename Index, typename Real >
+struct DummyTask< Index, Real, 1, TNL::Devices::Cuda >
+{
+public:
+   static constexpr int Dimension = 1;
+   using Device = TNL::Devices::Cuda;
+   using Vector = TNL::Containers::StaticVector< Dimension, Index >;
+   using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType;
+   using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType;
+   using ConvolutionLauncher = Convolution< Dimension, Device >;
+
+   static void
+   exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel, int boundaryValue = 1 )
+   {
+      auto fetchData = [ = ] __cuda_callable__( Index i )
+      {
+         return input[ i ];
+      };
+
+      auto fetchBoundary = [ = ] __cuda_callable__( Index i )
+      {
+         return boundaryValue;
+      };
+
+      auto fetchKernel = [ = ] __cuda_callable__( Index i )
+      {
+         return kernel[ i ];
+      };
+
+      auto convolve = [ = ] __cuda_callable__( Real result, Real data, Real kernel )
+      {
+         return result + data * kernel;
+      };
+
+      auto store = [ = ] __cuda_callable__( Index i, Real resultValue ) mutable
+      {
+         result[ i ] = resultValue;
+      };
+
+      ConvolutionLauncher::execute< Index, Real >( dimensions,
+                                                   kernelSize,
+                                                   std::forward< decltype( fetchData ) >( fetchData ),
+                                                   std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
+                                                   std::forward< decltype( fetchKernel ) >( fetchKernel ),
+                                                   std::forward< decltype( convolve ) >( convolve ),
+                                                   std::forward< decltype( store ) >( store ) );
+   }
+};
+
+template< typename Index, typename Real >
+struct DummyTask< Index, Real, 2, TNL::Devices::Cuda >
+{
+public:
+   static constexpr int Dimension = 2;
+   using Device = TNL::Devices::Cuda;
+   using Vector = TNL::Containers::StaticVector< Dimension, Index >;
+   using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType;
+   using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType;
+   using ConvolutionLauncher = Convolution< Dimension, Device >;
+
+   static void
+   exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel, int boundaryValue = 1 )
+   {
+      auto fetchData = [ = ] __cuda_callable__( Index i, Index j )
+      {
+         auto index = i + j * dimensions.x();
+
+         return input[ index ];
+      };
+
+      auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j )
+      {
+         return boundaryValue;
+      };
+
+      auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j )
+      {
+         auto index = i + j * kernelSize.x();
+
+         return kernel[ index ];
+      };
+
+      auto convolve = [ = ] __cuda_callable__( Real result, Real data, Real kernel )
+      {
+         return result + data * kernel;
+      };
+
+      auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) mutable
+      {
+         auto index = i + j * dimensions.x();
+
+         result[ index ] = resultValue;
+      };
+
+      ConvolutionLauncher::execute< Index, Real >( dimensions,
+                                                   kernelSize,
+                                                   std::forward< decltype( fetchData ) >( fetchData ),
+                                                   std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
+                                                   std::forward< decltype( fetchKernel ) >( fetchKernel ),
+                                                   std::forward< decltype( convolve ) >( convolve ),
+                                                   std::forward< decltype( store ) >( store ) );
+   }
+};
+
+template< typename Index, typename Real >
+struct DummyTask< Index, Real, 3, TNL::Devices::Cuda >
+{
+public:
+   static constexpr int Dimension = 3;
+   using Device = TNL::Devices::Cuda;
+   using Vector = TNL::Containers::StaticVector< Dimension, Index >;
+   using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType;
+   using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType;
+   using ConvolutionLauncher = Convolution< Dimension, Device >;
+
+   static void
+   exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel, int boundaryValue = 1 )
+   {
+      auto fetchData = [ = ] __cuda_callable__( Index i, Index j, Index k )
+      {
+         auto index = i + j * dimensions.x() + k * dimensions.x() * dimensions.y();
+
+         return input[ index ];
+      };
+
+      auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j, Index k )
+      {
+         return boundaryValue;
+      };
+
+      auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j, Index k )
+      {
+         auto index = i + j * kernelSize.x() + k * kernelSize.x() * kernelSize.y();
+
+         return kernel[ index ];
+      };
+
+      auto convolve = [ = ] __cuda_callable__( Real result, Real data, Real kernel )
+      {
+         return result + data * kernel;
+      };
+
+      auto store = [ = ] __cuda_callable__( Index i, Index j, Index k, Real resultValue ) mutable
+      {
+         auto index = i + j * dimensions.x() + k * dimensions.x() * dimensions.y();
+
+         result[ index ] = resultValue;
+      };
+
+      ConvolutionLauncher::execute< Index, Real >( dimensions,
+                                                   kernelSize,
+                                                   std::forward< decltype( fetchData ) >( fetchData ),
+                                                   std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
+                                                   std::forward< decltype( fetchKernel ) >( fetchKernel ),
+                                                   std::forward< decltype( convolve ) >( convolve ),
+                                                   std::forward< decltype( store ) >( store ) );
+   }
+};
diff --git a/src/Benchmarks/Convolution/support/HeatEquationSolver.h b/src/Benchmarks/Convolution/support/HeatEquationSolver.h
new file mode 100644
index 0000000000000000000000000000000000000000..eebe89c4bec41b08be063f0e383774b5cac4f851
--- /dev/null
+++ b/src/Benchmarks/Convolution/support/HeatEquationSolver.h
@@ -0,0 +1,240 @@
+
+#pragma once
+
+#include "Solver.h"
+#include "DummyTask.h"
+
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Timer.h>
+
+static std::vector< TNL::String > dimensionIds = { "grid-x-size", "grid-y-size" };
+static std::vector< TNL::String > kernelSizeIds = { "kernel-x-size", "kernel-y-size" };
+static std::vector< TNL::String > domainIds = { "domain-x-size", "domain-y-size" };
+static std::vector< TNL::String > kernelDomainIds = { "kernel-domain-x-size", "kernel-domain-y-size" };
+
+static std::string alphaKey = "alpha";
+static std::string betaKey = "beta";
+static std::string gammaKey = "gamma";
+
+static std::string timeStepKey = "timeStep";
+static std::string startTimeKey = "startTime";
+static std::string finalTimeKey = "finalTime";
+static std::string outputFilenamePrefix = "outputFilenamePrefix";
+
+template< typename Real = double >
+class HeatEquationSolver : public Solver< 2, TNL::Devices::Cuda >
+{
+public:
+   constexpr static int Dimension = 2;
+   using Device = TNL::Devices::Cuda;
+
+   using Base = Solver< Dimension, Device >;
+   using Vector = TNL::Containers::StaticVector< Dimension, int >;
+   using Point = TNL::Containers::StaticVector< Dimension, Real >;
+   using DataStore = TNL::Containers::Vector< Real, Device, int >;
+   using HostDataStore = TNL::Containers::Vector< Real, TNL::Devices::Host, int >;
+
+   virtual void
+   start( const TNL::Config::ParameterContainer& parameters ) const override
+   {
+      int gridXSize = parameters.getParameter< int >( dimensionIds[ 0 ] );
+      int gridYSize = parameters.getParameter< int >( dimensionIds[ 1 ] );
+
+      int kernelXSize = parameters.getParameter< int >( kernelSizeIds[ 0 ] );
+      int kernelYSize = parameters.getParameter< int >( kernelSizeIds[ 1 ] );
+
+      Real xDomainSize = parameters.getParameter< Real >( domainIds[ 0 ] );
+      Real yDomainSize = parameters.getParameter< Real >( domainIds[ 1 ] );
+
+      Real kernelXDomainSize = parameters.getParameter< Real >( kernelDomainIds[ 0 ] );
+      Real kernelYDomainSize = parameters.getParameter< Real >( kernelDomainIds[ 1 ] );
+
+      Real hx = xDomainSize / (Real) gridXSize;
+      Real hy = yDomainSize / (Real) gridYSize;
+
+      Point domain = { xDomainSize, yDomainSize };
+      Point kernelDomain = { kernelXDomainSize, kernelYDomainSize };
+      Point spaceSteps = { hx, hy };
+
+      Vector dimensions = { gridXSize, gridYSize };
+      Vector kernelSize = { kernelXSize, kernelYSize };
+
+      DataStore function = prepareFunction( parameters, dimensions, domain, spaceSteps );
+
+      auto filenamePrefix = parameters.getParameter< TNL::String >( outputFilenamePrefix );
+      auto initialFilename = filenamePrefix + "_initial.txt";
+
+      if( ! writeGNUPlot( initialFilename, dimensions, spaceSteps, domain, function.getConstView() ) ) {
+         std::cout << "Did fail during file write";
+         return;
+      }
+
+      DataStore result;
+
+      result.setLike( function );
+      result = 0;
+
+      auto timeStep = parameters.getParameter< double >( timeStepKey );
+      auto startTime = parameters.getParameter< double >( startTimeKey );
+      auto finalTime = parameters.getParameter< double >( finalTimeKey );
+
+      int iteration = (startTime / timeStep) + 1;
+      int finalIteration = finalTime / timeStep;
+
+      double time = iteration * timeStep;
+
+      for (int i = iteration; i <= finalIteration; i++) {
+         printf("Time: %lf\n", time);
+
+         convolve( dimensions, domain, kernelSize, kernelDomain, function.getConstView(), result.getView(), time );
+
+         auto filename = TNL::String("data_") + TNL::convertToString(i) + ".txt";
+
+         if( ! writeGNUPlot( filename, dimensions, spaceSteps, domain, result.getConstView() ) ) {
+            std::cout << "Did fail during file write";
+            return;
+         }
+
+         result = 0;
+
+         time += timeStep;
+      }
+   }
+
+   virtual TNL::Config::ConfigDescription
+   makeInputConfig() const override
+   {
+      TNL::Config::ConfigDescription config = Base::makeInputConfig();
+
+      config.addDelimiter( "Grid settings:" );
+      config.addEntry< int >( dimensionIds[ 0 ], "Grid size along x-axis.", 200 );
+      config.addEntry< int >( dimensionIds[ 1 ], "Grid size along y-axis.", 200 );
+
+      config.addDelimiter( "Kernel settings:" );
+      config.addEntry< int >( kernelSizeIds[ 0 ], "Kernel size along x-axis.", 3 );
+      config.addEntry< int >( kernelSizeIds[ 1 ], "Kernel size along y-axis.", 3 );
+
+      config.addDelimiter( "Problem settings:" );
+      config.addEntry< TNL::String >( outputFilenamePrefix, "The prefix in name of the output file", "data" );
+
+      config.addEntry< Real >( domainIds[ 0 ], "Domain size along x-axis.", 4.0 );
+      config.addEntry< Real >( domainIds[ 1 ], "Domain size along y-axis.", 4.0 );
+
+      config.addEntry< Real >( kernelDomainIds[ 0 ], "Kernel domain size along x-axis.", 4.0 );
+      config.addEntry< Real >( kernelDomainIds[ 1 ], "Kernel domain size along y-axis.", 4.0 );
+
+      config.addDelimiter( "Initial condition settings ( (x^2/alpha + y^2/beta) + gamma)):" );
+      config.addEntry< Real >( alphaKey, "Alpha value in initial condition", -0.05 );
+      config.addEntry< Real >( betaKey, "Beta value in initial condition", -0.05 );
+      config.addEntry< Real >( gammaKey, "Gamma key in initial condition", 15 );
+
+      config.addDelimiter( "Time settings:" );
+      config.addEntry< Real >( startTimeKey, "Final time of the simulation.", 0.0);
+      config.addEntry< Real >( timeStepKey, "Time step of the simulation.", 0.005);
+      config.addEntry< Real >( finalTimeKey, "Final time of the simulation.", 0.36);
+
+      return config;
+   }
+
+   DataStore
+   prepareFunction( const TNL::Config::ParameterContainer& parameters,
+                    const Vector& dimensions,
+                    const Point& domain,
+                    const Point& spaceSteps ) const
+   {
+      DataStore function;
+
+      function.resize( dimensions.x() * dimensions.y() );
+
+      auto functionView = function.getView();
+
+      auto xDomainSize = parameters.getParameter< Real >( domainIds[ 0 ] );
+      auto yDomainSize = parameters.getParameter< Real >( domainIds[ 1 ] );
+
+      auto alpha = parameters.getParameter< Real >( alphaKey );
+      auto beta = parameters.getParameter< Real >( betaKey );
+      auto gamma = parameters.getParameter< Real >( gammaKey );
+
+      auto init = [ = ] __cuda_callable__( int i, int j ) mutable
+      {
+         auto index = j * dimensions.x() + i;
+
+         auto x = i * spaceSteps.x() - domain.x() / 2.;
+         auto y = j * spaceSteps.y() - domain.y() / 2.;
+
+         functionView[ index ] = TNL::max((x * x / alpha)  + (y * y / beta) + gamma, 0);
+      };
+
+      TNL::Algorithms::ParallelFor2D< Device >::exec( 0, 0, dimensions.x(), dimensions.y(), init );
+
+      return function;
+   }
+
+   void
+   convolve( const Vector& dimensions,
+             const Point& domain,
+             const Vector& kernelSize,
+             const Point& kernelDomain,
+             typename DataStore::ConstViewType input,
+             typename DataStore::ViewType result,
+             const Real time ) const
+   {
+      DataStore kernel;
+      kernel.resize(kernelSize.x() * kernelSize.y());
+
+      auto kernelView = kernel.getView();
+      auto domainSpaceSteps = Point(domain.x() / dimensions.x(), domain.y() / dimensions.y());
+      auto kernelSpaceSteps = Point(kernelDomain.x() / (kernelSize.x() - 1), kernelDomain.y() / (kernelSize.y() - 1));
+
+      auto init = [ = ] __cuda_callable__( int i, int j ) mutable {
+         auto index = j * kernelSize.x() + i;
+
+         auto x = i * kernelSpaceSteps.x() - kernelDomain.x() / 2.;
+         auto y = j * kernelSpaceSteps.y() - kernelDomain.y() / 2.;
+
+         // The space step is given by the function domain
+         // However, because the kernel is limited to 31x31 size
+         // The user can specify it custom kernel domain from which values are taken
+         kernelView[ index ] = domainSpaceSteps.x() * domainSpaceSteps.y() * ( (Real)1 / ( (Real)4 * M_PI * time ) ) * exp( - ( pow(x, 2.) + pow(y, 2.)  ) / ( (Real)4 * time ) );
+      };
+
+      TNL::Algorithms::ParallelFor2D< Device >::exec( 0, 0, kernelSize.x(), kernelSize.y(), init );
+
+      // std::cout << std::endl << std::endl << std::endl;
+
+      for (int i = 0; i < kernelSize.x(); i++) {
+         for (int j = 0; j < kernelSize.y(); j++) {
+            auto index = j * kernelSize.x() + i;
+
+            printf("%lf ", kernelView.getElement(index));
+         }
+
+         printf("\n");
+      }
+
+      auto kernelConstView = kernel.getConstView();
+
+      DummyTask<int, Real, Dimension, Device>::exec(dimensions, kernelSize, input, result, kernelConstView, 0);
+   }
+
+   bool
+   writeGNUPlot( const std::string& filename,
+                 const Vector& dimensions,
+                 const Point& spaceSteps,
+                 const Point& domain,
+                 const typename DataStore::ConstViewType& map ) const
+   {
+      std::ofstream out( filename, std::ios::out );
+
+      if( ! out.is_open() )
+         return false;
+
+      for( int j = 0; j < dimensions.y(); j++ )
+         for( int i = 0; i < dimensions.x(); i++ )
+            out << i * spaceSteps.x() - domain.x() / 2. << " "
+                << j * spaceSteps.y() - domain.y() / 2. << " "
+                << map.getElement( j * dimensions.x() + i ) << std::endl;
+
+      return out.good();
+   }
+};
diff --git a/src/Benchmarks/Convolution/support/ImageSolver.h b/src/Benchmarks/Convolution/support/ImageSolver.h
new file mode 100644
index 0000000000000000000000000000000000000000..069553573b0ae933e8d0e11786ba1090c873bc18
--- /dev/null
+++ b/src/Benchmarks/Convolution/support/ImageSolver.h
@@ -0,0 +1,245 @@
+
+#pragma once
+
+#include "Solver.h"
+#include "DummyTask.h"
+
+#include <TNL/FileName.h>
+#include <TNL/Timer.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include <TNL/Functions/MeshFunction.h>
+#include <TNL/Images/PNGImage.h>
+#include <TNL/Images/RegionOfInterest.h>
+
+static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" };
+static std::vector< TNL::String > kernelSizeIds = { "kernel-x-size", "kernel-y-size", "kernel-z-size" };
+static std::vector< TNL::String > kernels = { "identity",        "gauss3x3",      "gauss5x5",
+                                              "sobelHorizontal", "sobelVertical", "edgeDetection" };
+
+class ImageSolver : public Solver< 2, TNL::Devices::Cuda >
+{
+public:
+   constexpr static int Dimension = 2;
+   using Device = TNL::Devices::Cuda;
+
+   using Base = Solver< Dimension, Device >;
+   using Vector = TNL::Containers::StaticVector< Dimension, int >;
+   using DataStore = TNL::Containers::Vector< float, Device, int >;
+   using HostDataStore = TNL::Containers::Vector< float, TNL::Devices::Host, int >;
+
+   using GridType = TNL::Meshes::Grid< 2, float, Device, int >;
+   using GridPointer = TNL::Pointers::SharedPointer< GridType >;
+   using MeshFunctionType = TNL::Functions::MeshFunction< GridType >;
+
+   virtual void
+   start( const TNL::Config::ParameterContainer& parameters ) const override
+   {
+      GridPointer grid;
+      MeshFunctionType meshFunction;
+      TNL::Images::PNGImage< int > image;
+      TNL::Images::RegionOfInterest< int > roi;
+
+      meshFunction.setMesh( grid );
+
+      auto output = parameters.getParameter< TNL::String >( "output" );
+
+      if( ! this->readImage( parameters, grid, meshFunction, image, roi ) || ! this->convolve( parameters, meshFunction )
+          || ! this->write( parameters, image, meshFunction ) )
+         return;
+   }
+
+   virtual TNL::Config::ConfigDescription
+   makeInputConfig() const override
+   {
+      TNL::Config::ConfigDescription config = Base::makeInputConfig();
+
+      config.addDelimiter( "Image settings:" );
+
+      config.addEntry< TNL::String >( "input", "PNG image" );
+      config.addEntry< TNL::String >( "output", "PNG image" );
+      config.addEntry< TNL::String >( "kernel", "A kernel to apply", kernels[ 0 ] );
+
+      for( const auto& kernel : kernels )
+         config.addEntryEnum( kernel );
+
+      config.addDelimiter( "Roi settings:" );
+
+      config.addEntry< int >( "roi-top", "Top (smaller number) line of the region of interest.", -1 );
+      config.addEntry< int >( "roi-bottom", "Bottom (larger number) line of the region of interest.", -1 );
+      config.addEntry< int >( "roi-left", "Left (smaller number) column of the region of interest.", -1 );
+      config.addEntry< int >( "roi-right", "Right (larger number) column of the region of interest.", -1 );
+
+      return config;
+   }
+
+   template< typename Image >
+   bool
+   readImage( const TNL::Config::ParameterContainer& parameters,
+              GridPointer& grid,
+              MeshFunctionType& meshFunction,
+              Image& image,
+              TNL::Images::RegionOfInterest< int >& roi ) const
+   {
+      auto input = parameters.getParameter< TNL::String >( "input" );
+
+      if( image.openForRead( input ) ) {
+         if( ! roi.setup( parameters, &image ) ) {
+            std::cout << "Invalid image roi.";
+            image.close();
+            return false;
+         }
+
+         std::cout << image.getWidth() << " " << image.getHeight() << std::endl;
+
+         auto meshPointer = meshFunction.getMeshPointer();
+
+         meshPointer->setDimensions( image.getWidth(), image.getHeight() );
+
+         meshFunction.setMesh( meshPointer );
+
+         if( ! image.read( roi, meshFunction ) ) {
+            std::cout << "Invalid image size" << std::endl;
+
+            image.close();
+            return false;
+         }
+
+         image.close();
+
+         std::cout << "Image read was successful: " << meshFunction.getData().getSize() << " elements count" << std::endl;
+         return true;
+      }
+
+      std::cout << "Image open for read failed. Please check file path" << std::endl;
+
+      return false;
+   }
+
+   bool
+   convolve( const TNL::Config::ParameterContainer& parameters, MeshFunctionType& meshFunction ) const
+   {
+      auto imageData = meshFunction.getData().getConstView();
+
+      Vector kernelSize;
+      DataStore kernel;
+
+      kernel = getKernel( parameters, kernelSize );
+
+      DataStore result;
+
+      result.setLike( imageData );
+      result = 0;
+
+      TNL::Timer timer;
+
+      timer.start();
+
+      std::cout << imageData.getSize() << " " << result.getSize() << std::endl;
+
+      launchConvolution(
+         imageData, kernel.getConstView(), result.getView(), meshFunction.getMeshPointer()->getDimensions(), kernelSize );
+
+      timer.stop();
+
+      result.forAllElements(
+         [] __cuda_callable__( int i, float& value )
+         {
+            value = TNL::max( TNL::min( value, 1.0 ), 0.0 );
+         } );
+
+      meshFunction.getData() = result;
+
+      std::cout << "Image convolution was successful. Time: " << timer.getRealTime() << " sec" << std::endl;
+
+      return true;
+   }
+
+   template< typename Image >
+   bool
+   write( const TNL::Config::ParameterContainer& parameters, Image& image, MeshFunctionType& meshFunction ) const
+   {
+      auto output = parameters.getParameter< TNL::String >( "output" );
+      GridType grid = meshFunction.getMesh();
+
+      if( image.openForWrite( output, grid ) ) {
+         if( ! image.write( meshFunction ) ) {
+            std::cout << "Image write failed" << std::endl;
+
+            image.close();
+            return false;
+         }
+
+         image.close();
+
+         return true;
+      }
+
+      std::cout << "Image open for write failed. Please check file path" << std::endl;
+
+      return false;
+   }
+
+   HostDataStore
+   getKernel( const TNL::Config::ParameterContainer& parameters, Vector& kernelDimension ) const
+   {
+      auto kernel = parameters.getParameter< TNL::String >( "kernel" );
+
+      if( kernel == "identity" ) {
+         kernelDimension = { 3, 3 };
+
+         return { 0, 0, 0, 0, 1, 0, 0, 0, 0 };
+      }
+
+      if( kernel == "gauss3x3" ) {
+         kernelDimension = { 3, 3 };
+
+         HostDataStore kernel = { 1, 2, 1, 2, 4, 2, 1, 2, 1 };
+
+         kernel /= 16;
+
+         return kernel;
+      }
+
+      if( kernel == "gauss5x5" ) {
+         kernelDimension = { 5, 5 };
+
+         HostDataStore kernel = { 1, 4, 7, 4, 1, 4, 16, 26, 16, 4, 7, 26, 41, 26, 7, 4, 16, 26, 16, 4, 1, 4, 7, 4, 1 };
+
+         kernel /= 273;
+
+         return kernel;
+      }
+
+      if( kernel == "sobelHorizontal" ) {
+         kernelDimension = { 3, 3 };
+
+         return { 1, 2, 1, 0, 0, 0, -1, -2, -1 };
+      }
+
+      if( kernel == "sobelVertical" ) {
+         kernelDimension = { 3, 3 };
+
+         return { 1, 0, -1, 2, 0, -2, 1, 0, -1 };
+      }
+
+      if( kernel == "edgeDetection" ) {
+         kernelDimension = { 3, 3 };
+
+         return { -1, -1, -1, -1, 8, -1, -1, -1, -1 };
+      }
+
+      std::cout << "Unknown kernel " << kernel << ". Exit" << std::endl;
+      exit( 1 );
+   }
+
+   void
+   launchConvolution( DataStore::ConstViewType image,
+                      DataStore::ConstViewType kernel,
+                      DataStore::ViewType result,
+                      const GridType::CoordinatesType& imageDimension,
+                      const GridType::CoordinatesType& kernelDimension ) const
+   {
+      DummyTask< int, float, Dimension, Device >::exec( imageDimension, kernelDimension, image, result, kernel );
+   }
+};
diff --git a/src/Benchmarks/Convolution/support/Solver.h b/src/Benchmarks/Convolution/support/Solver.h
new file mode 100644
index 0000000000000000000000000000000000000000..d80373f0df34d7b552adc3a5940e8b7b18386d01
--- /dev/null
+++ b/src/Benchmarks/Convolution/support/Solver.h
@@ -0,0 +1,41 @@
+
+#pragma once
+
+#include <vector>
+
+#include <TNL/Containers/StaticVector.h>
+#include <TNL/Containers/Vector.h>
+
+template< int Dimension, typename Device >
+class Solver
+{
+public:
+   void
+   solve( const TNL::Config::ParameterContainer& parameters ) const
+   {
+      if( ! TNL::Devices::Cuda::setup( parameters ) )
+         return;
+
+      start( parameters );
+   }
+
+   virtual void
+   start( const TNL::Config::ParameterContainer& parameters ) const
+   {
+      TNL_ASSERT_TRUE( false, "Should be overriden" );
+   }
+
+   virtual TNL::Config::ConfigDescription
+   makeInputConfig() const
+   {
+      TNL::Config::ConfigDescription config;
+
+      config.addDelimiter( "Device settings:" );
+
+#ifdef HAVE_CUDA
+      TNL::Devices::Cuda::configSetup( config );
+#endif
+
+      return config;
+   }
+};
diff --git a/src/Benchmarks/Convolution/templates/main_benchmark.h b/src/Benchmarks/Convolution/templates/main_benchmark.h
new file mode 100644
index 0000000000000000000000000000000000000000..4be1e80e5b4494ea0e80da66e9357d3a771347d7
--- /dev/null
+++ b/src/Benchmarks/Convolution/templates/main_benchmark.h
@@ -0,0 +1,26 @@
+
+#define KERNEL KERNEL_VALUE
+#define DIMENSION DIMENSION_VALUE
+
+#include KERNEL_VALUE
+#include "../support/DummyBenchmark.h"
+
+#include <TNL/Config/parseCommandLine.h>
+
+using TaskBenchmark = DummyBenchmark< DIMENSION, TNL::Devices::Cuda >;
+
+int main(int argc, char* argv[])
+{
+   TaskBenchmark benchmark;
+
+   auto config = benchmark.makeInputConfig();
+
+   TNL::Config::ParameterContainer parameters;
+
+   if( ! parseCommandLine( argc, argv, config, parameters ) )
+      return EXIT_FAILURE;
+
+   benchmark.run( parameters );
+
+   return 0;
+}
diff --git a/src/Benchmarks/Convolution/templates/main_heat_equation_solver.h b/src/Benchmarks/Convolution/templates/main_heat_equation_solver.h
new file mode 100644
index 0000000000000000000000000000000000000000..c258f074005746a4e78b8f466952bfe404415a77
--- /dev/null
+++ b/src/Benchmarks/Convolution/templates/main_heat_equation_solver.h
@@ -0,0 +1,26 @@
+
+#define KERNEL KERNEL_VALUE
+#define DIMENSION DIMENSION_VALUE
+
+#include KERNEL
+#include "../support/HeatEquationSolver.h"
+
+#include <TNL/Config/parseCommandLine.h>
+
+using TaskSolver = HeatEquationSolver<>;
+
+int main(int argc, char* argv[])
+{
+   TaskSolver solver;
+
+   auto config = solver.makeInputConfig();
+
+   TNL::Config::ParameterContainer parameters;
+
+   if( ! parseCommandLine( argc, argv, config, parameters ) )
+      return EXIT_FAILURE;
+
+   solver.solve( parameters );
+
+   return 0;
+}
diff --git a/src/Benchmarks/Convolution/templates/main_image_solver.h b/src/Benchmarks/Convolution/templates/main_image_solver.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2b9c1440c17c4f9c38f00393e85b19848b0c037
--- /dev/null
+++ b/src/Benchmarks/Convolution/templates/main_image_solver.h
@@ -0,0 +1,26 @@
+
+#define KERNEL KERNEL_VALUE
+#define DIMENSION DIMENSION_VALUE
+
+#include KERNEL
+#include "../support/ImageSolver.h"
+
+#include <TNL/Config/parseCommandLine.h>
+
+using TaskSolver = ImageSolver;
+
+int main(int argc, char* argv[])
+{
+   TaskSolver solver;
+
+   auto config = solver.makeInputConfig();
+
+   TNL::Config::ParameterContainer parameters;
+
+   if( ! parseCommandLine( argc, argv, config, parameters ) )
+      return EXIT_FAILURE;
+
+   solver.solve( parameters );
+
+   return 0;
+}
diff --git a/src/Benchmarks/Convolution/templates/main_solver.h b/src/Benchmarks/Convolution/templates/main_solver.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab2bc86990cbe2123d198d489d347dfe8444d372
--- /dev/null
+++ b/src/Benchmarks/Convolution/templates/main_solver.h
@@ -0,0 +1,27 @@
+
+#define KERNEL KERNEL_VALUE
+#define DIMENSION DIMENSION_VALUE
+
+#include KERNEL
+#include "../support/DummySolver.h"
+
+#include <TNL/Config/parseCommandLine.h>
+
+
+using TaskSolver = DummySolver< DIMENSION, TNL::Devices::Cuda >;
+
+int main(int argc, char* argv[])
+{
+   TaskSolver solver;
+
+   auto config = solver.makeInputConfig();
+
+   TNL::Config::ParameterContainer parameters;
+
+   if( ! parseCommandLine( argc, argv, config, parameters ) )
+      return EXIT_FAILURE;
+
+   solver.solve( parameters );
+
+   return 0;
+}
diff --git a/src/TNL/Images/PNGImage_impl.h b/src/TNL/Images/PNGImage_impl.h
index 3b946ff8276f5ea798f92dad5039adedd7417a73..5efd66c84b5ab05c8078eba910a2804c30679bc8 100644
--- a/src/TNL/Images/PNGImage_impl.h
+++ b/src/TNL/Images/PNGImage_impl.h
@@ -266,6 +266,7 @@ PNGImage< Index >::write( const Meshes::Grid< 2, Real, Device, Index >& grid, Ve
       for( j = 0; j < grid.getDimensions().x(); j++ ) {
          cell.getCoordinates().x() = j;
          cell.getCoordinates().y() = grid.getDimensions().y() - 1 - i;
+         cell.refresh();
 
          // Index cellIndex = grid.getCellIndex( CoordinatesType( j,
          //                                      grid.getDimensions().y() - 1 - i ) );
@@ -305,6 +306,7 @@ PNGImage< Index >::write( const Functions::MeshFunction< Meshes::Grid< 2, MeshRe
       for( j = 0; j < grid.getDimensions().x(); j++ ) {
          cell.getCoordinates().x() = j;
          cell.getCoordinates().y() = grid.getDimensions().y() - 1 - i;
+         cell.refresh();
 
          // Index cellIndex = grid.getCellIndex( CoordinatesType( j,
          //                                      grid.getDimensions().y() - 1 - i ) );