From 77b4e0813cd1f1401a2566db1a14725d6aa28f8f Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Sat, 2 Apr 2022 13:05:51 +0200
Subject: [PATCH 01/19] Implement naive convolution for 1D kernel

---
 src/Benchmarks/Convolution/.gitignore         |   1 +
 src/Benchmarks/Convolution/CMakeLists.txt     |  25 ++
 src/Benchmarks/Convolution/kernels/naive.h    | 164 +++++++++++++
 .../Convolution/support/Benchmark.h           |  81 +++++++
 .../Convolution/support/DummyBenchmark.h      | 165 +++++++++++++
 .../Convolution/support/DummySolver.h         |  84 +++++++
 .../Convolution/support/DummyTask.h           | 153 ++++++++++++
 src/Benchmarks/Convolution/support/Launcher.h | 218 ++++++++++++++++++
 src/Benchmarks/Convolution/support/Solver.h   |  52 +++++
 .../Convolution/templates/main_benchmark.h    |   0
 .../Convolution/templates/main_solver.h       |  25 ++
 11 files changed, 968 insertions(+)
 create mode 100644 src/Benchmarks/Convolution/.gitignore
 create mode 100644 src/Benchmarks/Convolution/CMakeLists.txt
 create mode 100644 src/Benchmarks/Convolution/kernels/naive.h
 create mode 100644 src/Benchmarks/Convolution/support/Benchmark.h
 create mode 100644 src/Benchmarks/Convolution/support/DummyBenchmark.h
 create mode 100644 src/Benchmarks/Convolution/support/DummySolver.h
 create mode 100644 src/Benchmarks/Convolution/support/DummyTask.h
 create mode 100644 src/Benchmarks/Convolution/support/Launcher.h
 create mode 100644 src/Benchmarks/Convolution/support/Solver.h
 create mode 100644 src/Benchmarks/Convolution/templates/main_benchmark.h
 create mode 100644 src/Benchmarks/Convolution/templates/main_solver.h

diff --git a/src/Benchmarks/Convolution/.gitignore b/src/Benchmarks/Convolution/.gitignore
new file mode 100644
index 000000000..86d4c2dd3
--- /dev/null
+++ b/src/Benchmarks/Convolution/.gitignore
@@ -0,0 +1 @@
+generated
diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt
new file mode 100644
index 000000000..22d0876bc
--- /dev/null
+++ b/src/Benchmarks/Convolution/CMakeLists.txt
@@ -0,0 +1,25 @@
+
+function(generate_cuda_executable PREFIX DIMENSION TEMPLATE KERNEL_HEADER)
+
+get_filename_component(MODULE_NAME ${KERNEL_HEADER} NAME_WE)
+get_filename_component(TEMPLATE_NAME ${TEMPLATE} NAME_WE)
+
+if (${BUILD_CUDA})
+   SET(SOURCE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/generated/${MODULE_NAME}_${DIMENSION}_${TEMPLATE_NAME}.cu")
+
+   FILE(READ ${TEMPLATE} TEMPLATE_CONTENT)
+
+   STRING(REGEX REPLACE "DIMENSION_VALUE" ${DIMENSION} TEMPLATE_CONTENT "${TEMPLATE_CONTENT}")
+
+   FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}")
+
+   SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}")
+
+   CUDA_ADD_EXECUTABLE(${EXECUTABLE_NAME} ${SOURCE_FILE})
+else()
+   MESSAGE(WARNING "Convolutions are not supported on CPU")
+endif()
+
+endfunction()
+
+GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/naive.h")
diff --git a/src/Benchmarks/Convolution/kernels/naive.h b/src/Benchmarks/Convolution/kernels/naive.h
new file mode 100644
index 000000000..a9e00d890
--- /dev/null
+++ b/src/Benchmarks/Convolution/kernels/naive.h
@@ -0,0 +1,164 @@
+
+#ifdef HAVE_CUDA
+
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+
+template< int Dimension, typename Device >
+struct Convolution;
+
+template<>
+struct Convolution< 1, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   static size_t
+   getDynamicSharedMemorySize( Index kernelWidth, Index endX )
+   {
+      return 0;
+   }
+};
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution1D( Index kernelWidth,
+               Index endX,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Index ix =  threadIdx.x + blockIdx.x * blockDim.x;
+   Index radius = kernelWidth >> 1;
+
+   Real result = 0;
+
+   for( Index i = -radius; i <= radius; i++ ) {
+      Index elementIndex = i + ix;
+      Index kernelIndex = i + radius;
+
+      if( elementIndex < 0 || elementIndex >= endX ) {
+         result = convolve( result, fetchBoundary( elementIndex ), fetchKernel( kernelIndex ) );
+      }
+      else {
+         result = convolve( result, fetchData( elementIndex ), fetchKernel( kernelIndex ) );
+      }
+   }
+
+   store( ix, result );
+}
+
+// template<>
+// struct Convolution< 2, TNL::Devices::Cuda >
+// {
+// public:
+//    template< typename Index >
+//    static size_t
+//    getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY )
+//    {
+//       return 0;
+//    }
+// };
+
+// template< typename Index,
+//           typename Real,
+//           typename FetchData,
+//           typename FetchBoundary,
+//           typename FetchKernel,
+//           typename Convolve,
+//           typename Store >
+// __global__
+// static void
+// convolution2D( Index kernelWidth,
+//                Index kernelHeight,
+//                Index endX,
+//                Index endY,
+//                FetchData& fetchData,
+//                FetchBoundary& fetchBoundary,
+//                FetchKernel& fetchKernel,
+//                Convolve& convolve,
+//                Store& store )
+// {
+//    int iy = threadIdx.y + blockIdx.y * blockDim.y;
+//    int ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+//    Real result = 0;
+
+//    for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) {
+//       for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) {
+//          if( i < 0 || i >= endX || j < 0 || j >= endY ) {
+//             result = convolve( result, fetchBoundary( i, j ) );
+//          }
+//          else {
+//             result = convolve( result, fetchData( i, j ), fetchKernel( i, j ) );
+//          }
+//       }
+//    }
+
+//    store( ix, iy, result );
+// }
+
+// template<>
+// struct Convolution< 3, TNL::Devices::Cuda >
+// {
+// public:
+//    template< typename Index >
+//    static size_t
+//    getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ )
+//    {
+//       return 0;
+//    }
+// };
+
+// template< typename Index,
+//           typename Real,
+//           typename FetchData,
+//           typename FetchBoundary,
+//           typename FetchKernel,
+//           typename Convolve,
+//           typename Store >
+// __global__
+// static void
+// convolution3D( Index kernelWidth,
+//                Index kernelHeight,
+//                Index kernelDepth,
+//                Index endX,
+//                Index endY,
+//                Index endZ,
+//                FetchData& fetchData,
+//                FetchBoundary& fetchBoundary,
+//                FetchKernel& fetchKernel,
+//                Convolve& convolve,
+//                Store& store )
+// {
+//    int ix = threadIdx.x + blockIdx.x * blockDim.x;
+//    int iy = threadIdx.y + blockIdx.y * blockDim.y;
+//    int iz = threadIdx.z + blockIdx.z * blockDim.z;
+
+//    Real result = 0;
+
+//    for( Index k = iz - kernelDepth; k <= iz + kernelDepth; k++ ) {
+//       for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) {
+//          for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) {
+//             if( i < 0 || i >= endX || j < 0 || j >= endY || k < 0 || k >= endZ ) {
+//                result = convolve( result, fetchBoundary( i, j, k ) );
+//             }
+//             else {
+//                result = convolve( result, fetchData( i, j, k ), fetchKernel( i, j, k ) );
+//             }
+//          }
+//       }
+//    }
+
+//    store( ix, iy, iz, result );
+// }
+
+#endif
diff --git a/src/Benchmarks/Convolution/support/Benchmark.h b/src/Benchmarks/Convolution/support/Benchmark.h
new file mode 100644
index 000000000..f5671a06b
--- /dev/null
+++ b/src/Benchmarks/Convolution/support/Benchmark.h
@@ -0,0 +1,81 @@
+
+#pragma once
+
+#include <TNL/Config/parseCommandLine.h>
+
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+#include <TNL/Benchmarks/Benchmarks.h>
+#include <TNL/Containers/StaticVector.h>
+#include <TNL/Containers/Array.h>
+
+template< int Dimension, typename Device >
+class Benchmark
+{
+public:
+   using Benchmark = typename TNL::Benchmarks::Benchmark<>;
+
+   void
+   runBenchmark( const TNL::Config::ParameterContainer& parameters ) const
+   {
+      if( ! TNL::Devices::Host::setup( parameters ) || ! TNL::Devices::Cuda::setup( parameters ) )
+         return;
+
+      const TNL::String logFileName = parameters.getParameter< TNL::String >( "log-file" );
+      const TNL::String outputMode = parameters.getParameter< TNL::String >( "output-mode" );
+      const TNL::String device = parameters.getParameter< TNL::String >( "device" );
+
+      const int verbose = parameters.getParameter< int >( "verbose" );
+      const int loops = parameters.getParameter< int >( "loops" );
+
+      auto mode = std::ios::out;
+
+      if( outputMode == "append" )
+         mode |= std::ios::app;
+
+      std::ofstream logFile( logFileName.getString(), mode );
+
+      Benchmark benchmark( logFile, loops, verbose );
+
+      std::map< std::string, std::string > metadata = TNL::Benchmarks::getHardwareMetadata();
+      TNL::Benchmarks::writeMapAsJson( metadata, logFileName, ".metadata.json" );
+
+      start(benchmark, parameters);
+   }
+
+   virtual void start(const Benchmark& benchmark, const TNL::Config::ParameterContainer& parameters) const {
+      TNL_ASSERT_TRUE(false, << "Should be overriden");
+   }
+
+   virtual TNL::Config::ConfigDescription makeInputConfig() const {
+      TNL::Config::ConfigDescription config;
+
+      config.addDelimiter( "Benchmark settings:" );
+      config.addEntry< TNL::String >( "id", "Identifier of the run", "unknown" );
+      config.addEntry< TNL::String >( "log-file", "Log file name.", "output.log" );
+      config.addEntry< TNL::String >( "output-mode", "Mode for opening the log file.", "overwrite" );
+      config.addEntryEnum( "append" );
+      config.addEntryEnum( "overwrite" );
+
+      config.addEntry< TNL::String >( "device", "Device the computation will run on.", "cuda" );
+      config.addEntryEnum< TNL::String >( "all" );
+      config.addEntryEnum< TNL::String >( "host" );
+
+#ifdef HAVE_CUDA
+      config.addEntryEnum< TNL::String >( "cuda" );
+#endif
+
+      config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
+      config.addEntry< int >( "verbose", "Verbose mode.", 1 );
+
+
+      config.addDelimiter( "Device settings:" );
+      TNL::Devices::Host::configSetup( config );
+
+#ifdef HAVE_CUDA
+      TNL::Devices::Cuda::configSetup( config );
+#endif
+      return config;
+   }
+};
diff --git a/src/Benchmarks/Convolution/support/DummyBenchmark.h b/src/Benchmarks/Convolution/support/DummyBenchmark.h
new file mode 100644
index 000000000..1830e7484
--- /dev/null
+++ b/src/Benchmarks/Convolution/support/DummyBenchmark.h
@@ -0,0 +1,165 @@
+
+#pragma once
+
+#include "Benchmark.h"
+#include "DummyTask.h"
+
+static std::vector< TNL::String > minDimensionIds = { "min-x-dimension", "min-y-dimension", "min-z-dimension" };
+static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" };
+static std::vector< TNL::String > maxDimensionIds = { "max-x-dimension", "max-y-dimension", "max-z-dimension" };
+static std::vector< TNL::String > minKernelSizeIds = { "min-kernel-width", "min-kernel-height", "min-kernel-depth" };
+static std::vector< TNL::String > kernelSizeIds = { "x-kernelSize", "y-kernelSize", "z-kernelSize" };
+static std::vector< TNL::String > maxKernelSizeIds = { "max-kernel-width", "max-kernel-height", "max-kernel-depth" };
+
+template< int Dimension, typename Device >
+class DummyBenchmark : public Benchmark< Dimension, Device >
+{
+public:
+   using Vector = TNL::Containers::StaticVector< Dimension, int >;
+   using DataStore = TNL::Containers::Array< int, Device, float >;
+   using Benchmark = Base::Benchmark;
+   using Base = Benchmark< Dimension, Device >;
+
+   virtual void
+   start( const Benchmark& benchmark, const TNL::Config::ParameterContainer& parameters ) const override
+   {
+      Vector start;
+      Vector end;
+      Vector minKernelSize;
+      Vector maxKernelSize;
+
+      for( int i = 0; i < Dimension; i++ ) {
+         start[ i ] = parameters.getParameter< int >( minDimensionIds[ i ] );
+         end[ i ] = parameters.getParameter< int >( maxDimensionIds[ i ] );
+         minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] );
+         maxKernelSizeIds[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] );
+
+         TNL_ASSERT_GT( start[ i ], 1, "Start dimension must be positive integer" );
+         TNL_ASSERT_GT( end[ i ], start[ i ], "End dimension must be greater than start dimension" );
+
+         TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" );
+         TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" );
+         TNL_ASSERT_GT( end[ i ], start[ i ], "End kernel size must be greater than start kernel size" );
+      }
+
+      int dimensionStep = parameters.getParameter< int >( "dimension-step" );
+      int kernelStep = parameters.getParameter< int >( "kernel-step" );
+
+      TNL_ASSERT_GT( dimensionStep, 1, "Dimension step must be a positive number" );
+      TNL_ASSERT_GT( kernelStep, 0, "Kernel step must be a positive number" );
+      TNL_ASSERT_EQ( kernelStep % 2, 0, "Kernel step must be even" );
+
+      time( benchmark, start, end, dimensionStep, minKernelSize, maxKernelSize, kernelStep );
+   }
+
+   virtual void
+   time( Benchmark& bencmark,
+         const Vector& minDimension,
+         const Vector& maxDimension,
+         const int dimensionStep,
+         const Vector& minKernelSize,
+         const Vector& maxKernelSize,
+         const int kernelStep ) const
+   {
+      Vector currentDimension = minDimension;
+      Vector currentKernelSize;
+
+      do {
+         currentKernelSize = minKernelSize;
+
+         do {
+            time( benchmark, currentDimension, currentKernelSize );
+
+            currentKernelSize[ 0 ] += kernelStep;
+
+            for( size_t i = 0; i < currentKernelSize.getSize() - 1; i++ ) {
+               if( currentKernelSize[ i ] >= maxKernelSize[ i ] ) {
+                  currentKernelSize[ i ] = minKernelSize[ i ];
+                  maxKernelSize[ i + 1 ] += kernelStep;
+               }
+            }
+         } while( currentKernelSize < maxKernelSize );
+
+         currentDimension[ 0 ] *= dimensionStep;
+
+         for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) {
+            if( currentDimension[ i ] >= maxDimension[ i ] ) {
+               currentDimension[ i ] = minDimension[ i ];
+               maxDimension[ i ] = maxDimension[ i ];
+            }
+         }
+
+      } while( currentDimension < maxDimension );
+   }
+
+   void
+   timeConvolution( Benchmark& benchmark, const Vector& dimension, const Vector& kernelSize ) const
+   {
+      auto device = TNL::getType< Device >();
+
+      Benchmark::MetadataColumns columns = {};
+
+      size_t elementsCount = 1;
+      size_t kernelElementsCount = 1;
+
+      for( size_t i = 0; i < dimension.getSize(); i++ ) {
+         elementsCount *= dimension[ i ];
+         kernelElementsCount *= kernelSize[ i ];
+
+         columns.insert( { dimensionIds[ i ], dimension[ i ] } );
+         columns.insert( { kernelSizeIds[ i ], kernelSize[ i ] } );
+      }
+
+      benchmark.setDatasetSize( ( elementsCount * 4 ) / 1.e9, 1.0 );
+
+      // Setup input data
+      DataStore input, result, kernel;
+
+      input.resize( elementsCount );
+      result.resize( elementsCount );
+      kernel.resize( kernelSize );
+
+      input = 1;
+      result = 1;
+      kernel = 1;
+
+      auto inputView = input.getView();
+      auto resultView = result.getView();
+      auto kernelView = kernel.getView();
+
+      auto measure = [ & ]()
+      {
+         DummyTask<Dimension, Device>::exec(dimension, kernelSize, inputView, resultView, kernelView);
+      };
+
+      benchmark.time< Device >( device, measure );
+   }
+
+   TNL::Config::ConfigDescription
+   makeInputConfig() const override
+   {
+      auto config = Base::makeInputConfig();
+
+      config.addDelimiter( "Grid dimension settings:" );
+
+      for( int i = 0; i < Dimension; i++ )
+         config.addEntry< int >( minDimensionIds[ i ], minDimensionIds[ i ], 512 );
+
+      for( int i = 0; i < Dimension; i++ )
+         config.addEntry< int >( maxDimensionIds[ i ], maxDimensionIds[ i ], 512 );
+
+      config.addEntry< int >( "dimension-step", "Step of kernel increase by which dimension is multiplied (must be even)", 2 );
+
+      config.addDelimiter( "Kernel settings:" );
+
+      for( int i = 0; i < Dimension; i++ )
+         config.addEntry< int >( minKernelSizeIds[ i ], minKernelSizeIds[ i ] + " (odd) :", 1 );
+
+      for( int i = 0; i < Dimension; i++ )
+         config.addEntry< int >( minKernelSizeIds[ i ], minKernelSizeIds[ i ] + " (odd) :", 11 );
+
+      config.addEntry< int >( "kernel-step", "Step of kernel increase which is added to kernel (must be even)", 2 );
+
+      return config;
+   }
+};
diff --git a/src/Benchmarks/Convolution/support/DummySolver.h b/src/Benchmarks/Convolution/support/DummySolver.h
new file mode 100644
index 000000000..a871c7f3f
--- /dev/null
+++ b/src/Benchmarks/Convolution/support/DummySolver.h
@@ -0,0 +1,84 @@
+
+#pragma once
+
+#include "Solver.h"
+#include "DummyTask.h"
+
+static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" };
+static std::vector< TNL::String > kernelSizeIds = { "x-kernel-size", "y-kernel-size", "z-kernel-size" };
+
+template< int Dimension, typename Device >
+class DummySolver : public Solver< Dimension, Device >
+{
+public:
+   using Base = Solver< Dimension, Device >;
+   using Vector = TNL::Containers::StaticVector< Dimension, int >;
+   using DataStore = TNL::Containers::Array< float, Device, int >;
+
+   virtual void
+   start( const TNL::Config::ParameterContainer& parameters ) const override
+   {
+      Vector dimensions;
+      Vector kernelSize;
+
+      for( int i = 0; i < Dimension; i++ ) {
+         dimensions[ i ] = parameters.getParameter< int >( dimensionIds[ i ] );
+         kernelSize[ i ] = parameters.getParameter< int >( kernelSizeIds[ i ] );
+
+         TNL_ASSERT_GT( dimensions[ i ], 1, "Start dimension must be positive integer" );
+
+         TNL_ASSERT_GE( kernelSize[ i ], 1, "Minimal kernel size must be a positive number" );
+         TNL_ASSERT_EQ( kernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" );
+      }
+
+      launchConvolution( dimensions, kernelSize );
+   }
+
+   void
+   launchConvolution( const Vector& dimension, const Vector& kernelSize ) const
+   {
+      DataStore input, result, kernel;
+
+      size_t elementsCount = 1;
+      size_t kernelElementsCount = 1;
+
+      for( size_t i = 0; i < (size_t) dimension.getSize(); i++ ) {
+         elementsCount *= dimension[ i ];
+         kernelElementsCount *= kernelSize[ i ];
+      }
+
+      input.resize( elementsCount );
+      result.resize( elementsCount );
+      kernel.resize( kernelElementsCount );
+
+      input = 1;
+      result = 1;
+      kernel = 1;
+
+      auto inputView = input.getView();
+      auto resultView = result.getView();
+      auto kernelView = kernel.getView();
+
+      DummyTask<int, float, Dimension, Device>::exec(dimension, kernelSize, inputView, resultView, kernelView);
+
+      std::cout << "Everything is fine" << std::endl;
+   }
+
+   virtual TNL::Config::ConfigDescription
+   makeInputConfig() const override
+   {
+      TNL::Config::ConfigDescription config = Base::makeInputConfig();
+
+      config.addDelimiter( "Grid dimension settings:" );
+
+      for( int i = 0; i < Dimension; i++ )
+         config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 512 );
+
+      config.addDelimiter( "Kernel settings:" );
+
+      for( int i = 0; i < Dimension; i++ )
+         config.addEntry< int >( kernelSizeIds[ i ], kernelSizeIds[ i ] + " (odd) :", 11 );
+
+      return config;
+   }
+};
diff --git a/src/Benchmarks/Convolution/support/DummyTask.h b/src/Benchmarks/Convolution/support/DummyTask.h
new file mode 100644
index 000000000..22565ac1b
--- /dev/null
+++ b/src/Benchmarks/Convolution/support/DummyTask.h
@@ -0,0 +1,153 @@
+
+#pragma once
+
+#include "Launcher.h"
+
+template< typename Index, typename Real, int Dimension, typename Device >
+struct DummyTask;
+
+template< typename Index, typename Real >
+struct DummyTask< Index, Real, 1, TNL::Devices::Cuda >
+{
+public:
+   static constexpr int Dimension = 1;
+   using Device = TNL::Devices::Cuda;
+   using Vector = TNL::Containers::StaticVector< Dimension, Index >;
+   using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
+   using Launcher = Launcher< Dimension, Device >;
+
+   static void
+   exec( const Vector& dimensions, const Vector& kernelSize, DataStore input, DataStore result, DataStore kernel )
+   {
+      auto fetchData = [ = ] __cuda_callable__( Index i )
+      {
+         return input[ i ];
+      };
+
+      auto fetchBoundary = [ = ] __cuda_callable__( Index i )
+      {
+         return 1;
+      };
+
+      auto fetchKernel = [ = ] __cuda_callable__( Index i )
+      {
+         return kernel[ i ];
+      };
+
+      auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel )
+      {
+        return result + data * kernel;
+      };
+
+      auto store = [ = ] __cuda_callable__( Index i, Real resultValue ) mutable
+      {
+         result[i] = resultValue;
+      };
+
+      Launcher::exec< Index, Real >( dimensions,
+                                     kernelSize,
+                                     std::forward< decltype( fetchData ) >( fetchData ),
+                                     std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
+                                     std::forward< decltype( fetchKernel ) >( fetchKernel ),
+                                     std::forward< decltype( convolve ) >( convolve ),
+                                     std::forward< decltype( store ) >( store ) );
+   }
+};
+
+// template< typename Index, typename Real >
+// struct DummyTask< Index, Real, 2, TNL::Devices::Cuda >
+// {
+// public:
+//    static constexpr int Dimension = 2;
+//    using Device = TNL::Devices::Cuda;
+//    using Vector = TNL::Containers::StaticVector< Dimension, Index >;
+//    using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
+//    using Launcher = Launcher< Dimension, Device >;
+
+//    static void
+//    exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
+//    {
+//       auto fetchData = [ = ] __cuda_callable__( Index i, Index j )
+//       {
+//          auto index = i + j * dimensions.x();
+
+//          return input[ index ];
+//       };
+
+//       auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j )
+//       {
+//          return -1;
+//       };
+
+//       auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j )
+//       {
+//          auto index = i + j * kernel.x();
+
+//          return kernel[ index ];
+//       };
+
+//       auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel )
+//       {
+//          return result + data * kernel;
+//       };
+
+//       auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue )
+//       {
+//          auto index = i + j * dimensions.x();
+
+//          result[ index ] = resultValue;
+//       };
+
+//       Launcher::exec< Index >( dimensions,
+//                                kernelSize,
+//                                std::forward< decltype( fetchData ) >( fetchData ),
+//                                std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
+//                                std::forward< decltype( fetchKernel ) >( fetchKernel ),
+//                                std::forward< decltype( convolve ) >( convolve ),
+//                                std::forward< decltype( store ) >( store ) );
+//    }
+// };
+
+// template< typename Index, typename Real >
+// struct DummyTask< Index, Real, 3, TNL::Devices::Cuda >
+// {
+// public:
+//    static constexpr int Dimension = 3;
+//    using Device = TNL::Devices::Cuda;
+//    using Vector = TNL::Containers::StaticVector< Dimension, Index >;
+//    using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
+//    using Launcher = Launcher< Dimension, Device >;
+
+//    static void
+//    exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
+//    {
+//       auto fetchData = [ = ] __cuda_callable__( Index i, Index j, Index k ) {
+
+//       };
+
+//       auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j, Index k ) {
+
+//       };
+
+//       auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j, Index k ) {
+
+//       };
+
+//       auto convolve = [ = ] __cuda_callable__( float result, Index data, Index kernel )
+//       {
+//          return result + data * kernel;
+//       };
+
+//       auto store = [ = ] __cuda_callable__( Index i, Index j, Index k, Real result ) {
+
+//       };
+
+//       Launcher::exec< Index >( dimensions,
+//                                kernelSize,
+//                                std::forward< decltype( fetchData ) >( fetchData ),
+//                                std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
+//                                std::forward< decltype( fetchKernel ) >( fetchKernel ),
+//                                std::forward< decltype( convolve ) >( convolve ),
+//                                std::forward< decltype( store ) >( store ) );
+//    }
+// };
diff --git a/src/Benchmarks/Convolution/support/Launcher.h b/src/Benchmarks/Convolution/support/Launcher.h
new file mode 100644
index 000000000..c86ed2057
--- /dev/null
+++ b/src/Benchmarks/Convolution/support/Launcher.h
@@ -0,0 +1,218 @@
+
+#pragma once
+
+#include <TNL/Containers/StaticVector.h>
+#include <TNL/Cuda/KernelLaunch.h>
+
+template< int Dimension, typename Device >
+struct Convolution;
+
+template< int Dimension, typename Device >
+struct Launcher;
+
+template<>
+struct Launcher< 1, TNL::Devices::Cuda >
+{
+public:
+   using Vector = TNL::Containers::StaticVector< 1, int >;
+   using ConvolutionKernel = Convolution< 1, TNL::Devices::Cuda >;
+
+   template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store >
+   static inline void
+   exec( const Vector& dimensions,
+         const Vector& kernelSize,
+         FetchData&& fetchData,
+         FetchBoundary&& fetchBoundary,
+         FetchKernel&& fetchKernel,
+         Convolve&& convolve,
+         Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration launchConfig;
+
+      launchConfig.dynamicSharedMemorySize =
+         ConvolutionKernel::getDynamicSharedMemorySize< Index >( kernelSize.x(), dimensions.x() );
+
+      // TODO: - Benchmark the best value
+      launchConfig.blockSize.x = 256;
+      launchConfig.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), launchConfig.blockSize.x ) );
+
+      if( (std::size_t) launchConfig.blockSize.x * launchConfig.gridSize.x < (std::size_t) dimensions.x() ) {
+         const int desGridSize = 32 * TNL::Cuda::DeviceInfo::getCudaMultiprocessors( TNL::Cuda::DeviceInfo::getActiveDevice() );
+
+         launchConfig.gridSize.x =
+            TNL::min( desGridSize, TNL::Cuda::getNumberOfBlocks( dimensions.x(), launchConfig.blockSize.x ) );
+      }
+
+      constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       launchConfig,
+                                       kernelSize.x(),
+                                       dimensions.x(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   }
+};
+
+// template<>
+// struct Launcher< 2, TNL::Devices::Cuda >
+// {
+// public:
+//    using Vector = TNL::Containers::StaticVector< 2, int >;
+//    using ConvolutionKernel = Convolution< 2, TNL::Devices::Cuda >;
+
+//    template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store >
+//    static inline void
+//    exec( const Vector& dimensions,
+//          const Vector& kernelSize,
+//          FetchData&& fetchData,
+//          FetchBoundary&& fetchBoundary,
+//          FetchKernel&& fetchKernel,
+//          Convolve&& convolve,
+//          Store&& store )
+//    {
+//       TNL::Cuda::LaunchConfiguration launchConfig;
+
+//       launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >(
+//          kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() );
+
+//       const Index sizeX = dimensions.x();
+//       const Index sizeY = dimensions.y();
+
+//       if( sizeX >= sizeY * sizeY ) {
+//          launchConfig.blockSize.x = TNL::min( 256, sizeX );
+//          launchConfig.blockSize.y = 1;
+//       }
+//       else if( sizeY >= sizeX * sizeX ) {
+//          launchConfig.blockSize.x = 1;
+//          launchConfig.blockSize.y = TNL::min( 256, sizeY );
+//       }
+//       else {
+//          launchConfig.blockSize.x = TNL::min( 32, sizeX );
+//          launchConfig.blockSize.y = TNL::min( 8, sizeY );
+//       }
+
+//       launchConfig.gridSize.x =
+//          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) );
+//       launchConfig.gridSize.y =
+//          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) );
+
+//       dim3 gridCount;
+
+//       gridCount.x = roundUpDivision( sizeX, launchConfig.blockSize.x * launchConfig.gridSize.x );
+//       gridCount.y = roundUpDivision( sizeY, launchConfig.blockSize.y * launchConfig.gridSize.y );
+
+//       constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+//       TNL::Cuda::launchKernel< true >( kernel,
+//                                        0,
+//                                        launchConfig,
+//                                        kernelSize.x(),
+//                                        kernelSize.y(),
+//                                        dimensions.x(),
+//                                        dimensions.y(),
+//                                        std::forward< FetchData >( fetchData ),
+//                                        std::forward< FetchBoundary >( fetchBoundary ),
+//                                        std::forward< FetchKernel >( fetchKernel ),
+//                                        std::forward< Convolve >( convolve ),
+//                                        std::forward< Store >( store ) );
+//    }
+// };
+
+// template<>
+// struct Launcher< 3, TNL::Devices::Cuda >
+// {
+// public:
+//    using Vector = TNL::Containers::StaticVector< 3, int >;
+//    using ConvolutionKernel = Convolution< 3, TNL::Devices::Cuda >;
+
+//    template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store >
+//    static inline void
+//    exec( const Vector& dimensions,
+//          const Vector& kernelSize,
+//          FetchData&& fetchData,
+//          FetchBoundary&& fetchBoundary,
+//          FetchKernel&& fetchKernel,
+//          Convolve&& convolve,
+//          Store&& store )
+//    {
+//       const Index sizeX = dimensions.x();
+//       const Index sizeY = dimensions.y();
+//       const Index sizeZ = dimensions.z();
+
+//       TNL::Cuda::LaunchConfiguration launchConfig;
+
+//       launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >(
+//          kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z() );
+
+//       if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) {
+//          launchConfig.blockSize.x = TNL::min( 256, sizeX );
+//          launchConfig.blockSize.y = 1;
+//          launchConfig.blockSize.z = 1;
+//       }
+//       else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) {
+//          launchConfig.blockSize.x = 1;
+//          launchConfig.blockSize.y = TNL::min( 256, sizeY );
+//          launchConfig.blockSize.z = 1;
+//       }
+//       else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) {
+//          launchConfig.blockSize.x = TNL::min( 2, sizeX );
+//          launchConfig.blockSize.y = TNL::min( 2, sizeY );
+//          // CUDA allows max 64 for launchConfig.blockSize.z
+//          launchConfig.blockSize.z = TNL::min( 64, sizeZ );
+//       }
+//       else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) {
+//          launchConfig.blockSize.x = TNL::min( 32, sizeX );
+//          launchConfig.blockSize.y = TNL::min( 8, sizeY );
+//          launchConfig.blockSize.z = 1;
+//       }
+//       else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) {
+//          launchConfig.blockSize.x = TNL::min( 32, sizeX );
+//          launchConfig.blockSize.y = 1;
+//          launchConfig.blockSize.z = TNL::min( 8, sizeZ );
+//       }
+//       else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) {
+//          launchConfig.blockSize.x = 1;
+//          launchConfig.blockSize.y = TNL::min( 32, sizeY );
+//          launchConfig.blockSize.z = TNL::min( 8, sizeZ );
+//       }
+//       else {
+//          launchConfig.blockSize.x = TNL::min( 16, sizeX );
+//          launchConfig.blockSize.y = TNL::min( 4, sizeY );
+//          launchConfig.blockSize.z = TNL::min( 4, sizeZ );
+//       }
+//       launchConfig.gridSize.x =
+//          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) );
+//       launchConfig.gridSize.y =
+//          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) );
+//       launchConfig.gridSize.z =
+//          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeZ, launchConfig.blockSize.z ) );
+
+//       dim3 gridCount;
+//       gridCount.x = roundUpDivision( sizeX, launchConfig.blockSize.x * launchConfig.gridSize.x );
+//       gridCount.y = roundUpDivision( sizeY, launchConfig.blockSize.y * launchConfig.gridSize.y );
+//       gridCount.z = roundUpDivision( sizeZ, launchConfig.blockSize.z * launchConfig.gridSize.z );
+
+//       constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+//       TNL::Cuda::launchKernel< true >( kernel,
+//                                        0,
+//                                        launchConfig,
+//                                        kernelSize.x(),
+//                                        kernelSize.y(),
+//                                        kernelSize.z(),
+//                                        dimensions.x(),
+//                                        dimensions.y(),
+//                                        dimensions.z(),
+//                                        std::forward< FetchData >( fetchData ),
+//                                        std::forward< FetchBoundary >( fetchBoundary ),
+//                                        std::forward< FetchKernel >( fetchKernel ),
+//                                        std::forward< Convolve >( convolve ),
+//                                        std::forward< Store >( store ) );
+//    }
+// };
diff --git a/src/Benchmarks/Convolution/support/Solver.h b/src/Benchmarks/Convolution/support/Solver.h
new file mode 100644
index 000000000..a6b1d2c91
--- /dev/null
+++ b/src/Benchmarks/Convolution/support/Solver.h
@@ -0,0 +1,52 @@
+
+#pragma once
+
+#include <vector>
+
+#include <TNL/Containers/StaticVector.h>
+#include <TNL/Containers/Array.h>
+
+#include "Launcher.h"
+
+template< int Dimension, typename Device >
+class Solver
+{
+public:
+   void
+   solve( const TNL::Config::ParameterContainer& parameters ) const
+   {
+      if( ! TNL::Devices::Host::setup( parameters ) || ! TNL::Devices::Cuda::setup( parameters ) )
+         return;
+
+      start( parameters );
+   }
+
+   virtual void
+   start( const TNL::Config::ParameterContainer& parameters ) const
+   {
+      TNL_ASSERT_TRUE( false, "Should be overriden" );
+   }
+
+   virtual TNL::Config::ConfigDescription
+   makeInputConfig() const
+   {
+      TNL::Config::ConfigDescription config;
+
+      config.addEntry< TNL::String >( "device", "Device the computation will run on.", "cuda" );
+      config.addEntryEnum< TNL::String >( "all" );
+      config.addEntryEnum< TNL::String >( "host" );
+
+#ifdef HAVE_CUDA
+      config.addEntryEnum< TNL::String >( "cuda" );
+#endif
+
+      config.addDelimiter( "Device settings:" );
+      TNL::Devices::Host::configSetup( config );
+
+#ifdef HAVE_CUDA
+      TNL::Devices::Cuda::configSetup( config );
+#endif
+
+      return config;
+   }
+};
diff --git a/src/Benchmarks/Convolution/templates/main_benchmark.h b/src/Benchmarks/Convolution/templates/main_benchmark.h
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/Benchmarks/Convolution/templates/main_solver.h b/src/Benchmarks/Convolution/templates/main_solver.h
new file mode 100644
index 000000000..1a6c33a9b
--- /dev/null
+++ b/src/Benchmarks/Convolution/templates/main_solver.h
@@ -0,0 +1,25 @@
+
+#include "../kernels/naive.h"
+#include "../support/DummySolver.h"
+
+#include <TNL/Config/parseCommandLine.h>
+
+#define DIMENSION DIMENSION_VALUE
+
+using TaskSolver = DummySolver< DIMENSION, TNL::Devices::Cuda >;
+
+int main(int argc, char* argv[])
+{
+   TaskSolver solver;
+
+   auto config = solver.makeInputConfig();
+
+   TNL::Config::ParameterContainer parameters;
+
+   if( ! parseCommandLine( argc, argv, config, parameters ) )
+      return EXIT_FAILURE;
+
+   solver.solve( parameters );
+
+   return 0;
+}
-- 
GitLab


From 5a5a294c8ee811687be19bd7cfd05832d57decbe Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Sat, 2 Apr 2022 13:15:37 +0200
Subject: [PATCH 02/19] Implement naive 2D kernel

---
 src/Benchmarks/Convolution/CMakeLists.txt     |   1 +
 src/Benchmarks/Convolution/kernels/naive.h    |  99 ++++++++--------
 .../Convolution/support/DummyTask.h           |  88 +++++++-------
 src/Benchmarks/Convolution/support/Launcher.h | 109 +++++++++---------
 4 files changed, 151 insertions(+), 146 deletions(-)

diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt
index 22d0876bc..6e31beaeb 100644
--- a/src/Benchmarks/Convolution/CMakeLists.txt
+++ b/src/Benchmarks/Convolution/CMakeLists.txt
@@ -23,3 +23,4 @@ endif()
 endfunction()
 
 GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/naive.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/naive.h")
diff --git a/src/Benchmarks/Convolution/kernels/naive.h b/src/Benchmarks/Convolution/kernels/naive.h
index a9e00d890..76c73237d 100644
--- a/src/Benchmarks/Convolution/kernels/naive.h
+++ b/src/Benchmarks/Convolution/kernels/naive.h
@@ -56,55 +56,64 @@ convolution1D( Index kernelWidth,
    store( ix, result );
 }
 
-// template<>
-// struct Convolution< 2, TNL::Devices::Cuda >
-// {
-// public:
-//    template< typename Index >
-//    static size_t
-//    getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY )
-//    {
-//       return 0;
-//    }
-// };
+template<>
+struct Convolution< 2, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   static size_t
+   getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY )
+   {
+      return 0;
+   }
+};
 
-// template< typename Index,
-//           typename Real,
-//           typename FetchData,
-//           typename FetchBoundary,
-//           typename FetchKernel,
-//           typename Convolve,
-//           typename Store >
-// __global__
-// static void
-// convolution2D( Index kernelWidth,
-//                Index kernelHeight,
-//                Index endX,
-//                Index endY,
-//                FetchData& fetchData,
-//                FetchBoundary& fetchBoundary,
-//                FetchKernel& fetchKernel,
-//                Convolve& convolve,
-//                Store& store )
-// {
-//    int iy = threadIdx.y + blockIdx.y * blockDim.y;
-//    int ix = threadIdx.x + blockIdx.x * blockDim.x;
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution2D( Index kernelWidth,
+               Index kernelHeight,
+               Index endX,
+               Index endY,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
-//    Real result = 0;
+   Index radiusY = kernelHeight >> 1;
+   Index radiusX = kernelHeight >> 1;
 
-//    for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) {
-//       for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) {
-//          if( i < 0 || i >= endX || j < 0 || j >= endY ) {
-//             result = convolve( result, fetchBoundary( i, j ) );
-//          }
-//          else {
-//             result = convolve( result, fetchData( i, j ), fetchKernel( i, j ) );
-//          }
-//       }
-//    }
+   Real result = 0;
 
-//    store( ix, iy, result );
-// }
+   for( Index j = - radiusY; j <= radiusY; j++ ) {
+      Index elementIndexY = j + iy;
+      Index kernelIndexY = j + radiusY;
+
+      for( Index i = - radiusX; i <= radiusX; i++ ) {
+         Index elementIndexX = i + ix;
+         Index kernelIndexX = i + radiusX;
+
+         if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY ) {
+            result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel ( kernelIndexX, kernelIndexY ) );
+         }
+         else {
+            result = convolve( result, fetchData( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) );
+         }
+      }
+   }
+
+   store( ix, iy, result );
+}
 
 // template<>
 // struct Convolution< 3, TNL::Devices::Cuda >
diff --git a/src/Benchmarks/Convolution/support/DummyTask.h b/src/Benchmarks/Convolution/support/DummyTask.h
index 22565ac1b..f92a5c2fc 100644
--- a/src/Benchmarks/Convolution/support/DummyTask.h
+++ b/src/Benchmarks/Convolution/support/DummyTask.h
@@ -54,59 +54,59 @@ public:
    }
 };
 
-// template< typename Index, typename Real >
-// struct DummyTask< Index, Real, 2, TNL::Devices::Cuda >
-// {
-// public:
-//    static constexpr int Dimension = 2;
-//    using Device = TNL::Devices::Cuda;
-//    using Vector = TNL::Containers::StaticVector< Dimension, Index >;
-//    using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
-//    using Launcher = Launcher< Dimension, Device >;
+template< typename Index, typename Real >
+struct DummyTask< Index, Real, 2, TNL::Devices::Cuda >
+{
+public:
+   static constexpr int Dimension = 2;
+   using Device = TNL::Devices::Cuda;
+   using Vector = TNL::Containers::StaticVector< Dimension, Index >;
+   using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
+   using Launcher = Launcher< Dimension, Device >;
 
-//    static void
-//    exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
-//    {
-//       auto fetchData = [ = ] __cuda_callable__( Index i, Index j )
-//       {
-//          auto index = i + j * dimensions.x();
+   static void
+   exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
+   {
+      auto fetchData = [ = ] __cuda_callable__( Index i, Index j )
+      {
+         auto index = i + j * dimensions.x();
 
-//          return input[ index ];
-//       };
+         return input[ index ];
+      };
 
-//       auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j )
-//       {
-//          return -1;
-//       };
+      auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j )
+      {
+         return -1;
+      };
 
-//       auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j )
-//       {
-//          auto index = i + j * kernel.x();
+      auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j )
+      {
+         auto index = i + j * kernelSize.x();
 
-//          return kernel[ index ];
-//       };
+         return kernel[ index ];
+      };
 
-//       auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel )
-//       {
-//          return result + data * kernel;
-//       };
+      auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel )
+      {
+         return result + data * kernel;
+      };
 
-//       auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue )
-//       {
-//          auto index = i + j * dimensions.x();
+      auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) mutable
+      {
+         auto index = i + j * dimensions.x();
 
-//          result[ index ] = resultValue;
-//       };
+         result[ index ] = resultValue;
+      };
 
-//       Launcher::exec< Index >( dimensions,
-//                                kernelSize,
-//                                std::forward< decltype( fetchData ) >( fetchData ),
-//                                std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
-//                                std::forward< decltype( fetchKernel ) >( fetchKernel ),
-//                                std::forward< decltype( convolve ) >( convolve ),
-//                                std::forward< decltype( store ) >( store ) );
-//    }
-// };
+      Launcher::exec< Index, Real >( dimensions,
+                                     kernelSize,
+                                     std::forward< decltype( fetchData ) >( fetchData ),
+                                     std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
+                                     std::forward< decltype( fetchKernel ) >( fetchKernel ),
+                                     std::forward< decltype( convolve ) >( convolve ),
+                                     std::forward< decltype( store ) >( store ) );
+   }
+};
 
 // template< typename Index, typename Real >
 // struct DummyTask< Index, Real, 3, TNL::Devices::Cuda >
diff --git a/src/Benchmarks/Convolution/support/Launcher.h b/src/Benchmarks/Convolution/support/Launcher.h
index c86ed2057..fde1e91ab 100644
--- a/src/Benchmarks/Convolution/support/Launcher.h
+++ b/src/Benchmarks/Convolution/support/Launcher.h
@@ -59,70 +59,65 @@ public:
    }
 };
 
-// template<>
-// struct Launcher< 2, TNL::Devices::Cuda >
-// {
-// public:
-//    using Vector = TNL::Containers::StaticVector< 2, int >;
-//    using ConvolutionKernel = Convolution< 2, TNL::Devices::Cuda >;
-
-//    template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store >
-//    static inline void
-//    exec( const Vector& dimensions,
-//          const Vector& kernelSize,
-//          FetchData&& fetchData,
-//          FetchBoundary&& fetchBoundary,
-//          FetchKernel&& fetchKernel,
-//          Convolve&& convolve,
-//          Store&& store )
-//    {
-//       TNL::Cuda::LaunchConfiguration launchConfig;
-
-//       launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >(
-//          kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() );
+template<>
+struct Launcher< 2, TNL::Devices::Cuda >
+{
+public:
+   using Vector = TNL::Containers::StaticVector< 2, int >;
+   using ConvolutionKernel = Convolution< 2, TNL::Devices::Cuda >;
 
-//       const Index sizeX = dimensions.x();
-//       const Index sizeY = dimensions.y();
+   template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store >
+   static inline void
+   exec( const Vector& dimensions,
+         const Vector& kernelSize,
+         FetchData&& fetchData,
+         FetchBoundary&& fetchBoundary,
+         FetchKernel&& fetchKernel,
+         Convolve&& convolve,
+         Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration launchConfig;
 
-//       if( sizeX >= sizeY * sizeY ) {
-//          launchConfig.blockSize.x = TNL::min( 256, sizeX );
-//          launchConfig.blockSize.y = 1;
-//       }
-//       else if( sizeY >= sizeX * sizeX ) {
-//          launchConfig.blockSize.x = 1;
-//          launchConfig.blockSize.y = TNL::min( 256, sizeY );
-//       }
-//       else {
-//          launchConfig.blockSize.x = TNL::min( 32, sizeX );
-//          launchConfig.blockSize.y = TNL::min( 8, sizeY );
-//       }
+      launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >(
+         kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() );
 
-//       launchConfig.gridSize.x =
-//          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) );
-//       launchConfig.gridSize.y =
-//          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) );
+      const Index sizeX = dimensions.x();
+      const Index sizeY = dimensions.y();
 
-//       dim3 gridCount;
+      if( sizeX >= sizeY * sizeY ) {
+         launchConfig.blockSize.x = TNL::min( 256, sizeX );
+         launchConfig.blockSize.y = 1;
+      }
+      else if( sizeY >= sizeX * sizeX ) {
+         launchConfig.blockSize.x = 1;
+         launchConfig.blockSize.y = TNL::min( 256, sizeY );
+      }
+      else {
+         launchConfig.blockSize.x = TNL::min( 32, sizeX );
+         launchConfig.blockSize.y = TNL::min( 8, sizeY );
+      }
 
-//       gridCount.x = roundUpDivision( sizeX, launchConfig.blockSize.x * launchConfig.gridSize.x );
-//       gridCount.y = roundUpDivision( sizeY, launchConfig.blockSize.y * launchConfig.gridSize.y );
+      launchConfig.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) );
+      launchConfig.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) );
 
-//       constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+      constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
 
-//       TNL::Cuda::launchKernel< true >( kernel,
-//                                        0,
-//                                        launchConfig,
-//                                        kernelSize.x(),
-//                                        kernelSize.y(),
-//                                        dimensions.x(),
-//                                        dimensions.y(),
-//                                        std::forward< FetchData >( fetchData ),
-//                                        std::forward< FetchBoundary >( fetchBoundary ),
-//                                        std::forward< FetchKernel >( fetchKernel ),
-//                                        std::forward< Convolve >( convolve ),
-//                                        std::forward< Store >( store ) );
-//    }
-// };
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       launchConfig,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   }
+};
 
 // template<>
 // struct Launcher< 3, TNL::Devices::Cuda >
-- 
GitLab


From c710d0b218937bc01e84419f4894ba009d9c87ef Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Sat, 2 Apr 2022 14:22:51 +0200
Subject: [PATCH 03/19] Implement naive 3D kernel

---
 src/Benchmarks/Convolution/CMakeLists.txt     |   2 +-
 src/Benchmarks/Convolution/kernels/naive.h    | 133 +++++++------
 .../Convolution/support/DummyTask.h           |  84 ++++----
 src/Benchmarks/Convolution/support/Launcher.h | 180 +++++++++---------
 4 files changed, 214 insertions(+), 185 deletions(-)

diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt
index 6e31beaeb..4c80ff07e 100644
--- a/src/Benchmarks/Convolution/CMakeLists.txt
+++ b/src/Benchmarks/Convolution/CMakeLists.txt
@@ -13,7 +13,7 @@ if (${BUILD_CUDA})
 
    FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}")
 
-   SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}")
+   SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}_${TEMPLATE_NAME}")
 
    CUDA_ADD_EXECUTABLE(${EXECUTABLE_NAME} ${SOURCE_FILE})
 else()
diff --git a/src/Benchmarks/Convolution/kernels/naive.h b/src/Benchmarks/Convolution/kernels/naive.h
index 76c73237d..2a8cf47ca 100644
--- a/src/Benchmarks/Convolution/kernels/naive.h
+++ b/src/Benchmarks/Convolution/kernels/naive.h
@@ -37,6 +37,10 @@ convolution1D( Index kernelWidth,
                Store store )
 {
    Index ix =  threadIdx.x + blockIdx.x * blockDim.x;
+
+   if (ix >= endX)
+      return;
+
    Index radius = kernelWidth >> 1;
 
    Real result = 0;
@@ -90,8 +94,11 @@ convolution2D( Index kernelWidth,
    Index iy = threadIdx.y + blockIdx.y * blockDim.y;
    Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
+   if (ix >= endX || iy >= endY)
+      return;
+
    Index radiusY = kernelHeight >> 1;
-   Index radiusX = kernelHeight >> 1;
+   Index radiusX = kernelWidth >> 1;
 
    Real result = 0;
 
@@ -115,59 +122,75 @@ convolution2D( Index kernelWidth,
    store( ix, iy, result );
 }
 
-// template<>
-// struct Convolution< 3, TNL::Devices::Cuda >
-// {
-// public:
-//    template< typename Index >
-//    static size_t
-//    getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ )
-//    {
-//       return 0;
-//    }
-// };
-
-// template< typename Index,
-//           typename Real,
-//           typename FetchData,
-//           typename FetchBoundary,
-//           typename FetchKernel,
-//           typename Convolve,
-//           typename Store >
-// __global__
-// static void
-// convolution3D( Index kernelWidth,
-//                Index kernelHeight,
-//                Index kernelDepth,
-//                Index endX,
-//                Index endY,
-//                Index endZ,
-//                FetchData& fetchData,
-//                FetchBoundary& fetchBoundary,
-//                FetchKernel& fetchKernel,
-//                Convolve& convolve,
-//                Store& store )
-// {
-//    int ix = threadIdx.x + blockIdx.x * blockDim.x;
-//    int iy = threadIdx.y + blockIdx.y * blockDim.y;
-//    int iz = threadIdx.z + blockIdx.z * blockDim.z;
-
-//    Real result = 0;
-
-//    for( Index k = iz - kernelDepth; k <= iz + kernelDepth; k++ ) {
-//       for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) {
-//          for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) {
-//             if( i < 0 || i >= endX || j < 0 || j >= endY || k < 0 || k >= endZ ) {
-//                result = convolve( result, fetchBoundary( i, j, k ) );
-//             }
-//             else {
-//                result = convolve( result, fetchData( i, j, k ), fetchKernel( i, j, k ) );
-//             }
-//          }
-//       }
-//    }
-
-//    store( ix, iy, iz, result );
-// }
+template<>
+struct Convolution< 3, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   static size_t
+   getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ )
+   {
+      return 0;
+   }
+};
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution3D( Index kernelWidth,
+               Index kernelHeight,
+               Index kernelDepth,
+               Index endX,
+               Index endY,
+               Index endZ,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Index iz = threadIdx.z + blockIdx.z * blockDim.z;
+   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   if (ix >= endX || iy >= endY || iz >= endZ)
+      return;
+
+   Index radiusZ = kernelDepth >> 1;
+   Index radiusY = kernelHeight >> 1;
+   Index radiusX = kernelWidth >> 1;
+
+   Real result = 0;
+
+   for( Index k = -radiusZ; k <= radiusZ; k++ ) {
+      Index elementIndexZ = k + iz;
+      Index kernelIndexZ = k + radiusZ;
+
+      for( Index j = -radiusY; j <= radiusY; j++ ) {
+         Index elementIndexY = j + iy;
+         Index kernelIndexY = j + radiusY;
+
+         for( Index i = -radiusX; i <= radiusX; i++ ) {
+            Index elementIndexX = i + ix;
+            Index kernelIndexX = i + radiusX;
+
+            if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 || elementIndexZ >= endZ ) {
+               result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) );
+            }
+            else {
+               result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) );
+            }
+         }
+      }
+   }
+
+   store( ix, iy, iz, result );
+}
 
 #endif
diff --git a/src/Benchmarks/Convolution/support/DummyTask.h b/src/Benchmarks/Convolution/support/DummyTask.h
index f92a5c2fc..f7db47e34 100644
--- a/src/Benchmarks/Convolution/support/DummyTask.h
+++ b/src/Benchmarks/Convolution/support/DummyTask.h
@@ -17,7 +17,7 @@ public:
    using Launcher = Launcher< Dimension, Device >;
 
    static void
-   exec( const Vector& dimensions, const Vector& kernelSize, DataStore input, DataStore result, DataStore kernel )
+   exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
    {
       auto fetchData = [ = ] __cuda_callable__( Index i )
       {
@@ -36,12 +36,12 @@ public:
 
       auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel )
       {
-        return result + data * kernel;
+         return result + data * kernel;
       };
 
       auto store = [ = ] __cuda_callable__( Index i, Real resultValue ) mutable
       {
-         result[i] = resultValue;
+         result[ i ] = resultValue;
       };
 
       Launcher::exec< Index, Real >( dimensions,
@@ -108,46 +108,56 @@ public:
    }
 };
 
-// template< typename Index, typename Real >
-// struct DummyTask< Index, Real, 3, TNL::Devices::Cuda >
-// {
-// public:
-//    static constexpr int Dimension = 3;
-//    using Device = TNL::Devices::Cuda;
-//    using Vector = TNL::Containers::StaticVector< Dimension, Index >;
-//    using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
-//    using Launcher = Launcher< Dimension, Device >;
-
-//    static void
-//    exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
-//    {
-//       auto fetchData = [ = ] __cuda_callable__( Index i, Index j, Index k ) {
+template< typename Index, typename Real >
+struct DummyTask< Index, Real, 3, TNL::Devices::Cuda >
+{
+public:
+   static constexpr int Dimension = 3;
+   using Device = TNL::Devices::Cuda;
+   using Vector = TNL::Containers::StaticVector< Dimension, Index >;
+   using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
+   using Launcher = Launcher< Dimension, Device >;
 
-//       };
+   static void
+   exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
+   {
+      auto fetchData = [ = ] __cuda_callable__( Index i, Index j, Index k )
+      {
+         auto index = i + j * dimensions.x() + k * dimensions.x() * dimensions.y();
 
-//       auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j, Index k ) {
+         return input[index];
+      };
 
-//       };
+      auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j, Index k )
+      {
+         return 1;
+      };
 
-//       auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j, Index k ) {
+      auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j, Index k )
+      {
+         auto index = i + j * kernelSize.x() + k * kernelSize.x() * kernelSize.y();
 
-//       };
+         return kernel[ index ];
+      };
 
-//       auto convolve = [ = ] __cuda_callable__( float result, Index data, Index kernel )
-//       {
-//          return result + data * kernel;
-//       };
+      auto convolve = [ = ] __cuda_callable__( float result, Index data, Index kernel )
+      {
+         return result + data * kernel;
+      };
 
-//       auto store = [ = ] __cuda_callable__( Index i, Index j, Index k, Real result ) {
+      auto store = [ = ] __cuda_callable__( Index i, Index j, Index k, Real resultValue ) mutable
+      {
+         auto index = i + j * dimensions.x() + k * dimensions.x() * dimensions.y();
 
-//       };
+         result[ index ] = resultValue;
+      };
 
-//       Launcher::exec< Index >( dimensions,
-//                                kernelSize,
-//                                std::forward< decltype( fetchData ) >( fetchData ),
-//                                std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
-//                                std::forward< decltype( fetchKernel ) >( fetchKernel ),
-//                                std::forward< decltype( convolve ) >( convolve ),
-//                                std::forward< decltype( store ) >( store ) );
-//    }
-// };
+      Launcher::exec< Index, Real >( dimensions,
+                                     kernelSize,
+                                     std::forward< decltype( fetchData ) >( fetchData ),
+                                     std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
+                                     std::forward< decltype( fetchKernel ) >( fetchKernel ),
+                                     std::forward< decltype( convolve ) >( convolve ),
+                                     std::forward< decltype( store ) >( store ) );
+   }
+};
diff --git a/src/Benchmarks/Convolution/support/Launcher.h b/src/Benchmarks/Convolution/support/Launcher.h
index fde1e91ab..94e9b096b 100644
--- a/src/Benchmarks/Convolution/support/Launcher.h
+++ b/src/Benchmarks/Convolution/support/Launcher.h
@@ -119,95 +119,91 @@ public:
    }
 };
 
-// template<>
-// struct Launcher< 3, TNL::Devices::Cuda >
-// {
-// public:
-//    using Vector = TNL::Containers::StaticVector< 3, int >;
-//    using ConvolutionKernel = Convolution< 3, TNL::Devices::Cuda >;
-
-//    template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store >
-//    static inline void
-//    exec( const Vector& dimensions,
-//          const Vector& kernelSize,
-//          FetchData&& fetchData,
-//          FetchBoundary&& fetchBoundary,
-//          FetchKernel&& fetchKernel,
-//          Convolve&& convolve,
-//          Store&& store )
-//    {
-//       const Index sizeX = dimensions.x();
-//       const Index sizeY = dimensions.y();
-//       const Index sizeZ = dimensions.z();
-
-//       TNL::Cuda::LaunchConfiguration launchConfig;
-
-//       launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >(
-//          kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z() );
-
-//       if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) {
-//          launchConfig.blockSize.x = TNL::min( 256, sizeX );
-//          launchConfig.blockSize.y = 1;
-//          launchConfig.blockSize.z = 1;
-//       }
-//       else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) {
-//          launchConfig.blockSize.x = 1;
-//          launchConfig.blockSize.y = TNL::min( 256, sizeY );
-//          launchConfig.blockSize.z = 1;
-//       }
-//       else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) {
-//          launchConfig.blockSize.x = TNL::min( 2, sizeX );
-//          launchConfig.blockSize.y = TNL::min( 2, sizeY );
-//          // CUDA allows max 64 for launchConfig.blockSize.z
-//          launchConfig.blockSize.z = TNL::min( 64, sizeZ );
-//       }
-//       else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) {
-//          launchConfig.blockSize.x = TNL::min( 32, sizeX );
-//          launchConfig.blockSize.y = TNL::min( 8, sizeY );
-//          launchConfig.blockSize.z = 1;
-//       }
-//       else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) {
-//          launchConfig.blockSize.x = TNL::min( 32, sizeX );
-//          launchConfig.blockSize.y = 1;
-//          launchConfig.blockSize.z = TNL::min( 8, sizeZ );
-//       }
-//       else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) {
-//          launchConfig.blockSize.x = 1;
-//          launchConfig.blockSize.y = TNL::min( 32, sizeY );
-//          launchConfig.blockSize.z = TNL::min( 8, sizeZ );
-//       }
-//       else {
-//          launchConfig.blockSize.x = TNL::min( 16, sizeX );
-//          launchConfig.blockSize.y = TNL::min( 4, sizeY );
-//          launchConfig.blockSize.z = TNL::min( 4, sizeZ );
-//       }
-//       launchConfig.gridSize.x =
-//          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) );
-//       launchConfig.gridSize.y =
-//          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) );
-//       launchConfig.gridSize.z =
-//          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeZ, launchConfig.blockSize.z ) );
-
-//       dim3 gridCount;
-//       gridCount.x = roundUpDivision( sizeX, launchConfig.blockSize.x * launchConfig.gridSize.x );
-//       gridCount.y = roundUpDivision( sizeY, launchConfig.blockSize.y * launchConfig.gridSize.y );
-//       gridCount.z = roundUpDivision( sizeZ, launchConfig.blockSize.z * launchConfig.gridSize.z );
-
-//       constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
-
-//       TNL::Cuda::launchKernel< true >( kernel,
-//                                        0,
-//                                        launchConfig,
-//                                        kernelSize.x(),
-//                                        kernelSize.y(),
-//                                        kernelSize.z(),
-//                                        dimensions.x(),
-//                                        dimensions.y(),
-//                                        dimensions.z(),
-//                                        std::forward< FetchData >( fetchData ),
-//                                        std::forward< FetchBoundary >( fetchBoundary ),
-//                                        std::forward< FetchKernel >( fetchKernel ),
-//                                        std::forward< Convolve >( convolve ),
-//                                        std::forward< Store >( store ) );
-//    }
-// };
+template<>
+struct Launcher< 3, TNL::Devices::Cuda >
+{
+public:
+   using Vector = TNL::Containers::StaticVector< 3, int >;
+   using ConvolutionKernel = Convolution< 3, TNL::Devices::Cuda >;
+
+   template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store >
+   static inline void
+   exec( const Vector& dimensions,
+         const Vector& kernelSize,
+         FetchData&& fetchData,
+         FetchBoundary&& fetchBoundary,
+         FetchKernel&& fetchKernel,
+         Convolve&& convolve,
+         Store&& store )
+   {
+      const Index sizeX = dimensions.x();
+      const Index sizeY = dimensions.y();
+      const Index sizeZ = dimensions.z();
+
+      TNL::Cuda::LaunchConfiguration launchConfig;
+
+      launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >(
+         kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z() );
+
+      if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) {
+         launchConfig.blockSize.x = TNL::min( 256, sizeX );
+         launchConfig.blockSize.y = 1;
+         launchConfig.blockSize.z = 1;
+      }
+      else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) {
+         launchConfig.blockSize.x = 1;
+         launchConfig.blockSize.y = TNL::min( 256, sizeY );
+         launchConfig.blockSize.z = 1;
+      }
+      else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) {
+         launchConfig.blockSize.x = TNL::min( 2, sizeX );
+         launchConfig.blockSize.y = TNL::min( 2, sizeY );
+         // CUDA allows max 64 for launchConfig.blockSize.z
+         launchConfig.blockSize.z = TNL::min( 64, sizeZ );
+      }
+      else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) {
+         launchConfig.blockSize.x = TNL::min( 32, sizeX );
+         launchConfig.blockSize.y = TNL::min( 8, sizeY );
+         launchConfig.blockSize.z = 1;
+      }
+      else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) {
+         launchConfig.blockSize.x = TNL::min( 32, sizeX );
+         launchConfig.blockSize.y = 1;
+         launchConfig.blockSize.z = TNL::min( 8, sizeZ );
+      }
+      else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) {
+         launchConfig.blockSize.x = 1;
+         launchConfig.blockSize.y = TNL::min( 32, sizeY );
+         launchConfig.blockSize.z = TNL::min( 8, sizeZ );
+      }
+      else {
+         launchConfig.blockSize.x = TNL::min( 16, sizeX );
+         launchConfig.blockSize.y = TNL::min( 4, sizeY );
+         launchConfig.blockSize.z = TNL::min( 4, sizeZ );
+      }
+
+      launchConfig.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) );
+      launchConfig.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) );
+      launchConfig.gridSize.z =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeZ, launchConfig.blockSize.z ) );
+
+      constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       launchConfig,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       kernelSize.z(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       dimensions.z(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   }
+};
-- 
GitLab


From 868f233e64c3a6bf1c4c4057a42dbac3e93878e9 Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Sat, 2 Apr 2022 14:23:20 +0200
Subject: [PATCH 04/19] Implement benchmarks for naive kernel

---
 src/Benchmarks/Convolution/CMakeLists.txt     |  5 +++
 .../Convolution/support/Benchmark.h           | 11 +++---
 .../Convolution/support/DummyBenchmark.h      | 37 ++++++++++---------
 .../Convolution/templates/main_benchmark.h    | 25 +++++++++++++
 4 files changed, 54 insertions(+), 24 deletions(-)

diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt
index 4c80ff07e..d8a0c683c 100644
--- a/src/Benchmarks/Convolution/CMakeLists.txt
+++ b/src/Benchmarks/Convolution/CMakeLists.txt
@@ -24,3 +24,8 @@ endfunction()
 
 GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/naive.h")
 GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/naive.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/naive.h")
+
+GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/naive.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/naive.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/naive.h")
diff --git a/src/Benchmarks/Convolution/support/Benchmark.h b/src/Benchmarks/Convolution/support/Benchmark.h
index f5671a06b..ce1b91b23 100644
--- a/src/Benchmarks/Convolution/support/Benchmark.h
+++ b/src/Benchmarks/Convolution/support/Benchmark.h
@@ -14,10 +14,10 @@ template< int Dimension, typename Device >
 class Benchmark
 {
 public:
-   using Benchmark = typename TNL::Benchmarks::Benchmark<>;
+   using TNLBenchmark = typename TNL::Benchmarks::Benchmark<>;
 
    void
-   runBenchmark( const TNL::Config::ParameterContainer& parameters ) const
+   run( const TNL::Config::ParameterContainer& parameters ) const
    {
       if( ! TNL::Devices::Host::setup( parameters ) || ! TNL::Devices::Cuda::setup( parameters ) )
          return;
@@ -36,7 +36,7 @@ public:
 
       std::ofstream logFile( logFileName.getString(), mode );
 
-      Benchmark benchmark( logFile, loops, verbose );
+      TNLBenchmark benchmark( logFile, loops, verbose );
 
       std::map< std::string, std::string > metadata = TNL::Benchmarks::getHardwareMetadata();
       TNL::Benchmarks::writeMapAsJson( metadata, logFileName, ".metadata.json" );
@@ -44,8 +44,8 @@ public:
       start(benchmark, parameters);
    }
 
-   virtual void start(const Benchmark& benchmark, const TNL::Config::ParameterContainer& parameters) const {
-      TNL_ASSERT_TRUE(false, << "Should be overriden");
+   virtual void start( TNLBenchmark& benchmark, const TNL::Config::ParameterContainer& parameters) const {
+      TNL_ASSERT_TRUE(false, "Should be overriden");
    }
 
    virtual TNL::Config::ConfigDescription makeInputConfig() const {
@@ -69,7 +69,6 @@ public:
       config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
       config.addEntry< int >( "verbose", "Verbose mode.", 1 );
 
-
       config.addDelimiter( "Device settings:" );
       TNL::Devices::Host::configSetup( config );
 
diff --git a/src/Benchmarks/Convolution/support/DummyBenchmark.h b/src/Benchmarks/Convolution/support/DummyBenchmark.h
index 1830e7484..8c8e60be7 100644
--- a/src/Benchmarks/Convolution/support/DummyBenchmark.h
+++ b/src/Benchmarks/Convolution/support/DummyBenchmark.h
@@ -16,12 +16,12 @@ class DummyBenchmark : public Benchmark< Dimension, Device >
 {
 public:
    using Vector = TNL::Containers::StaticVector< Dimension, int >;
-   using DataStore = TNL::Containers::Array< int, Device, float >;
-   using Benchmark = Base::Benchmark;
+   using DataStore = TNL::Containers::Array< float, Device, int >;
    using Base = Benchmark< Dimension, Device >;
+   using TNLBenchmark = typename Base::TNLBenchmark;
 
    virtual void
-   start( const Benchmark& benchmark, const TNL::Config::ParameterContainer& parameters ) const override
+   start( TNLBenchmark& benchmark, const TNL::Config::ParameterContainer& parameters ) const override
    {
       Vector start;
       Vector end;
@@ -53,7 +53,7 @@ public:
    }
 
    virtual void
-   time( Benchmark& bencmark,
+   time( TNLBenchmark& benchmark,
          const Vector& minDimension,
          const Vector& maxDimension,
          const int dimensionStep,
@@ -68,14 +68,14 @@ public:
          currentKernelSize = minKernelSize;
 
          do {
-            time( benchmark, currentDimension, currentKernelSize );
+            timeConvolution( benchmark, currentDimension, currentKernelSize );
 
             currentKernelSize[ 0 ] += kernelStep;
 
             for( size_t i = 0; i < currentKernelSize.getSize() - 1; i++ ) {
                if( currentKernelSize[ i ] >= maxKernelSize[ i ] ) {
                   currentKernelSize[ i ] = minKernelSize[ i ];
-                  maxKernelSize[ i + 1 ] += kernelStep;
+                  currentKernelSize[ i + 1 ] += kernelStep;
                }
             }
          } while( currentKernelSize < maxKernelSize );
@@ -85,7 +85,7 @@ public:
          for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) {
             if( currentDimension[ i ] >= maxDimension[ i ] ) {
                currentDimension[ i ] = minDimension[ i ];
-               maxDimension[ i ] = maxDimension[ i ];
+               currentDimension[ i ] = maxDimension[ i ];
             }
          }
 
@@ -93,11 +93,11 @@ public:
    }
 
    void
-   timeConvolution( Benchmark& benchmark, const Vector& dimension, const Vector& kernelSize ) const
+   timeConvolution( TNLBenchmark& benchmark, const Vector& dimension, const Vector& kernelSize ) const
    {
       auto device = TNL::getType< Device >();
 
-      Benchmark::MetadataColumns columns = {};
+      typename TNLBenchmark::MetadataColumns columns;
 
       size_t elementsCount = 1;
       size_t kernelElementsCount = 1;
@@ -106,18 +106,19 @@ public:
          elementsCount *= dimension[ i ];
          kernelElementsCount *= kernelSize[ i ];
 
-         columns.insert( { dimensionIds[ i ], dimension[ i ] } );
-         columns.insert( { kernelSizeIds[ i ], kernelSize[ i ] } );
+         columns.push_back( { dimensionIds[ i ], TNL::convertToString(dimension[ i ]) } );
+         columns.push_back( { kernelSizeIds[ i ], TNL::convertToString(kernelSize[ i ]) } );
       }
 
       benchmark.setDatasetSize( ( elementsCount * 4 ) / 1.e9, 1.0 );
+      benchmark.setMetadataColumns( columns );
 
       // Setup input data
       DataStore input, result, kernel;
 
       input.resize( elementsCount );
       result.resize( elementsCount );
-      kernel.resize( kernelSize );
+      kernel.resize( kernelElementsCount );
 
       input = 1;
       result = 1;
@@ -129,24 +130,24 @@ public:
 
       auto measure = [ & ]()
       {
-         DummyTask<Dimension, Device>::exec(dimension, kernelSize, inputView, resultView, kernelView);
+         DummyTask<int, float, Dimension, Device>::exec(dimension, kernelSize, inputView, resultView, kernelView);
       };
 
-      benchmark.time< Device >( device, measure );
+      benchmark.template time<Device>( device, measure );
    }
 
    TNL::Config::ConfigDescription
    makeInputConfig() const override
    {
-      auto config = Base::makeInputConfig();
+      TNL::Config::ConfigDescription config = Base::makeInputConfig();
 
       config.addDelimiter( "Grid dimension settings:" );
 
       for( int i = 0; i < Dimension; i++ )
-         config.addEntry< int >( minDimensionIds[ i ], minDimensionIds[ i ], 512 );
+         config.addEntry< int >( minDimensionIds[ i ], minDimensionIds[ i ], 16 );
 
       for( int i = 0; i < Dimension; i++ )
-         config.addEntry< int >( maxDimensionIds[ i ], maxDimensionIds[ i ], 512 );
+         config.addEntry< int >( maxDimensionIds[ i ], maxDimensionIds[ i ], 128 );
 
       config.addEntry< int >( "dimension-step", "Step of kernel increase by which dimension is multiplied (must be even)", 2 );
 
@@ -156,7 +157,7 @@ public:
          config.addEntry< int >( minKernelSizeIds[ i ], minKernelSizeIds[ i ] + " (odd) :", 1 );
 
       for( int i = 0; i < Dimension; i++ )
-         config.addEntry< int >( minKernelSizeIds[ i ], minKernelSizeIds[ i ] + " (odd) :", 11 );
+         config.addEntry< int >( maxKernelSizeIds[ i ], maxKernelSizeIds[ i ] + " (odd) :", 11 );
 
       config.addEntry< int >( "kernel-step", "Step of kernel increase which is added to kernel (must be even)", 2 );
 
diff --git a/src/Benchmarks/Convolution/templates/main_benchmark.h b/src/Benchmarks/Convolution/templates/main_benchmark.h
index e69de29bb..5124922e6 100644
--- a/src/Benchmarks/Convolution/templates/main_benchmark.h
+++ b/src/Benchmarks/Convolution/templates/main_benchmark.h
@@ -0,0 +1,25 @@
+
+#include "../kernels/naive.h"
+#include "../support/DummyBenchmark.h"
+
+#include <TNL/Config/parseCommandLine.h>
+
+#define DIMENSION DIMENSION_VALUE
+
+using TaskBenchmark = DummyBenchmark< DIMENSION, TNL::Devices::Cuda >;
+
+int main(int argc, char* argv[])
+{
+   TaskBenchmark benchmark;
+
+   auto config = benchmark.makeInputConfig();
+
+   TNL::Config::ParameterContainer parameters;
+
+   if( ! parseCommandLine( argc, argv, config, parameters ) )
+      return EXIT_FAILURE;
+
+   benchmark.run( parameters );
+
+   return 0;
+}
-- 
GitLab


From 3781cb67e0da09317df14979e9295443a9abfdc4 Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Sat, 2 Apr 2022 16:49:21 +0200
Subject: [PATCH 05/19] Move out kernel coniguration to convolution task

---
 src/Benchmarks/Convolution/kernels/naive.h    | 85 +++++++++++++-----
 .../Convolution/support/DummyBenchmark.h      |  5 +-
 src/Benchmarks/Convolution/support/Launcher.h | 86 +------------------
 3 files changed, 70 insertions(+), 106 deletions(-)

diff --git a/src/Benchmarks/Convolution/kernels/naive.h b/src/Benchmarks/Convolution/kernels/naive.h
index 2a8cf47ca..f8b5966e2 100644
--- a/src/Benchmarks/Convolution/kernels/naive.h
+++ b/src/Benchmarks/Convolution/kernels/naive.h
@@ -1,8 +1,9 @@
 
 #ifdef HAVE_CUDA
 
-#include <TNL/Devices/Cuda.h>
-#include <TNL/Cuda/LaunchHelpers.h>
+   #include <TNL/Devices/Cuda.h>
+   #include <TNL/Containers/StaticVector.h>
+   #include <TNL/Cuda/LaunchHelpers.h>
 
 template< int Dimension, typename Device >
 struct Convolution;
@@ -12,10 +13,18 @@ struct Convolution< 1, TNL::Devices::Cuda >
 {
 public:
    template< typename Index >
-   static size_t
-   getDynamicSharedMemorySize( Index kernelWidth, Index endX )
+   using Vector = TNL::Containers::StaticVector< 1, Index >;
+
+   template< typename Index >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
    {
-      return 0;
+      configuration.dynamicSharedMemorySize = 0;
+
+      // TODO: - Benchmark the best value
+      configuration.blockSize.x = kernelSize.x();
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
    }
 };
 
@@ -36,9 +45,9 @@ convolution1D( Index kernelWidth,
                Convolve convolve,
                Store store )
 {
-   Index ix =  threadIdx.x + blockIdx.x * blockDim.x;
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
-   if (ix >= endX)
+   if( ix >= endX )
       return;
 
    Index radius = kernelWidth >> 1;
@@ -65,10 +74,22 @@ struct Convolution< 2, TNL::Devices::Cuda >
 {
 public:
    template< typename Index >
-   static size_t
-   getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY )
+   using Vector = TNL::Containers::StaticVector< 2, Index >;
+
+   template< typename Index >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
    {
-      return 0;
+      configuration.dynamicSharedMemorySize = 0;
+
+      // TODO: - Benchmark the best value
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
    }
 };
 
@@ -94,7 +115,7 @@ convolution2D( Index kernelWidth,
    Index iy = threadIdx.y + blockIdx.y * blockDim.y;
    Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
-   if (ix >= endX || iy >= endY)
+   if( ix >= endX || iy >= endY )
       return;
 
    Index radiusY = kernelHeight >> 1;
@@ -102,16 +123,17 @@ convolution2D( Index kernelWidth,
 
    Real result = 0;
 
-   for( Index j = - radiusY; j <= radiusY; j++ ) {
+   for( Index j = -radiusY; j <= radiusY; j++ ) {
       Index elementIndexY = j + iy;
       Index kernelIndexY = j + radiusY;
 
-      for( Index i = - radiusX; i <= radiusX; i++ ) {
+      for( Index i = -radiusX; i <= radiusX; i++ ) {
          Index elementIndexX = i + ix;
          Index kernelIndexX = i + radiusX;
 
          if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY ) {
-            result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel ( kernelIndexX, kernelIndexY ) );
+            result =
+               convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) );
          }
          else {
             result = convolve( result, fetchData( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) );
@@ -127,10 +149,25 @@ struct Convolution< 3, TNL::Devices::Cuda >
 {
 public:
    template< typename Index >
-   static size_t
-   getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ )
+   using Vector = TNL::Containers::StaticVector< 3, Index >;
+
+   template< typename Index >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
    {
-      return 0;
+      configuration.dynamicSharedMemorySize = 0;
+
+      // TODO: - Benchmark the best value
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+      configuration.blockSize.z = kernelSize.z();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
    }
 };
 
@@ -159,7 +196,7 @@ convolution3D( Index kernelWidth,
    Index iy = threadIdx.y + blockIdx.y * blockDim.y;
    Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
-   if (ix >= endX || iy >= endY || iz >= endZ)
+   if( ix >= endX || iy >= endY || iz >= endZ )
       return;
 
    Index radiusZ = kernelDepth >> 1;
@@ -180,11 +217,17 @@ convolution3D( Index kernelWidth,
             Index elementIndexX = i + ix;
             Index kernelIndexX = i + radiusX;
 
-            if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 || elementIndexZ >= endZ ) {
-               result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) );
+            if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0
+                || elementIndexZ >= endZ )
+            {
+               result = convolve( result,
+                                  fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ),
+                                  fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) );
             }
             else {
-               result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) );
+               result = convolve( result,
+                                  fetchData( elementIndexX, elementIndexY, elementIndexZ ),
+                                  fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) );
             }
          }
       }
diff --git a/src/Benchmarks/Convolution/support/DummyBenchmark.h b/src/Benchmarks/Convolution/support/DummyBenchmark.h
index 8c8e60be7..804ca03d7 100644
--- a/src/Benchmarks/Convolution/support/DummyBenchmark.h
+++ b/src/Benchmarks/Convolution/support/DummyBenchmark.h
@@ -32,13 +32,14 @@ public:
          start[ i ] = parameters.getParameter< int >( minDimensionIds[ i ] );
          end[ i ] = parameters.getParameter< int >( maxDimensionIds[ i ] );
          minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] );
-         maxKernelSizeIds[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] );
+         maxKernelSize[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] );
 
          TNL_ASSERT_GT( start[ i ], 1, "Start dimension must be positive integer" );
          TNL_ASSERT_GT( end[ i ], start[ i ], "End dimension must be greater than start dimension" );
 
          TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" );
          TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" );
+         TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End dimension must be greater than start dimension" );
          TNL_ASSERT_GT( end[ i ], start[ i ], "End kernel size must be greater than start kernel size" );
       }
 
@@ -85,7 +86,7 @@ public:
          for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) {
             if( currentDimension[ i ] >= maxDimension[ i ] ) {
                currentDimension[ i ] = minDimension[ i ];
-               currentDimension[ i ] = maxDimension[ i ];
+               currentDimension[ i + 1 ] *= dimensionStep;
             }
          }
 
diff --git a/src/Benchmarks/Convolution/support/Launcher.h b/src/Benchmarks/Convolution/support/Launcher.h
index 94e9b096b..208deb080 100644
--- a/src/Benchmarks/Convolution/support/Launcher.h
+++ b/src/Benchmarks/Convolution/support/Launcher.h
@@ -29,20 +29,7 @@ public:
    {
       TNL::Cuda::LaunchConfiguration launchConfig;
 
-      launchConfig.dynamicSharedMemorySize =
-         ConvolutionKernel::getDynamicSharedMemorySize< Index >( kernelSize.x(), dimensions.x() );
-
-      // TODO: - Benchmark the best value
-      launchConfig.blockSize.x = 256;
-      launchConfig.gridSize.x =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), launchConfig.blockSize.x ) );
-
-      if( (std::size_t) launchConfig.blockSize.x * launchConfig.gridSize.x < (std::size_t) dimensions.x() ) {
-         const int desGridSize = 32 * TNL::Cuda::DeviceInfo::getCudaMultiprocessors( TNL::Cuda::DeviceInfo::getActiveDevice() );
-
-         launchConfig.gridSize.x =
-            TNL::min( desGridSize, TNL::Cuda::getNumberOfBlocks( dimensions.x(), launchConfig.blockSize.x ) );
-      }
+      ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize);
 
       constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
 
@@ -78,29 +65,7 @@ public:
    {
       TNL::Cuda::LaunchConfiguration launchConfig;
 
-      launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >(
-         kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() );
-
-      const Index sizeX = dimensions.x();
-      const Index sizeY = dimensions.y();
-
-      if( sizeX >= sizeY * sizeY ) {
-         launchConfig.blockSize.x = TNL::min( 256, sizeX );
-         launchConfig.blockSize.y = 1;
-      }
-      else if( sizeY >= sizeX * sizeX ) {
-         launchConfig.blockSize.x = 1;
-         launchConfig.blockSize.y = TNL::min( 256, sizeY );
-      }
-      else {
-         launchConfig.blockSize.x = TNL::min( 32, sizeX );
-         launchConfig.blockSize.y = TNL::min( 8, sizeY );
-      }
-
-      launchConfig.gridSize.x =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) );
-      launchConfig.gridSize.y =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) );
+      ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize);
 
       constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
 
@@ -142,52 +107,7 @@ public:
 
       TNL::Cuda::LaunchConfiguration launchConfig;
 
-      launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >(
-         kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z() );
-
-      if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) {
-         launchConfig.blockSize.x = TNL::min( 256, sizeX );
-         launchConfig.blockSize.y = 1;
-         launchConfig.blockSize.z = 1;
-      }
-      else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) {
-         launchConfig.blockSize.x = 1;
-         launchConfig.blockSize.y = TNL::min( 256, sizeY );
-         launchConfig.blockSize.z = 1;
-      }
-      else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) {
-         launchConfig.blockSize.x = TNL::min( 2, sizeX );
-         launchConfig.blockSize.y = TNL::min( 2, sizeY );
-         // CUDA allows max 64 for launchConfig.blockSize.z
-         launchConfig.blockSize.z = TNL::min( 64, sizeZ );
-      }
-      else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) {
-         launchConfig.blockSize.x = TNL::min( 32, sizeX );
-         launchConfig.blockSize.y = TNL::min( 8, sizeY );
-         launchConfig.blockSize.z = 1;
-      }
-      else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) {
-         launchConfig.blockSize.x = TNL::min( 32, sizeX );
-         launchConfig.blockSize.y = 1;
-         launchConfig.blockSize.z = TNL::min( 8, sizeZ );
-      }
-      else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) {
-         launchConfig.blockSize.x = 1;
-         launchConfig.blockSize.y = TNL::min( 32, sizeY );
-         launchConfig.blockSize.z = TNL::min( 8, sizeZ );
-      }
-      else {
-         launchConfig.blockSize.x = TNL::min( 16, sizeX );
-         launchConfig.blockSize.y = TNL::min( 4, sizeY );
-         launchConfig.blockSize.z = TNL::min( 4, sizeZ );
-      }
-
-      launchConfig.gridSize.x =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) );
-      launchConfig.gridSize.y =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) );
-      launchConfig.gridSize.z =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeZ, launchConfig.blockSize.z ) );
+      ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize);
 
       constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
 
-- 
GitLab


From be08382dc41aa2720465424fe2f44adf200908a8 Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Sat, 2 Apr 2022 18:31:24 +0200
Subject: [PATCH 06/19] Implement convolution with storing kernel in shared
 memory

---
 src/Benchmarks/Convolution/CMakeLists.txt     |   9 +
 src/Benchmarks/Convolution/kernels/naive.h    |  22 +-
 .../Convolution/kernels/sharedKernel.h        | 274 ++++++++++++++++++
 src/Benchmarks/Convolution/support/Launcher.h |  15 +-
 .../Convolution/templates/main_benchmark.h    |   7 +-
 .../Convolution/templates/main_solver.h       |   6 +-
 6 files changed, 317 insertions(+), 16 deletions(-)
 create mode 100644 src/Benchmarks/Convolution/kernels/sharedKernel.h

diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt
index d8a0c683c..ec637f6dd 100644
--- a/src/Benchmarks/Convolution/CMakeLists.txt
+++ b/src/Benchmarks/Convolution/CMakeLists.txt
@@ -10,6 +10,7 @@ if (${BUILD_CUDA})
    FILE(READ ${TEMPLATE} TEMPLATE_CONTENT)
 
    STRING(REGEX REPLACE "DIMENSION_VALUE" ${DIMENSION} TEMPLATE_CONTENT "${TEMPLATE_CONTENT}")
+   STRING(REGEX REPLACE "KERNEL_VALUE" "\"../${KERNEL_HEADER}\"" TEMPLATE_CONTENT "${TEMPLATE_CONTENT}")
 
    FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}")
 
@@ -29,3 +30,11 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/naiv
 GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/naive.h")
 GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/naive.h")
 GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/naive.h")
+
+GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedKernel.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedKernel.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedKernel.h")
+
+GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedKernel.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedKernel.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedKernel.h")
diff --git a/src/Benchmarks/Convolution/kernels/naive.h b/src/Benchmarks/Convolution/kernels/naive.h
index f8b5966e2..d69e219d2 100644
--- a/src/Benchmarks/Convolution/kernels/naive.h
+++ b/src/Benchmarks/Convolution/kernels/naive.h
@@ -1,9 +1,18 @@
 
+#pragma once
+
 #ifdef HAVE_CUDA
 
-   #include <TNL/Devices/Cuda.h>
-   #include <TNL/Containers/StaticVector.h>
-   #include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/StaticVector.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+
+/**
+ * There are several pitfalls with such configuration.
+ *
+ * 1. At first we don't use shared memory
+ * 2. At second we don't control block size, so we may launch extremely small kernels or otherwise we can launch extremely large kernels.
+ */
 
 template< int Dimension, typename Device >
 struct Convolution;
@@ -15,7 +24,7 @@ public:
    template< typename Index >
    using Vector = TNL::Containers::StaticVector< 1, Index >;
 
-   template< typename Index >
+   template< typename Index, typename Real >
    static void
    setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
    {
@@ -76,13 +85,12 @@ public:
    template< typename Index >
    using Vector = TNL::Containers::StaticVector< 2, Index >;
 
-   template< typename Index >
+   template< typename Index, typename Real >
    static void
    setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
    {
       configuration.dynamicSharedMemorySize = 0;
 
-      // TODO: - Benchmark the best value
       configuration.blockSize.x = kernelSize.x();
       configuration.blockSize.y = kernelSize.y();
 
@@ -151,7 +159,7 @@ public:
    template< typename Index >
    using Vector = TNL::Containers::StaticVector< 3, Index >;
 
-   template< typename Index >
+   template< typename Index, typename Real >
    static void
    setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
    {
diff --git a/src/Benchmarks/Convolution/kernels/sharedKernel.h b/src/Benchmarks/Convolution/kernels/sharedKernel.h
new file mode 100644
index 000000000..de76dd32c
--- /dev/null
+++ b/src/Benchmarks/Convolution/kernels/sharedKernel.h
@@ -0,0 +1,274 @@
+
+#pragma once
+
+#ifdef HAVE_CUDA
+
+   #include <TNL/Devices/Cuda.h>
+   #include <TNL/Containers/StaticVector.h>
+   #include <TNL/Cuda/LaunchHelpers.h>
+   #include <TNL/Cuda/SharedMemory.h>
+
+template< int Dimension, typename Device >
+struct Convolution;
+
+template<>
+struct Convolution< 1, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 1, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= kernelSize[ i ];
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+   }
+};
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution1D( Index kernelWidth,
+               Index endX,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Real* shared = TNL::Cuda::getSharedMemory< Real >();
+
+   Index radius = kernelWidth >> 1;
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   // The size of the block is equal to the kernel size
+   shared[ threadIdx.x ] = fetchKernel( threadIdx.x );
+
+   __syncthreads();
+
+   Real result = 0;
+
+   for( Index i = -radius; i <= radius; i++ ) {
+      Index elementIndex = i + ix;
+      Index kernelIndex = i + radius;
+
+      if( elementIndex < 0 || elementIndex >= endX ) {
+         result = convolve( result, fetchBoundary( elementIndex ), shared[ kernelIndex ] );
+      }
+      else {
+         result = convolve( result, fetchData( elementIndex ), shared[ kernelIndex ] );
+      }
+   }
+
+   store( ix, result );
+}
+
+template<>
+struct Convolution< 2, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 2, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= kernelSize[ i ];
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+   }
+};
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution2D( Index kernelWidth,
+               Index kernelHeight,
+               Index endX,
+               Index endY,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Real* shared = TNL::Cuda::getSharedMemory< Real >();
+
+   Index radiusY = kernelHeight >> 1;
+   Index radiusX = kernelWidth >> 1;
+
+   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y;
+
+   // The size of the block is equal to the kernel size
+   shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y );
+
+   __syncthreads();
+
+   Real result = 0;
+
+   for( Index j = -radiusY; j <= radiusY; j++ ) {
+      Index elementIndexY = j + iy;
+      Index kernelIndexY = j + radiusY;
+
+      for( Index i = -radiusX; i <= radiusX; i++ ) {
+         Index elementIndexX = i + ix;
+         Index kernelIndexX = i + radiusX;
+
+         Index threadIndex = kernelIndexX + kernelWidth * kernelIndexY;
+
+         if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY ) {
+            result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), shared[ threadIndex ] );
+         }
+         else {
+            result = convolve( result, fetchData( elementIndexX, elementIndexY ), shared[ threadIndex ] );
+         }
+      }
+   }
+
+   store( ix, iy, result );
+}
+
+template<>
+struct Convolution< 3, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 3, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= kernelSize[ i ];
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+      configuration.blockSize.z = kernelSize.z();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
+   }
+};
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution3D( Index kernelWidth,
+               Index kernelHeight,
+               Index kernelDepth,
+               Index endX,
+               Index endY,
+               Index endZ,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Real* shared = TNL::Cuda::getSharedMemory< Real >();
+
+   Index iz = threadIdx.z + blockIdx.z * blockDim.z;
+   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   Index radiusZ = kernelDepth >> 1;
+   Index radiusY = kernelHeight >> 1;
+   Index radiusX = kernelWidth >> 1;
+
+   Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z;
+
+   printf( "%d\n", threadIndex );
+
+   // The size of the block is equal to the kernel size
+   shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z );
+
+   __syncthreads();
+
+   Real result = 0;
+
+   for( Index k = -radiusZ; k <= radiusZ; k++ ) {
+      Index elementIndexZ = k + iz;
+      Index kernelIndexZ = k + radiusZ;
+
+      for( Index j = -radiusY; j <= radiusY; j++ ) {
+         Index elementIndexY = j + iy;
+         Index kernelIndexY = j + radiusY;
+
+         for( Index i = -radiusX; i <= radiusX; i++ ) {
+            Index elementIndexX = i + ix;
+            Index kernelIndexX = i + radiusX;
+
+            Index threadIndex = kernelIndexX + kernelWidth * kernelIndexY + kernelWidth * kernelHeight * kernelIndexZ;
+
+            if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0
+                || elementIndexZ >= endZ )
+            {
+               result = convolve( result,
+                                  fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ),
+                                  shared[threadIndex] );
+            }
+            else {
+               result = convolve( result,
+                                  fetchData( elementIndexX, elementIndexY, elementIndexZ ),
+                                  shared[threadIndex] );
+            }
+         }
+      }
+   }
+
+   store( ix, iy, iz, result );
+}
+
+#endif
diff --git a/src/Benchmarks/Convolution/support/Launcher.h b/src/Benchmarks/Convolution/support/Launcher.h
index 208deb080..c336dc8e5 100644
--- a/src/Benchmarks/Convolution/support/Launcher.h
+++ b/src/Benchmarks/Convolution/support/Launcher.h
@@ -5,7 +5,14 @@
 #include <TNL/Cuda/KernelLaunch.h>
 
 template< int Dimension, typename Device >
-struct Convolution;
+struct Convolution {
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 1, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup(TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize);
+};
 
 template< int Dimension, typename Device >
 struct Launcher;
@@ -29,7 +36,7 @@ public:
    {
       TNL::Cuda::LaunchConfiguration launchConfig;
 
-      ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize);
+      ConvolutionKernel::setup<Index, Real>(launchConfig, dimensions, kernelSize);
 
       constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
 
@@ -65,7 +72,7 @@ public:
    {
       TNL::Cuda::LaunchConfiguration launchConfig;
 
-      ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize);
+      ConvolutionKernel::setup<Index, Real>(launchConfig, dimensions, kernelSize);
 
       constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
 
@@ -107,7 +114,7 @@ public:
 
       TNL::Cuda::LaunchConfiguration launchConfig;
 
-      ConvolutionKernel::setup<Index>(launchConfig, dimensions, kernelSize);
+      ConvolutionKernel::setup<Index, Real>(launchConfig, dimensions, kernelSize);
 
       constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
 
diff --git a/src/Benchmarks/Convolution/templates/main_benchmark.h b/src/Benchmarks/Convolution/templates/main_benchmark.h
index 5124922e6..4be1e80e5 100644
--- a/src/Benchmarks/Convolution/templates/main_benchmark.h
+++ b/src/Benchmarks/Convolution/templates/main_benchmark.h
@@ -1,11 +1,12 @@
 
-#include "../kernels/naive.h"
+#define KERNEL KERNEL_VALUE
+#define DIMENSION DIMENSION_VALUE
+
+#include KERNEL_VALUE
 #include "../support/DummyBenchmark.h"
 
 #include <TNL/Config/parseCommandLine.h>
 
-#define DIMENSION DIMENSION_VALUE
-
 using TaskBenchmark = DummyBenchmark< DIMENSION, TNL::Devices::Cuda >;
 
 int main(int argc, char* argv[])
diff --git a/src/Benchmarks/Convolution/templates/main_solver.h b/src/Benchmarks/Convolution/templates/main_solver.h
index 1a6c33a9b..ab2bc8699 100644
--- a/src/Benchmarks/Convolution/templates/main_solver.h
+++ b/src/Benchmarks/Convolution/templates/main_solver.h
@@ -1,10 +1,12 @@
 
-#include "../kernels/naive.h"
+#define KERNEL KERNEL_VALUE
+#define DIMENSION DIMENSION_VALUE
+
+#include KERNEL
 #include "../support/DummySolver.h"
 
 #include <TNL/Config/parseCommandLine.h>
 
-#define DIMENSION DIMENSION_VALUE
 
 using TaskSolver = DummySolver< DIMENSION, TNL::Devices::Cuda >;
 
-- 
GitLab


From d3b6676ebf2911d6360ca65f0d55c4a0c2dc1ee2 Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Sat, 2 Apr 2022 18:38:43 +0200
Subject: [PATCH 07/19] Add loggin of the id to the benchmark output

---
 src/Benchmarks/Convolution/kernels/sharedKernel.h   |  8 ++++----
 src/Benchmarks/Convolution/support/DummyBenchmark.h | 13 ++++++++-----
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/Benchmarks/Convolution/kernels/sharedKernel.h b/src/Benchmarks/Convolution/kernels/sharedKernel.h
index de76dd32c..c4f7f0199 100644
--- a/src/Benchmarks/Convolution/kernels/sharedKernel.h
+++ b/src/Benchmarks/Convolution/kernels/sharedKernel.h
@@ -3,10 +3,10 @@
 
 #ifdef HAVE_CUDA
 
-   #include <TNL/Devices/Cuda.h>
-   #include <TNL/Containers/StaticVector.h>
-   #include <TNL/Cuda/LaunchHelpers.h>
-   #include <TNL/Cuda/SharedMemory.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/StaticVector.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Cuda/SharedMemory.h>
 
 template< int Dimension, typename Device >
 struct Convolution;
diff --git a/src/Benchmarks/Convolution/support/DummyBenchmark.h b/src/Benchmarks/Convolution/support/DummyBenchmark.h
index 804ca03d7..005096d12 100644
--- a/src/Benchmarks/Convolution/support/DummyBenchmark.h
+++ b/src/Benchmarks/Convolution/support/DummyBenchmark.h
@@ -50,11 +50,14 @@ public:
       TNL_ASSERT_GT( kernelStep, 0, "Kernel step must be a positive number" );
       TNL_ASSERT_EQ( kernelStep % 2, 0, "Kernel step must be even" );
 
-      time( benchmark, start, end, dimensionStep, minKernelSize, maxKernelSize, kernelStep );
+      TNL::String id = parameters.getParameter<TNL::String>("id");
+
+      time( id, benchmark, start, end, dimensionStep, minKernelSize, maxKernelSize, kernelStep );
    }
 
    virtual void
-   time( TNLBenchmark& benchmark,
+   time( const TNL::String& id,
+         TNLBenchmark& benchmark,
          const Vector& minDimension,
          const Vector& maxDimension,
          const int dimensionStep,
@@ -69,7 +72,7 @@ public:
          currentKernelSize = minKernelSize;
 
          do {
-            timeConvolution( benchmark, currentDimension, currentKernelSize );
+            timeConvolution( id, benchmark, currentDimension, currentKernelSize );
 
             currentKernelSize[ 0 ] += kernelStep;
 
@@ -94,11 +97,11 @@ public:
    }
 
    void
-   timeConvolution( TNLBenchmark& benchmark, const Vector& dimension, const Vector& kernelSize ) const
+   timeConvolution( const TNL::String& id, TNLBenchmark& benchmark, const Vector& dimension, const Vector& kernelSize ) const
    {
       auto device = TNL::getType< Device >();
 
-      typename TNLBenchmark::MetadataColumns columns;
+      typename TNLBenchmark::MetadataColumns columns = {{ "id", id }};
 
       size_t elementsCount = 1;
       size_t kernelElementsCount = 1;
-- 
GitLab


From 9728480563a24e4ee9eef4e1ab16355514318c87 Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Sat, 2 Apr 2022 21:03:49 +0200
Subject: [PATCH 08/19] Implement kernel, which loads data in shared memory

---
 src/Benchmarks/Convolution/CMakeLists.txt     |   8 +
 .../Convolution/kernels/sharedData.h          | 432 ++++++++++++++++++
 .../Convolution/kernels/sharedKernel.h        |  42 +-
 3 files changed, 466 insertions(+), 16 deletions(-)
 create mode 100644 src/Benchmarks/Convolution/kernels/sharedData.h

diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt
index ec637f6dd..b51e7de7d 100644
--- a/src/Benchmarks/Convolution/CMakeLists.txt
+++ b/src/Benchmarks/Convolution/CMakeLists.txt
@@ -38,3 +38,11 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/shar
 GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedKernel.h")
 GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedKernel.h")
 GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedKernel.h")
+
+GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedData.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedData.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedData.h")
+
+GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedData.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedData.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedData.h")
diff --git a/src/Benchmarks/Convolution/kernels/sharedData.h b/src/Benchmarks/Convolution/kernels/sharedData.h
new file mode 100644
index 000000000..f6dbe48fb
--- /dev/null
+++ b/src/Benchmarks/Convolution/kernels/sharedData.h
@@ -0,0 +1,432 @@
+#pragma once
+
+#ifdef HAVE_CUDA
+
+/**
+ * This method stores image tile into shared memory
+ * and then calculates convolution.
+ *
+ * Thanks for the idea  https://www.evl.uic.edu/sjames/cs525/final.html
+ */
+
+   #include <TNL/Devices/Cuda.h>
+   #include <TNL/Containers/StaticVector.h>
+   #include <TNL/Cuda/LaunchHelpers.h>
+   #include <TNL/Cuda/SharedMemory.h>
+
+template< int Dimension, typename Device >
+struct Convolution;
+
+template<>
+struct Convolution< 1, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 1, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+   }
+};
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution1D( Index kernelWidth,
+               Index endX,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   if( ix >= endX )
+      return;
+
+   Real* shared = TNL::Cuda::getSharedMemory< Real >();
+   Index radius = kernelWidth >> 1;
+
+   // Left
+   Index lhs = ix - radius;
+
+   if( lhs < 0 ) {
+      shared[ threadIdx.x ] = fetchBoundary( lhs );
+   }
+   else {
+      shared[ threadIdx.x ] = fetchData( lhs );
+   }
+
+   // Right
+   Index rhs = ix + radius;
+
+   if( rhs >= endX ) {
+      shared[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs );
+   }
+   else {
+      shared[ threadIdx.x + blockDim.x ] = fetchData( rhs );
+   }
+
+   __syncthreads();
+
+   Real result = 0;
+
+   #pragma unroll
+   for( Index i = 0; i < kernelWidth; i++ ) {
+      Index elementIndex = i + threadIdx.x;
+
+      result = convolve( result, shared[ elementIndex ], fetchKernel( i ) );
+   }
+
+   store( ix, result );
+}
+
+template<>
+struct Convolution< 2, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 2, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+   }
+};
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution2D( Index kernelWidth,
+               Index kernelHeight,
+               Index endX,
+               Index endY,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   if( ix >= endX || iy >= endY )
+      return;
+
+   Real* shared = TNL::Cuda::getSharedMemory< Real >();
+
+   Index radiusY = kernelHeight >> 1;
+   Index radiusX = kernelWidth >> 1;
+
+   Index x, y, index;
+
+   // Top Left
+   x = ix - radiusX;
+   y = iy - radiusY;
+
+   index = threadIdx.x + threadIdx.y * blockDim.x;
+
+   if( x < 0 || y < 0 ) {
+      shared[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      shared[ index ] = fetchData( x, y );
+   }
+
+   // Top right
+   x = ix + radiusX;
+   y = iy - radiusY;
+
+   index = radiusX + threadIdx.x + threadIdx.y * blockDim.x;
+
+   if( x >= endX || y < 0 ) {
+      shared[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      shared[ index ] = fetchData( x, y );
+   }
+
+   // Bottom Left
+   x = ix - radiusX;
+   y = iy + radiusY;
+
+   index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;
+
+   if( x < 0 || y >= endY ) {
+      shared[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      shared[ index ] = fetchData( x, y );
+   }
+
+   // Bottom Right
+   x = ix + radiusX;
+   y = iy + radiusY;
+
+   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;
+
+   if( x >= endX || y >= endY ) {
+      shared[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      shared[ index ] = fetchData( x, y );
+   }
+
+   __syncthreads();
+
+   Real result = 0;
+
+   for( Index j = 0; j <= radiusY; j++ ) {
+      Index align = ( j + threadIdx.y ) * blockDim.y;
+
+      for( Index i = 0; i <= radiusX; i++ ) {
+         Index index = i + threadIdx.x + align;
+
+         result = convolve( result, shared[ index ], fetchKernel( i, j ) );
+      }
+   }
+
+   store( ix, iy, result );
+}
+
+template<>
+struct Convolution< 3, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 3, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+      configuration.blockSize.z = kernelSize.z();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
+   }
+};
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution3D( Index kernelWidth,
+               Index kernelHeight,
+               Index kernelDepth,
+               Index endX,
+               Index endY,
+               Index endZ,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Index iz = threadIdx.z + blockIdx.z * blockDim.z;
+   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   if( ix >= endX || iy >= endY || iz >= endZ )
+      return;
+
+   Real* shared = TNL::Cuda::getSharedMemory< Real >();
+
+   Index radiusZ = kernelDepth >> 1;
+   Index radiusY = kernelHeight >> 1;
+   Index radiusX = kernelWidth >> 1;
+
+   Index x, y, z, index;
+
+   // Z: 0 Y: 0 X: 0
+   x = ix - radiusX;
+   y = iy - radiusY;
+   z = iz - radiusZ;
+
+   index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
+
+   if( x < 0 || y < 0 || z < 0 ) {
+      shared[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      shared[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 0 Y: 0 X: 1
+   x = ix + radiusX;
+   y = iy - radiusY;
+   z = iz - radiusZ;
+
+   index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
+
+   if( x >= endX || y < 0 || z < 0 ) {
+      shared[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      shared[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 0 Y: 1 X: 0
+   x = ix - radiusX;
+   y = iy + radiusY;
+   z = iz - radiusZ;
+
+   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
+
+   if( x < 0 || y >= endY || z < 0 ) {
+      shared[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      shared[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 1 Y: 0 X: 0
+   x = ix - radiusX;
+   y = iy - radiusY;
+   z = iz + radiusZ;
+
+   index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
+
+   if( x < 0 || y < 0 || z >= endZ ) {
+      shared[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      shared[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 0 Y: 1 X: 1
+   x = ix + radiusX;
+   y = iy + radiusY;
+   z = iz - radiusZ;
+
+   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
+
+   if( x >= endX || y >= endY || z < 0 ) {
+      shared[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      shared[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 1 Y: 0 X: 1
+   x = ix + radiusX;
+   y = iy - radiusY;
+   z = iz + radiusZ;
+
+   index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
+
+   if( x >= endX || y < 0 || z >= endZ ) {
+      shared[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      shared[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 1 Y: 1 X: 0
+   x = ix - radiusX;
+   y = iy + radiusY;
+   z = iz + radiusZ;
+
+   index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
+
+   if( x < 0 || y >= endY || z >= endZ ) {
+      shared[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      shared[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 1 Y: 1 X: 1
+   x = ix + radiusX;
+   y = iy + radiusY;
+   z = iz + radiusZ;
+
+   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
+
+   if( x >= endX || y >= endY || z >= endZ ) {
+      shared[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      shared[ index ] = fetchData( x, y, z );
+   }
+
+   __syncthreads();
+
+   Real result = 0;
+
+   for( Index k = 0; k <= radiusZ; k++ ) {
+      Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;
+
+      for( Index j = 0; j <= radiusY; j++ ) {
+         Index xAlign = ( j + threadIdx.y ) * blockDim.y;
+
+         for( Index i = 0; i <= radiusX; i++ ) {
+            Index index = i + threadIdx.x + xAlign + xyAlign;
+
+            result = convolve( result, shared[ index ], fetchKernel( i, j, k ) );
+         }
+      }
+   }
+
+   store( ix, iy, iz, result );
+}
+
+#endif
diff --git a/src/Benchmarks/Convolution/kernels/sharedKernel.h b/src/Benchmarks/Convolution/kernels/sharedKernel.h
index c4f7f0199..eb6537270 100644
--- a/src/Benchmarks/Convolution/kernels/sharedKernel.h
+++ b/src/Benchmarks/Convolution/kernels/sharedKernel.h
@@ -3,10 +3,14 @@
 
 #ifdef HAVE_CUDA
 
-#include <TNL/Devices/Cuda.h>
-#include <TNL/Containers/StaticVector.h>
-#include <TNL/Cuda/LaunchHelpers.h>
-#include <TNL/Cuda/SharedMemory.h>
+   #include <TNL/Devices/Cuda.h>
+   #include <TNL/Containers/StaticVector.h>
+   #include <TNL/Cuda/LaunchHelpers.h>
+   #include <TNL/Cuda/SharedMemory.h>
+
+/**
+ * This method stores kernel in the shared memory to reduce amount of loads.
+ */
 
 template< int Dimension, typename Device >
 struct Convolution;
@@ -52,10 +56,14 @@ convolution1D( Index kernelWidth,
                Convolve convolve,
                Store store )
 {
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   if( ix >= endX )
+      return;
+
    Real* shared = TNL::Cuda::getSharedMemory< Real >();
 
    Index radius = kernelWidth >> 1;
-   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
    // The size of the block is equal to the kernel size
    shared[ threadIdx.x ] = fetchKernel( threadIdx.x );
@@ -126,14 +134,17 @@ convolution2D( Index kernelWidth,
                Convolve convolve,
                Store store )
 {
+   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   if( ix >= endX || iy >= endY )
+      return;
+
    Real* shared = TNL::Cuda::getSharedMemory< Real >();
 
    Index radiusY = kernelHeight >> 1;
    Index radiusX = kernelWidth >> 1;
 
-   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
-   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
-
    Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y;
 
    // The size of the block is equal to the kernel size
@@ -217,12 +228,15 @@ convolution3D( Index kernelWidth,
                Convolve convolve,
                Store store )
 {
-   Real* shared = TNL::Cuda::getSharedMemory< Real >();
-
    Index iz = threadIdx.z + blockIdx.z * blockDim.z;
    Index iy = threadIdx.y + blockIdx.y * blockDim.y;
    Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
+   if( ix >= endX || iy >= endY || iz >= endZ )
+      return;
+
+   Real* shared = TNL::Cuda::getSharedMemory< Real >();
+
    Index radiusZ = kernelDepth >> 1;
    Index radiusY = kernelHeight >> 1;
    Index radiusX = kernelWidth >> 1;
@@ -255,14 +269,10 @@ convolution3D( Index kernelWidth,
             if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0
                 || elementIndexZ >= endZ )
             {
-               result = convolve( result,
-                                  fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ),
-                                  shared[threadIndex] );
+               result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), shared[ threadIndex ] );
             }
             else {
-               result = convolve( result,
-                                  fetchData( elementIndexX, elementIndexY, elementIndexZ ),
-                                  shared[threadIndex] );
+               result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), shared[ threadIndex ] );
             }
          }
       }
-- 
GitLab


From 3571b27e99d774d874142950feb3f2aa8cd9a91d Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Sun, 3 Apr 2022 11:20:48 +0200
Subject: [PATCH 09/19] Move kernel launching in kernel definition

---
 src/Benchmarks/Convolution/kernels/naive.h    | 252 +++++++++++-----
 .../Convolution/kernels/sharedData.h          | 272 ++++++++++++------
 .../Convolution/kernels/sharedKernel.h        | 266 +++++++++++------
 .../Convolution/support/DummyTask.h           |  73 +++--
 src/Benchmarks/Convolution/support/Launcher.h | 136 ---------
 src/Benchmarks/Convolution/support/Solver.h   |   2 -
 6 files changed, 594 insertions(+), 407 deletions(-)
 delete mode 100644 src/Benchmarks/Convolution/support/Launcher.h

diff --git a/src/Benchmarks/Convolution/kernels/naive.h b/src/Benchmarks/Convolution/kernels/naive.h
index d69e219d2..4326e0d74 100644
--- a/src/Benchmarks/Convolution/kernels/naive.h
+++ b/src/Benchmarks/Convolution/kernels/naive.h
@@ -3,40 +3,21 @@
 
 #ifdef HAVE_CUDA
 
-#include <TNL/Devices/Cuda.h>
-#include <TNL/Containers/StaticVector.h>
-#include <TNL/Cuda/LaunchHelpers.h>
+   #include <TNL/Devices/Cuda.h>
+   #include <TNL/Containers/StaticVector.h>
+   #include <TNL/Cuda/LaunchHelpers.h>
 
 /**
  * There are several pitfalls with such configuration.
  *
  * 1. At first we don't use shared memory
- * 2. At second we don't control block size, so we may launch extremely small kernels or otherwise we can launch extremely large kernels.
+ * 2. At second we don't control block size, so we may launch extremely small kernels or otherwise we can launch extremely large
+ * kernels.
  */
 
 template< int Dimension, typename Device >
 struct Convolution;
 
-template<>
-struct Convolution< 1, TNL::Devices::Cuda >
-{
-public:
-   template< typename Index >
-   using Vector = TNL::Containers::StaticVector< 1, Index >;
-
-   template< typename Index, typename Real >
-   static void
-   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
-   {
-      configuration.dynamicSharedMemorySize = 0;
-
-      // TODO: - Benchmark the best value
-      configuration.blockSize.x = kernelSize.x();
-      configuration.gridSize.x =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
-   }
-};
-
 template< typename Index,
           typename Real,
           typename FetchData,
@@ -78,29 +59,6 @@ convolution1D( Index kernelWidth,
    store( ix, result );
 }
 
-template<>
-struct Convolution< 2, TNL::Devices::Cuda >
-{
-public:
-   template< typename Index >
-   using Vector = TNL::Containers::StaticVector< 2, Index >;
-
-   template< typename Index, typename Real >
-   static void
-   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
-   {
-      configuration.dynamicSharedMemorySize = 0;
-
-      configuration.blockSize.x = kernelSize.x();
-      configuration.blockSize.y = kernelSize.y();
-
-      configuration.gridSize.x =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
-      configuration.gridSize.y =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
-   }
-};
-
 template< typename Index,
           typename Real,
           typename FetchData,
@@ -152,33 +110,6 @@ convolution2D( Index kernelWidth,
    store( ix, iy, result );
 }
 
-template<>
-struct Convolution< 3, TNL::Devices::Cuda >
-{
-public:
-   template< typename Index >
-   using Vector = TNL::Containers::StaticVector< 3, Index >;
-
-   template< typename Index, typename Real >
-   static void
-   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
-   {
-      configuration.dynamicSharedMemorySize = 0;
-
-      // TODO: - Benchmark the best value
-      configuration.blockSize.x = kernelSize.x();
-      configuration.blockSize.y = kernelSize.y();
-      configuration.blockSize.z = kernelSize.z();
-
-      configuration.gridSize.x =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
-      configuration.gridSize.y =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
-      configuration.gridSize.y =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
-   }
-};
-
 template< typename Index,
           typename Real,
           typename FetchData,
@@ -244,4 +175,177 @@ convolution3D( Index kernelWidth,
    store( ix, iy, iz, result );
 }
 
+template< int Dimension, typename Device >
+struct Convolution;
+
+template<>
+struct Convolution< 1, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 1, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      configuration.dynamicSharedMemorySize = 0;
+
+      // TODO: - Benchmark the best value
+      configuration.blockSize.x = kernelSize.x();
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >(
+         kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store );
+   };
+};
+
+template<>
+struct Convolution< 2, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 2, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      configuration.dynamicSharedMemorySize = 0;
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       configuration,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   };
+};
+
+template<>
+struct Convolution< 3, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 3, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      configuration.dynamicSharedMemorySize = 0;
+
+      // TODO: - Benchmark the best value
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+      configuration.blockSize.z = kernelSize.z();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       configuration,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       kernelSize.z(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       dimensions.z(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   };
+};
+
 #endif
diff --git a/src/Benchmarks/Convolution/kernels/sharedData.h b/src/Benchmarks/Convolution/kernels/sharedData.h
index f6dbe48fb..0c0e0a2a6 100644
--- a/src/Benchmarks/Convolution/kernels/sharedData.h
+++ b/src/Benchmarks/Convolution/kernels/sharedData.h
@@ -14,33 +14,6 @@
    #include <TNL/Cuda/LaunchHelpers.h>
    #include <TNL/Cuda/SharedMemory.h>
 
-template< int Dimension, typename Device >
-struct Convolution;
-
-template<>
-struct Convolution< 1, TNL::Devices::Cuda >
-{
-public:
-   template< typename Index >
-   using Vector = TNL::Containers::StaticVector< 1, Index >;
-
-   template< typename Index, typename Real >
-   static void
-   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
-   {
-      Index kernelElementCount = 1;
-
-      for( Index i = 0; i < kernelSize.getSize(); i++ )
-         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
-
-      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
-
-      configuration.blockSize.x = kernelSize.x();
-      configuration.gridSize.x =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
-   }
-};
-
 template< typename Index,
           typename Real,
           typename FetchData,
@@ -100,34 +73,6 @@ convolution1D( Index kernelWidth,
    store( ix, result );
 }
 
-template<>
-struct Convolution< 2, TNL::Devices::Cuda >
-{
-public:
-   template< typename Index >
-   using Vector = TNL::Containers::StaticVector< 2, Index >;
-
-   template< typename Index, typename Real >
-   static void
-   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
-   {
-      Index kernelElementCount = 1;
-
-      for( Index i = 0; i < kernelSize.getSize(); i++ )
-         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
-
-      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
-
-      configuration.blockSize.x = kernelSize.x();
-      configuration.blockSize.y = kernelSize.y();
-
-      configuration.gridSize.x =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
-      configuration.gridSize.y =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
-   }
-};
-
 template< typename Index,
           typename Real,
           typename FetchData,
@@ -229,37 +174,6 @@ convolution2D( Index kernelWidth,
    store( ix, iy, result );
 }
 
-template<>
-struct Convolution< 3, TNL::Devices::Cuda >
-{
-public:
-   template< typename Index >
-   using Vector = TNL::Containers::StaticVector< 3, Index >;
-
-   template< typename Index, typename Real >
-   static void
-   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
-   {
-      Index kernelElementCount = 1;
-
-      for( Index i = 0; i < kernelSize.getSize(); i++ )
-         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
-
-      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
-
-      configuration.blockSize.x = kernelSize.x();
-      configuration.blockSize.y = kernelSize.y();
-      configuration.blockSize.z = kernelSize.z();
-
-      configuration.gridSize.x =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
-      configuration.gridSize.y =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
-      configuration.gridSize.y =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
-   }
-};
-
 template< typename Index,
           typename Real,
           typename FetchData,
@@ -429,4 +343,190 @@ convolution3D( Index kernelWidth,
    store( ix, iy, iz, result );
 }
 
+template< int Dimension, typename Device >
+struct Convolution;
+
+template<>
+struct Convolution< 1, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 1, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >(
+         kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store );
+   };
+};
+
+template<>
+struct Convolution< 2, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 2, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       configuration,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   };
+};
+
+template<>
+struct Convolution< 3, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 3, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+      configuration.blockSize.z = kernelSize.z();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       configuration,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       kernelSize.z(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       dimensions.z(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   };
+};
+
 #endif
diff --git a/src/Benchmarks/Convolution/kernels/sharedKernel.h b/src/Benchmarks/Convolution/kernels/sharedKernel.h
index eb6537270..d3e1e4da3 100644
--- a/src/Benchmarks/Convolution/kernels/sharedKernel.h
+++ b/src/Benchmarks/Convolution/kernels/sharedKernel.h
@@ -15,30 +15,6 @@
 template< int Dimension, typename Device >
 struct Convolution;
 
-template<>
-struct Convolution< 1, TNL::Devices::Cuda >
-{
-public:
-   template< typename Index >
-   using Vector = TNL::Containers::StaticVector< 1, Index >;
-
-   template< typename Index, typename Real >
-   static void
-   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
-   {
-      Index kernelElementCount = 1;
-
-      for( Index i = 0; i < kernelSize.getSize(); i++ )
-         kernelElementCount *= kernelSize[ i ];
-
-      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
-
-      configuration.blockSize.x = kernelSize.x();
-      configuration.gridSize.x =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
-   }
-};
-
 template< typename Index,
           typename Real,
           typename FetchData,
@@ -87,34 +63,6 @@ convolution1D( Index kernelWidth,
    store( ix, result );
 }
 
-template<>
-struct Convolution< 2, TNL::Devices::Cuda >
-{
-public:
-   template< typename Index >
-   using Vector = TNL::Containers::StaticVector< 2, Index >;
-
-   template< typename Index, typename Real >
-   static void
-   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
-   {
-      Index kernelElementCount = 1;
-
-      for( Index i = 0; i < kernelSize.getSize(); i++ )
-         kernelElementCount *= kernelSize[ i ];
-
-      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
-
-      configuration.blockSize.x = kernelSize.x();
-      configuration.blockSize.y = kernelSize.y();
-
-      configuration.gridSize.x =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
-      configuration.gridSize.y =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
-   }
-};
-
 template< typename Index,
           typename Real,
           typename FetchData,
@@ -176,37 +124,6 @@ convolution2D( Index kernelWidth,
    store( ix, iy, result );
 }
 
-template<>
-struct Convolution< 3, TNL::Devices::Cuda >
-{
-public:
-   template< typename Index >
-   using Vector = TNL::Containers::StaticVector< 3, Index >;
-
-   template< typename Index, typename Real >
-   static void
-   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
-   {
-      Index kernelElementCount = 1;
-
-      for( Index i = 0; i < kernelSize.getSize(); i++ )
-         kernelElementCount *= kernelSize[ i ];
-
-      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
-
-      configuration.blockSize.x = kernelSize.x();
-      configuration.blockSize.y = kernelSize.y();
-      configuration.blockSize.z = kernelSize.z();
-
-      configuration.gridSize.x =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
-      configuration.gridSize.y =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
-      configuration.gridSize.y =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
-   }
-};
-
 template< typename Index,
           typename Real,
           typename FetchData,
@@ -281,4 +198,187 @@ convolution3D( Index kernelWidth,
    store( ix, iy, iz, result );
 }
 
+template<>
+struct Convolution< 1, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 1, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= kernelSize[ i ];
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >(
+         kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store );
+   };
+};
+
+template<>
+struct Convolution< 2, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 2, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= kernelSize[ i ];
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       configuration,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   };
+};
+
+template<>
+struct Convolution< 3, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 3, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= kernelSize[ i ];
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+      configuration.blockSize.z = kernelSize.z();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       configuration,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       kernelSize.z(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       dimensions.z(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   };
+};
+
 #endif
diff --git a/src/Benchmarks/Convolution/support/DummyTask.h b/src/Benchmarks/Convolution/support/DummyTask.h
index f7db47e34..e850b64c0 100644
--- a/src/Benchmarks/Convolution/support/DummyTask.h
+++ b/src/Benchmarks/Convolution/support/DummyTask.h
@@ -1,7 +1,28 @@
 
 #pragma once
 
-#include "Launcher.h"
+template< int Dimension, typename Device >
+struct Convolution
+{
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< Dimension, Index >;
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store );
+};
 
 template< typename Index, typename Real, int Dimension, typename Device >
 struct DummyTask;
@@ -14,7 +35,7 @@ public:
    using Device = TNL::Devices::Cuda;
    using Vector = TNL::Containers::StaticVector< Dimension, Index >;
    using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
-   using Launcher = Launcher< Dimension, Device >;
+   using ConvolutionLauncher = Convolution< Dimension, Device >;
 
    static void
    exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
@@ -44,13 +65,13 @@ public:
          result[ i ] = resultValue;
       };
 
-      Launcher::exec< Index, Real >( dimensions,
-                                     kernelSize,
-                                     std::forward< decltype( fetchData ) >( fetchData ),
-                                     std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
-                                     std::forward< decltype( fetchKernel ) >( fetchKernel ),
-                                     std::forward< decltype( convolve ) >( convolve ),
-                                     std::forward< decltype( store ) >( store ) );
+      ConvolutionLauncher::execute< Index, Real >( dimensions,
+                                                   kernelSize,
+                                                   std::forward< decltype( fetchData ) >( fetchData ),
+                                                   std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
+                                                   std::forward< decltype( fetchKernel ) >( fetchKernel ),
+                                                   std::forward< decltype( convolve ) >( convolve ),
+                                                   std::forward< decltype( store ) >( store ) );
    }
 };
 
@@ -62,7 +83,7 @@ public:
    using Device = TNL::Devices::Cuda;
    using Vector = TNL::Containers::StaticVector< Dimension, Index >;
    using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
-   using Launcher = Launcher< Dimension, Device >;
+   using ConvolutionLauncher = Convolution< Dimension, Device >;
 
    static void
    exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
@@ -98,13 +119,13 @@ public:
          result[ index ] = resultValue;
       };
 
-      Launcher::exec< Index, Real >( dimensions,
-                                     kernelSize,
-                                     std::forward< decltype( fetchData ) >( fetchData ),
-                                     std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
-                                     std::forward< decltype( fetchKernel ) >( fetchKernel ),
-                                     std::forward< decltype( convolve ) >( convolve ),
-                                     std::forward< decltype( store ) >( store ) );
+      ConvolutionLauncher::execute< Index, Real >( dimensions,
+                                                   kernelSize,
+                                                   std::forward< decltype( fetchData ) >( fetchData ),
+                                                   std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
+                                                   std::forward< decltype( fetchKernel ) >( fetchKernel ),
+                                                   std::forward< decltype( convolve ) >( convolve ),
+                                                   std::forward< decltype( store ) >( store ) );
    }
 };
 
@@ -116,7 +137,7 @@ public:
    using Device = TNL::Devices::Cuda;
    using Vector = TNL::Containers::StaticVector< Dimension, Index >;
    using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
-   using Launcher = Launcher< Dimension, Device >;
+   using ConvolutionLauncher = Convolution< Dimension, Device >;
 
    static void
    exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
@@ -125,7 +146,7 @@ public:
       {
          auto index = i + j * dimensions.x() + k * dimensions.x() * dimensions.y();
 
-         return input[index];
+         return input[ index ];
       };
 
       auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j, Index k )
@@ -152,12 +173,12 @@ public:
          result[ index ] = resultValue;
       };
 
-      Launcher::exec< Index, Real >( dimensions,
-                                     kernelSize,
-                                     std::forward< decltype( fetchData ) >( fetchData ),
-                                     std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
-                                     std::forward< decltype( fetchKernel ) >( fetchKernel ),
-                                     std::forward< decltype( convolve ) >( convolve ),
-                                     std::forward< decltype( store ) >( store ) );
+      ConvolutionLauncher::execute< Index, Real >( dimensions,
+                                                   kernelSize,
+                                                   std::forward< decltype( fetchData ) >( fetchData ),
+                                                   std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
+                                                   std::forward< decltype( fetchKernel ) >( fetchKernel ),
+                                                   std::forward< decltype( convolve ) >( convolve ),
+                                                   std::forward< decltype( store ) >( store ) );
    }
 };
diff --git a/src/Benchmarks/Convolution/support/Launcher.h b/src/Benchmarks/Convolution/support/Launcher.h
deleted file mode 100644
index c336dc8e5..000000000
--- a/src/Benchmarks/Convolution/support/Launcher.h
+++ /dev/null
@@ -1,136 +0,0 @@
-
-#pragma once
-
-#include <TNL/Containers/StaticVector.h>
-#include <TNL/Cuda/KernelLaunch.h>
-
-template< int Dimension, typename Device >
-struct Convolution {
-   template< typename Index >
-   using Vector = TNL::Containers::StaticVector< 1, Index >;
-
-   template< typename Index, typename Real >
-   static void
-   setup(TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize);
-};
-
-template< int Dimension, typename Device >
-struct Launcher;
-
-template<>
-struct Launcher< 1, TNL::Devices::Cuda >
-{
-public:
-   using Vector = TNL::Containers::StaticVector< 1, int >;
-   using ConvolutionKernel = Convolution< 1, TNL::Devices::Cuda >;
-
-   template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store >
-   static inline void
-   exec( const Vector& dimensions,
-         const Vector& kernelSize,
-         FetchData&& fetchData,
-         FetchBoundary&& fetchBoundary,
-         FetchKernel&& fetchKernel,
-         Convolve&& convolve,
-         Store&& store )
-   {
-      TNL::Cuda::LaunchConfiguration launchConfig;
-
-      ConvolutionKernel::setup<Index, Real>(launchConfig, dimensions, kernelSize);
-
-      constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
-
-      TNL::Cuda::launchKernel< true >( kernel,
-                                       0,
-                                       launchConfig,
-                                       kernelSize.x(),
-                                       dimensions.x(),
-                                       fetchData,
-                                       fetchBoundary,
-                                       fetchKernel,
-                                       convolve,
-                                       store );
-   }
-};
-
-template<>
-struct Launcher< 2, TNL::Devices::Cuda >
-{
-public:
-   using Vector = TNL::Containers::StaticVector< 2, int >;
-   using ConvolutionKernel = Convolution< 2, TNL::Devices::Cuda >;
-
-   template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store >
-   static inline void
-   exec( const Vector& dimensions,
-         const Vector& kernelSize,
-         FetchData&& fetchData,
-         FetchBoundary&& fetchBoundary,
-         FetchKernel&& fetchKernel,
-         Convolve&& convolve,
-         Store&& store )
-   {
-      TNL::Cuda::LaunchConfiguration launchConfig;
-
-      ConvolutionKernel::setup<Index, Real>(launchConfig, dimensions, kernelSize);
-
-      constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
-
-      TNL::Cuda::launchKernel< true >( kernel,
-                                       0,
-                                       launchConfig,
-                                       kernelSize.x(),
-                                       kernelSize.y(),
-                                       dimensions.x(),
-                                       dimensions.y(),
-                                       fetchData,
-                                       fetchBoundary,
-                                       fetchKernel,
-                                       convolve,
-                                       store );
-   }
-};
-
-template<>
-struct Launcher< 3, TNL::Devices::Cuda >
-{
-public:
-   using Vector = TNL::Containers::StaticVector< 3, int >;
-   using ConvolutionKernel = Convolution< 3, TNL::Devices::Cuda >;
-
-   template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store >
-   static inline void
-   exec( const Vector& dimensions,
-         const Vector& kernelSize,
-         FetchData&& fetchData,
-         FetchBoundary&& fetchBoundary,
-         FetchKernel&& fetchKernel,
-         Convolve&& convolve,
-         Store&& store )
-   {
-      const Index sizeX = dimensions.x();
-      const Index sizeY = dimensions.y();
-      const Index sizeZ = dimensions.z();
-
-      TNL::Cuda::LaunchConfiguration launchConfig;
-
-      ConvolutionKernel::setup<Index, Real>(launchConfig, dimensions, kernelSize);
-
-      constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
-
-      TNL::Cuda::launchKernel< true >( kernel,
-                                       0,
-                                       launchConfig,
-                                       kernelSize.x(),
-                                       kernelSize.y(),
-                                       kernelSize.z(),
-                                       dimensions.x(),
-                                       dimensions.y(),
-                                       dimensions.z(),
-                                       fetchData,
-                                       fetchBoundary,
-                                       fetchKernel,
-                                       convolve,
-                                       store );
-   }
-};
diff --git a/src/Benchmarks/Convolution/support/Solver.h b/src/Benchmarks/Convolution/support/Solver.h
index a6b1d2c91..c0c0b31e2 100644
--- a/src/Benchmarks/Convolution/support/Solver.h
+++ b/src/Benchmarks/Convolution/support/Solver.h
@@ -6,8 +6,6 @@
 #include <TNL/Containers/StaticVector.h>
 #include <TNL/Containers/Array.h>
 
-#include "Launcher.h"
-
 template< int Dimension, typename Device >
 class Solver
 {
-- 
GitLab


From 5d0dac3964d679ee48f2ed5c47b9c91bed52ed60 Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Sun, 3 Apr 2022 15:04:57 +0200
Subject: [PATCH 10/19] Implement shared data and kernel kernel

---
 src/Benchmarks/Convolution/CMakeLists.txt     |   8 +
 .../Convolution/kernels/sharedData.h          |   4 +-
 .../Convolution/kernels/sharedDataAndKernel.h | 577 ++++++++++++++++++
 .../Convolution/support/Benchmark.h           |  13 +-
 .../Convolution/support/DummyBenchmark.h      |  70 +--
 src/Benchmarks/Convolution/support/Solver.h   |  11 +-
 6 files changed, 610 insertions(+), 73 deletions(-)
 create mode 100644 src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h

diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt
index b51e7de7d..0569e1013 100644
--- a/src/Benchmarks/Convolution/CMakeLists.txt
+++ b/src/Benchmarks/Convolution/CMakeLists.txt
@@ -46,3 +46,11 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/shar
 GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedData.h")
 GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedData.h")
 GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedData.h")
+
+GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedDataAndKernel.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedDataAndKernel.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedDataAndKernel.h")
+
+GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")
+GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")
diff --git a/src/Benchmarks/Convolution/kernels/sharedData.h b/src/Benchmarks/Convolution/kernels/sharedData.h
index 0c0e0a2a6..dc173703e 100644
--- a/src/Benchmarks/Convolution/kernels/sharedData.h
+++ b/src/Benchmarks/Convolution/kernels/sharedData.h
@@ -162,7 +162,7 @@ convolution2D( Index kernelWidth,
    Real result = 0;
 
    for( Index j = 0; j <= radiusY; j++ ) {
-      Index align = ( j + threadIdx.y ) * blockDim.y;
+      Index align = ( j + threadIdx.y ) * blockDim.x;
 
       for( Index i = 0; i <= radiusX; i++ ) {
          Index index = i + threadIdx.x + align;
@@ -330,7 +330,7 @@ convolution3D( Index kernelWidth,
       Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;
 
       for( Index j = 0; j <= radiusY; j++ ) {
-         Index xAlign = ( j + threadIdx.y ) * blockDim.y;
+         Index xAlign = ( j + threadIdx.y ) * blockDim.x;
 
          for( Index i = 0; i <= radiusX; i++ ) {
             Index index = i + threadIdx.x + xAlign + xyAlign;
diff --git a/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h
new file mode 100644
index 000000000..8cbc26aa4
--- /dev/null
+++ b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h
@@ -0,0 +1,577 @@
+#pragma once
+
+#ifdef HAVE_CUDA
+
+   #include <TNL/Devices/Cuda.h>
+   #include <TNL/Containers/StaticVector.h>
+   #include <TNL/Cuda/LaunchHelpers.h>
+   #include <TNL/Cuda/SharedMemory.h>
+
+/**
+ * This method stores kernel and data in the shared memory to reduce amount of loads.
+ *
+ * We can calculate the size of shared memory needed the next way:
+ * 1. We need to store in shared memory:
+ *      * for 1D -> (2 * kernelWidth) - 1 < 2 * kernelWidth
+ *      * for 2D -> ( (2 * kernelWidth) - 1 ) * ( (2 * kernelHeight) - 1 ) < 4 * kernelWidth * kernelHeight
+ *      * for 3D -> ( (2 * kernelWidth) - 1 ) * ( (2 * kernelHeight) - 1 ) * ( (2 * kernelDepth) - 1 ) < 8 * kernelWidth *
+ * kernelHeight * kernelDepth
+ * 2. We take into account, that the maximal block size is 1024, so the maximum volume of kernel is 1024.
+ *    Then the maximal amount of shared memory is:
+ *      * for 1D -> 2 * 1024 -> 2048 elements (Note, that even if we take long double (16B) we still can fit in the shared
+ * memory)
+ *      * for 2D -> 4 * 1024 -> 4096 elements
+ *      * for 3D -> 8 * 1024 -> 8196 elements (Note, that if double takes 8 bytes, then we can't fit tile into shared memory,
+ * because we have 64 KB of data)
+ * 3. The last thing is, that even if we take 1D and 2D case we have enough space to store 1024 kernel element.
+ *    Then the maximal amount of shared memory is:
+ *      * for 1D -> 3 * 1024 -> can use long double, double, float
+ *      * for 2D -> 5 * 1024 -> can use double, float
+ *      * for 3D -> 9 * 1024 -> can use float
+ */
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution1D( Index kernelWidth,
+               Index endX,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   if( ix >= endX )
+      return;
+
+   Index kernelOffset = 2 * kernelWidth;
+
+   Real* data = TNL::Cuda::getSharedMemory< Real >();
+   Real* kernel = data + kernelOffset;
+
+   Index radius = kernelWidth >> 1;
+
+   // Left
+   Index lhs = ix - radius;
+
+   if( lhs < 0 ) {
+      data[ threadIdx.x ] = fetchBoundary( lhs );
+   }
+   else {
+      data[ threadIdx.x ] = fetchData( lhs );
+   }
+
+   // Right
+   Index rhs = ix + radius;
+
+   if( rhs >= endX ) {
+      data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs );
+   }
+   else {
+      data[ threadIdx.x + blockDim.x ] = fetchData( rhs );
+   }
+
+   kernel[ threadIdx.x ] = fetchKernel( threadIdx.x );
+
+   __syncthreads();
+
+   Real result = 0;
+
+   #pragma unroll
+   for( Index i = 0; i < kernelWidth; i++ ) {
+      Index elementIndex = i + threadIdx.x;
+
+      result = convolve( result, data[ elementIndex ], kernel[ i ] );
+   }
+
+   store( ix, result );
+}
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution2D( Index kernelWidth,
+               Index kernelHeight,
+               Index endX,
+               Index endY,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   if( ix >= endX || iy >= endY )
+      return;
+
+   Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 );
+
+   Real* data = TNL::Cuda::getSharedMemory< Real >();
+   Real* kernel = data + kernelOffset;
+
+   Index radiusY = kernelHeight >> 1;
+   Index radiusX = kernelWidth >> 1;
+
+   Index x, y, index;
+
+   // Top Left
+   x = ix - radiusX;
+   y = iy - radiusY;
+
+   index = threadIdx.x + threadIdx.y * blockDim.x;
+
+   kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y );
+
+   if( x < 0 || y < 0 ) {
+      data[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      data[ index ] = fetchData( x, y );
+   }
+
+   // Top right
+   x = ix + radiusX;
+   y = iy - radiusY;
+
+   index = radiusX + threadIdx.x + threadIdx.y * blockDim.x;
+
+   if( x >= endX || y < 0 ) {
+      data[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      data[ index ] = fetchData( x, y );
+   }
+
+   // Bottom Left
+   x = ix - radiusX;
+   y = iy + radiusY;
+
+   index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;
+
+   if( x < 0 || y >= endY ) {
+      data[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      data[ index ] = fetchData( x, y );
+   }
+
+   // Bottom Right
+   x = ix + radiusX;
+   y = iy + radiusY;
+
+   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;
+
+   if( x >= endX || y >= endY ) {
+      data[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      data[ index ] = fetchData( x, y );
+   }
+
+   __syncthreads();
+
+   Real result = 0;
+
+   #pragma unroll
+   for( Index j = 0; j <= radiusY; j++ ) {
+      Index elementAlign = ( j + threadIdx.y ) * blockDim.x;
+      Index kernelAlign = j * blockDim.x;
+
+   #pragma unroll
+      for( Index i = 0; i <= radiusX; i++ ) {
+         Index elementIndex = i + threadIdx.x + elementAlign;
+         Index kernelIndex = i + kernelAlign;
+
+         result = convolve( result, data[ elementIndex ], kernel[ kernelIndex ] );
+      }
+   }
+
+   store( ix, iy, result );
+}
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename FetchKernel,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution3D( Index kernelWidth,
+               Index kernelHeight,
+               Index kernelDepth,
+               Index endX,
+               Index endY,
+               Index endZ,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               FetchKernel fetchKernel,
+               Convolve convolve,
+               Store store )
+{
+   Index iz = threadIdx.z + blockIdx.z * blockDim.z;
+   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   if( ix >= endX || iy >= endY || iz >= endZ )
+      return;
+
+   Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ) * ( 2 * kernelDepth - 1 );
+
+   Real* data = TNL::Cuda::getSharedMemory< Real >();
+   Real* kernel = data + kernelOffset;
+
+   Index radiusZ = kernelDepth >> 1;
+   Index radiusY = kernelHeight >> 1;
+   Index radiusX = kernelWidth >> 1;
+
+   Index x, y, z, index;
+
+   // Z: 0 Y: 0 X: 0
+   x = ix - radiusX;
+   y = iy - radiusY;
+   z = iz - radiusZ;
+
+   index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
+
+   kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z );
+
+   if( x < 0 || y < 0 || z < 0 ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 0 Y: 0 X: 1
+   x = ix + radiusX;
+   y = iy - radiusY;
+   z = iz - radiusZ;
+
+   index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
+
+   if( x >= endX || y < 0 || z < 0 ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 0 Y: 1 X: 0
+   x = ix - radiusX;
+   y = iy + radiusY;
+   z = iz - radiusZ;
+
+   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
+
+   if( x < 0 || y >= endY || z < 0 ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 1 Y: 0 X: 0
+   x = ix - radiusX;
+   y = iy - radiusY;
+   z = iz + radiusZ;
+
+   index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
+
+   if( x < 0 || y < 0 || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 0 Y: 1 X: 1
+   x = ix + radiusX;
+   y = iy + radiusY;
+   z = iz - radiusZ;
+
+   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
+
+   if( x >= endX || y >= endY || z < 0 ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 1 Y: 0 X: 1
+   x = ix + radiusX;
+   y = iy - radiusY;
+   z = iz + radiusZ;
+
+   index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
+
+   if( x >= endX || y < 0 || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 1 Y: 1 X: 0
+   x = ix - radiusX;
+   y = iy + radiusY;
+   z = iz + radiusZ;
+
+   index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
+
+   if( x < 0 || y >= endY || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   // Z: 1 Y: 1 X: 1
+   x = ix + radiusX;
+   y = iy + radiusY;
+   z = iz + radiusZ;
+
+   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
+
+   if( x >= endX || y >= endY || z >= endZ ) {
+      data[ index ] = fetchBoundary( x, y, z );
+   }
+   else {
+      data[ index ] = fetchData( x, y, z );
+   }
+
+   __syncthreads();
+
+   Real result = 0;
+
+   for( Index k = 0; k <= radiusZ; k++ ) {
+      Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;
+      Index xyKernelAlign = k * blockDim.x * blockDim.y;
+
+      for( Index j = 0; j <= radiusY; j++ ) {
+         Index xAlign = ( j + threadIdx.y ) * blockDim.x;
+         Index xKernelAlign = j * blockDim.x;
+
+         for( Index i = 0; i <= radiusX; i++ ) {
+            Index elementIndex = i + threadIdx.x + xAlign + xyAlign;
+            Index kernelIndex = i + xKernelAlign + xyKernelAlign;
+
+            result = convolve( result, data[ index ], kernel[ kernelIndex ] );
+         }
+      }
+   }
+
+   store( ix, iy, iz, result );
+}
+
+template< int Dimension, typename Device >
+struct Convolution;
+
+template<>
+struct Convolution< 1, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 1, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
+
+      configuration.dynamicSharedMemorySize = ( kernelSize.x() + kernelElementCount ) * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >(
+         kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store );
+   };
+};
+
+template<>
+struct Convolution< 2, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 2, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+      Index kernelVolume = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ ) {
+         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
+         kernelVolume *= kernelSize[ i ];
+      }
+
+      configuration.dynamicSharedMemorySize = ( kernelVolume + kernelElementCount ) * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       configuration,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   };
+};
+
+template<>
+struct Convolution< 3, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 3, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+      Index kernelVolume = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ ) {
+         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
+         kernelVolume *= kernelSize[ i ];
+      }
+
+      configuration.dynamicSharedMemorySize = ( kernelVolume + kernelElementCount ) * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+      configuration.blockSize.z = kernelSize.z();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename FetchKernel,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            FetchKernel&& fetchKernel,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       configuration,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       kernelSize.z(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       dimensions.z(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       fetchKernel,
+                                       convolve,
+                                       store );
+   };
+};
+
+#endif
diff --git a/src/Benchmarks/Convolution/support/Benchmark.h b/src/Benchmarks/Convolution/support/Benchmark.h
index ce1b91b23..b489000d4 100644
--- a/src/Benchmarks/Convolution/support/Benchmark.h
+++ b/src/Benchmarks/Convolution/support/Benchmark.h
@@ -19,12 +19,12 @@ public:
    void
    run( const TNL::Config::ParameterContainer& parameters ) const
    {
-      if( ! TNL::Devices::Host::setup( parameters ) || ! TNL::Devices::Cuda::setup( parameters ) )
+      if( ! TNL::Devices::Cuda::setup( parameters ) )
          return;
 
       const TNL::String logFileName = parameters.getParameter< TNL::String >( "log-file" );
       const TNL::String outputMode = parameters.getParameter< TNL::String >( "output-mode" );
-      const TNL::String device = parameters.getParameter< TNL::String >( "device" );
+   
 
       const int verbose = parameters.getParameter< int >( "verbose" );
       const int loops = parameters.getParameter< int >( "loops" );
@@ -58,19 +58,10 @@ public:
       config.addEntryEnum( "append" );
       config.addEntryEnum( "overwrite" );
 
-      config.addEntry< TNL::String >( "device", "Device the computation will run on.", "cuda" );
-      config.addEntryEnum< TNL::String >( "all" );
-      config.addEntryEnum< TNL::String >( "host" );
-
-#ifdef HAVE_CUDA
-      config.addEntryEnum< TNL::String >( "cuda" );
-#endif
-
       config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
       config.addEntry< int >( "verbose", "Verbose mode.", 1 );
 
       config.addDelimiter( "Device settings:" );
-      TNL::Devices::Host::configSetup( config );
 
 #ifdef HAVE_CUDA
       TNL::Devices::Cuda::configSetup( config );
diff --git a/src/Benchmarks/Convolution/support/DummyBenchmark.h b/src/Benchmarks/Convolution/support/DummyBenchmark.h
index 005096d12..9b44d53dd 100644
--- a/src/Benchmarks/Convolution/support/DummyBenchmark.h
+++ b/src/Benchmarks/Convolution/support/DummyBenchmark.h
@@ -23,77 +23,52 @@ public:
    virtual void
    start( TNLBenchmark& benchmark, const TNL::Config::ParameterContainer& parameters ) const override
    {
-      Vector start;
-      Vector end;
+      Vector dimension;
       Vector minKernelSize;
       Vector maxKernelSize;
 
       for( int i = 0; i < Dimension; i++ ) {
-         start[ i ] = parameters.getParameter< int >( minDimensionIds[ i ] );
-         end[ i ] = parameters.getParameter< int >( maxDimensionIds[ i ] );
+         dimension[ i ] = parameters.getParameter< int >( dimensionIds[ i ] );
          minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] );
          maxKernelSize[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] );
 
-         TNL_ASSERT_GT( start[ i ], 1, "Start dimension must be positive integer" );
-         TNL_ASSERT_GT( end[ i ], start[ i ], "End dimension must be greater than start dimension" );
-
          TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" );
          TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" );
-         TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End dimension must be greater than start dimension" );
-         TNL_ASSERT_GT( end[ i ], start[ i ], "End kernel size must be greater than start kernel size" );
+         TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End kernel size must be greater than start kernel size" );
       }
 
-      int dimensionStep = parameters.getParameter< int >( "dimension-step" );
       int kernelStep = parameters.getParameter< int >( "kernel-step" );
 
-      TNL_ASSERT_GT( dimensionStep, 1, "Dimension step must be a positive number" );
       TNL_ASSERT_GT( kernelStep, 0, "Kernel step must be a positive number" );
       TNL_ASSERT_EQ( kernelStep % 2, 0, "Kernel step must be even" );
 
-      TNL::String id = parameters.getParameter<TNL::String>("id");
+      TNL::String id = parameters.getParameter< TNL::String >( "id" );
 
-      time( id, benchmark, start, end, dimensionStep, minKernelSize, maxKernelSize, kernelStep );
+      time( id, benchmark, dimension, minKernelSize, maxKernelSize, kernelStep );
    }
 
    virtual void
    time( const TNL::String& id,
          TNLBenchmark& benchmark,
-         const Vector& minDimension,
-         const Vector& maxDimension,
-         const int dimensionStep,
+         const Vector& dimension,
          const Vector& minKernelSize,
          const Vector& maxKernelSize,
          const int kernelStep ) const
    {
-      Vector currentDimension = minDimension;
-      Vector currentKernelSize;
+      Vector currentKernelSize = minKernelSize;
 
       do {
-         currentKernelSize = minKernelSize;
-
-         do {
-            timeConvolution( id, benchmark, currentDimension, currentKernelSize );
-
-            currentKernelSize[ 0 ] += kernelStep;
-
-            for( size_t i = 0; i < currentKernelSize.getSize() - 1; i++ ) {
-               if( currentKernelSize[ i ] >= maxKernelSize[ i ] ) {
-                  currentKernelSize[ i ] = minKernelSize[ i ];
-                  currentKernelSize[ i + 1 ] += kernelStep;
-               }
-            }
-         } while( currentKernelSize < maxKernelSize );
+         timeConvolution( id, benchmark, dimension, currentKernelSize );
 
-         currentDimension[ 0 ] *= dimensionStep;
+         currentKernelSize[ 0 ] += kernelStep;
 
-         for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) {
-            if( currentDimension[ i ] >= maxDimension[ i ] ) {
-               currentDimension[ i ] = minDimension[ i ];
-               currentDimension[ i + 1 ] *= dimensionStep;
+         for( size_t i = 0; i < currentKernelSize.getSize() - 1; i++ ) {
+            if( currentKernelSize[ i ] >= maxKernelSize[ i ] ) {
+               currentKernelSize[ i ] = minKernelSize[ i ];
+               currentKernelSize[ i + 1 ] += kernelStep;
             }
          }
-
-      } while( currentDimension < maxDimension );
+      } while( currentKernelSize < maxKernelSize );
    }
 
    void
@@ -101,7 +76,7 @@ public:
    {
       auto device = TNL::getType< Device >();
 
-      typename TNLBenchmark::MetadataColumns columns = {{ "id", id }};
+      typename TNLBenchmark::MetadataColumns columns = { { "id", id } };
 
       size_t elementsCount = 1;
       size_t kernelElementsCount = 1;
@@ -110,8 +85,8 @@ public:
          elementsCount *= dimension[ i ];
          kernelElementsCount *= kernelSize[ i ];
 
-         columns.push_back( { dimensionIds[ i ], TNL::convertToString(dimension[ i ]) } );
-         columns.push_back( { kernelSizeIds[ i ], TNL::convertToString(kernelSize[ i ]) } );
+         columns.push_back( { dimensionIds[ i ], TNL::convertToString( dimension[ i ] ) } );
+         columns.push_back( { kernelSizeIds[ i ], TNL::convertToString( kernelSize[ i ] ) } );
       }
 
       benchmark.setDatasetSize( ( elementsCount * 4 ) / 1.e9, 1.0 );
@@ -134,10 +109,10 @@ public:
 
       auto measure = [ & ]()
       {
-         DummyTask<int, float, Dimension, Device>::exec(dimension, kernelSize, inputView, resultView, kernelView);
+         DummyTask< int, float, Dimension, Device >::exec( dimension, kernelSize, inputView, resultView, kernelView );
       };
 
-      benchmark.template time<Device>( device, measure );
+      benchmark.template time< Device >( device, measure );
    }
 
    TNL::Config::ConfigDescription
@@ -148,12 +123,7 @@ public:
       config.addDelimiter( "Grid dimension settings:" );
 
       for( int i = 0; i < Dimension; i++ )
-         config.addEntry< int >( minDimensionIds[ i ], minDimensionIds[ i ], 16 );
-
-      for( int i = 0; i < Dimension; i++ )
-         config.addEntry< int >( maxDimensionIds[ i ], maxDimensionIds[ i ], 128 );
-
-      config.addEntry< int >( "dimension-step", "Step of kernel increase by which dimension is multiplied (must be even)", 2 );
+         config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 16 );
 
       config.addDelimiter( "Kernel settings:" );
 
diff --git a/src/Benchmarks/Convolution/support/Solver.h b/src/Benchmarks/Convolution/support/Solver.h
index c0c0b31e2..3fd56fb02 100644
--- a/src/Benchmarks/Convolution/support/Solver.h
+++ b/src/Benchmarks/Convolution/support/Solver.h
@@ -13,7 +13,7 @@ public:
    void
    solve( const TNL::Config::ParameterContainer& parameters ) const
    {
-      if( ! TNL::Devices::Host::setup( parameters ) || ! TNL::Devices::Cuda::setup( parameters ) )
+      if( ! TNL::Devices::Cuda::setup( parameters ) )
          return;
 
       start( parameters );
@@ -30,16 +30,7 @@ public:
    {
       TNL::Config::ConfigDescription config;
 
-      config.addEntry< TNL::String >( "device", "Device the computation will run on.", "cuda" );
-      config.addEntryEnum< TNL::String >( "all" );
-      config.addEntryEnum< TNL::String >( "host" );
-
-#ifdef HAVE_CUDA
-      config.addEntryEnum< TNL::String >( "cuda" );
-#endif
-
       config.addDelimiter( "Device settings:" );
-      TNL::Devices::Host::configSetup( config );
 
 #ifdef HAVE_CUDA
       TNL::Devices::Cuda::configSetup( config );
-- 
GitLab


From 6860696cdb39ab8290a3300f65e683362b7b5e6a Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Mon, 4 Apr 2022 08:40:08 +0200
Subject: [PATCH 11/19] Fix convolution kernel execution

---
 src/Benchmarks/Convolution/kernels/naive.h    |  2 +-
 .../Convolution/kernels/sharedData.h          | 85 +++++++++---------
 .../Convolution/kernels/sharedDataAndKernel.h | 87 ++++++++++---------
 .../Convolution/kernels/sharedKernel.h        | 28 +++---
 .../Convolution/support/DummySolver.h         |  9 +-
 .../Convolution/support/DummyTask.h           |  8 +-
 6 files changed, 116 insertions(+), 103 deletions(-)

diff --git a/src/Benchmarks/Convolution/kernels/naive.h b/src/Benchmarks/Convolution/kernels/naive.h
index 4326e0d74..5705deb04 100644
--- a/src/Benchmarks/Convolution/kernels/naive.h
+++ b/src/Benchmarks/Convolution/kernels/naive.h
@@ -305,7 +305,7 @@ public:
          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
       configuration.gridSize.y =
          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
-      configuration.gridSize.y =
+      configuration.gridSize.z =
          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
    }
 
diff --git a/src/Benchmarks/Convolution/kernels/sharedData.h b/src/Benchmarks/Convolution/kernels/sharedData.h
index dc173703e..8abc82fda 100644
--- a/src/Benchmarks/Convolution/kernels/sharedData.h
+++ b/src/Benchmarks/Convolution/kernels/sharedData.h
@@ -33,16 +33,13 @@ convolution1D( Index kernelWidth,
 {
    Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
-   if( ix >= endX )
-      return;
-
    Real* shared = TNL::Cuda::getSharedMemory< Real >();
    Index radius = kernelWidth >> 1;
 
    // Left
    Index lhs = ix - radius;
 
-   if( lhs < 0 ) {
+   if( lhs < 0 || lhs >= endX ) {
       shared[ threadIdx.x ] = fetchBoundary( lhs );
    }
    else {
@@ -52,7 +49,7 @@ convolution1D( Index kernelWidth,
    // Right
    Index rhs = ix + radius;
 
-   if( rhs >= endX ) {
+   if( rhs < 0 || rhs >= endX ) {
       shared[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs );
    }
    else {
@@ -61,6 +58,9 @@ convolution1D( Index kernelWidth,
 
    __syncthreads();
 
+   if( ix >= endX )
+      return;
+
    Real result = 0;
 
    #pragma unroll
@@ -95,9 +95,6 @@ convolution2D( Index kernelWidth,
    Index iy = threadIdx.y + blockIdx.y * blockDim.y;
    Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
-   if( ix >= endX || iy >= endY )
-      return;
-
    Real* shared = TNL::Cuda::getSharedMemory< Real >();
 
    Index radiusY = kernelHeight >> 1;
@@ -105,13 +102,16 @@ convolution2D( Index kernelWidth,
 
    Index x, y, index;
 
+   Index kernelHorizontalPadding = kernelWidth == 1 ? 0 : kernelWidth;
+   Index kernelVerticalPadding = kernelHeight == 1 ? 0 : kernelHeight;
+
    // Top Left
    x = ix - radiusX;
    y = iy - radiusY;
 
    index = threadIdx.x + threadIdx.y * blockDim.x;
 
-   if( x < 0 || y < 0 ) {
+   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
       shared[ index ] = fetchBoundary( x, y );
    }
    else {
@@ -122,9 +122,9 @@ convolution2D( Index kernelWidth,
    x = ix + radiusX;
    y = iy - radiusY;
 
-   index = radiusX + threadIdx.x + threadIdx.y * blockDim.x;
+   index = kernelHorizontalPadding + threadIdx.x + threadIdx.y * blockDim.x;
 
-   if( x >= endX || y < 0 ) {
+   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
       shared[ index ] = fetchBoundary( x, y );
    }
    else {
@@ -135,9 +135,9 @@ convolution2D( Index kernelWidth,
    x = ix - radiusX;
    y = iy + radiusY;
 
-   index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;
+   index = threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x;
 
-   if( x < 0 || y >= endY ) {
+   if(x < 0 || y < 0 || x >= endX || y >= endY ) {
       shared[ index ] = fetchBoundary( x, y );
    }
    else {
@@ -148,9 +148,9 @@ convolution2D( Index kernelWidth,
    x = ix + radiusX;
    y = iy + radiusY;
 
-   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;
+   index = kernelHorizontalPadding + threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x;
 
-   if( x >= endX || y >= endY ) {
+   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
       shared[ index ] = fetchBoundary( x, y );
    }
    else {
@@ -159,12 +159,15 @@ convolution2D( Index kernelWidth,
 
    __syncthreads();
 
+   if( ix >= endX || iy >= endY )
+      return;
+
    Real result = 0;
 
-   for( Index j = 0; j <= radiusY; j++ ) {
+   for( Index j = 0; j < kernelHeight; j++ ) {
       Index align = ( j + threadIdx.y ) * blockDim.x;
 
-      for( Index i = 0; i <= radiusX; i++ ) {
+      for( Index i = 0; i < kernelWidth; i++ ) {
          Index index = i + threadIdx.x + align;
 
          result = convolve( result, shared[ index ], fetchKernel( i, j ) );
@@ -199,9 +202,6 @@ convolution3D( Index kernelWidth,
    Index iy = threadIdx.y + blockIdx.y * blockDim.y;
    Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
-   if( ix >= endX || iy >= endY || iz >= endZ )
-      return;
-
    Real* shared = TNL::Cuda::getSharedMemory< Real >();
 
    Index radiusZ = kernelDepth >> 1;
@@ -215,9 +215,9 @@ convolution3D( Index kernelWidth,
    y = iy - radiusY;
    z = iz - radiusZ;
 
-   index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
+   index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
 
-   if( x < 0 || y < 0 || z < 0 ) {
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       shared[ index ] = fetchBoundary( x, y, z );
    }
    else {
@@ -229,9 +229,9 @@ convolution3D( Index kernelWidth,
    y = iy - radiusY;
    z = iz - radiusZ;
 
-   index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
+   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
 
-   if( x >= endX || y < 0 || z < 0 ) {
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       shared[ index ] = fetchBoundary( x, y, z );
    }
    else {
@@ -243,9 +243,9 @@ convolution3D( Index kernelWidth,
    y = iy + radiusY;
    z = iz - radiusZ;
 
-   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
+   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
 
-   if( x < 0 || y >= endY || z < 0 ) {
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       shared[ index ] = fetchBoundary( x, y, z );
    }
    else {
@@ -257,9 +257,9 @@ convolution3D( Index kernelWidth,
    y = iy - radiusY;
    z = iz + radiusZ;
 
-   index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
+   index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
 
-   if( x < 0 || y < 0 || z >= endZ ) {
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       shared[ index ] = fetchBoundary( x, y, z );
    }
    else {
@@ -271,9 +271,9 @@ convolution3D( Index kernelWidth,
    y = iy + radiusY;
    z = iz - radiusZ;
 
-   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
+   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
 
-   if( x >= endX || y >= endY || z < 0 ) {
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       shared[ index ] = fetchBoundary( x, y, z );
    }
    else {
@@ -285,9 +285,9 @@ convolution3D( Index kernelWidth,
    y = iy - radiusY;
    z = iz + radiusZ;
 
-   index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
+   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
 
-   if( x >= endX || y < 0 || z >= endZ ) {
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       shared[ index ] = fetchBoundary( x, y, z );
    }
    else {
@@ -299,9 +299,9 @@ convolution3D( Index kernelWidth,
    y = iy + radiusY;
    z = iz + radiusZ;
 
-   index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
+   index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
 
-   if( x < 0 || y >= endY || z >= endZ ) {
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       shared[ index ] = fetchBoundary( x, y, z );
    }
    else {
@@ -313,9 +313,9 @@ convolution3D( Index kernelWidth,
    y = iy + radiusY;
    z = iz + radiusZ;
 
-   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
+   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
 
-   if( x >= endX || y >= endY || z >= endZ ) {
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       shared[ index ] = fetchBoundary( x, y, z );
    }
    else {
@@ -324,15 +324,18 @@ convolution3D( Index kernelWidth,
 
    __syncthreads();
 
+   if( ix >= endX || iy >= endY || iz >= endZ )
+      return;
+
    Real result = 0;
 
-   for( Index k = 0; k <= radiusZ; k++ ) {
+   for( Index k = 0; k < kernelDepth; k++ ) {
       Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;
 
-      for( Index j = 0; j <= radiusY; j++ ) {
+      for( Index j = 0; j < kernelHeight; j++ ) {
          Index xAlign = ( j + threadIdx.y ) * blockDim.x;
 
-         for( Index i = 0; i <= radiusX; i++ ) {
+         for( Index i = 0; i < kernelWidth; i++ ) {
             Index index = i + threadIdx.x + xAlign + xyAlign;
 
             result = convolve( result, shared[ index ], fetchKernel( i, j, k ) );
@@ -360,7 +363,7 @@ public:
       Index kernelElementCount = 1;
 
       for( Index i = 0; i < kernelSize.getSize(); i++ )
-         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
+         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1 ;
 
       configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
 
@@ -486,7 +489,7 @@ public:
          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
       configuration.gridSize.y =
          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
-      configuration.gridSize.y =
+      configuration.gridSize.z =
          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
    }
 
diff --git a/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h
index 8cbc26aa4..70e1d58a9 100644
--- a/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h
+++ b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h
@@ -49,9 +49,6 @@ convolution1D( Index kernelWidth,
 {
    Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
-   if( ix >= endX )
-      return;
-
    Index kernelOffset = 2 * kernelWidth;
 
    Real* data = TNL::Cuda::getSharedMemory< Real >();
@@ -62,7 +59,7 @@ convolution1D( Index kernelWidth,
    // Left
    Index lhs = ix - radius;
 
-   if( lhs < 0 ) {
+   if( lhs < 0 || lhs >= endX ) {
       data[ threadIdx.x ] = fetchBoundary( lhs );
    }
    else {
@@ -72,7 +69,7 @@ convolution1D( Index kernelWidth,
    // Right
    Index rhs = ix + radius;
 
-   if( rhs >= endX ) {
+   if( rhs < 0 || rhs >= endX ) {
       data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs );
    }
    else {
@@ -83,6 +80,9 @@ convolution1D( Index kernelWidth,
 
    __syncthreads();
 
+   if( ix >= endX )
+      return;
+
    Real result = 0;
 
    #pragma unroll
@@ -117,9 +117,6 @@ convolution2D( Index kernelWidth,
    Index iy = threadIdx.y + blockIdx.y * blockDim.y;
    Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
-   if( ix >= endX || iy >= endY )
-      return;
-
    Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 );
 
    Real* data = TNL::Cuda::getSharedMemory< Real >();
@@ -138,7 +135,7 @@ convolution2D( Index kernelWidth,
 
    kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y );
 
-   if( x < 0 || y < 0 ) {
+   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
       data[ index ] = fetchBoundary( x, y );
    }
    else {
@@ -149,9 +146,9 @@ convolution2D( Index kernelWidth,
    x = ix + radiusX;
    y = iy - radiusY;
 
-   index = radiusX + threadIdx.x + threadIdx.y * blockDim.x;
+   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x;
 
-   if( x >= endX || y < 0 ) {
+   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
       data[ index ] = fetchBoundary( x, y );
    }
    else {
@@ -162,9 +159,9 @@ convolution2D( Index kernelWidth,
    x = ix - radiusX;
    y = iy + radiusY;
 
-   index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;
+   index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x;
 
-   if( x < 0 || y >= endY ) {
+   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
       data[ index ] = fetchBoundary( x, y );
    }
    else {
@@ -175,9 +172,9 @@ convolution2D( Index kernelWidth,
    x = ix + radiusX;
    y = iy + radiusY;
 
-   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;
+   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x;
 
-   if( x >= endX || y >= endY ) {
+   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
       data[ index ] = fetchBoundary( x, y );
    }
    else {
@@ -186,15 +183,18 @@ convolution2D( Index kernelWidth,
 
    __syncthreads();
 
+    if( ix >= endX || iy >= endY )
+      return;
+
    Real result = 0;
 
    #pragma unroll
-   for( Index j = 0; j <= radiusY; j++ ) {
+   for( Index j = 0; j < kernelHeight; j++ ) {
       Index elementAlign = ( j + threadIdx.y ) * blockDim.x;
       Index kernelAlign = j * blockDim.x;
 
    #pragma unroll
-      for( Index i = 0; i <= radiusX; i++ ) {
+      for( Index i = 0; i < kernelWidth; i++ ) {
          Index elementIndex = i + threadIdx.x + elementAlign;
          Index kernelIndex = i + kernelAlign;
 
@@ -230,9 +230,6 @@ convolution3D( Index kernelWidth,
    Index iy = threadIdx.y + blockIdx.y * blockDim.y;
    Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
-   if( ix >= endX || iy >= endY || iz >= endZ )
-      return;
-
    Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ) * ( 2 * kernelDepth - 1 );
 
    Real* data = TNL::Cuda::getSharedMemory< Real >();
@@ -249,11 +246,11 @@ convolution3D( Index kernelWidth,
    y = iy - radiusY;
    z = iz - radiusZ;
 
-   index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
+   index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
 
    kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z );
 
-   if( x < 0 || y < 0 || z < 0 ) {
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       data[ index ] = fetchBoundary( x, y, z );
    }
    else {
@@ -265,9 +262,9 @@ convolution3D( Index kernelWidth,
    y = iy - radiusY;
    z = iz - radiusZ;
 
-   index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
+   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
 
-   if( x >= endX || y < 0 || z < 0 ) {
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       data[ index ] = fetchBoundary( x, y, z );
    }
    else {
@@ -279,9 +276,9 @@ convolution3D( Index kernelWidth,
    y = iy + radiusY;
    z = iz - radiusZ;
 
-   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
+   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
 
-   if( x < 0 || y >= endY || z < 0 ) {
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       data[ index ] = fetchBoundary( x, y, z );
    }
    else {
@@ -293,9 +290,9 @@ convolution3D( Index kernelWidth,
    y = iy - radiusY;
    z = iz + radiusZ;
 
-   index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
+   index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
 
-   if( x < 0 || y < 0 || z >= endZ ) {
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       data[ index ] = fetchBoundary( x, y, z );
    }
    else {
@@ -307,9 +304,9 @@ convolution3D( Index kernelWidth,
    y = iy + radiusY;
    z = iz - radiusZ;
 
-   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;
+   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
 
-   if( x >= endX || y >= endY || z < 0 ) {
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ) {
       data[ index ] = fetchBoundary( x, y, z );
    }
    else {
@@ -321,9 +318,9 @@ convolution3D( Index kernelWidth,
    y = iy - radiusY;
    z = iz + radiusZ;
 
-   index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
+   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
 
-   if( x >= endX || y < 0 || z >= endZ ) {
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       data[ index ] = fetchBoundary( x, y, z );
    }
    else {
@@ -335,9 +332,9 @@ convolution3D( Index kernelWidth,
    y = iy + radiusY;
    z = iz + radiusZ;
 
-   index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
+   index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
 
-   if( x < 0 || y >= endY || z >= endZ ) {
+   if(x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       data[ index ] = fetchBoundary( x, y, z );
    }
    else {
@@ -349,9 +346,9 @@ convolution3D( Index kernelWidth,
    y = iy + radiusY;
    z = iz + radiusZ;
 
-   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;
+   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
 
-   if( x >= endX || y >= endY || z >= endZ ) {
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       data[ index ] = fetchBoundary( x, y, z );
    }
    else {
@@ -360,21 +357,25 @@ convolution3D( Index kernelWidth,
 
    __syncthreads();
 
+   if( ix >= endX || iy >= endY || iz >= endZ )
+      return;
+
    Real result = 0;
 
-   for( Index k = 0; k <= radiusZ; k++ ) {
+   #pragma unroll
+   for( Index k = 0; k < kernelDepth; k++ ) {
       Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;
       Index xyKernelAlign = k * blockDim.x * blockDim.y;
-
-      for( Index j = 0; j <= radiusY; j++ ) {
+   #pragma unroll
+      for( Index j = 0; j < kernelHeight; j++ ) {
          Index xAlign = ( j + threadIdx.y ) * blockDim.x;
          Index xKernelAlign = j * blockDim.x;
-
-         for( Index i = 0; i <= radiusX; i++ ) {
+   #pragma unroll
+         for( Index i = 0; i < kernelWidth; i++ ) {
             Index elementIndex = i + threadIdx.x + xAlign + xyAlign;
             Index kernelIndex = i + xKernelAlign + xyKernelAlign;
 
-            result = convolve( result, data[ index ], kernel[ kernelIndex ] );
+            result = convolve( result, data[ elementIndex ], kernel[ kernelIndex ] );
          }
       }
    }
@@ -531,7 +532,7 @@ public:
          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
       configuration.gridSize.y =
          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
-      configuration.gridSize.y =
+      configuration.gridSize.z =
          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
    }
 
diff --git a/src/Benchmarks/Convolution/kernels/sharedKernel.h b/src/Benchmarks/Convolution/kernels/sharedKernel.h
index d3e1e4da3..c217cfb34 100644
--- a/src/Benchmarks/Convolution/kernels/sharedKernel.h
+++ b/src/Benchmarks/Convolution/kernels/sharedKernel.h
@@ -34,9 +34,6 @@ convolution1D( Index kernelWidth,
 {
    Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
-   if( ix >= endX )
-      return;
-
    Real* shared = TNL::Cuda::getSharedMemory< Real >();
 
    Index radius = kernelWidth >> 1;
@@ -46,8 +43,12 @@ convolution1D( Index kernelWidth,
 
    __syncthreads();
 
+   if( ix >= endX )
+      return;
+
    Real result = 0;
 
+   #pragma unroll
    for( Index i = -radius; i <= radius; i++ ) {
       Index elementIndex = i + ix;
       Index kernelIndex = i + radius;
@@ -85,9 +86,6 @@ convolution2D( Index kernelWidth,
    Index iy = threadIdx.y + blockIdx.y * blockDim.y;
    Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
-   if( ix >= endX || iy >= endY )
-      return;
-
    Real* shared = TNL::Cuda::getSharedMemory< Real >();
 
    Index radiusY = kernelHeight >> 1;
@@ -100,12 +98,17 @@ convolution2D( Index kernelWidth,
 
    __syncthreads();
 
+   if( ix >= endX || iy >= endY )
+      return;
+
    Real result = 0;
 
+   #pragma unroll
    for( Index j = -radiusY; j <= radiusY; j++ ) {
       Index elementIndexY = j + iy;
       Index kernelIndexY = j + radiusY;
 
+      #pragma unroll
       for( Index i = -radiusX; i <= radiusX; i++ ) {
          Index elementIndexX = i + ix;
          Index kernelIndexX = i + radiusX;
@@ -149,9 +152,6 @@ convolution3D( Index kernelWidth,
    Index iy = threadIdx.y + blockIdx.y * blockDim.y;
    Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
-   if( ix >= endX || iy >= endY || iz >= endZ )
-      return;
-
    Real* shared = TNL::Cuda::getSharedMemory< Real >();
 
    Index radiusZ = kernelDepth >> 1;
@@ -160,23 +160,27 @@ convolution3D( Index kernelWidth,
 
    Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z;
 
-   printf( "%d\n", threadIndex );
-
    // The size of the block is equal to the kernel size
    shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z );
 
    __syncthreads();
 
+   if( ix >= endX || iy >= endY || iz >= endZ )
+      return;
+
    Real result = 0;
 
+   #pragma unroll
    for( Index k = -radiusZ; k <= radiusZ; k++ ) {
       Index elementIndexZ = k + iz;
       Index kernelIndexZ = k + radiusZ;
 
+      #pragma unroll
       for( Index j = -radiusY; j <= radiusY; j++ ) {
          Index elementIndexY = j + iy;
          Index kernelIndexY = j + radiusY;
 
+         #pragma unroll
          for( Index i = -radiusX; i <= radiusX; i++ ) {
             Index elementIndexX = i + ix;
             Index kernelIndexX = i + radiusX;
@@ -338,7 +342,7 @@ public:
          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
       configuration.gridSize.y =
          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
-      configuration.gridSize.y =
+      configuration.gridSize.z =
          TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
    }
 
diff --git a/src/Benchmarks/Convolution/support/DummySolver.h b/src/Benchmarks/Convolution/support/DummySolver.h
index a871c7f3f..82a8f6ad4 100644
--- a/src/Benchmarks/Convolution/support/DummySolver.h
+++ b/src/Benchmarks/Convolution/support/DummySolver.h
@@ -61,6 +61,11 @@ public:
 
       DummyTask<int, float, Dimension, Device>::exec(dimension, kernelSize, inputView, resultView, kernelView);
 
+      TNL::Containers::Array< float, TNL::Devices::Host, int > host(result);
+
+      for (int i = 0; i < host.getSize(); i++)
+         TNL_ASSERT_EQ(host[i], kernelElementsCount, "Dummy task always sets volume of kernel");
+
       std::cout << "Everything is fine" << std::endl;
    }
 
@@ -72,12 +77,12 @@ public:
       config.addDelimiter( "Grid dimension settings:" );
 
       for( int i = 0; i < Dimension; i++ )
-         config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 512 );
+         config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 64 );
 
       config.addDelimiter( "Kernel settings:" );
 
       for( int i = 0; i < Dimension; i++ )
-         config.addEntry< int >( kernelSizeIds[ i ], kernelSizeIds[ i ] + " (odd) :", 11 );
+         config.addEntry< int >( kernelSizeIds[ i ], kernelSizeIds[ i ] + " (odd) :", 9 );
 
       return config;
    }
diff --git a/src/Benchmarks/Convolution/support/DummyTask.h b/src/Benchmarks/Convolution/support/DummyTask.h
index e850b64c0..026575c38 100644
--- a/src/Benchmarks/Convolution/support/DummyTask.h
+++ b/src/Benchmarks/Convolution/support/DummyTask.h
@@ -55,7 +55,7 @@ public:
          return kernel[ i ];
       };
 
-      auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel )
+      auto convolve = [ = ] __cuda_callable__( Real result, Real data, Real kernel )
       {
          return result + data * kernel;
       };
@@ -97,7 +97,7 @@ public:
 
       auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j )
       {
-         return -1;
+         return 1;
       };
 
       auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j )
@@ -107,7 +107,7 @@ public:
          return kernel[ index ];
       };
 
-      auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel )
+      auto convolve = [ = ] __cuda_callable__( Real result, Real data, Real kernel )
       {
          return result + data * kernel;
       };
@@ -161,7 +161,7 @@ public:
          return kernel[ index ];
       };
 
-      auto convolve = [ = ] __cuda_callable__( float result, Index data, Index kernel )
+      auto convolve = [ = ] __cuda_callable__( Real result, Real data, Real kernel )
       {
          return result + data * kernel;
       };
-- 
GitLab


From 47036e5e6ab038b5ada97d91312d3a309b44e198 Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Mon, 4 Apr 2022 09:44:18 +0200
Subject: [PATCH 12/19] Add convolution directory in CMake

---
 src/Benchmarks/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt
index 50e467762..e3b14d851 100644
--- a/src/Benchmarks/CMakeLists.txt
+++ b/src/Benchmarks/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory( Convolution )
 add_subdirectory( HeatEquation )
 add_subdirectory( BLAS )
 add_subdirectory( NDArray )
-- 
GitLab


From 12827c2dae50b8eca0c3b11b4319a0a80f88acb6 Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Mon, 2 May 2022 18:55:39 +0200
Subject: [PATCH 13/19] Add prefer of the shared memory

---
 src/Benchmarks/Convolution/kernels/sharedData.h          | 2 +-
 src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h | 6 ++++++
 src/Benchmarks/Convolution/kernels/sharedKernel.h        | 6 ++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/Benchmarks/Convolution/kernels/sharedData.h b/src/Benchmarks/Convolution/kernels/sharedData.h
index 8abc82fda..97f252b11 100644
--- a/src/Benchmarks/Convolution/kernels/sharedData.h
+++ b/src/Benchmarks/Convolution/kernels/sharedData.h
@@ -6,7 +6,7 @@
  * This method stores image tile into shared memory
  * and then calculates convolution.
  *
- * Thanks for the idea  https://www.evl.uic.edu/sjames/cs525/final.html
+ * Thanks for the idea https://www.evl.uic.edu/sjames/cs525/final.html
  */
 
    #include <TNL/Devices/Cuda.h>
diff --git a/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h
index 70e1d58a9..62276c3cf 100644
--- a/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h
+++ b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h
@@ -431,6 +431,8 @@ public:
 
       constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
 
+      cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared);
+
       TNL::Cuda::launchKernel< true >(
          kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store );
    };
@@ -488,6 +490,8 @@ public:
 
       constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
 
+      cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared);
+
       TNL::Cuda::launchKernel< true >( kernel,
                                        0,
                                        configuration,
@@ -558,6 +562,8 @@ public:
 
       constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
 
+      cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared);
+
       TNL::Cuda::launchKernel< true >( kernel,
                                        0,
                                        configuration,
diff --git a/src/Benchmarks/Convolution/kernels/sharedKernel.h b/src/Benchmarks/Convolution/kernels/sharedKernel.h
index c217cfb34..ba98efe73 100644
--- a/src/Benchmarks/Convolution/kernels/sharedKernel.h
+++ b/src/Benchmarks/Convolution/kernels/sharedKernel.h
@@ -247,6 +247,8 @@ public:
 
       constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
 
+      cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared);
+
       TNL::Cuda::launchKernel< true >(
          kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store );
    };
@@ -301,6 +303,8 @@ public:
 
       constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
 
+      cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared);
+
       TNL::Cuda::launchKernel< true >( kernel,
                                        0,
                                        configuration,
@@ -368,6 +372,8 @@ public:
 
       constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;
 
+      cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared);
+
       TNL::Cuda::launchKernel< true >( kernel,
                                        0,
                                        configuration,
-- 
GitLab


From 231c69bf9fff0feeb7f2c1a80773fec2731dcdf6 Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Tue, 3 May 2022 13:05:58 +0200
Subject: [PATCH 14/19] Add image convolution solver

---
 src/Benchmarks/Convolution/CMakeLists.txt     |   9 +
 .../Convolution/kernels/sharedData.h          | 141 +++++++-------
 .../Convolution/kernels/sharedDataAndKernel.h | 100 ++++++----
 .../Convolution/support/DummyBenchmark.h      |   6 +-
 .../Convolution/support/DummySolver.h         |   6 +-
 .../Convolution/support/DummyTask.h           |  17 +-
 .../Convolution/support/ImageSolver.h         | 184 ++++++++++++++++++
 src/Benchmarks/Convolution/support/Solver.h   |   2 +-
 .../Convolution/templates/main_image_solver.h |  26 +++
 src/TNL/Images/PNGImage_impl.h                |   2 +
 10 files changed, 371 insertions(+), 122 deletions(-)
 create mode 100644 src/Benchmarks/Convolution/support/ImageSolver.h
 create mode 100644 src/Benchmarks/Convolution/templates/main_image_solver.h

diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt
index 0569e1013..d34518dcc 100644
--- a/src/Benchmarks/Convolution/CMakeLists.txt
+++ b/src/Benchmarks/Convolution/CMakeLists.txt
@@ -17,6 +17,10 @@ if (${BUILD_CUDA})
    SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}_${TEMPLATE_NAME}")
 
    CUDA_ADD_EXECUTABLE(${EXECUTABLE_NAME} ${SOURCE_FILE})
+
+   if( PNG_FOUND )
+      target_link_libraries( ${EXECUTABLE_NAME} ${PNG_LIBRARIES} )
+   endif()
 else()
    MESSAGE(WARNING "Convolutions are not supported on CPU")
 endif()
@@ -54,3 +58,8 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/shar
 GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")
 GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")
 GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")
+
+GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/naive.h")
+GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedData.h")
+GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedKernel.h")
+GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedDataAndKernel.h")
diff --git a/src/Benchmarks/Convolution/kernels/sharedData.h b/src/Benchmarks/Convolution/kernels/sharedData.h
index 97f252b11..dcaa5236e 100644
--- a/src/Benchmarks/Convolution/kernels/sharedData.h
+++ b/src/Benchmarks/Convolution/kernels/sharedData.h
@@ -33,27 +33,27 @@ convolution1D( Index kernelWidth,
 {
    Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
-   Real* shared = TNL::Cuda::getSharedMemory< Real >();
+   Real* data = TNL::Cuda::getSharedMemory< Real >();
    Index radius = kernelWidth >> 1;
 
    // Left
    Index lhs = ix - radius;
 
    if( lhs < 0 || lhs >= endX ) {
-      shared[ threadIdx.x ] = fetchBoundary( lhs );
+      data[ threadIdx.x ] = fetchBoundary( lhs );
    }
    else {
-      shared[ threadIdx.x ] = fetchData( lhs );
+      data[ threadIdx.x ] = fetchData( lhs );
    }
 
    // Right
    Index rhs = ix + radius;
 
    if( rhs < 0 || rhs >= endX ) {
-      shared[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs );
+      data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs );
    }
    else {
-      shared[ threadIdx.x + blockDim.x ] = fetchData( rhs );
+      data[ threadIdx.x + blockDim.x ] = fetchData( rhs );
    }
 
    __syncthreads();
@@ -67,7 +67,7 @@ convolution1D( Index kernelWidth,
    for( Index i = 0; i < kernelWidth; i++ ) {
       Index elementIndex = i + threadIdx.x;
 
-      result = convolve( result, shared[ elementIndex ], fetchKernel( i ) );
+      result = convolve( result, data[ elementIndex ], fetchKernel( i ) );
    }
 
    store( ix, result );
@@ -92,69 +92,68 @@ convolution2D( Index kernelWidth,
                Convolve convolve,
                Store store )
 {
-   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
-   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+   Real* data = TNL::Cuda::getSharedMemory< Real >();
 
-   Real* shared = TNL::Cuda::getSharedMemory< Real >();
+   const Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   const Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
-   Index radiusY = kernelHeight >> 1;
-   Index radiusX = kernelWidth >> 1;
+   const Index radiusY = kernelHeight >> 1;
+   const Index radiusX = kernelWidth >> 1;
 
-   Index x, y, index;
+   const Index dataBlockWidth = 2 * kernelWidth - 1;
+   const Index dataBlockHeight = 2 * kernelHeight - 1;
+
+   const Index dataBlockRadiusX = dataBlockWidth >> 1;
+   const Index dataBlockRadiusY = dataBlockHeight >> 1;
 
-   Index kernelHorizontalPadding = kernelWidth == 1 ? 0 : kernelWidth;
-   Index kernelVerticalPadding = kernelHeight == 1 ? 0 : kernelHeight;
+   Index x, y, index;
 
    // Top Left
    x = ix - radiusX;
    y = iy - radiusY;
-
-   index = threadIdx.x + threadIdx.y * blockDim.x;
+   index = threadIdx.x + threadIdx.y * dataBlockWidth;
 
    if( x < 0 || y < 0 || x >= endX || y >= endY ) {
-      shared[ index ] = fetchBoundary( x, y );
+      data[ index ] = fetchBoundary( x, y );
    }
    else {
-      shared[ index ] = fetchData( x, y );
+      data[ index ] = fetchData( x, y );
    }
 
    // Top right
    x = ix + radiusX;
    y = iy - radiusY;
-
-   index = kernelHorizontalPadding + threadIdx.x + threadIdx.y * blockDim.x;
+   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth;
 
    if( x < 0 || y < 0 || x >= endX || y >= endY ) {
-      shared[ index ] = fetchBoundary( x, y );
+      data[ index ] = fetchBoundary( x, y );
    }
    else {
-      shared[ index ] = fetchData( x, y );
+      data[ index ] = fetchData( x, y );
    }
 
    // Bottom Left
    x = ix - radiusX;
    y = iy + radiusY;
-
-   index = threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x;
+   index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;
 
    if(x < 0 || y < 0 || x >= endX || y >= endY ) {
-      shared[ index ] = fetchBoundary( x, y );
+      data[ index ] = fetchBoundary( x, y );
    }
    else {
-      shared[ index ] = fetchData( x, y );
+      data[ index ] = fetchData( x, y );
    }
 
    // Bottom Right
    x = ix + radiusX;
    y = iy + radiusY;
-
-   index = kernelHorizontalPadding + threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x;
+   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;
 
    if( x < 0 || y < 0 || x >= endX || y >= endY ) {
-      shared[ index ] = fetchBoundary( x, y );
+      data[ index ] = fetchBoundary( x, y );
    }
    else {
-      shared[ index ] = fetchData( x, y );
+      data[ index ] = fetchData( x, y );
    }
 
    __syncthreads();
@@ -165,12 +164,12 @@ convolution2D( Index kernelWidth,
    Real result = 0;
 
    for( Index j = 0; j < kernelHeight; j++ ) {
-      Index align = ( j + threadIdx.y ) * blockDim.x;
+      Index align = ( j + threadIdx.y ) * dataBlockWidth;
 
       for( Index i = 0; i < kernelWidth; i++ ) {
          Index index = i + threadIdx.x + align;
 
-         result = convolve( result, shared[ index ], fetchKernel( i, j ) );
+         result = convolve( result, data[ index ], fetchKernel( i, j ) );
       }
    }
 
@@ -198,15 +197,25 @@ convolution3D( Index kernelWidth,
                Convolve convolve,
                Store store )
 {
-   Index iz = threadIdx.z + blockIdx.z * blockDim.z;
-   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
-   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+   Real* data = TNL::Cuda::getSharedMemory< Real >();
+
+   const Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+   const Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   const Index iz = threadIdx.z + blockIdx.z * blockDim.z;
+
+   const Index radiusX = kernelWidth >> 1;
+   const Index radiusY = kernelHeight >> 1;
+   const Index radiusZ = kernelDepth >> 1;
+
+   const Index dataBlockWidth = 2 * kernelWidth - 1;
+   const Index dataBlockHeight = 2 * kernelHeight - 1;
+   const Index dataBlockDepth = 2 * kernelDepth - 1;
 
-   Real* shared = TNL::Cuda::getSharedMemory< Real >();
+   const Index dataBlockXYVolume = dataBlockWidth * dataBlockHeight;
 
-   Index radiusZ = kernelDepth >> 1;
-   Index radiusY = kernelHeight >> 1;
-   Index radiusX = kernelWidth >> 1;
+   const Index dataBlockRadiusX = dataBlockWidth >> 1;
+   const Index dataBlockRadiusY = dataBlockHeight >> 1;
+   const Index dataBlockRadiusZ = dataBlockDepth >> 1;
 
    Index x, y, z, index;
 
@@ -215,13 +224,13 @@ convolution3D( Index kernelWidth,
    y = iy - radiusY;
    z = iz - radiusZ;
 
-   index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
+   index = threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume;
 
    if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
-      shared[ index ] = fetchBoundary( x, y, z );
+      data[ index ] = fetchBoundary( x, y, z );
    }
    else {
-      shared[ index ] = fetchData( x, y, z );
+      data[ index ] = fetchData( x, y, z );
    }
 
    // Z: 0 Y: 0 X: 1
@@ -229,13 +238,13 @@ convolution3D( Index kernelWidth,
    y = iy - radiusY;
    z = iz - radiusZ;
 
-   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
+   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume;
 
    if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
-      shared[ index ] = fetchBoundary( x, y, z );
+      data[ index ] = fetchBoundary( x, y, z );
    }
    else {
-      shared[ index ] = fetchData( x, y, z );
+      data[ index ] = fetchData( x, y, z );
    }
 
    // Z: 0 Y: 1 X: 0
@@ -243,13 +252,13 @@ convolution3D( Index kernelWidth,
    y = iy + radiusY;
    z = iz - radiusZ;
 
-   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
+   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume;
 
    if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
-      shared[ index ] = fetchBoundary( x, y, z );
+      data[ index ] = fetchBoundary( x, y, z );
    }
    else {
-      shared[ index ] = fetchData( x, y, z );
+      data[ index ] = fetchData( x, y, z );
    }
 
    // Z: 1 Y: 0 X: 0
@@ -257,13 +266,13 @@ convolution3D( Index kernelWidth,
    y = iy - radiusY;
    z = iz + radiusZ;
 
-   index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
+   index = threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;
 
    if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
-      shared[ index ] = fetchBoundary( x, y, z );
+      data[ index ] = fetchBoundary( x, y, z );
    }
    else {
-      shared[ index ] = fetchData( x, y, z );
+      data[ index ] = fetchData( x, y, z );
    }
 
    // Z: 0 Y: 1 X: 1
@@ -271,13 +280,13 @@ convolution3D( Index kernelWidth,
    y = iy + radiusY;
    z = iz - radiusZ;
 
-   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
+   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume;
 
    if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
-      shared[ index ] = fetchBoundary( x, y, z );
+      data[ index ] = fetchBoundary( x, y, z );
    }
    else {
-      shared[ index ] = fetchData( x, y, z );
+      data[ index ] = fetchData( x, y, z );
    }
 
    // Z: 1 Y: 0 X: 1
@@ -285,13 +294,13 @@ convolution3D( Index kernelWidth,
    y = iy - radiusY;
    z = iz + radiusZ;
 
-   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
+   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;
 
    if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
-      shared[ index ] = fetchBoundary( x, y, z );
+      data[ index ] = fetchBoundary( x, y, z );
    }
    else {
-      shared[ index ] = fetchData( x, y, z );
+      data[ index ] = fetchData( x, y, z );
    }
 
    // Z: 1 Y: 1 X: 0
@@ -299,13 +308,13 @@ convolution3D( Index kernelWidth,
    y = iy + radiusY;
    z = iz + radiusZ;
 
-   index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
+   index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;
 
    if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
-      shared[ index ] = fetchBoundary( x, y, z );
+      data[ index ] = fetchBoundary( x, y, z );
    }
    else {
-      shared[ index ] = fetchData( x, y, z );
+      data[ index ] = fetchData( x, y, z );
    }
 
    // Z: 1 Y: 1 X: 1
@@ -313,13 +322,13 @@ convolution3D( Index kernelWidth,
    y = iy + radiusY;
    z = iz + radiusZ;
 
-   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
+   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;
 
    if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
-      shared[ index ] = fetchBoundary( x, y, z );
+      data[ index ] = fetchBoundary( x, y, z );
    }
    else {
-      shared[ index ] = fetchData( x, y, z );
+      data[ index ] = fetchData( x, y, z );
    }
 
    __syncthreads();
@@ -330,15 +339,15 @@ convolution3D( Index kernelWidth,
    Real result = 0;
 
    for( Index k = 0; k < kernelDepth; k++ ) {
-      Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;
+      Index xyAlign = ( k + threadIdx.z ) * dataBlockXYVolume;
 
       for( Index j = 0; j < kernelHeight; j++ ) {
-         Index xAlign = ( j + threadIdx.y ) * blockDim.x;
+         Index xAlign = ( j + threadIdx.y ) * dataBlockWidth;
 
          for( Index i = 0; i < kernelWidth; i++ ) {
             Index index = i + threadIdx.x + xAlign + xyAlign;
 
-            result = convolve( result, shared[ index ], fetchKernel( i, j, k ) );
+            result = convolve( result, data[ index ], fetchKernel( i, j, k ) );
          }
       }
    }
diff --git a/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h
index 62276c3cf..b9d094203 100644
--- a/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h
+++ b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h
@@ -8,7 +8,7 @@
    #include <TNL/Cuda/SharedMemory.h>
 
 /**
- * This method stores kernel and data in the shared memory to reduce amount of loads.
+ * This method stores kernel and data in the data memory to reduce amount of loads.
  *
  * We can calculate the size of shared memory needed the next way:
  * 1. We need to store in shared memory:
@@ -49,7 +49,7 @@ convolution1D( Index kernelWidth,
 {
    Index ix = threadIdx.x + blockIdx.x * blockDim.x;
 
-   Index kernelOffset = 2 * kernelWidth;
+   Index kernelOffset = 2 * kernelWidth - 1;
 
    Real* data = TNL::Cuda::getSharedMemory< Real >();
    Real* kernel = data + kernelOffset;
@@ -114,26 +114,29 @@ convolution2D( Index kernelWidth,
                Convolve convolve,
                Store store )
 {
-   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
-   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+   const Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   const Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   const Index radiusY = kernelHeight >> 1;
+   const Index radiusX = kernelWidth >> 1;
+
+   const Index dataBlockWidth = 2 * kernelWidth - 1;
+   const Index dataBlockHeight = 2 * kernelHeight - 1;
+
+   const Index dataBlockRadiusX = dataBlockWidth >> 1;
+   const Index dataBlockRadiusY = dataBlockHeight >> 1;
 
-   Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 );
+   const Index kernelOffset = dataBlockWidth * dataBlockHeight;
 
    Real* data = TNL::Cuda::getSharedMemory< Real >();
    Real* kernel = data + kernelOffset;
 
-   Index radiusY = kernelHeight >> 1;
-   Index radiusX = kernelWidth >> 1;
-
    Index x, y, index;
 
    // Top Left
    x = ix - radiusX;
    y = iy - radiusY;
-
-   index = threadIdx.x + threadIdx.y * blockDim.x;
-
-   kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y );
+   index = threadIdx.x + threadIdx.y * dataBlockWidth;
 
    if( x < 0 || y < 0 || x >= endX || y >= endY ) {
       data[ index ] = fetchBoundary( x, y );
@@ -145,8 +148,7 @@ convolution2D( Index kernelWidth,
    // Top right
    x = ix + radiusX;
    y = iy - radiusY;
-
-   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x;
+   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth;
 
    if( x < 0 || y < 0 || x >= endX || y >= endY ) {
       data[ index ] = fetchBoundary( x, y );
@@ -158,10 +160,9 @@ convolution2D( Index kernelWidth,
    // Bottom Left
    x = ix - radiusX;
    y = iy + radiusY;
+   index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;
 
-   index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x;
-
-   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
+   if(x < 0 || y < 0 || x >= endX || y >= endY ) {
       data[ index ] = fetchBoundary( x, y );
    }
    else {
@@ -171,8 +172,7 @@ convolution2D( Index kernelWidth,
    // Bottom Right
    x = ix + radiusX;
    y = iy + radiusY;
-
-   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x;
+   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;
 
    if( x < 0 || y < 0 || x >= endX || y >= endY ) {
       data[ index ] = fetchBoundary( x, y );
@@ -181,6 +181,10 @@ convolution2D( Index kernelWidth,
       data[ index ] = fetchData( x, y );
    }
 
+   index = threadIdx.x + threadIdx.y * blockDim.x;
+
+   kernel[index] = fetchKernel( threadIdx.x, threadIdx.y );
+
    __syncthreads();
 
     if( ix >= endX || iy >= endY )
@@ -190,7 +194,7 @@ convolution2D( Index kernelWidth,
 
    #pragma unroll
    for( Index j = 0; j < kernelHeight; j++ ) {
-      Index elementAlign = ( j + threadIdx.y ) * blockDim.x;
+      Index elementAlign = ( j + threadIdx.y ) * dataBlockWidth;
       Index kernelAlign = j * blockDim.x;
 
    #pragma unroll
@@ -226,19 +230,29 @@ convolution3D( Index kernelWidth,
                Convolve convolve,
                Store store )
 {
-   Index iz = threadIdx.z + blockIdx.z * blockDim.z;
-   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
-   Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+   const Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+   const Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   const Index iz = threadIdx.z + blockIdx.z * blockDim.z;
+
+   const Index radiusX = kernelWidth >> 1;
+   const Index radiusY = kernelHeight >> 1;
+   const Index radiusZ = kernelDepth >> 1;
+
+   const Index dataBlockWidth = 2 * kernelWidth - 1;
+   const Index dataBlockHeight = 2 * kernelHeight - 1;
+   const Index dataBlockDepth = 2 * kernelDepth - 1;
 
-   Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ) * ( 2 * kernelDepth - 1 );
+   const Index dataBlockXYVolume = dataBlockWidth * dataBlockHeight;
+
+   const Index dataBlockRadiusX = dataBlockWidth >> 1;
+   const Index dataBlockRadiusY = dataBlockHeight >> 1;
+   const Index dataBlockRadiusZ = dataBlockDepth >> 1;
+
+   const Index kernelOffset = dataBlockWidth * dataBlockHeight * dataBlockDepth;
 
    Real* data = TNL::Cuda::getSharedMemory< Real >();
    Real* kernel = data + kernelOffset;
 
-   Index radiusZ = kernelDepth >> 1;
-   Index radiusY = kernelHeight >> 1;
-   Index radiusX = kernelWidth >> 1;
-
    Index x, y, z, index;
 
    // Z: 0 Y: 0 X: 0
@@ -246,9 +260,7 @@ convolution3D( Index kernelWidth,
    y = iy - radiusY;
    z = iz - radiusZ;
 
-   index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
-
-   kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z );
+   index = threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume;
 
    if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       data[ index ] = fetchBoundary( x, y, z );
@@ -262,7 +274,7 @@ convolution3D( Index kernelWidth,
    y = iy - radiusY;
    z = iz - radiusZ;
 
-   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
+   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume;
 
    if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       data[ index ] = fetchBoundary( x, y, z );
@@ -276,7 +288,7 @@ convolution3D( Index kernelWidth,
    y = iy + radiusY;
    z = iz - radiusZ;
 
-   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
+   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume;
 
    if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       data[ index ] = fetchBoundary( x, y, z );
@@ -290,7 +302,7 @@ convolution3D( Index kernelWidth,
    y = iy - radiusY;
    z = iz + radiusZ;
 
-   index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
+   index = threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;
 
    if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       data[ index ] = fetchBoundary( x, y, z );
@@ -304,9 +316,9 @@ convolution3D( Index kernelWidth,
    y = iy + radiusY;
    z = iz - radiusZ;
 
-   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
+   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume;
 
-   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ) {
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       data[ index ] = fetchBoundary( x, y, z );
    }
    else {
@@ -318,7 +330,7 @@ convolution3D( Index kernelWidth,
    y = iy - radiusY;
    z = iz + radiusZ;
 
-   index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
+   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;
 
    if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       data[ index ] = fetchBoundary( x, y, z );
@@ -332,9 +344,9 @@ convolution3D( Index kernelWidth,
    y = iy + radiusY;
    z = iz + radiusZ;
 
-   index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
+   index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;
 
-   if(x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
+   if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       data[ index ] = fetchBoundary( x, y, z );
    }
    else {
@@ -346,7 +358,7 @@ convolution3D( Index kernelWidth,
    y = iy + radiusY;
    z = iz + radiusZ;
 
-   index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y;
+   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume;
 
    if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) {
       data[ index ] = fetchBoundary( x, y, z );
@@ -355,6 +367,10 @@ convolution3D( Index kernelWidth,
       data[ index ] = fetchData( x, y, z );
    }
 
+   index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
+
+   kernel[index] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z );
+
    __syncthreads();
 
    if( ix >= endX || iy >= endY || iz >= endZ )
@@ -364,11 +380,11 @@ convolution3D( Index kernelWidth,
 
    #pragma unroll
    for( Index k = 0; k < kernelDepth; k++ ) {
-      Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;
+      Index xyAlign = ( k + threadIdx.z ) * dataBlockXYVolume;
       Index xyKernelAlign = k * blockDim.x * blockDim.y;
    #pragma unroll
       for( Index j = 0; j < kernelHeight; j++ ) {
-         Index xAlign = ( j + threadIdx.y ) * blockDim.x;
+         Index xAlign = ( j + threadIdx.y ) * dataBlockWidth;
          Index xKernelAlign = j * blockDim.x;
    #pragma unroll
          for( Index i = 0; i < kernelWidth; i++ ) {
diff --git a/src/Benchmarks/Convolution/support/DummyBenchmark.h b/src/Benchmarks/Convolution/support/DummyBenchmark.h
index 9b44d53dd..f8e2d4ae8 100644
--- a/src/Benchmarks/Convolution/support/DummyBenchmark.h
+++ b/src/Benchmarks/Convolution/support/DummyBenchmark.h
@@ -16,7 +16,7 @@ class DummyBenchmark : public Benchmark< Dimension, Device >
 {
 public:
    using Vector = TNL::Containers::StaticVector< Dimension, int >;
-   using DataStore = TNL::Containers::Array< float, Device, int >;
+   using DataStore = TNL::Containers::Vector< float, Device, int >;
    using Base = Benchmark< Dimension, Device >;
    using TNLBenchmark = typename Base::TNLBenchmark;
 
@@ -103,9 +103,9 @@ public:
       result = 1;
       kernel = 1;
 
-      auto inputView = input.getView();
+      auto inputView = input.getConstView();
+      auto kernelView = kernel.getConstView();
       auto resultView = result.getView();
-      auto kernelView = kernel.getView();
 
       auto measure = [ & ]()
       {
diff --git a/src/Benchmarks/Convolution/support/DummySolver.h b/src/Benchmarks/Convolution/support/DummySolver.h
index 82a8f6ad4..2b1e60041 100644
--- a/src/Benchmarks/Convolution/support/DummySolver.h
+++ b/src/Benchmarks/Convolution/support/DummySolver.h
@@ -13,7 +13,7 @@ class DummySolver : public Solver< Dimension, Device >
 public:
    using Base = Solver< Dimension, Device >;
    using Vector = TNL::Containers::StaticVector< Dimension, int >;
-   using DataStore = TNL::Containers::Array< float, Device, int >;
+   using DataStore = TNL::Containers::Vector< float, Device, int >;
 
    virtual void
    start( const TNL::Config::ParameterContainer& parameters ) const override
@@ -55,9 +55,9 @@ public:
       result = 1;
       kernel = 1;
 
-      auto inputView = input.getView();
+      auto inputView = input.getConstView();
+      auto kernelView = kernel.getConstView();
       auto resultView = result.getView();
-      auto kernelView = kernel.getView();
 
       DummyTask<int, float, Dimension, Device>::exec(dimension, kernelSize, inputView, resultView, kernelView);
 
diff --git a/src/Benchmarks/Convolution/support/DummyTask.h b/src/Benchmarks/Convolution/support/DummyTask.h
index 026575c38..d8e904896 100644
--- a/src/Benchmarks/Convolution/support/DummyTask.h
+++ b/src/Benchmarks/Convolution/support/DummyTask.h
@@ -34,11 +34,12 @@ public:
    static constexpr int Dimension = 1;
    using Device = TNL::Devices::Cuda;
    using Vector = TNL::Containers::StaticVector< Dimension, Index >;
-   using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
+   using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType;
+   using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType;
    using ConvolutionLauncher = Convolution< Dimension, Device >;
 
    static void
-   exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
+   exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel )
    {
       auto fetchData = [ = ] __cuda_callable__( Index i )
       {
@@ -82,11 +83,12 @@ public:
    static constexpr int Dimension = 2;
    using Device = TNL::Devices::Cuda;
    using Vector = TNL::Containers::StaticVector< Dimension, Index >;
-   using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
+   using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType;
+   using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType;
    using ConvolutionLauncher = Convolution< Dimension, Device >;
 
    static void
-   exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
+   exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel )
    {
       auto fetchData = [ = ] __cuda_callable__( Index i, Index j )
       {
@@ -116,7 +118,7 @@ public:
       {
          auto index = i + j * dimensions.x();
 
-         result[ index ] = resultValue;
+         result[ index ] = TNL::max(TNL::min(resultValue, 1.), 0.);
       };
 
       ConvolutionLauncher::execute< Index, Real >( dimensions,
@@ -136,11 +138,12 @@ public:
    static constexpr int Dimension = 3;
    using Device = TNL::Devices::Cuda;
    using Vector = TNL::Containers::StaticVector< Dimension, Index >;
-   using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType;
+   using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType;
+   using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType;
    using ConvolutionLauncher = Convolution< Dimension, Device >;
 
    static void
-   exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel )
+   exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel )
    {
       auto fetchData = [ = ] __cuda_callable__( Index i, Index j, Index k )
       {
diff --git a/src/Benchmarks/Convolution/support/ImageSolver.h b/src/Benchmarks/Convolution/support/ImageSolver.h
new file mode 100644
index 000000000..1e50ab535
--- /dev/null
+++ b/src/Benchmarks/Convolution/support/ImageSolver.h
@@ -0,0 +1,184 @@
+
+#pragma once
+
+#include "Solver.h"
+#include "DummyTask.h"
+
+#include <TNL/FileName.h>
+#include <TNL/Timer.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Pointers/SharedPointer.h>
+#include <TNL/Functions/MeshFunction.h>
+#include <TNL/Images/PNGImage.h>
+#include <TNL/Images/RegionOfInterest.h>
+
+static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" };
+static std::vector< TNL::String > kernelSizeIds = { "x-kernel-size", "y-kernel-size", "z-kernel-size" };
+
+class ImageSolver : public Solver< 2, TNL::Devices::Cuda >
+{
+public:
+   constexpr static int Dimension = 2;
+   using Device = TNL::Devices::Cuda;
+
+   using Base = Solver< Dimension, Device >;
+   using Vector = TNL::Containers::StaticVector< Dimension, int >;
+   using DataStore = TNL::Containers::Vector< float, Device, int >;
+   using HostDataStore = TNL::Containers::Vector< float, TNL::Devices::Host, int >;
+
+   using GridType = TNL::Meshes::Grid< 2, float, Device, int >;
+   using GridPointer = TNL::Pointers::SharedPointer< GridType >;
+   using MeshFunctionType = TNL::Functions::MeshFunction< GridType >;
+
+   virtual void
+   start( const TNL::Config::ParameterContainer& parameters ) const override
+   {
+      GridPointer grid;
+      MeshFunctionType meshFunction;
+      TNL::Images::PNGImage< int > image;
+      TNL::Images::RegionOfInterest< int > roi;
+
+      meshFunction.setMesh( grid );
+
+      auto output = parameters.getParameter< TNL::String >( "output" );
+
+      if (!this -> readImage(parameters, grid, meshFunction, image, roi) ||
+          !this -> convolve(parameters, meshFunction) ||
+          !this -> write(parameters, image, meshFunction))
+         return;
+   }
+
+   template<typename Image>
+   bool readImage(const TNL::Config::ParameterContainer& parameters,
+                  GridPointer & grid,
+                  MeshFunctionType& meshFunction,
+                  Image& image,
+                  TNL::Images::RegionOfInterest< int >& roi) const {
+      auto input = parameters.getParameter< TNL::String >( "input" );
+
+      if( image.openForRead( input ) ) {
+         if( ! roi.setup( parameters, &image ) ) {
+            std::cout << "Invalid image roi.";
+            image.close();
+            return false;
+         }
+
+         std::cout << image.getWidth() << " " << image.getHeight() << std::endl;
+
+         auto meshPointer = meshFunction.getMeshPointer();
+
+         meshPointer -> setDimensions(image.getWidth(), image.getHeight());
+
+         meshFunction.setMesh(meshPointer);
+
+         if( ! image.read( roi, meshFunction ) ) {
+            std::cout << "Invalid image size" << std::endl;;
+            image.close();
+            return false;
+         }
+
+         image.close();
+
+         std::cout << "Image read was successful: " << meshFunction.getData().getSize() << " elements count" << std::endl;
+         return true;
+      }
+
+      std::cout << "Image open for read failed. Please check file path" << std::endl;;
+
+      return false;
+   }
+
+   bool convolve(const TNL::Config::ParameterContainer& parameters, MeshFunctionType& meshFunction) const {
+      auto imageData = meshFunction.getData().getConstView();
+
+      Vector kernelSize;
+      DataStore kernel;
+
+      kernel = getKernel(parameters, kernelSize);
+
+      DataStore result;
+
+      result.setLike( imageData );
+      result = 0;
+
+      TNL::Timer timer;
+
+      timer.start();
+
+      std::cout << imageData.getSize() << " " << result.getSize() << std::endl;
+
+      launchConvolution( imageData,
+                         kernel.getConstView(),
+                         result.getView(),
+                         meshFunction.getMeshPointer() -> getDimensions(),
+                         kernelSize );
+
+      timer.stop();
+
+      meshFunction.getData() = result;
+
+      std::cout << "Image convolution was successful. Time: " << timer.getRealTime() << " sec" << std::endl;
+
+      return true;
+   }
+
+   template<typename Image>
+   bool write(const TNL::Config::ParameterContainer& parameters, Image& image, MeshFunctionType& meshFunction) const {
+      auto output = parameters.getParameter< TNL::String >( "output" );
+      GridType grid = meshFunction.getMesh();
+
+      if( image.openForWrite( output, grid ) ) {
+         if( ! image.write( meshFunction ) ) {
+            std::cout << "Image write failed" << std::endl;;
+            image.close();
+            return false;
+         }
+
+         image.close();
+
+         return true;
+      }
+
+      std::cout << "Image open for write failed. Please check file path" << std::endl;
+
+      return false;
+   }
+
+   HostDataStore getKernel( const TNL::Config::ParameterContainer& parameters, Vector& kernelDimension ) const {
+      kernelDimension = {3, 3};
+
+      return {-1, -1, -1,
+              -1, 8, -1,
+              -1, -1, -1};
+   }
+
+   void
+   launchConvolution( DataStore::ConstViewType image,
+                      DataStore::ConstViewType kernel,
+                      DataStore::ViewType result,
+                      const GridType::CoordinatesType& imageDimension,
+                      const GridType::CoordinatesType& kernelDimension) const
+   {
+      DummyTask<int, float, Dimension, Device>::exec(imageDimension, kernelDimension, image, result, kernel);
+   }
+
+   virtual TNL::Config::ConfigDescription
+   makeInputConfig() const override
+   {
+      TNL::Config::ConfigDescription config = Base::makeInputConfig();
+
+      config.addDelimiter( "Image settings:" );
+
+      config.addEntry< TNL::String >( "input", "PNG image" );
+      config.addEntry< TNL::String >( "output", "PNG image" );
+
+      config.addDelimiter( "Roi settings:" );
+
+      config.addEntry< int >( "roi-top", "Top (smaller number) line of the region of interest.", -1 );
+      config.addEntry< int >( "roi-bottom", "Bottom (larger number) line of the region of interest.", -1 );
+      config.addEntry< int >( "roi-left", "Left (smaller number) column of the region of interest.", -1 );
+      config.addEntry< int >( "roi-right", "Right (larger number) column of the region of interest.", -1 );
+
+      return config;
+   }
+};
diff --git a/src/Benchmarks/Convolution/support/Solver.h b/src/Benchmarks/Convolution/support/Solver.h
index 3fd56fb02..d80373f0d 100644
--- a/src/Benchmarks/Convolution/support/Solver.h
+++ b/src/Benchmarks/Convolution/support/Solver.h
@@ -4,7 +4,7 @@
 #include <vector>
 
 #include <TNL/Containers/StaticVector.h>
-#include <TNL/Containers/Array.h>
+#include <TNL/Containers/Vector.h>
 
 template< int Dimension, typename Device >
 class Solver
diff --git a/src/Benchmarks/Convolution/templates/main_image_solver.h b/src/Benchmarks/Convolution/templates/main_image_solver.h
new file mode 100644
index 000000000..c2b9c1440
--- /dev/null
+++ b/src/Benchmarks/Convolution/templates/main_image_solver.h
@@ -0,0 +1,26 @@
+
+#define KERNEL KERNEL_VALUE
+#define DIMENSION DIMENSION_VALUE
+
+#include KERNEL
+#include "../support/ImageSolver.h"
+
+#include <TNL/Config/parseCommandLine.h>
+
+using TaskSolver = ImageSolver;
+
+int main(int argc, char* argv[])
+{
+   TaskSolver solver;
+
+   auto config = solver.makeInputConfig();
+
+   TNL::Config::ParameterContainer parameters;
+
+   if( ! parseCommandLine( argc, argv, config, parameters ) )
+      return EXIT_FAILURE;
+
+   solver.solve( parameters );
+
+   return 0;
+}
diff --git a/src/TNL/Images/PNGImage_impl.h b/src/TNL/Images/PNGImage_impl.h
index 3b946ff82..5efd66c84 100644
--- a/src/TNL/Images/PNGImage_impl.h
+++ b/src/TNL/Images/PNGImage_impl.h
@@ -266,6 +266,7 @@ PNGImage< Index >::write( const Meshes::Grid< 2, Real, Device, Index >& grid, Ve
       for( j = 0; j < grid.getDimensions().x(); j++ ) {
          cell.getCoordinates().x() = j;
          cell.getCoordinates().y() = grid.getDimensions().y() - 1 - i;
+         cell.refresh();
 
          // Index cellIndex = grid.getCellIndex( CoordinatesType( j,
          //                                      grid.getDimensions().y() - 1 - i ) );
@@ -305,6 +306,7 @@ PNGImage< Index >::write( const Functions::MeshFunction< Meshes::Grid< 2, MeshRe
       for( j = 0; j < grid.getDimensions().x(); j++ ) {
          cell.getCoordinates().x() = j;
          cell.getCoordinates().y() = grid.getDimensions().y() - 1 - i;
+         cell.refresh();
 
          // Index cellIndex = grid.getCellIndex( CoordinatesType( j,
          //                                      grid.getDimensions().y() - 1 - i ) );
-- 
GitLab


From 8a23431f452248210c76567b8151e01f60529aba Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Wed, 4 May 2022 16:52:39 +0200
Subject: [PATCH 15/19] Add custom kernels to image solver

---
 .../Convolution/support/DummyTask.h           |   2 +-
 .../Convolution/support/ImageSolver.h         | 135 ++++++++++++++----
 2 files changed, 106 insertions(+), 31 deletions(-)

diff --git a/src/Benchmarks/Convolution/support/DummyTask.h b/src/Benchmarks/Convolution/support/DummyTask.h
index d8e904896..60d37f923 100644
--- a/src/Benchmarks/Convolution/support/DummyTask.h
+++ b/src/Benchmarks/Convolution/support/DummyTask.h
@@ -118,7 +118,7 @@ public:
       {
          auto index = i + j * dimensions.x();
 
-         result[ index ] = TNL::max(TNL::min(resultValue, 1.), 0.);
+         result[ index ] = resultValue;
       };
 
       ConvolutionLauncher::execute< Index, Real >( dimensions,
diff --git a/src/Benchmarks/Convolution/support/ImageSolver.h b/src/Benchmarks/Convolution/support/ImageSolver.h
index 1e50ab535..6d3d6b79d 100644
--- a/src/Benchmarks/Convolution/support/ImageSolver.h
+++ b/src/Benchmarks/Convolution/support/ImageSolver.h
@@ -14,6 +14,8 @@
 
 static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" };
 static std::vector< TNL::String > kernelSizeIds = { "x-kernel-size", "y-kernel-size", "z-kernel-size" };
+static std::vector< TNL::String > kernels = { "identity",        "gauss3x3",      "gauss5x5",
+                                              "sobelHorizontal", "sobelVertical", "edgeDetection" };
 
 class ImageSolver : public Solver< 2, TNL::Devices::Cuda >
 {
@@ -42,18 +44,19 @@ public:
 
       auto output = parameters.getParameter< TNL::String >( "output" );
 
-      if (!this -> readImage(parameters, grid, meshFunction, image, roi) ||
-          !this -> convolve(parameters, meshFunction) ||
-          !this -> write(parameters, image, meshFunction))
+      if( ! this->readImage( parameters, grid, meshFunction, image, roi ) || ! this->convolve( parameters, meshFunction )
+          || ! this->write( parameters, image, meshFunction ) )
          return;
    }
 
-   template<typename Image>
-   bool readImage(const TNL::Config::ParameterContainer& parameters,
-                  GridPointer & grid,
-                  MeshFunctionType& meshFunction,
-                  Image& image,
-                  TNL::Images::RegionOfInterest< int >& roi) const {
+   template< typename Image >
+   bool
+   readImage( const TNL::Config::ParameterContainer& parameters,
+              GridPointer& grid,
+              MeshFunctionType& meshFunction,
+              Image& image,
+              TNL::Images::RegionOfInterest< int >& roi ) const
+   {
       auto input = parameters.getParameter< TNL::String >( "input" );
 
       if( image.openForRead( input ) ) {
@@ -67,12 +70,13 @@ public:
 
          auto meshPointer = meshFunction.getMeshPointer();
 
-         meshPointer -> setDimensions(image.getWidth(), image.getHeight());
+         meshPointer->setDimensions( image.getWidth(), image.getHeight() );
 
-         meshFunction.setMesh(meshPointer);
+         meshFunction.setMesh( meshPointer );
 
          if( ! image.read( roi, meshFunction ) ) {
-            std::cout << "Invalid image size" << std::endl;;
+            std::cout << "Invalid image size" << std::endl;
+
             image.close();
             return false;
          }
@@ -83,18 +87,20 @@ public:
          return true;
       }
 
-      std::cout << "Image open for read failed. Please check file path" << std::endl;;
+      std::cout << "Image open for read failed. Please check file path" << std::endl;
 
       return false;
    }
 
-   bool convolve(const TNL::Config::ParameterContainer& parameters, MeshFunctionType& meshFunction) const {
+   bool
+   convolve( const TNL::Config::ParameterContainer& parameters, MeshFunctionType& meshFunction ) const
+   {
       auto imageData = meshFunction.getData().getConstView();
 
       Vector kernelSize;
       DataStore kernel;
 
-      kernel = getKernel(parameters, kernelSize);
+      kernel = getKernel( parameters, kernelSize );
 
       DataStore result;
 
@@ -107,14 +113,17 @@ public:
 
       std::cout << imageData.getSize() << " " << result.getSize() << std::endl;
 
-      launchConvolution( imageData,
-                         kernel.getConstView(),
-                         result.getView(),
-                         meshFunction.getMeshPointer() -> getDimensions(),
-                         kernelSize );
+      launchConvolution(
+         imageData, kernel.getConstView(), result.getView(), meshFunction.getMeshPointer()->getDimensions(), kernelSize );
 
       timer.stop();
 
+      result.forAllElements(
+         [] __cuda_callable__( int i, float& value )
+         {
+            value = TNL::max( TNL::min( value, 1.0 ), 0.0 );
+         } );
+
       meshFunction.getData() = result;
 
       std::cout << "Image convolution was successful. Time: " << timer.getRealTime() << " sec" << std::endl;
@@ -122,14 +131,17 @@ public:
       return true;
    }
 
-   template<typename Image>
-   bool write(const TNL::Config::ParameterContainer& parameters, Image& image, MeshFunctionType& meshFunction) const {
+   template< typename Image >
+   bool
+   write( const TNL::Config::ParameterContainer& parameters, Image& image, MeshFunctionType& meshFunction ) const
+   {
       auto output = parameters.getParameter< TNL::String >( "output" );
       GridType grid = meshFunction.getMesh();
 
       if( image.openForWrite( output, grid ) ) {
          if( ! image.write( meshFunction ) ) {
-            std::cout << "Image write failed" << std::endl;;
+            std::cout << "Image write failed" << std::endl;
+
             image.close();
             return false;
          }
@@ -144,12 +156,71 @@ public:
       return false;
    }
 
-   HostDataStore getKernel( const TNL::Config::ParameterContainer& parameters, Vector& kernelDimension ) const {
-      kernelDimension = {3, 3};
+   HostDataStore
+   getKernel( const TNL::Config::ParameterContainer& parameters, Vector& kernelDimension ) const
+   {
+      auto kernel = parameters.getParameter< TNL::String >( "kernel" );
+
+      if( kernel == "identity" ) {
+         kernelDimension = { 3, 3 };
+
+         return { 0, 0, 0,
+                  0, 1, 0,
+                  0, 0, 0 };
+      }
+
+      if( kernel == "gauss3x3" ) {
+         kernelDimension = { 3, 3 };
+
+         HostDataStore kernel = { 1, 2, 1,
+                                  2, 4, 2,
+                                  1, 2, 1 };
+
+         kernel /= 16;
+
+         return kernel;
+      }
+
+      if( kernel == "gauss5x5" ) {
+         kernelDimension = { 5, 5 };
+
+        HostDataStore kernel = { 1, 4, 7, 4, 1,
+                                 4, 16, 26, 16, 4,
+                                 7, 26, 41, 26, 7,
+                                 4, 16, 26, 16, 4,
+                                 1, 4, 7, 4, 1 };
+
+         kernel /= 273;
+
+         return kernel;
+      }
+
+      if( kernel == "sobelHorizontal" ) {
+         kernelDimension = { 3, 3 };
+
+         return { 1, 2, 1,
+                  0, 0, 0,
+                  -1, -2, -1 };
+      }
 
-      return {-1, -1, -1,
-              -1, 8, -1,
-              -1, -1, -1};
+      if( kernel == "sobelVertical" ) {
+         kernelDimension = { 3, 3 };
+
+         return { 1, 0, -1,
+                  2, 0, -2,
+                  1, 0, -1 };
+      }
+
+      if( kernel == "edgeDetection" ) {
+         kernelDimension = { 3, 3 };
+
+         return { -1, -1, -1,
+                  -1, 8, -1,
+                  -1, -1, -1 };
+      }
+
+      std::cout << "Unknown kernel " << kernel << ". Exit" << std::endl;
+      exit(1);
    }
 
    void
@@ -157,9 +228,9 @@ public:
                       DataStore::ConstViewType kernel,
                       DataStore::ViewType result,
                       const GridType::CoordinatesType& imageDimension,
-                      const GridType::CoordinatesType& kernelDimension) const
+                      const GridType::CoordinatesType& kernelDimension ) const
    {
-      DummyTask<int, float, Dimension, Device>::exec(imageDimension, kernelDimension, image, result, kernel);
+      DummyTask< int, float, Dimension, Device >::exec( imageDimension, kernelDimension, image, result, kernel );
    }
 
    virtual TNL::Config::ConfigDescription
@@ -171,6 +242,10 @@ public:
 
       config.addEntry< TNL::String >( "input", "PNG image" );
       config.addEntry< TNL::String >( "output", "PNG image" );
+      config.addEntry< TNL::String >( "kernel", "A kernel to apply", kernels[ 0 ] );
+
+      for( const auto& kernel : kernels )
+         config.addEntryEnum( kernel);
 
       config.addDelimiter( "Roi settings:" );
 
-- 
GitLab


From 053bc398a0dadcfc3db603f305967ee2e8ff518a Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Thu, 5 May 2022 19:21:02 +0200
Subject: [PATCH 16/19] Implement heat equation solver

---
 src/Benchmarks/Convolution/CMakeLists.txt     |   9 +-
 .../Convolution/support/DummyTask.h           |  12 +-
 .../Convolution/support/HeatEquationSolver.h  | 189 ++++++++++++++++++
 .../Convolution/support/ImageSolver.h         |  78 +++-----
 .../templates/main_heat_equation_solver.h     |  26 +++
 5 files changed, 261 insertions(+), 53 deletions(-)
 create mode 100644 src/Benchmarks/Convolution/support/HeatEquationSolver.h
 create mode 100644 src/Benchmarks/Convolution/templates/main_heat_equation_solver.h

diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt
index d34518dcc..6ac9ed64f 100644
--- a/src/Benchmarks/Convolution/CMakeLists.txt
+++ b/src/Benchmarks/Convolution/CMakeLists.txt
@@ -12,7 +12,11 @@ if (${BUILD_CUDA})
    STRING(REGEX REPLACE "DIMENSION_VALUE" ${DIMENSION} TEMPLATE_CONTENT "${TEMPLATE_CONTENT}")
    STRING(REGEX REPLACE "KERNEL_VALUE" "\"../${KERNEL_HEADER}\"" TEMPLATE_CONTENT "${TEMPLATE_CONTENT}")
 
-   FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}")
+   FILE(READ ${SOURCE_FILE} SOURCE_FILE_CONTENT)
+
+   if ( NOT "${SOURCE_FILE_CONTENT}" STREQUAL "${TEMPLATE_CONTENT}" )
+      FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}")
+   endif()
 
    SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}_${TEMPLATE_NAME}")
 
@@ -63,3 +67,6 @@ GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "k
 GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedData.h")
 GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedKernel.h")
 GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedDataAndKernel.h")
+
+GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/naive.h")
+GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/sharedDataAndKernel.h")
diff --git a/src/Benchmarks/Convolution/support/DummyTask.h b/src/Benchmarks/Convolution/support/DummyTask.h
index 60d37f923..07a58e98d 100644
--- a/src/Benchmarks/Convolution/support/DummyTask.h
+++ b/src/Benchmarks/Convolution/support/DummyTask.h
@@ -39,7 +39,7 @@ public:
    using ConvolutionLauncher = Convolution< Dimension, Device >;
 
    static void
-   exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel )
+   exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel, int boundaryValue = 1 )
    {
       auto fetchData = [ = ] __cuda_callable__( Index i )
       {
@@ -48,7 +48,7 @@ public:
 
       auto fetchBoundary = [ = ] __cuda_callable__( Index i )
       {
-         return 1;
+         return boundaryValue;
       };
 
       auto fetchKernel = [ = ] __cuda_callable__( Index i )
@@ -88,7 +88,7 @@ public:
    using ConvolutionLauncher = Convolution< Dimension, Device >;
 
    static void
-   exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel )
+   exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel, int boundaryValue = 1 )
    {
       auto fetchData = [ = ] __cuda_callable__( Index i, Index j )
       {
@@ -99,7 +99,7 @@ public:
 
       auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j )
       {
-         return 1;
+         return boundaryValue;
       };
 
       auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j )
@@ -143,7 +143,7 @@ public:
    using ConvolutionLauncher = Convolution< Dimension, Device >;
 
    static void
-   exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel )
+   exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel, int boundaryValue = 1 )
    {
       auto fetchData = [ = ] __cuda_callable__( Index i, Index j, Index k )
       {
@@ -154,7 +154,7 @@ public:
 
       auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j, Index k )
       {
-         return 1;
+         return boundaryValue;
       };
 
       auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j, Index k )
diff --git a/src/Benchmarks/Convolution/support/HeatEquationSolver.h b/src/Benchmarks/Convolution/support/HeatEquationSolver.h
new file mode 100644
index 000000000..6ce6d5c09
--- /dev/null
+++ b/src/Benchmarks/Convolution/support/HeatEquationSolver.h
@@ -0,0 +1,189 @@
+
+#pragma once
+
+#include "Solver.h"
+#include "DummyTask.h"
+
+#include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Timer.h>
+
+static std::vector< TNL::String > dimensionIds = { "grid-x-size", "grid-y-size" };
+static std::vector< TNL::String > kernelSizeIds = { "kernel-x-size", "kernel-y-size" };
+static std::vector< TNL::String > domainIds = { "domain-x-size", "domain-y-size" };
+static std::string sigmaKey = "sigma";
+static std::string timestepKey = "timeStep";
+static std::string finalTimeKey = "finalTime";
+static std::string outputFilenamePrefix = "outputFilenamePrefix";
+
+template< typename Real = double >
+class HeatEquationSolver : public Solver< 2, TNL::Devices::Cuda >
+{
+public:
+   constexpr static int Dimension = 2;
+   using Device = TNL::Devices::Cuda;
+
+   using Base = Solver< Dimension, Device >;
+   using Vector = TNL::Containers::StaticVector< Dimension, int >;
+   using Point = TNL::Containers::StaticVector< Dimension, Real >;
+   using DataStore = TNL::Containers::Vector< Real, Device, int >;
+   using HostDataStore = TNL::Containers::Vector< Real, TNL::Devices::Host, int >;
+
+   virtual void
+   start( const TNL::Config::ParameterContainer& parameters ) const override
+   {
+      int gridXSize = parameters.getParameter< int >( dimensionIds[ 0 ] );
+      int gridYSize = parameters.getParameter< int >( dimensionIds[ 1 ] );
+
+      int kernelXSize = parameters.getParameter< int >( kernelSizeIds[ 0 ] );
+      int kernelYSize = parameters.getParameter< int >( kernelSizeIds[ 1 ] );
+
+      Real xDomainSize = parameters.getParameter< Real >( domainIds[ 0 ] );
+      Real yDomainSize = parameters.getParameter< Real >( domainIds[ 1 ] );
+
+      Real hx = xDomainSize / (Real) gridXSize;
+      Real hy = yDomainSize / (Real) gridYSize;
+
+      Point domain = { xDomainSize, yDomainSize };
+      Point spaceSteps = { hx, hy };
+
+      Vector dimensions = { gridXSize, gridYSize };
+      Vector kernelSize = { kernelXSize, kernelYSize };
+
+      DataStore function = prepareFunction( parameters, dimensions, domain, spaceSteps );
+
+      auto filenamePrefix = parameters.getParameter< TNL::String >( outputFilenamePrefix );
+      auto initialFilename = filenamePrefix + "_initial.txt";
+
+      if( ! writeGNUPlot( initialFilename, dimensions, spaceSteps, domain, function.getConstView() ) ) {
+         std::cout << "Did fail during file write";
+         return;
+      }
+
+      DataStore result;
+
+      result.setLike( function );
+      result = 0;
+
+      auto finalTime = parameters.getParameter< Real >( finalTimeKey );
+
+      convolve( dimensions, domain, spaceSteps, kernelSize, function.getConstView(), result.getView(), finalTime );
+
+      auto finalFilename = filenamePrefix + "_final.txt";
+
+      if( ! writeGNUPlot( finalFilename, dimensions, spaceSteps, domain, result.getConstView() ) ) {
+         std::cout << "Did fail during file write";
+         return;
+      }
+   }
+
+   virtual TNL::Config::ConfigDescription
+   makeInputConfig() const override
+   {
+      TNL::Config::ConfigDescription config = Base::makeInputConfig();
+
+      config.addDelimiter( "Grid settings:" );
+      config.addEntry< int >( dimensionIds[ 0 ], "Grid size along x-axis.", 100 );
+      config.addEntry< int >( dimensionIds[ 1 ], "Grid size along y-axis.", 100 );
+
+      config.addDelimiter( "Kernel settings:" );
+      config.addEntry< int >( kernelSizeIds[ 0 ], "Kernel size along x-axis.", 3 );
+      config.addEntry< int >( kernelSizeIds[ 1 ], "Kernel size along y-axis.", 3 );
+
+      config.addDelimiter( "Problem settings:" );
+      config.addEntry< TNL::String >( outputFilenamePrefix, "The prefix in name of the output file", "data" );
+
+      config.addEntry< Real >( domainIds[ 0 ], "Domain size along x-axis.", 4.0 );
+      config.addEntry< Real >( domainIds[ 1 ], "Domain size along y-axis.", 4.0 );
+
+      config.addEntry< Real >( sigmaKey, "Sigma in exponential initial condition.", 0.5);
+
+      config.addEntry< Real >( finalTimeKey, "Final time of the simulation.", 0.12);
+
+      return config;
+   }
+
+   DataStore
+   prepareFunction( const TNL::Config::ParameterContainer& parameters,
+                    const Vector& dimensions,
+                    const Point& domain,
+                    const Point& spaceSteps ) const
+   {
+      DataStore function;
+
+      function.resize( dimensions.x() * dimensions.y() );
+
+      auto functionView = function.getView();
+
+      auto xDomainSize = parameters.getParameter< Real >( domainIds[ 0 ] );
+      auto yDomainSize = parameters.getParameter< Real >( domainIds[ 1 ] );
+      auto sigma = parameters.getParameter< Real >( sigmaKey );
+
+      auto init = [ = ] __cuda_callable__( int i, int j ) mutable
+      {
+         auto index = j * dimensions.x() + i;
+
+         auto x = i * spaceSteps.x() - domain.x() / 2.;
+         auto y = j * spaceSteps.y() - domain.y() / 2.;
+
+         functionView[ index ] = exp( sigma * ( x * x + y * y ) );
+      };
+
+      TNL::Algorithms::ParallelFor2D< Device >::exec( 0, 0, dimensions.x(), dimensions.y(), init );
+
+      return function;
+   }
+
+   void
+   convolve( const Vector& dimensions,
+             const Point& domain,
+             const Point& spaceSteps,
+             const Vector& kernelSize,
+             typename DataStore::ConstViewType input,
+             typename DataStore::ViewType result,
+             const Real time ) const
+   {
+      HostDataStore kernel;
+
+      kernel.resize( kernelSize.x() * kernelSize.y() );
+
+      for( int j = 0; j < kernelSize.y(); j++ ) {
+         for( int i = 0; i < kernelSize.x(); i++ ) {
+            int index = i + j * kernelSize.x();
+
+            auto x = i * spaceSteps.x() - domain.x() / 2.;
+            auto y = j * spaceSteps.y() - domain.y() / 2.;
+
+            kernel[ index ] = ( 1. / ( 4. * M_PI * time ) ) * exp( -( x * x + y * y ) / ( 4. * time ) );
+         }
+      }
+
+      std::cout << kernel << std::endl;
+
+      DataStore kernelDevice( kernel );
+
+      auto kernelView = kernelDevice.getConstView();
+
+      DummyTask< int, Real, Dimension, Device >::exec( dimensions, kernelSize, input, result, kernelView, 0);
+   }
+
+   bool
+   writeGNUPlot( const std::string& filename,
+                 const Vector& dimensions,
+                 const Point& spaceSteps,
+                 const Point& domain,
+                 const typename DataStore::ConstViewType& map ) const
+   {
+      std::ofstream out( filename, std::ios::out );
+
+      if( ! out.is_open() )
+         return false;
+
+      for( int j = 0; j < dimensions.y(); j++ )
+         for( int i = 0; i < dimensions.x(); i++ )
+            out << i * spaceSteps.x() - domain.x() / 2. << " "
+                << j * spaceSteps.y() - domain.y() / 2. << " "
+                << map.getElement( j * dimensions.x() + i ) << std::endl;
+
+      return out.good();
+   }
+};
diff --git a/src/Benchmarks/Convolution/support/ImageSolver.h b/src/Benchmarks/Convolution/support/ImageSolver.h
index 6d3d6b79d..069553573 100644
--- a/src/Benchmarks/Convolution/support/ImageSolver.h
+++ b/src/Benchmarks/Convolution/support/ImageSolver.h
@@ -13,7 +13,7 @@
 #include <TNL/Images/RegionOfInterest.h>
 
 static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" };
-static std::vector< TNL::String > kernelSizeIds = { "x-kernel-size", "y-kernel-size", "z-kernel-size" };
+static std::vector< TNL::String > kernelSizeIds = { "kernel-x-size", "kernel-y-size", "kernel-z-size" };
 static std::vector< TNL::String > kernels = { "identity",        "gauss3x3",      "gauss5x5",
                                               "sobelHorizontal", "sobelVertical", "edgeDetection" };
 
@@ -49,6 +49,30 @@ public:
          return;
    }
 
+   virtual TNL::Config::ConfigDescription
+   makeInputConfig() const override
+   {
+      TNL::Config::ConfigDescription config = Base::makeInputConfig();
+
+      config.addDelimiter( "Image settings:" );
+
+      config.addEntry< TNL::String >( "input", "PNG image" );
+      config.addEntry< TNL::String >( "output", "PNG image" );
+      config.addEntry< TNL::String >( "kernel", "A kernel to apply", kernels[ 0 ] );
+
+      for( const auto& kernel : kernels )
+         config.addEntryEnum( kernel );
+
+      config.addDelimiter( "Roi settings:" );
+
+      config.addEntry< int >( "roi-top", "Top (smaller number) line of the region of interest.", -1 );
+      config.addEntry< int >( "roi-bottom", "Bottom (larger number) line of the region of interest.", -1 );
+      config.addEntry< int >( "roi-left", "Left (smaller number) column of the region of interest.", -1 );
+      config.addEntry< int >( "roi-right", "Right (larger number) column of the region of interest.", -1 );
+
+      return config;
+   }
+
    template< typename Image >
    bool
    readImage( const TNL::Config::ParameterContainer& parameters,
@@ -164,17 +188,13 @@ public:
       if( kernel == "identity" ) {
          kernelDimension = { 3, 3 };
 
-         return { 0, 0, 0,
-                  0, 1, 0,
-                  0, 0, 0 };
+         return { 0, 0, 0, 0, 1, 0, 0, 0, 0 };
       }
 
       if( kernel == "gauss3x3" ) {
          kernelDimension = { 3, 3 };
 
-         HostDataStore kernel = { 1, 2, 1,
-                                  2, 4, 2,
-                                  1, 2, 1 };
+         HostDataStore kernel = { 1, 2, 1, 2, 4, 2, 1, 2, 1 };
 
          kernel /= 16;
 
@@ -184,11 +204,7 @@ public:
       if( kernel == "gauss5x5" ) {
          kernelDimension = { 5, 5 };
 
-        HostDataStore kernel = { 1, 4, 7, 4, 1,
-                                 4, 16, 26, 16, 4,
-                                 7, 26, 41, 26, 7,
-                                 4, 16, 26, 16, 4,
-                                 1, 4, 7, 4, 1 };
+         HostDataStore kernel = { 1, 4, 7, 4, 1, 4, 16, 26, 16, 4, 7, 26, 41, 26, 7, 4, 16, 26, 16, 4, 1, 4, 7, 4, 1 };
 
          kernel /= 273;
 
@@ -198,29 +214,23 @@ public:
       if( kernel == "sobelHorizontal" ) {
          kernelDimension = { 3, 3 };
 
-         return { 1, 2, 1,
-                  0, 0, 0,
-                  -1, -2, -1 };
+         return { 1, 2, 1, 0, 0, 0, -1, -2, -1 };
       }
 
       if( kernel == "sobelVertical" ) {
          kernelDimension = { 3, 3 };
 
-         return { 1, 0, -1,
-                  2, 0, -2,
-                  1, 0, -1 };
+         return { 1, 0, -1, 2, 0, -2, 1, 0, -1 };
       }
 
       if( kernel == "edgeDetection" ) {
          kernelDimension = { 3, 3 };
 
-         return { -1, -1, -1,
-                  -1, 8, -1,
-                  -1, -1, -1 };
+         return { -1, -1, -1, -1, 8, -1, -1, -1, -1 };
       }
 
       std::cout << "Unknown kernel " << kernel << ". Exit" << std::endl;
-      exit(1);
+      exit( 1 );
    }
 
    void
@@ -232,28 +242,4 @@ public:
    {
       DummyTask< int, float, Dimension, Device >::exec( imageDimension, kernelDimension, image, result, kernel );
    }
-
-   virtual TNL::Config::ConfigDescription
-   makeInputConfig() const override
-   {
-      TNL::Config::ConfigDescription config = Base::makeInputConfig();
-
-      config.addDelimiter( "Image settings:" );
-
-      config.addEntry< TNL::String >( "input", "PNG image" );
-      config.addEntry< TNL::String >( "output", "PNG image" );
-      config.addEntry< TNL::String >( "kernel", "A kernel to apply", kernels[ 0 ] );
-
-      for( const auto& kernel : kernels )
-         config.addEntryEnum( kernel);
-
-      config.addDelimiter( "Roi settings:" );
-
-      config.addEntry< int >( "roi-top", "Top (smaller number) line of the region of interest.", -1 );
-      config.addEntry< int >( "roi-bottom", "Bottom (larger number) line of the region of interest.", -1 );
-      config.addEntry< int >( "roi-left", "Left (smaller number) column of the region of interest.", -1 );
-      config.addEntry< int >( "roi-right", "Right (larger number) column of the region of interest.", -1 );
-
-      return config;
-   }
 };
diff --git a/src/Benchmarks/Convolution/templates/main_heat_equation_solver.h b/src/Benchmarks/Convolution/templates/main_heat_equation_solver.h
new file mode 100644
index 000000000..c258f0740
--- /dev/null
+++ b/src/Benchmarks/Convolution/templates/main_heat_equation_solver.h
@@ -0,0 +1,26 @@
+
+#define KERNEL KERNEL_VALUE
+#define DIMENSION DIMENSION_VALUE
+
+#include KERNEL
+#include "../support/HeatEquationSolver.h"
+
+#include <TNL/Config/parseCommandLine.h>
+
+using TaskSolver = HeatEquationSolver<>;
+
+int main(int argc, char* argv[])
+{
+   TaskSolver solver;
+
+   auto config = solver.makeInputConfig();
+
+   TNL::Config::ParameterContainer parameters;
+
+   if( ! parseCommandLine( argc, argv, config, parameters ) )
+      return EXIT_FAILURE;
+
+   solver.solve( parameters );
+
+   return 0;
+}
-- 
GitLab


From 405faef731a05d8e3dfa3decb4e568d293c108aa Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Fri, 6 May 2022 19:12:16 +0200
Subject: [PATCH 17/19] Update the heat equation solver

---
 src/Benchmarks/Convolution/CMakeLists.txt     |   3 +-
 .../kernels/heatEquationSharedData.h          | 182 ++++++++++++++++++
 .../Convolution/kernels/sharedData.h          |   8 +-
 .../Convolution/support/HeatEquationSolver.h  |  62 +++---
 .../Convolution/support/HeatEquationTask.h    |  94 +++++++++
 5 files changed, 308 insertions(+), 41 deletions(-)
 create mode 100644 src/Benchmarks/Convolution/kernels/heatEquationSharedData.h
 create mode 100644 src/Benchmarks/Convolution/support/HeatEquationTask.h

diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt
index 6ac9ed64f..8695a4048 100644
--- a/src/Benchmarks/Convolution/CMakeLists.txt
+++ b/src/Benchmarks/Convolution/CMakeLists.txt
@@ -68,5 +68,4 @@ GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "k
 GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedKernel.h")
 GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedDataAndKernel.h")
 
-GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/naive.h")
-GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/sharedDataAndKernel.h")
+GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/heatEquationSharedData.h")
diff --git a/src/Benchmarks/Convolution/kernels/heatEquationSharedData.h b/src/Benchmarks/Convolution/kernels/heatEquationSharedData.h
new file mode 100644
index 000000000..bf6fdccf5
--- /dev/null
+++ b/src/Benchmarks/Convolution/kernels/heatEquationSharedData.h
@@ -0,0 +1,182 @@
+
+#ifdef HAVE_CUDA
+
+/**
+ * This method stores image tile into shared memory
+ * and then calculates convolution.
+ *
+ * Thanks for the idea https://www.evl.uic.edu/sjames/cs525/final.html
+ */
+
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/StaticVector.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Cuda/SharedMemory.h>
+
+template< int Dimension, typename Device >
+struct Convolution;
+
+template< typename Index,
+          typename Real,
+          typename FetchData,
+          typename FetchBoundary,
+          typename Convolve,
+          typename Store >
+__global__
+static void
+convolution2D( Index kernelWidth,
+               Index kernelHeight,
+               Index endX,
+               Index endY,
+               FetchData fetchData,
+               FetchBoundary fetchBoundary,
+               Convolve convolve,
+               Store store )
+{
+   Real* data = TNL::Cuda::getSharedMemory< Real >();
+
+   const Index iy = threadIdx.y + blockIdx.y * blockDim.y;
+   const Index ix = threadIdx.x + blockIdx.x * blockDim.x;
+
+   const Index radiusY = kernelHeight >> 1;
+   const Index radiusX = kernelWidth >> 1;
+
+   const Index dataBlockWidth = 2 * kernelWidth - 1;
+   const Index dataBlockHeight = 2 * kernelHeight - 1;
+
+   const Index dataBlockRadiusX = dataBlockWidth >> 1;
+   const Index dataBlockRadiusY = dataBlockHeight >> 1;
+
+   Index x, y, index;
+
+   // Top Left
+   x = ix - radiusX;
+   y = iy - radiusY;
+   index = threadIdx.x + threadIdx.y * dataBlockWidth;
+
+   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
+      data[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      data[ index ] = fetchData( x, y );
+   }
+
+   // Top right
+   x = ix + radiusX;
+   y = iy - radiusY;
+   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth;
+
+   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
+      data[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      data[ index ] = fetchData( x, y );
+   }
+
+   // Bottom Left
+   x = ix - radiusX;
+   y = iy + radiusY;
+   index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;
+
+   if(x < 0 || y < 0 || x >= endX || y >= endY ) {
+      data[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      data[ index ] = fetchData( x, y );
+   }
+
+   // Bottom Right
+   x = ix + radiusX;
+   y = iy + radiusY;
+   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;
+
+   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
+      data[ index ] = fetchBoundary( x, y );
+   }
+   else {
+      data[ index ] = fetchData( x, y );
+   }
+
+   __syncthreads();
+
+   if( ix >= endX || iy >= endY )
+      return;
+
+   Real result = 0;
+
+   for( Index j = 0; j < kernelHeight; j++ ) {
+      Index align = ( j + threadIdx.y ) * dataBlockWidth;
+
+      for( Index i = 0; i < kernelWidth; i++ ) {
+         Index index = i + threadIdx.x + align;
+
+         result = convolve( result, ix, iy, i, j, data[ index ]);
+      }
+   }
+
+   store( ix, iy, result );
+}
+
+
+template<>
+struct Convolution< 2, TNL::Devices::Cuda >
+{
+public:
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< 2, Index >;
+
+   template< typename Index, typename Real >
+   static void
+   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
+   {
+      Index kernelElementCount = 1;
+
+      for( Index i = 0; i < kernelSize.getSize(); i++ )
+         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
+
+      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
+
+      configuration.blockSize.x = kernelSize.x();
+      configuration.blockSize.y = kernelSize.y();
+
+      configuration.gridSize.x =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
+      configuration.gridSize.y =
+         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
+   }
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            Convolve&& convolve,
+            Store&& store )
+   {
+      TNL::Cuda::LaunchConfiguration configuration;
+
+      setup< Index, Real >( configuration, dimensions, kernelSize );
+
+      constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, Convolve, Store >;
+
+      TNL::Cuda::launchKernel< true >( kernel,
+                                       0,
+                                       configuration,
+                                       kernelSize.x(),
+                                       kernelSize.y(),
+                                       dimensions.x(),
+                                       dimensions.y(),
+                                       fetchData,
+                                       fetchBoundary,
+                                       convolve,
+                                       store );
+   };
+};
+
+#endif
diff --git a/src/Benchmarks/Convolution/kernels/sharedData.h b/src/Benchmarks/Convolution/kernels/sharedData.h
index dcaa5236e..f1dfb9008 100644
--- a/src/Benchmarks/Convolution/kernels/sharedData.h
+++ b/src/Benchmarks/Convolution/kernels/sharedData.h
@@ -9,10 +9,10 @@
  * Thanks for the idea https://www.evl.uic.edu/sjames/cs525/final.html
  */
 
-   #include <TNL/Devices/Cuda.h>
-   #include <TNL/Containers/StaticVector.h>
-   #include <TNL/Cuda/LaunchHelpers.h>
-   #include <TNL/Cuda/SharedMemory.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/StaticVector.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Cuda/SharedMemory.h>
 
 template< typename Index,
           typename Real,
diff --git a/src/Benchmarks/Convolution/support/HeatEquationSolver.h b/src/Benchmarks/Convolution/support/HeatEquationSolver.h
index 6ce6d5c09..539268d9b 100644
--- a/src/Benchmarks/Convolution/support/HeatEquationSolver.h
+++ b/src/Benchmarks/Convolution/support/HeatEquationSolver.h
@@ -2,7 +2,7 @@
 #pragma once
 
 #include "Solver.h"
-#include "DummyTask.h"
+#include "HeatEquationTask.h"
 
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Timer.h>
@@ -11,8 +11,8 @@ static std::vector< TNL::String > dimensionIds = { "grid-x-size", "grid-y-size"
 static std::vector< TNL::String > kernelSizeIds = { "kernel-x-size", "kernel-y-size" };
 static std::vector< TNL::String > domainIds = { "domain-x-size", "domain-y-size" };
 static std::string sigmaKey = "sigma";
-static std::string timestepKey = "timeStep";
-static std::string finalTimeKey = "finalTime";
+static std::string timeStepKey = "timeStep";
+static std::string timeKey = "time";
 static std::string outputFilenamePrefix = "outputFilenamePrefix";
 
 template< typename Real = double >
@@ -64,15 +64,28 @@ public:
       result.setLike( function );
       result = 0;
 
-      auto finalTime = parameters.getParameter< Real >( finalTimeKey );
+      auto timeStep = parameters.getParameter< double >( timeStepKey );
+      auto finalTime = parameters.getParameter< double >( timeKey );
 
-      convolve( dimensions, domain, spaceSteps, kernelSize, function.getConstView(), result.getView(), finalTime );
+      int iterationsCount = finalTime / timeStep;
 
-      auto finalFilename = filenamePrefix + "_final.txt";
+      double time = timeStep;
 
-      if( ! writeGNUPlot( finalFilename, dimensions, spaceSteps, domain, result.getConstView() ) ) {
-         std::cout << "Did fail during file write";
-         return;
+      for (int i = 1; i <= iterationsCount; i++) {
+         printf("Time: %lf\n", time);
+
+         convolve( dimensions, domain, kernelSize, function.getConstView(), result.getView(), time );
+
+         auto filename = TNL::String("data_") + TNL::convertToString(i) + ".txt";
+
+         if( ! writeGNUPlot( filename, dimensions, spaceSteps, domain, result.getConstView() ) ) {
+            std::cout << "Did fail during file write";
+            return;
+         }
+
+         result = 0;
+
+         time += timeStep;
       }
    }
 
@@ -82,8 +95,8 @@ public:
       TNL::Config::ConfigDescription config = Base::makeInputConfig();
 
       config.addDelimiter( "Grid settings:" );
-      config.addEntry< int >( dimensionIds[ 0 ], "Grid size along x-axis.", 100 );
-      config.addEntry< int >( dimensionIds[ 1 ], "Grid size along y-axis.", 100 );
+      config.addEntry< int >( dimensionIds[ 0 ], "Grid size along x-axis.", 200 );
+      config.addEntry< int >( dimensionIds[ 1 ], "Grid size along y-axis.", 200 );
 
       config.addDelimiter( "Kernel settings:" );
       config.addEntry< int >( kernelSizeIds[ 0 ], "Kernel size along x-axis.", 3 );
@@ -97,7 +110,8 @@ public:
 
       config.addEntry< Real >( sigmaKey, "Sigma in exponential initial condition.", 0.5);
 
-      config.addEntry< Real >( finalTimeKey, "Final time of the simulation.", 0.12);
+      config.addEntry< Real >( timeStepKey, "Time step of the simulation.", 0.005);
+      config.addEntry< Real >( timeKey, "Final time of the simulation.", 0.36);
 
       return config;
    }
@@ -136,34 +150,12 @@ public:
    void
    convolve( const Vector& dimensions,
              const Point& domain,
-             const Point& spaceSteps,
              const Vector& kernelSize,
              typename DataStore::ConstViewType input,
              typename DataStore::ViewType result,
              const Real time ) const
    {
-      HostDataStore kernel;
-
-      kernel.resize( kernelSize.x() * kernelSize.y() );
-
-      for( int j = 0; j < kernelSize.y(); j++ ) {
-         for( int i = 0; i < kernelSize.x(); i++ ) {
-            int index = i + j * kernelSize.x();
-
-            auto x = i * spaceSteps.x() - domain.x() / 2.;
-            auto y = j * spaceSteps.y() - domain.y() / 2.;
-
-            kernel[ index ] = ( 1. / ( 4. * M_PI * time ) ) * exp( -( x * x + y * y ) / ( 4. * time ) );
-         }
-      }
-
-      std::cout << kernel << std::endl;
-
-      DataStore kernelDevice( kernel );
-
-      auto kernelView = kernelDevice.getConstView();
-
-      DummyTask< int, Real, Dimension, Device >::exec( dimensions, kernelSize, input, result, kernelView, 0);
+      HeatEquationTask< int, Real, Dimension, Device >::exec( dimensions, kernelSize, domain, { 3., 3. }, time, input, result);
    }
 
    bool
diff --git a/src/Benchmarks/Convolution/support/HeatEquationTask.h b/src/Benchmarks/Convolution/support/HeatEquationTask.h
new file mode 100644
index 000000000..c4b4f5546
--- /dev/null
+++ b/src/Benchmarks/Convolution/support/HeatEquationTask.h
@@ -0,0 +1,94 @@
+
+#pragma once
+
+template< int Dimension, typename Device >
+struct Convolution
+{
+   template< typename Index >
+   using Vector = TNL::Containers::StaticVector< Dimension, Index >;
+
+   template< typename Index,
+             typename Real,
+             typename FetchData,
+             typename FetchBoundary,
+             typename Convolve,
+             typename Store >
+   static void
+   execute( const Vector< Index >& dimensions,
+            const Vector< Index >& kernelSize,
+            FetchData&& fetchData,
+            FetchBoundary&& fetchBoundary,
+            Convolve&& convolve,
+            Store&& store );
+};
+
+template< typename Index, typename Real, int Dimension, typename Device >
+struct HeatEquationTask;
+
+template< typename Index, typename Real >
+struct HeatEquationTask< Index, Real, 2, TNL::Devices::Cuda >
+{
+public:
+   static constexpr int Dimension = 2;
+   using Device = TNL::Devices::Cuda;
+   using Vector = TNL::Containers::StaticVector< Dimension, Index >;
+   using Point = TNL::Containers::StaticVector< Dimension, Real >;
+   using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType;
+   using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType;
+   using ConvolutionLauncher = Convolution< Dimension, Device >;
+
+   static void
+   exec( const Vector& dimensions,
+         const Vector& kernelSize,
+         const Point& functionDomain,
+         const Point& kernelDomain,
+         const Real time,
+         ConstDataStore& input,
+         DataStore& result)
+   {
+      auto functionSpaceSteps = Point(functionDomain.x() / dimensions.x(), functionDomain.y() / dimensions.y());
+      auto kernelSpaceSteps = Point(kernelDomain.x() / kernelSize.x(), kernelDomain.y() / kernelSize.y());
+
+      auto fetchData = [ = ] __cuda_callable__( Index i, Index j )
+      {
+         auto index = i + j * dimensions.x();
+
+         return input[ index ];
+      };
+
+      auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j )
+      {
+         return 0;
+      };
+
+      auto convolve = [ = ] __cuda_callable__( Real result, Index dataX, Index dataY, Index kernelX, Index kernelY, Real data )
+      {
+         auto functionXPos = dataX * functionSpaceSteps.x() - (functionDomain.x() / 2),
+              functionYPos = dataY * functionSpaceSteps.y() - (functionDomain.y() / 2);
+
+         auto kernelXPos = (kernelX - kernelSize.x() / 2) * kernelSpaceSteps.x(),
+              kernelYPos = (kernelY - kernelSize.y() / 2) * kernelSpaceSteps.y();
+
+         auto deltaXPos = kernelXPos - functionXPos,
+              deltaYPos = kernelYPos - functionYPos;
+
+         auto kernel = kernelSpaceSteps.x() * kernelSpaceSteps.y() * ( (Real)1 / ( (Real)4 * M_PI * time ) ) * exp( - ( pow(deltaXPos, 2.) + pow(deltaYPos, 2.)  ) / ( (Real)4 * time ) );
+
+         return result + data * kernel;
+      };
+
+      auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) mutable
+      {
+         auto index = i + j * dimensions.x();
+
+         result[ index ] = resultValue;
+      };
+
+      ConvolutionLauncher::execute< Index, Real >( dimensions,
+                                                   kernelSize,
+                                                   std::forward< decltype( fetchData ) >( fetchData ),
+                                                   std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
+                                                   std::forward< decltype( convolve ) >( convolve ),
+                                                   std::forward< decltype( store ) >( store ) );
+   }
+};
-- 
GitLab


From 82c7ee07cbf521637e3da170c99185626c7caff8 Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Sun, 8 May 2022 11:57:38 +0200
Subject: [PATCH 18/19] Add possibility to specify domain and start time of the
 iteration

---
 src/Benchmarks/Convolution/CMakeLists.txt     |  6 ++++
 .../Convolution/support/HeatEquationSolver.h  | 29 ++++++++++++++-----
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt
index 8695a4048..65e7fe897 100644
--- a/src/Benchmarks/Convolution/CMakeLists.txt
+++ b/src/Benchmarks/Convolution/CMakeLists.txt
@@ -12,6 +12,12 @@ if (${BUILD_CUDA})
    STRING(REGEX REPLACE "DIMENSION_VALUE" ${DIMENSION} TEMPLATE_CONTENT "${TEMPLATE_CONTENT}")
    STRING(REGEX REPLACE "KERNEL_VALUE" "\"../${KERNEL_HEADER}\"" TEMPLATE_CONTENT "${TEMPLATE_CONTENT}")
 
+   get_filename_component(ABSOLUTE_SUPPORT_PATH ${SOURCE_FILE} ABSOLUTE)
+
+   if(NOT EXISTS ${ABSOLUTE_SUPPORT_PATH})
+      FILE(WRITE ${ABSOLUTE_SUPPORT_PATH} "")
+   endif()
+
    FILE(READ ${SOURCE_FILE} SOURCE_FILE_CONTENT)
 
    if ( NOT "${SOURCE_FILE_CONTENT}" STREQUAL "${TEMPLATE_CONTENT}" )
diff --git a/src/Benchmarks/Convolution/support/HeatEquationSolver.h b/src/Benchmarks/Convolution/support/HeatEquationSolver.h
index 539268d9b..57ecd21a0 100644
--- a/src/Benchmarks/Convolution/support/HeatEquationSolver.h
+++ b/src/Benchmarks/Convolution/support/HeatEquationSolver.h
@@ -10,9 +10,11 @@
 static std::vector< TNL::String > dimensionIds = { "grid-x-size", "grid-y-size" };
 static std::vector< TNL::String > kernelSizeIds = { "kernel-x-size", "kernel-y-size" };
 static std::vector< TNL::String > domainIds = { "domain-x-size", "domain-y-size" };
+static std::vector< TNL::String > kernelDomainIds = { "kernel-domain-x-size", "kernel-domain-y-size" };
 static std::string sigmaKey = "sigma";
 static std::string timeStepKey = "timeStep";
-static std::string timeKey = "time";
+static std::string startTimeKey = "startTime";
+static std::string finalTimeKey = "finalTime";
 static std::string outputFilenamePrefix = "outputFilenamePrefix";
 
 template< typename Real = double >
@@ -40,10 +42,14 @@ public:
       Real xDomainSize = parameters.getParameter< Real >( domainIds[ 0 ] );
       Real yDomainSize = parameters.getParameter< Real >( domainIds[ 1 ] );
 
+      Real kernelXDomainSize = parameters.getParameter< Real >( kernelDomainIds[ 0 ] );
+      Real kernelYDomainSize = parameters.getParameter< Real >( kernelDomainIds[ 1 ] );
+
       Real hx = xDomainSize / (Real) gridXSize;
       Real hy = yDomainSize / (Real) gridYSize;
 
       Point domain = { xDomainSize, yDomainSize };
+      Point kernelDomain = { kernelXDomainSize, kernelYDomainSize };
       Point spaceSteps = { hx, hy };
 
       Vector dimensions = { gridXSize, gridYSize };
@@ -65,16 +71,18 @@ public:
       result = 0;
 
       auto timeStep = parameters.getParameter< double >( timeStepKey );
-      auto finalTime = parameters.getParameter< double >( timeKey );
+      auto startTime = parameters.getParameter< double >( startTimeKey );
+      auto finalTime = parameters.getParameter< double >( finalTimeKey );
 
-      int iterationsCount = finalTime / timeStep;
+      int iteration = (startTime / timeStep) + 1;
+      int finalIteration = finalTime / timeStep;
 
-      double time = timeStep;
+      double time = iteration * timeStep;
 
-      for (int i = 1; i <= iterationsCount; i++) {
+      for (int i = iteration; i <= finalIteration; i++) {
          printf("Time: %lf\n", time);
 
-         convolve( dimensions, domain, kernelSize, function.getConstView(), result.getView(), time );
+         convolve( dimensions, domain, kernelSize, kernelDomain, function.getConstView(), result.getView(), time );
 
          auto filename = TNL::String("data_") + TNL::convertToString(i) + ".txt";
 
@@ -108,10 +116,14 @@ public:
       config.addEntry< Real >( domainIds[ 0 ], "Domain size along x-axis.", 4.0 );
       config.addEntry< Real >( domainIds[ 1 ], "Domain size along y-axis.", 4.0 );
 
+      config.addEntry< Real >( kernelDomainIds[ 0 ], "Kernel domain size along x-axis.", 3.0 );
+      config.addEntry< Real >( kernelDomainIds[ 1 ], "Kernel domain size along y-axis.", 3.0 );
+
       config.addEntry< Real >( sigmaKey, "Sigma in exponential initial condition.", 0.5);
 
+      config.addEntry< Real >( startTimeKey, "Final time of the simulation.", 0.0);
       config.addEntry< Real >( timeStepKey, "Time step of the simulation.", 0.005);
-      config.addEntry< Real >( timeKey, "Final time of the simulation.", 0.36);
+      config.addEntry< Real >( finalTimeKey, "Final time of the simulation.", 0.36);
 
       return config;
    }
@@ -151,11 +163,12 @@ public:
    convolve( const Vector& dimensions,
              const Point& domain,
              const Vector& kernelSize,
+             const Point& kernelDomain,
              typename DataStore::ConstViewType input,
              typename DataStore::ViewType result,
              const Real time ) const
    {
-      HeatEquationTask< int, Real, Dimension, Device >::exec( dimensions, kernelSize, domain, { 3., 3. }, time, input, result);
+      HeatEquationTask< int, Real, Dimension, Device >::exec( dimensions, kernelSize, domain, kernelDomain, time, input, result);
    }
 
    bool
-- 
GitLab


From 6cc6bffe2ef5cb0ebb5fbad1e3ce2dd15ceb2d51 Mon Sep 17 00:00:00 2001
From: hayeuyur <hayeuyur@fit.cvut.cz>
Date: Mon, 9 May 2022 14:27:08 +0200
Subject: [PATCH 19/19] Fix heat equation solver

---
 src/Benchmarks/Convolution/CMakeLists.txt     |   2 +-
 .../kernels/heatEquationSharedData.h          | 182 ------------------
 .../Convolution/support/HeatEquationSolver.h  |  62 +++++-
 .../Convolution/support/HeatEquationTask.h    |  94 ---------
 4 files changed, 55 insertions(+), 285 deletions(-)
 delete mode 100644 src/Benchmarks/Convolution/kernels/heatEquationSharedData.h
 delete mode 100644 src/Benchmarks/Convolution/support/HeatEquationTask.h

diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt
index 65e7fe897..31d46cbf7 100644
--- a/src/Benchmarks/Convolution/CMakeLists.txt
+++ b/src/Benchmarks/Convolution/CMakeLists.txt
@@ -74,4 +74,4 @@ GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "k
 GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedKernel.h")
 GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedDataAndKernel.h")
 
-GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/heatEquationSharedData.h")
+GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/sharedDataAndKernel.h")
diff --git a/src/Benchmarks/Convolution/kernels/heatEquationSharedData.h b/src/Benchmarks/Convolution/kernels/heatEquationSharedData.h
deleted file mode 100644
index bf6fdccf5..000000000
--- a/src/Benchmarks/Convolution/kernels/heatEquationSharedData.h
+++ /dev/null
@@ -1,182 +0,0 @@
-
-#ifdef HAVE_CUDA
-
-/**
- * This method stores image tile into shared memory
- * and then calculates convolution.
- *
- * Thanks for the idea https://www.evl.uic.edu/sjames/cs525/final.html
- */
-
-#include <TNL/Devices/Cuda.h>
-#include <TNL/Containers/StaticVector.h>
-#include <TNL/Cuda/LaunchHelpers.h>
-#include <TNL/Cuda/SharedMemory.h>
-
-template< int Dimension, typename Device >
-struct Convolution;
-
-template< typename Index,
-          typename Real,
-          typename FetchData,
-          typename FetchBoundary,
-          typename Convolve,
-          typename Store >
-__global__
-static void
-convolution2D( Index kernelWidth,
-               Index kernelHeight,
-               Index endX,
-               Index endY,
-               FetchData fetchData,
-               FetchBoundary fetchBoundary,
-               Convolve convolve,
-               Store store )
-{
-   Real* data = TNL::Cuda::getSharedMemory< Real >();
-
-   const Index iy = threadIdx.y + blockIdx.y * blockDim.y;
-   const Index ix = threadIdx.x + blockIdx.x * blockDim.x;
-
-   const Index radiusY = kernelHeight >> 1;
-   const Index radiusX = kernelWidth >> 1;
-
-   const Index dataBlockWidth = 2 * kernelWidth - 1;
-   const Index dataBlockHeight = 2 * kernelHeight - 1;
-
-   const Index dataBlockRadiusX = dataBlockWidth >> 1;
-   const Index dataBlockRadiusY = dataBlockHeight >> 1;
-
-   Index x, y, index;
-
-   // Top Left
-   x = ix - radiusX;
-   y = iy - radiusY;
-   index = threadIdx.x + threadIdx.y * dataBlockWidth;
-
-   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
-      data[ index ] = fetchBoundary( x, y );
-   }
-   else {
-      data[ index ] = fetchData( x, y );
-   }
-
-   // Top right
-   x = ix + radiusX;
-   y = iy - radiusY;
-   index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth;
-
-   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
-      data[ index ] = fetchBoundary( x, y );
-   }
-   else {
-      data[ index ] = fetchData( x, y );
-   }
-
-   // Bottom Left
-   x = ix - radiusX;
-   y = iy + radiusY;
-   index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;
-
-   if(x < 0 || y < 0 || x >= endX || y >= endY ) {
-      data[ index ] = fetchBoundary( x, y );
-   }
-   else {
-      data[ index ] = fetchData( x, y );
-   }
-
-   // Bottom Right
-   x = ix + radiusX;
-   y = iy + radiusY;
-   index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth;
-
-   if( x < 0 || y < 0 || x >= endX || y >= endY ) {
-      data[ index ] = fetchBoundary( x, y );
-   }
-   else {
-      data[ index ] = fetchData( x, y );
-   }
-
-   __syncthreads();
-
-   if( ix >= endX || iy >= endY )
-      return;
-
-   Real result = 0;
-
-   for( Index j = 0; j < kernelHeight; j++ ) {
-      Index align = ( j + threadIdx.y ) * dataBlockWidth;
-
-      for( Index i = 0; i < kernelWidth; i++ ) {
-         Index index = i + threadIdx.x + align;
-
-         result = convolve( result, ix, iy, i, j, data[ index ]);
-      }
-   }
-
-   store( ix, iy, result );
-}
-
-
-template<>
-struct Convolution< 2, TNL::Devices::Cuda >
-{
-public:
-   template< typename Index >
-   using Vector = TNL::Containers::StaticVector< 2, Index >;
-
-   template< typename Index, typename Real >
-   static void
-   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
-   {
-      Index kernelElementCount = 1;
-
-      for( Index i = 0; i < kernelSize.getSize(); i++ )
-         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
-
-      configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real );
-
-      configuration.blockSize.x = kernelSize.x();
-      configuration.blockSize.y = kernelSize.y();
-
-      configuration.gridSize.x =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
-      configuration.gridSize.y =
-         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
-   }
-
-   template< typename Index,
-             typename Real,
-             typename FetchData,
-             typename FetchBoundary,
-             typename Convolve,
-             typename Store >
-   static void
-   execute( const Vector< Index >& dimensions,
-            const Vector< Index >& kernelSize,
-            FetchData&& fetchData,
-            FetchBoundary&& fetchBoundary,
-            Convolve&& convolve,
-            Store&& store )
-   {
-      TNL::Cuda::LaunchConfiguration configuration;
-
-      setup< Index, Real >( configuration, dimensions, kernelSize );
-
-      constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, Convolve, Store >;
-
-      TNL::Cuda::launchKernel< true >( kernel,
-                                       0,
-                                       configuration,
-                                       kernelSize.x(),
-                                       kernelSize.y(),
-                                       dimensions.x(),
-                                       dimensions.y(),
-                                       fetchData,
-                                       fetchBoundary,
-                                       convolve,
-                                       store );
-   };
-};
-
-#endif
diff --git a/src/Benchmarks/Convolution/support/HeatEquationSolver.h b/src/Benchmarks/Convolution/support/HeatEquationSolver.h
index 57ecd21a0..eebe89c4b 100644
--- a/src/Benchmarks/Convolution/support/HeatEquationSolver.h
+++ b/src/Benchmarks/Convolution/support/HeatEquationSolver.h
@@ -2,7 +2,7 @@
 #pragma once
 
 #include "Solver.h"
-#include "HeatEquationTask.h"
+#include "DummyTask.h"
 
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Timer.h>
@@ -11,7 +11,11 @@ static std::vector< TNL::String > dimensionIds = { "grid-x-size", "grid-y-size"
 static std::vector< TNL::String > kernelSizeIds = { "kernel-x-size", "kernel-y-size" };
 static std::vector< TNL::String > domainIds = { "domain-x-size", "domain-y-size" };
 static std::vector< TNL::String > kernelDomainIds = { "kernel-domain-x-size", "kernel-domain-y-size" };
-static std::string sigmaKey = "sigma";
+
+static std::string alphaKey = "alpha";
+static std::string betaKey = "beta";
+static std::string gammaKey = "gamma";
+
 static std::string timeStepKey = "timeStep";
 static std::string startTimeKey = "startTime";
 static std::string finalTimeKey = "finalTime";
@@ -116,11 +120,15 @@ public:
       config.addEntry< Real >( domainIds[ 0 ], "Domain size along x-axis.", 4.0 );
       config.addEntry< Real >( domainIds[ 1 ], "Domain size along y-axis.", 4.0 );
 
-      config.addEntry< Real >( kernelDomainIds[ 0 ], "Kernel domain size along x-axis.", 3.0 );
-      config.addEntry< Real >( kernelDomainIds[ 1 ], "Kernel domain size along y-axis.", 3.0 );
+      config.addEntry< Real >( kernelDomainIds[ 0 ], "Kernel domain size along x-axis.", 4.0 );
+      config.addEntry< Real >( kernelDomainIds[ 1 ], "Kernel domain size along y-axis.", 4.0 );
 
-      config.addEntry< Real >( sigmaKey, "Sigma in exponential initial condition.", 0.5);
+      config.addDelimiter( "Initial condition settings ( (x^2/alpha + y^2/beta) + gamma)):" );
+      config.addEntry< Real >( alphaKey, "Alpha value in initial condition", -0.05 );
+      config.addEntry< Real >( betaKey, "Beta value in initial condition", -0.05 );
+      config.addEntry< Real >( gammaKey, "Gamma key in initial condition", 15 );
 
+      config.addDelimiter( "Time settings:" );
       config.addEntry< Real >( startTimeKey, "Final time of the simulation.", 0.0);
       config.addEntry< Real >( timeStepKey, "Time step of the simulation.", 0.005);
       config.addEntry< Real >( finalTimeKey, "Final time of the simulation.", 0.36);
@@ -142,7 +150,10 @@ public:
 
       auto xDomainSize = parameters.getParameter< Real >( domainIds[ 0 ] );
       auto yDomainSize = parameters.getParameter< Real >( domainIds[ 1 ] );
-      auto sigma = parameters.getParameter< Real >( sigmaKey );
+
+      auto alpha = parameters.getParameter< Real >( alphaKey );
+      auto beta = parameters.getParameter< Real >( betaKey );
+      auto gamma = parameters.getParameter< Real >( gammaKey );
 
       auto init = [ = ] __cuda_callable__( int i, int j ) mutable
       {
@@ -151,7 +162,7 @@ public:
          auto x = i * spaceSteps.x() - domain.x() / 2.;
          auto y = j * spaceSteps.y() - domain.y() / 2.;
 
-         functionView[ index ] = exp( sigma * ( x * x + y * y ) );
+         functionView[ index ] = TNL::max((x * x / alpha)  + (y * y / beta) + gamma, 0);
       };
 
       TNL::Algorithms::ParallelFor2D< Device >::exec( 0, 0, dimensions.x(), dimensions.y(), init );
@@ -168,7 +179,42 @@ public:
              typename DataStore::ViewType result,
              const Real time ) const
    {
-      HeatEquationTask< int, Real, Dimension, Device >::exec( dimensions, kernelSize, domain, kernelDomain, time, input, result);
+      DataStore kernel;
+      kernel.resize(kernelSize.x() * kernelSize.y());
+
+      auto kernelView = kernel.getView();
+      auto domainSpaceSteps = Point(domain.x() / dimensions.x(), domain.y() / dimensions.y());
+      auto kernelSpaceSteps = Point(kernelDomain.x() / (kernelSize.x() - 1), kernelDomain.y() / (kernelSize.y() - 1));
+
+      auto init = [ = ] __cuda_callable__( int i, int j ) mutable {
+         auto index = j * kernelSize.x() + i;
+
+         auto x = i * kernelSpaceSteps.x() - kernelDomain.x() / 2.;
+         auto y = j * kernelSpaceSteps.y() - kernelDomain.y() / 2.;
+
+         // The space step is given by the function domain
+         // However, because the kernel is limited to 31x31 size
+         // The user can specify it custom kernel domain from which values are taken
+         kernelView[ index ] = domainSpaceSteps.x() * domainSpaceSteps.y() * ( (Real)1 / ( (Real)4 * M_PI * time ) ) * exp( - ( pow(x, 2.) + pow(y, 2.)  ) / ( (Real)4 * time ) );
+      };
+
+      TNL::Algorithms::ParallelFor2D< Device >::exec( 0, 0, kernelSize.x(), kernelSize.y(), init );
+
+      // std::cout << std::endl << std::endl << std::endl;
+
+      for (int i = 0; i < kernelSize.x(); i++) {
+         for (int j = 0; j < kernelSize.y(); j++) {
+            auto index = j * kernelSize.x() + i;
+
+            printf("%lf ", kernelView.getElement(index));
+         }
+
+         printf("\n");
+      }
+
+      auto kernelConstView = kernel.getConstView();
+
+      DummyTask<int, Real, Dimension, Device>::exec(dimensions, kernelSize, input, result, kernelConstView, 0);
    }
 
    bool
diff --git a/src/Benchmarks/Convolution/support/HeatEquationTask.h b/src/Benchmarks/Convolution/support/HeatEquationTask.h
deleted file mode 100644
index c4b4f5546..000000000
--- a/src/Benchmarks/Convolution/support/HeatEquationTask.h
+++ /dev/null
@@ -1,94 +0,0 @@
-
-#pragma once
-
-template< int Dimension, typename Device >
-struct Convolution
-{
-   template< typename Index >
-   using Vector = TNL::Containers::StaticVector< Dimension, Index >;
-
-   template< typename Index,
-             typename Real,
-             typename FetchData,
-             typename FetchBoundary,
-             typename Convolve,
-             typename Store >
-   static void
-   execute( const Vector< Index >& dimensions,
-            const Vector< Index >& kernelSize,
-            FetchData&& fetchData,
-            FetchBoundary&& fetchBoundary,
-            Convolve&& convolve,
-            Store&& store );
-};
-
-template< typename Index, typename Real, int Dimension, typename Device >
-struct HeatEquationTask;
-
-template< typename Index, typename Real >
-struct HeatEquationTask< Index, Real, 2, TNL::Devices::Cuda >
-{
-public:
-   static constexpr int Dimension = 2;
-   using Device = TNL::Devices::Cuda;
-   using Vector = TNL::Containers::StaticVector< Dimension, Index >;
-   using Point = TNL::Containers::StaticVector< Dimension, Real >;
-   using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType;
-   using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType;
-   using ConvolutionLauncher = Convolution< Dimension, Device >;
-
-   static void
-   exec( const Vector& dimensions,
-         const Vector& kernelSize,
-         const Point& functionDomain,
-         const Point& kernelDomain,
-         const Real time,
-         ConstDataStore& input,
-         DataStore& result)
-   {
-      auto functionSpaceSteps = Point(functionDomain.x() / dimensions.x(), functionDomain.y() / dimensions.y());
-      auto kernelSpaceSteps = Point(kernelDomain.x() / kernelSize.x(), kernelDomain.y() / kernelSize.y());
-
-      auto fetchData = [ = ] __cuda_callable__( Index i, Index j )
-      {
-         auto index = i + j * dimensions.x();
-
-         return input[ index ];
-      };
-
-      auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j )
-      {
-         return 0;
-      };
-
-      auto convolve = [ = ] __cuda_callable__( Real result, Index dataX, Index dataY, Index kernelX, Index kernelY, Real data )
-      {
-         auto functionXPos = dataX * functionSpaceSteps.x() - (functionDomain.x() / 2),
-              functionYPos = dataY * functionSpaceSteps.y() - (functionDomain.y() / 2);
-
-         auto kernelXPos = (kernelX - kernelSize.x() / 2) * kernelSpaceSteps.x(),
-              kernelYPos = (kernelY - kernelSize.y() / 2) * kernelSpaceSteps.y();
-
-         auto deltaXPos = kernelXPos - functionXPos,
-              deltaYPos = kernelYPos - functionYPos;
-
-         auto kernel = kernelSpaceSteps.x() * kernelSpaceSteps.y() * ( (Real)1 / ( (Real)4 * M_PI * time ) ) * exp( - ( pow(deltaXPos, 2.) + pow(deltaYPos, 2.)  ) / ( (Real)4 * time ) );
-
-         return result + data * kernel;
-      };
-
-      auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) mutable
-      {
-         auto index = i + j * dimensions.x();
-
-         result[ index ] = resultValue;
-      };
-
-      ConvolutionLauncher::execute< Index, Real >( dimensions,
-                                                   kernelSize,
-                                                   std::forward< decltype( fetchData ) >( fetchData ),
-                                                   std::forward< decltype( fetchBoundary ) >( fetchBoundary ),
-                                                   std::forward< decltype( convolve ) >( convolve ),
-                                                   std::forward< decltype( store ) >( store ) );
-   }
-};
-- 
GitLab