Commit 5d0dac39 authored by Yury Hayeu's avatar Yury Hayeu
Browse files

Implement shared data and kernel kernel

parent 3571b27e
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -46,3 +46,11 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/shar
GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedData.h")
GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedData.h")
GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedData.h")

GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedDataAndKernel.h")
GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedDataAndKernel.h")
GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedDataAndKernel.h")

GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")
GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")
GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h")
+2 −2
Original line number Diff line number Diff line
@@ -162,7 +162,7 @@ convolution2D( Index kernelWidth,
   Real result = 0;

   for( Index j = 0; j <= radiusY; j++ ) {
      Index align = ( j + threadIdx.y ) * blockDim.y;
      Index align = ( j + threadIdx.y ) * blockDim.x;

      for( Index i = 0; i <= radiusX; i++ ) {
         Index index = i + threadIdx.x + align;
@@ -330,7 +330,7 @@ convolution3D( Index kernelWidth,
      Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;

      for( Index j = 0; j <= radiusY; j++ ) {
         Index xAlign = ( j + threadIdx.y ) * blockDim.y;
         Index xAlign = ( j + threadIdx.y ) * blockDim.x;

         for( Index i = 0; i <= radiusX; i++ ) {
            Index index = i + threadIdx.x + xAlign + xyAlign;
+577 −0
Original line number Diff line number Diff line
#pragma once

#ifdef HAVE_CUDA

   #include <TNL/Devices/Cuda.h>
   #include <TNL/Containers/StaticVector.h>
   #include <TNL/Cuda/LaunchHelpers.h>
   #include <TNL/Cuda/SharedMemory.h>

/**
 * This method stores kernel and data in the shared memory to reduce amount of loads.
 *
 * We can calculate the size of shared memory needed the next way:
 * 1. We need to store in shared memory:
 *      * for 1D -> (2 * kernelWidth) - 1 < 2 * kernelWidth
 *      * for 2D -> ( (2 * kernelWidth) - 1 ) * ( (2 * kernelHeight) - 1 ) < 4 * kernelWidth * kernelHeight
 *      * for 3D -> ( (2 * kernelWidth) - 1 ) * ( (2 * kernelHeight) - 1 ) * ( (2 * kernelDepth) - 1 ) < 8 * kernelWidth *
 * kernelHeight * kernelDepth
 * 2. We take into account, that the maximal block size is 1024, so the maximum volume of kernel is 1024.
 *    Then the maximal amount of shared memory is:
 *      * for 1D -> 2 * 1024 -> 2048 elements (Note, that even if we take long double (16B) we still can fit in the shared
 * memory)
 *      * for 2D -> 4 * 1024 -> 4096 elements
 *      * for 3D -> 8 * 1024 -> 8196 elements (Note, that if double takes 8 bytes, then we can't fit tile into shared memory,
 * because we have 64 KB of data)
 * 3. The last thing is, that even if we take 1D and 2D case we have enough space to store 1024 kernel element.
 *    Then the maximal amount of shared memory is:
 *      * for 1D -> 3 * 1024 -> can use long double, double, float
 *      * for 2D -> 5 * 1024 -> can use double, float
 *      * for 3D -> 9 * 1024 -> can use float
 */

template< typename Index,
          typename Real,
          typename FetchData,
          typename FetchBoundary,
          typename FetchKernel,
          typename Convolve,
          typename Store >
__global__
static void
convolution1D( Index kernelWidth,
               Index endX,
               FetchData fetchData,
               FetchBoundary fetchBoundary,
               FetchKernel fetchKernel,
               Convolve convolve,
               Store store )
{
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if( ix >= endX )
      return;

   Index kernelOffset = 2 * kernelWidth;

   Real* data = TNL::Cuda::getSharedMemory< Real >();
   Real* kernel = data + kernelOffset;

   Index radius = kernelWidth >> 1;

   // Left
   Index lhs = ix - radius;

   if( lhs < 0 ) {
      data[ threadIdx.x ] = fetchBoundary( lhs );
   }
   else {
      data[ threadIdx.x ] = fetchData( lhs );
   }

   // Right
   Index rhs = ix + radius;

   if( rhs >= endX ) {
      data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs );
   }
   else {
      data[ threadIdx.x + blockDim.x ] = fetchData( rhs );
   }

   kernel[ threadIdx.x ] = fetchKernel( threadIdx.x );

   __syncthreads();

   Real result = 0;

   #pragma unroll
   for( Index i = 0; i < kernelWidth; i++ ) {
      Index elementIndex = i + threadIdx.x;

      result = convolve( result, data[ elementIndex ], kernel[ i ] );
   }

   store( ix, result );
}

template< typename Index,
          typename Real,
          typename FetchData,
          typename FetchBoundary,
          typename FetchKernel,
          typename Convolve,
          typename Store >
__global__
static void
convolution2D( Index kernelWidth,
               Index kernelHeight,
               Index endX,
               Index endY,
               FetchData fetchData,
               FetchBoundary fetchBoundary,
               FetchKernel fetchKernel,
               Convolve convolve,
               Store store )
{
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if( ix >= endX || iy >= endY )
      return;

   Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 );

   Real* data = TNL::Cuda::getSharedMemory< Real >();
   Real* kernel = data + kernelOffset;

   Index radiusY = kernelHeight >> 1;
   Index radiusX = kernelWidth >> 1;

   Index x, y, index;

   // Top Left
   x = ix - radiusX;
   y = iy - radiusY;

   index = threadIdx.x + threadIdx.y * blockDim.x;

   kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y );

   if( x < 0 || y < 0 ) {
      data[ index ] = fetchBoundary( x, y );
   }
   else {
      data[ index ] = fetchData( x, y );
   }

   // Top right
   x = ix + radiusX;
   y = iy - radiusY;

   index = radiusX + threadIdx.x + threadIdx.y * blockDim.x;

   if( x >= endX || y < 0 ) {
      data[ index ] = fetchBoundary( x, y );
   }
   else {
      data[ index ] = fetchData( x, y );
   }

   // Bottom Left
   x = ix - radiusX;
   y = iy + radiusY;

   index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;

   if( x < 0 || y >= endY ) {
      data[ index ] = fetchBoundary( x, y );
   }
   else {
      data[ index ] = fetchData( x, y );
   }

   // Bottom Right
   x = ix + radiusX;
   y = iy + radiusY;

   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x;

   if( x >= endX || y >= endY ) {
      data[ index ] = fetchBoundary( x, y );
   }
   else {
      data[ index ] = fetchData( x, y );
   }

   __syncthreads();

   Real result = 0;

   #pragma unroll
   for( Index j = 0; j <= radiusY; j++ ) {
      Index elementAlign = ( j + threadIdx.y ) * blockDim.x;
      Index kernelAlign = j * blockDim.x;

   #pragma unroll
      for( Index i = 0; i <= radiusX; i++ ) {
         Index elementIndex = i + threadIdx.x + elementAlign;
         Index kernelIndex = i + kernelAlign;

         result = convolve( result, data[ elementIndex ], kernel[ kernelIndex ] );
      }
   }

   store( ix, iy, result );
}

template< typename Index,
          typename Real,
          typename FetchData,
          typename FetchBoundary,
          typename FetchKernel,
          typename Convolve,
          typename Store >
__global__
static void
convolution3D( Index kernelWidth,
               Index kernelHeight,
               Index kernelDepth,
               Index endX,
               Index endY,
               Index endZ,
               FetchData fetchData,
               FetchBoundary fetchBoundary,
               FetchKernel fetchKernel,
               Convolve convolve,
               Store store )
{
   Index iz = threadIdx.z + blockIdx.z * blockDim.z;
   Index iy = threadIdx.y + blockIdx.y * blockDim.y;
   Index ix = threadIdx.x + blockIdx.x * blockDim.x;

   if( ix >= endX || iy >= endY || iz >= endZ )
      return;

   Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ) * ( 2 * kernelDepth - 1 );

   Real* data = TNL::Cuda::getSharedMemory< Real >();
   Real* kernel = data + kernelOffset;

   Index radiusZ = kernelDepth >> 1;
   Index radiusY = kernelHeight >> 1;
   Index radiusX = kernelWidth >> 1;

   Index x, y, z, index;

   // Z: 0 Y: 0 X: 0
   x = ix - radiusX;
   y = iy - radiusY;
   z = iz - radiusZ;

   index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;

   kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z );

   if( x < 0 || y < 0 || z < 0 ) {
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
      data[ index ] = fetchData( x, y, z );
   }

   // Z: 0 Y: 0 X: 1
   x = ix + radiusX;
   y = iy - radiusY;
   z = iz - radiusZ;

   index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;

   if( x >= endX || y < 0 || z < 0 ) {
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
      data[ index ] = fetchData( x, y, z );
   }

   // Z: 0 Y: 1 X: 0
   x = ix - radiusX;
   y = iy + radiusY;
   z = iz - radiusZ;

   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;

   if( x < 0 || y >= endY || z < 0 ) {
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
      data[ index ] = fetchData( x, y, z );
   }

   // Z: 1 Y: 0 X: 0
   x = ix - radiusX;
   y = iy - radiusY;
   z = iz + radiusZ;

   index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;

   if( x < 0 || y < 0 || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
      data[ index ] = fetchData( x, y, z );
   }

   // Z: 0 Y: 1 X: 1
   x = ix + radiusX;
   y = iy + radiusY;
   z = iz - radiusZ;

   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y;

   if( x >= endX || y >= endY || z < 0 ) {
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
      data[ index ] = fetchData( x, y, z );
   }

   // Z: 1 Y: 0 X: 1
   x = ix + radiusX;
   y = iy - radiusY;
   z = iz + radiusZ;

   index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;

   if( x >= endX || y < 0 || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
      data[ index ] = fetchData( x, y, z );
   }

   // Z: 1 Y: 1 X: 0
   x = ix - radiusX;
   y = iy + radiusY;
   z = iz + radiusZ;

   index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;

   if( x < 0 || y >= endY || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
      data[ index ] = fetchData( x, y, z );
   }

   // Z: 1 Y: 1 X: 1
   x = ix + radiusX;
   y = iy + radiusY;
   z = iz + radiusZ;

   index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y;

   if( x >= endX || y >= endY || z >= endZ ) {
      data[ index ] = fetchBoundary( x, y, z );
   }
   else {
      data[ index ] = fetchData( x, y, z );
   }

   __syncthreads();

   Real result = 0;

   for( Index k = 0; k <= radiusZ; k++ ) {
      Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x;
      Index xyKernelAlign = k * blockDim.x * blockDim.y;

      for( Index j = 0; j <= radiusY; j++ ) {
         Index xAlign = ( j + threadIdx.y ) * blockDim.x;
         Index xKernelAlign = j * blockDim.x;

         for( Index i = 0; i <= radiusX; i++ ) {
            Index elementIndex = i + threadIdx.x + xAlign + xyAlign;
            Index kernelIndex = i + xKernelAlign + xyKernelAlign;

            result = convolve( result, data[ index ], kernel[ kernelIndex ] );
         }
      }
   }

   store( ix, iy, iz, result );
}

template< int Dimension, typename Device >
struct Convolution;

template<>
struct Convolution< 1, TNL::Devices::Cuda >
{
public:
   template< typename Index >
   using Vector = TNL::Containers::StaticVector< 1, Index >;

   template< typename Index, typename Real >
   static void
   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
   {
      Index kernelElementCount = 1;

      for( Index i = 0; i < kernelSize.getSize(); i++ )
         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;

      configuration.dynamicSharedMemorySize = ( kernelSize.x() + kernelElementCount ) * sizeof( Real );

      configuration.blockSize.x = kernelSize.x();
      configuration.gridSize.x =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
   }

   template< typename Index,
             typename Real,
             typename FetchData,
             typename FetchBoundary,
             typename FetchKernel,
             typename Convolve,
             typename Store >
   static void
   execute( const Vector< Index >& dimensions,
            const Vector< Index >& kernelSize,
            FetchData&& fetchData,
            FetchBoundary&& fetchBoundary,
            FetchKernel&& fetchKernel,
            Convolve&& convolve,
            Store&& store )
   {
      TNL::Cuda::LaunchConfiguration configuration;

      setup< Index, Real >( configuration, dimensions, kernelSize );

      constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

      TNL::Cuda::launchKernel< true >(
         kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store );
   };
};

template<>
struct Convolution< 2, TNL::Devices::Cuda >
{
public:
   template< typename Index >
   using Vector = TNL::Containers::StaticVector< 2, Index >;

   template< typename Index, typename Real >
   static void
   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
   {
      Index kernelElementCount = 1;
      Index kernelVolume = 1;

      for( Index i = 0; i < kernelSize.getSize(); i++ ) {
         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
         kernelVolume *= kernelSize[ i ];
      }

      configuration.dynamicSharedMemorySize = ( kernelVolume + kernelElementCount ) * sizeof( Real );

      configuration.blockSize.x = kernelSize.x();
      configuration.blockSize.y = kernelSize.y();

      configuration.gridSize.x =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
      configuration.gridSize.y =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
   }

   template< typename Index,
             typename Real,
             typename FetchData,
             typename FetchBoundary,
             typename FetchKernel,
             typename Convolve,
             typename Store >
   static void
   execute( const Vector< Index >& dimensions,
            const Vector< Index >& kernelSize,
            FetchData&& fetchData,
            FetchBoundary&& fetchBoundary,
            FetchKernel&& fetchKernel,
            Convolve&& convolve,
            Store&& store )
   {
      TNL::Cuda::LaunchConfiguration configuration;

      setup< Index, Real >( configuration, dimensions, kernelSize );

      constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

      TNL::Cuda::launchKernel< true >( kernel,
                                       0,
                                       configuration,
                                       kernelSize.x(),
                                       kernelSize.y(),
                                       dimensions.x(),
                                       dimensions.y(),
                                       fetchData,
                                       fetchBoundary,
                                       fetchKernel,
                                       convolve,
                                       store );
   };
};

template<>
struct Convolution< 3, TNL::Devices::Cuda >
{
public:
   template< typename Index >
   using Vector = TNL::Containers::StaticVector< 3, Index >;

   template< typename Index, typename Real >
   static void
   setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize )
   {
      Index kernelElementCount = 1;
      Index kernelVolume = 1;

      for( Index i = 0; i < kernelSize.getSize(); i++ ) {
         kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1;
         kernelVolume *= kernelSize[ i ];
      }

      configuration.dynamicSharedMemorySize = ( kernelVolume + kernelElementCount ) * sizeof( Real );

      configuration.blockSize.x = kernelSize.x();
      configuration.blockSize.y = kernelSize.y();
      configuration.blockSize.z = kernelSize.z();

      configuration.gridSize.x =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) );
      configuration.gridSize.y =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) );
      configuration.gridSize.y =
         TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) );
   }

   template< typename Index,
             typename Real,
             typename FetchData,
             typename FetchBoundary,
             typename FetchKernel,
             typename Convolve,
             typename Store >
   static void
   execute( const Vector< Index >& dimensions,
            const Vector< Index >& kernelSize,
            FetchData&& fetchData,
            FetchBoundary&& fetchBoundary,
            FetchKernel&& fetchKernel,
            Convolve&& convolve,
            Store&& store )
   {
      TNL::Cuda::LaunchConfiguration configuration;

      setup< Index, Real >( configuration, dimensions, kernelSize );

      constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >;

      TNL::Cuda::launchKernel< true >( kernel,
                                       0,
                                       configuration,
                                       kernelSize.x(),
                                       kernelSize.y(),
                                       kernelSize.z(),
                                       dimensions.x(),
                                       dimensions.y(),
                                       dimensions.z(),
                                       fetchData,
                                       fetchBoundary,
                                       fetchKernel,
                                       convolve,
                                       store );
   };
};

#endif
+2 −11
Original line number Diff line number Diff line
@@ -19,12 +19,12 @@ public:
   void
   run( const TNL::Config::ParameterContainer& parameters ) const
   {
      if( ! TNL::Devices::Host::setup( parameters ) || ! TNL::Devices::Cuda::setup( parameters ) )
      if( ! TNL::Devices::Cuda::setup( parameters ) )
         return;

      const TNL::String logFileName = parameters.getParameter< TNL::String >( "log-file" );
      const TNL::String outputMode = parameters.getParameter< TNL::String >( "output-mode" );
      const TNL::String device = parameters.getParameter< TNL::String >( "device" );
   

      const int verbose = parameters.getParameter< int >( "verbose" );
      const int loops = parameters.getParameter< int >( "loops" );
@@ -58,19 +58,10 @@ public:
      config.addEntryEnum( "append" );
      config.addEntryEnum( "overwrite" );

      config.addEntry< TNL::String >( "device", "Device the computation will run on.", "cuda" );
      config.addEntryEnum< TNL::String >( "all" );
      config.addEntryEnum< TNL::String >( "host" );

#ifdef HAVE_CUDA
      config.addEntryEnum< TNL::String >( "cuda" );
#endif

      config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
      config.addEntry< int >( "verbose", "Verbose mode.", 1 );

      config.addDelimiter( "Device settings:" );
      TNL::Devices::Host::configSetup( config );

#ifdef HAVE_CUDA
      TNL::Devices::Cuda::configSetup( config );
+20 −50
Original line number Diff line number Diff line
@@ -23,56 +23,42 @@ public:
   virtual void
   start( TNLBenchmark& benchmark, const TNL::Config::ParameterContainer& parameters ) const override
   {
      Vector start;
      Vector end;
      Vector dimension;
      Vector minKernelSize;
      Vector maxKernelSize;

      for( int i = 0; i < Dimension; i++ ) {
         start[ i ] = parameters.getParameter< int >( minDimensionIds[ i ] );
         end[ i ] = parameters.getParameter< int >( maxDimensionIds[ i ] );
         dimension[ i ] = parameters.getParameter< int >( dimensionIds[ i ] );
         minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] );
         maxKernelSize[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] );

         TNL_ASSERT_GT( start[ i ], 1, "Start dimension must be positive integer" );
         TNL_ASSERT_GT( end[ i ], start[ i ], "End dimension must be greater than start dimension" );

         TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" );
         TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" );
         TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End dimension must be greater than start dimension" );
         TNL_ASSERT_GT( end[ i ], start[ i ], "End kernel size must be greater than start kernel size" );
         TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End kernel size must be greater than start kernel size" );
      }

      int dimensionStep = parameters.getParameter< int >( "dimension-step" );
      int kernelStep = parameters.getParameter< int >( "kernel-step" );

      TNL_ASSERT_GT( dimensionStep, 1, "Dimension step must be a positive number" );
      TNL_ASSERT_GT( kernelStep, 0, "Kernel step must be a positive number" );
      TNL_ASSERT_EQ( kernelStep % 2, 0, "Kernel step must be even" );

      TNL::String id = parameters.getParameter< TNL::String >( "id" );

      time( id, benchmark, start, end, dimensionStep, minKernelSize, maxKernelSize, kernelStep );
      time( id, benchmark, dimension, minKernelSize, maxKernelSize, kernelStep );
   }

   virtual void
   time( const TNL::String& id,
         TNLBenchmark& benchmark,
         const Vector& minDimension,
         const Vector& maxDimension,
         const int dimensionStep,
         const Vector& dimension,
         const Vector& minKernelSize,
         const Vector& maxKernelSize,
         const int kernelStep ) const
   {
      Vector currentDimension = minDimension;
      Vector currentKernelSize;

      do {
         currentKernelSize = minKernelSize;
      Vector currentKernelSize = minKernelSize;

      do {
            timeConvolution( id, benchmark, currentDimension, currentKernelSize );
         timeConvolution( id, benchmark, dimension, currentKernelSize );

         currentKernelSize[ 0 ] += kernelStep;

@@ -83,17 +69,6 @@ public:
            }
         }
      } while( currentKernelSize < maxKernelSize );

         currentDimension[ 0 ] *= dimensionStep;

         for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) {
            if( currentDimension[ i ] >= maxDimension[ i ] ) {
               currentDimension[ i ] = minDimension[ i ];
               currentDimension[ i + 1 ] *= dimensionStep;
            }
         }

      } while( currentDimension < maxDimension );
   }

   void
@@ -148,12 +123,7 @@ public:
      config.addDelimiter( "Grid dimension settings:" );

      for( int i = 0; i < Dimension; i++ )
         config.addEntry< int >( minDimensionIds[ i ], minDimensionIds[ i ], 16 );

      for( int i = 0; i < Dimension; i++ )
         config.addEntry< int >( maxDimensionIds[ i ], maxDimensionIds[ i ], 128 );

      config.addEntry< int >( "dimension-step", "Step of kernel increase by which dimension is multiplied (must be even)", 2 );
         config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 16 );

      config.addDelimiter( "Kernel settings:" );

Loading