Commit 77b4e081 authored by Yury Hayeu's avatar Yury Hayeu
Browse files

Implement naive convolution for 1D kernel

parent 4403b2dc
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
generated
+25 −0
Original line number Diff line number Diff line

function(generate_cuda_executable PREFIX DIMENSION TEMPLATE KERNEL_HEADER)

get_filename_component(MODULE_NAME ${KERNEL_HEADER} NAME_WE)
get_filename_component(TEMPLATE_NAME ${TEMPLATE} NAME_WE)

if (${BUILD_CUDA})
   SET(SOURCE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/generated/${MODULE_NAME}_${DIMENSION}_${TEMPLATE_NAME}.cu")

   FILE(READ ${TEMPLATE} TEMPLATE_CONTENT)

   STRING(REGEX REPLACE "DIMENSION_VALUE" ${DIMENSION} TEMPLATE_CONTENT "${TEMPLATE_CONTENT}")

   FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}")

   SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}")

   CUDA_ADD_EXECUTABLE(${EXECUTABLE_NAME} ${SOURCE_FILE})
else()
   MESSAGE(WARNING "Convolutions are not supported on CPU")
endif()

endfunction()

GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/naive.h")
+164 −0
Original line number Diff line number Diff line

#ifdef HAVE_CUDA

#include <TNL/Devices/Cuda.h>
#include <TNL/Cuda/LaunchHelpers.h>

template< int Dimension, typename Device >
struct Convolution;

template<>
struct Convolution< 1, TNL::Devices::Cuda >
{
public:
   template< typename Index >
   static size_t
   getDynamicSharedMemorySize( Index kernelWidth, Index endX )
   {
      return 0;
   }
};

template< typename Index,
          typename Real,
          typename FetchData,
          typename FetchBoundary,
          typename FetchKernel,
          typename Convolve,
          typename Store >
__global__
static void
convolution1D( Index kernelWidth,
               Index endX,
               FetchData fetchData,
               FetchBoundary fetchBoundary,
               FetchKernel fetchKernel,
               Convolve convolve,
               Store store )
{
   Index ix =  threadIdx.x + blockIdx.x * blockDim.x;
   Index radius = kernelWidth >> 1;

   Real result = 0;

   for( Index i = -radius; i <= radius; i++ ) {
      Index elementIndex = i + ix;
      Index kernelIndex = i + radius;

      if( elementIndex < 0 || elementIndex >= endX ) {
         result = convolve( result, fetchBoundary( elementIndex ), fetchKernel( kernelIndex ) );
      }
      else {
         result = convolve( result, fetchData( elementIndex ), fetchKernel( kernelIndex ) );
      }
   }

   store( ix, result );
}

// template<>
// struct Convolution< 2, TNL::Devices::Cuda >
// {
// public:
//    template< typename Index >
//    static size_t
//    getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY )
//    {
//       return 0;
//    }
// };

// template< typename Index,
//           typename Real,
//           typename FetchData,
//           typename FetchBoundary,
//           typename FetchKernel,
//           typename Convolve,
//           typename Store >
// __global__
// static void
// convolution2D( Index kernelWidth,
//                Index kernelHeight,
//                Index endX,
//                Index endY,
//                FetchData& fetchData,
//                FetchBoundary& fetchBoundary,
//                FetchKernel& fetchKernel,
//                Convolve& convolve,
//                Store& store )
// {
//    int iy = threadIdx.y + blockIdx.y * blockDim.y;
//    int ix = threadIdx.x + blockIdx.x * blockDim.x;

//    Real result = 0;

//    for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) {
//       for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) {
//          if( i < 0 || i >= endX || j < 0 || j >= endY ) {
//             result = convolve( result, fetchBoundary( i, j ) );
//          }
//          else {
//             result = convolve( result, fetchData( i, j ), fetchKernel( i, j ) );
//          }
//       }
//    }

//    store( ix, iy, result );
// }

// template<>
// struct Convolution< 3, TNL::Devices::Cuda >
// {
// public:
//    template< typename Index >
//    static size_t
//    getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ )
//    {
//       return 0;
//    }
// };

// template< typename Index,
//           typename Real,
//           typename FetchData,
//           typename FetchBoundary,
//           typename FetchKernel,
//           typename Convolve,
//           typename Store >
// __global__
// static void
// convolution3D( Index kernelWidth,
//                Index kernelHeight,
//                Index kernelDepth,
//                Index endX,
//                Index endY,
//                Index endZ,
//                FetchData& fetchData,
//                FetchBoundary& fetchBoundary,
//                FetchKernel& fetchKernel,
//                Convolve& convolve,
//                Store& store )
// {
//    int ix = threadIdx.x + blockIdx.x * blockDim.x;
//    int iy = threadIdx.y + blockIdx.y * blockDim.y;
//    int iz = threadIdx.z + blockIdx.z * blockDim.z;

//    Real result = 0;

//    for( Index k = iz - kernelDepth; k <= iz + kernelDepth; k++ ) {
//       for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) {
//          for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) {
//             if( i < 0 || i >= endX || j < 0 || j >= endY || k < 0 || k >= endZ ) {
//                result = convolve( result, fetchBoundary( i, j, k ) );
//             }
//             else {
//                result = convolve( result, fetchData( i, j, k ), fetchKernel( i, j, k ) );
//             }
//          }
//       }
//    }

//    store( ix, iy, iz, result );
// }

#endif
+81 −0
Original line number Diff line number Diff line

#pragma once

#include <TNL/Config/parseCommandLine.h>

#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>

#include <TNL/Benchmarks/Benchmarks.h>
#include <TNL/Containers/StaticVector.h>
#include <TNL/Containers/Array.h>

template< int Dimension, typename Device >
class Benchmark
{
public:
   using Benchmark = typename TNL::Benchmarks::Benchmark<>;

   void
   runBenchmark( const TNL::Config::ParameterContainer& parameters ) const
   {
      if( ! TNL::Devices::Host::setup( parameters ) || ! TNL::Devices::Cuda::setup( parameters ) )
         return;

      const TNL::String logFileName = parameters.getParameter< TNL::String >( "log-file" );
      const TNL::String outputMode = parameters.getParameter< TNL::String >( "output-mode" );
      const TNL::String device = parameters.getParameter< TNL::String >( "device" );

      const int verbose = parameters.getParameter< int >( "verbose" );
      const int loops = parameters.getParameter< int >( "loops" );

      auto mode = std::ios::out;

      if( outputMode == "append" )
         mode |= std::ios::app;

      std::ofstream logFile( logFileName.getString(), mode );

      Benchmark benchmark( logFile, loops, verbose );

      std::map< std::string, std::string > metadata = TNL::Benchmarks::getHardwareMetadata();
      TNL::Benchmarks::writeMapAsJson( metadata, logFileName, ".metadata.json" );

      start(benchmark, parameters);
   }

   virtual void start(const Benchmark& benchmark, const TNL::Config::ParameterContainer& parameters) const {
      TNL_ASSERT_TRUE(false, << "Should be overriden");
   }

   virtual TNL::Config::ConfigDescription makeInputConfig() const {
      TNL::Config::ConfigDescription config;

      config.addDelimiter( "Benchmark settings:" );
      config.addEntry< TNL::String >( "id", "Identifier of the run", "unknown" );
      config.addEntry< TNL::String >( "log-file", "Log file name.", "output.log" );
      config.addEntry< TNL::String >( "output-mode", "Mode for opening the log file.", "overwrite" );
      config.addEntryEnum( "append" );
      config.addEntryEnum( "overwrite" );

      config.addEntry< TNL::String >( "device", "Device the computation will run on.", "cuda" );
      config.addEntryEnum< TNL::String >( "all" );
      config.addEntryEnum< TNL::String >( "host" );

#ifdef HAVE_CUDA
      config.addEntryEnum< TNL::String >( "cuda" );
#endif

      config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
      config.addEntry< int >( "verbose", "Verbose mode.", 1 );


      config.addDelimiter( "Device settings:" );
      TNL::Devices::Host::configSetup( config );

#ifdef HAVE_CUDA
      TNL::Devices::Cuda::configSetup( config );
#endif
      return config;
   }
};
+165 −0
Original line number Diff line number Diff line

#pragma once

#include "Benchmark.h"
#include "DummyTask.h"

static std::vector< TNL::String > minDimensionIds = { "min-x-dimension", "min-y-dimension", "min-z-dimension" };
static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" };
static std::vector< TNL::String > maxDimensionIds = { "max-x-dimension", "max-y-dimension", "max-z-dimension" };
static std::vector< TNL::String > minKernelSizeIds = { "min-kernel-width", "min-kernel-height", "min-kernel-depth" };
static std::vector< TNL::String > kernelSizeIds = { "x-kernelSize", "y-kernelSize", "z-kernelSize" };
static std::vector< TNL::String > maxKernelSizeIds = { "max-kernel-width", "max-kernel-height", "max-kernel-depth" };

template< int Dimension, typename Device >
class DummyBenchmark : public Benchmark< Dimension, Device >
{
public:
   using Vector = TNL::Containers::StaticVector< Dimension, int >;
   using DataStore = TNL::Containers::Array< int, Device, float >;
   using Benchmark = Base::Benchmark;
   using Base = Benchmark< Dimension, Device >;

   virtual void
   start( const Benchmark& benchmark, const TNL::Config::ParameterContainer& parameters ) const override
   {
      Vector start;
      Vector end;
      Vector minKernelSize;
      Vector maxKernelSize;

      for( int i = 0; i < Dimension; i++ ) {
         start[ i ] = parameters.getParameter< int >( minDimensionIds[ i ] );
         end[ i ] = parameters.getParameter< int >( maxDimensionIds[ i ] );
         minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] );
         maxKernelSizeIds[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] );

         TNL_ASSERT_GT( start[ i ], 1, "Start dimension must be positive integer" );
         TNL_ASSERT_GT( end[ i ], start[ i ], "End dimension must be greater than start dimension" );

         TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" );
         TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" );
         TNL_ASSERT_GT( end[ i ], start[ i ], "End kernel size must be greater than start kernel size" );
      }

      int dimensionStep = parameters.getParameter< int >( "dimension-step" );
      int kernelStep = parameters.getParameter< int >( "kernel-step" );

      TNL_ASSERT_GT( dimensionStep, 1, "Dimension step must be a positive number" );
      TNL_ASSERT_GT( kernelStep, 0, "Kernel step must be a positive number" );
      TNL_ASSERT_EQ( kernelStep % 2, 0, "Kernel step must be even" );

      time( benchmark, start, end, dimensionStep, minKernelSize, maxKernelSize, kernelStep );
   }

   virtual void
   time( Benchmark& bencmark,
         const Vector& minDimension,
         const Vector& maxDimension,
         const int dimensionStep,
         const Vector& minKernelSize,
         const Vector& maxKernelSize,
         const int kernelStep ) const
   {
      Vector currentDimension = minDimension;
      Vector currentKernelSize;

      do {
         currentKernelSize = minKernelSize;

         do {
            time( benchmark, currentDimension, currentKernelSize );

            currentKernelSize[ 0 ] += kernelStep;

            for( size_t i = 0; i < currentKernelSize.getSize() - 1; i++ ) {
               if( currentKernelSize[ i ] >= maxKernelSize[ i ] ) {
                  currentKernelSize[ i ] = minKernelSize[ i ];
                  maxKernelSize[ i + 1 ] += kernelStep;
               }
            }
         } while( currentKernelSize < maxKernelSize );

         currentDimension[ 0 ] *= dimensionStep;

         for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) {
            if( currentDimension[ i ] >= maxDimension[ i ] ) {
               currentDimension[ i ] = minDimension[ i ];
               maxDimension[ i ] = maxDimension[ i ];
            }
         }

      } while( currentDimension < maxDimension );
   }

   void
   timeConvolution( Benchmark& benchmark, const Vector& dimension, const Vector& kernelSize ) const
   {
      auto device = TNL::getType< Device >();

      Benchmark::MetadataColumns columns = {};

      size_t elementsCount = 1;
      size_t kernelElementsCount = 1;

      for( size_t i = 0; i < dimension.getSize(); i++ ) {
         elementsCount *= dimension[ i ];
         kernelElementsCount *= kernelSize[ i ];

         columns.insert( { dimensionIds[ i ], dimension[ i ] } );
         columns.insert( { kernelSizeIds[ i ], kernelSize[ i ] } );
      }

      benchmark.setDatasetSize( ( elementsCount * 4 ) / 1.e9, 1.0 );

      // Setup input data
      DataStore input, result, kernel;

      input.resize( elementsCount );
      result.resize( elementsCount );
      kernel.resize( kernelSize );

      input = 1;
      result = 1;
      kernel = 1;

      auto inputView = input.getView();
      auto resultView = result.getView();
      auto kernelView = kernel.getView();

      auto measure = [ & ]()
      {
         DummyTask<Dimension, Device>::exec(dimension, kernelSize, inputView, resultView, kernelView);
      };

      benchmark.time< Device >( device, measure );
   }

   TNL::Config::ConfigDescription
   makeInputConfig() const override
   {
      auto config = Base::makeInputConfig();

      config.addDelimiter( "Grid dimension settings:" );

      for( int i = 0; i < Dimension; i++ )
         config.addEntry< int >( minDimensionIds[ i ], minDimensionIds[ i ], 512 );

      for( int i = 0; i < Dimension; i++ )
         config.addEntry< int >( maxDimensionIds[ i ], maxDimensionIds[ i ], 512 );

      config.addEntry< int >( "dimension-step", "Step of kernel increase by which dimension is multiplied (must be even)", 2 );

      config.addDelimiter( "Kernel settings:" );

      for( int i = 0; i < Dimension; i++ )
         config.addEntry< int >( minKernelSizeIds[ i ], minKernelSizeIds[ i ] + " (odd) :", 1 );

      for( int i = 0; i < Dimension; i++ )
         config.addEntry< int >( minKernelSizeIds[ i ], minKernelSizeIds[ i ] + " (odd) :", 11 );

      config.addEntry< int >( "kernel-step", "Step of kernel increase which is added to kernel (must be even)", 2 );

      return config;
   }
};
Loading