Loading src/Benchmarks/Convolution/.gitignore 0 → 100644 +1 −0 Original line number Diff line number Diff line generated src/Benchmarks/Convolution/CMakeLists.txt 0 → 100644 +25 −0 Original line number Diff line number Diff line function(generate_cuda_executable PREFIX DIMENSION TEMPLATE KERNEL_HEADER) get_filename_component(MODULE_NAME ${KERNEL_HEADER} NAME_WE) get_filename_component(TEMPLATE_NAME ${TEMPLATE} NAME_WE) if (${BUILD_CUDA}) SET(SOURCE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/generated/${MODULE_NAME}_${DIMENSION}_${TEMPLATE_NAME}.cu") FILE(READ ${TEMPLATE} TEMPLATE_CONTENT) STRING(REGEX REPLACE "DIMENSION_VALUE" ${DIMENSION} TEMPLATE_CONTENT "${TEMPLATE_CONTENT}") FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}") SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}") CUDA_ADD_EXECUTABLE(${EXECUTABLE_NAME} ${SOURCE_FILE}) else() MESSAGE(WARNING "Convolutions are not supported on CPU") endif() endfunction() GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/naive.h") src/Benchmarks/Convolution/kernels/naive.h 0 → 100644 +164 −0 Original line number Diff line number Diff line #ifdef HAVE_CUDA #include <TNL/Devices/Cuda.h> #include <TNL/Cuda/LaunchHelpers.h> template< int Dimension, typename Device > struct Convolution; template<> struct Convolution< 1, TNL::Devices::Cuda > { public: template< typename Index > static size_t getDynamicSharedMemorySize( Index kernelWidth, Index endX ) { return 0; } }; template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > __global__ static void convolution1D( Index kernelWidth, Index endX, FetchData fetchData, FetchBoundary fetchBoundary, FetchKernel fetchKernel, Convolve convolve, Store store ) { Index ix = threadIdx.x + blockIdx.x * blockDim.x; Index radius = kernelWidth >> 1; Real result = 0; for( Index i = -radius; i <= radius; i++ ) { Index elementIndex = i + ix; Index kernelIndex = i + radius; if( elementIndex < 0 || elementIndex >= endX ) { result = convolve( result, fetchBoundary( elementIndex ), fetchKernel( kernelIndex ) ); } else { result = convolve( result, fetchData( elementIndex ), fetchKernel( kernelIndex ) ); } } store( ix, result ); } // template<> // struct Convolution< 2, TNL::Devices::Cuda > // { // public: // template< typename Index > // static size_t // getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY ) // { // return 0; // } // }; // template< typename Index, // typename Real, // typename FetchData, // typename FetchBoundary, // typename FetchKernel, // typename Convolve, // typename Store > // __global__ // static void // convolution2D( Index kernelWidth, // Index kernelHeight, // Index endX, // Index endY, // FetchData& fetchData, // FetchBoundary& fetchBoundary, // FetchKernel& fetchKernel, // Convolve& convolve, // Store& store ) // { // int iy = threadIdx.y + blockIdx.y * blockDim.y; // int ix = threadIdx.x + blockIdx.x * blockDim.x; // Real result = 0; // for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) { // for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) { // if( i < 0 || i >= endX || j < 0 || j >= endY ) { // result = convolve( result, fetchBoundary( i, j ) ); // } // else { // result = convolve( result, fetchData( i, j ), fetchKernel( i, j ) ); // } // } // } // store( ix, iy, result ); // } // template<> // struct Convolution< 3, TNL::Devices::Cuda > // { // public: // template< typename Index > // static size_t // getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ ) // { // return 0; // } // }; // template< typename Index, // typename Real, // typename FetchData, // typename FetchBoundary, // typename FetchKernel, // typename Convolve, // typename Store > // __global__ // static void // convolution3D( Index kernelWidth, // Index kernelHeight, // Index kernelDepth, // Index endX, // Index endY, // Index endZ, // FetchData& fetchData, // FetchBoundary& fetchBoundary, // FetchKernel& fetchKernel, // Convolve& convolve, // Store& store ) // { // int ix = threadIdx.x + blockIdx.x * blockDim.x; // int iy = threadIdx.y + blockIdx.y * blockDim.y; // int iz = threadIdx.z + blockIdx.z * blockDim.z; // Real result = 0; // for( Index k = iz - kernelDepth; k <= iz + kernelDepth; k++ ) { // for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) { // for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) { // if( i < 0 || i >= endX || j < 0 || j >= endY || k < 0 || k >= endZ ) { // result = convolve( result, fetchBoundary( i, j, k ) ); // } // else { // result = convolve( result, fetchData( i, j, k ), fetchKernel( i, j, k ) ); // } // } // } // } // store( ix, iy, iz, result ); // } #endif src/Benchmarks/Convolution/support/Benchmark.h 0 → 100644 +81 −0 Original line number Diff line number Diff line #pragma once #include <TNL/Config/parseCommandLine.h> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> #include <TNL/Benchmarks/Benchmarks.h> #include <TNL/Containers/StaticVector.h> #include <TNL/Containers/Array.h> template< int Dimension, typename Device > class Benchmark { public: using Benchmark = typename TNL::Benchmarks::Benchmark<>; void runBenchmark( const TNL::Config::ParameterContainer& parameters ) const { if( ! TNL::Devices::Host::setup( parameters ) || ! TNL::Devices::Cuda::setup( parameters ) ) return; const TNL::String logFileName = parameters.getParameter< TNL::String >( "log-file" ); const TNL::String outputMode = parameters.getParameter< TNL::String >( "output-mode" ); const TNL::String device = parameters.getParameter< TNL::String >( "device" ); const int verbose = parameters.getParameter< int >( "verbose" ); const int loops = parameters.getParameter< int >( "loops" ); auto mode = std::ios::out; if( outputMode == "append" ) mode |= std::ios::app; std::ofstream logFile( logFileName.getString(), mode ); Benchmark benchmark( logFile, loops, verbose ); std::map< std::string, std::string > metadata = TNL::Benchmarks::getHardwareMetadata(); TNL::Benchmarks::writeMapAsJson( metadata, logFileName, ".metadata.json" ); start(benchmark, parameters); } virtual void start(const Benchmark& benchmark, const TNL::Config::ParameterContainer& parameters) const { TNL_ASSERT_TRUE(false, << "Should be overriden"); } virtual TNL::Config::ConfigDescription makeInputConfig() const { TNL::Config::ConfigDescription config; config.addDelimiter( "Benchmark settings:" ); config.addEntry< TNL::String >( "id", "Identifier of the run", "unknown" ); config.addEntry< TNL::String >( "log-file", "Log file name.", "output.log" ); config.addEntry< TNL::String >( "output-mode", "Mode for opening the log file.", "overwrite" ); config.addEntryEnum( "append" ); config.addEntryEnum( "overwrite" ); config.addEntry< TNL::String >( "device", "Device the computation will run on.", "cuda" ); config.addEntryEnum< TNL::String >( "all" ); config.addEntryEnum< TNL::String >( "host" ); #ifdef HAVE_CUDA config.addEntryEnum< TNL::String >( "cuda" ); #endif config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); config.addEntry< int >( "verbose", "Verbose mode.", 1 ); config.addDelimiter( "Device settings:" ); TNL::Devices::Host::configSetup( config ); #ifdef HAVE_CUDA TNL::Devices::Cuda::configSetup( config ); #endif return config; } }; src/Benchmarks/Convolution/support/DummyBenchmark.h 0 → 100644 +165 −0 Original line number Diff line number Diff line #pragma once #include "Benchmark.h" #include "DummyTask.h" static std::vector< TNL::String > minDimensionIds = { "min-x-dimension", "min-y-dimension", "min-z-dimension" }; static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" }; static std::vector< TNL::String > maxDimensionIds = { "max-x-dimension", "max-y-dimension", "max-z-dimension" }; static std::vector< TNL::String > minKernelSizeIds = { "min-kernel-width", "min-kernel-height", "min-kernel-depth" }; static std::vector< TNL::String > kernelSizeIds = { "x-kernelSize", "y-kernelSize", "z-kernelSize" }; static std::vector< TNL::String > maxKernelSizeIds = { "max-kernel-width", "max-kernel-height", "max-kernel-depth" }; template< int Dimension, typename Device > class DummyBenchmark : public Benchmark< Dimension, Device > { public: using Vector = TNL::Containers::StaticVector< Dimension, int >; using DataStore = TNL::Containers::Array< int, Device, float >; using Benchmark = Base::Benchmark; using Base = Benchmark< Dimension, Device >; virtual void start( const Benchmark& benchmark, const TNL::Config::ParameterContainer& parameters ) const override { Vector start; Vector end; Vector minKernelSize; Vector maxKernelSize; for( int i = 0; i < Dimension; i++ ) { start[ i ] = parameters.getParameter< int >( minDimensionIds[ i ] ); end[ i ] = parameters.getParameter< int >( maxDimensionIds[ i ] ); minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] ); maxKernelSizeIds[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] ); TNL_ASSERT_GT( start[ i ], 1, "Start dimension must be positive integer" ); TNL_ASSERT_GT( end[ i ], start[ i ], "End dimension must be greater than start dimension" ); TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" ); TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" ); TNL_ASSERT_GT( end[ i ], start[ i ], "End kernel size must be greater than start kernel size" ); } int dimensionStep = parameters.getParameter< int >( "dimension-step" ); int kernelStep = parameters.getParameter< int >( "kernel-step" ); TNL_ASSERT_GT( dimensionStep, 1, "Dimension step must be a positive number" ); TNL_ASSERT_GT( kernelStep, 0, "Kernel step must be a positive number" ); TNL_ASSERT_EQ( kernelStep % 2, 0, "Kernel step must be even" ); time( benchmark, start, end, dimensionStep, minKernelSize, maxKernelSize, kernelStep ); } virtual void time( Benchmark& bencmark, const Vector& minDimension, const Vector& maxDimension, const int dimensionStep, const Vector& minKernelSize, const Vector& maxKernelSize, const int kernelStep ) const { Vector currentDimension = minDimension; Vector currentKernelSize; do { currentKernelSize = minKernelSize; do { time( benchmark, currentDimension, currentKernelSize ); currentKernelSize[ 0 ] += kernelStep; for( size_t i = 0; i < currentKernelSize.getSize() - 1; i++ ) { if( currentKernelSize[ i ] >= maxKernelSize[ i ] ) { currentKernelSize[ i ] = minKernelSize[ i ]; maxKernelSize[ i + 1 ] += kernelStep; } } } while( currentKernelSize < maxKernelSize ); currentDimension[ 0 ] *= dimensionStep; for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) { if( currentDimension[ i ] >= maxDimension[ i ] ) { currentDimension[ i ] = minDimension[ i ]; maxDimension[ i ] = maxDimension[ i ]; } } } while( currentDimension < maxDimension ); } void timeConvolution( Benchmark& benchmark, const Vector& dimension, const Vector& kernelSize ) const { auto device = TNL::getType< Device >(); Benchmark::MetadataColumns columns = {}; size_t elementsCount = 1; size_t kernelElementsCount = 1; for( size_t i = 0; i < dimension.getSize(); i++ ) { elementsCount *= dimension[ i ]; kernelElementsCount *= kernelSize[ i ]; columns.insert( { dimensionIds[ i ], dimension[ i ] } ); columns.insert( { kernelSizeIds[ i ], kernelSize[ i ] } ); } benchmark.setDatasetSize( ( elementsCount * 4 ) / 1.e9, 1.0 ); // Setup input data DataStore input, result, kernel; input.resize( elementsCount ); result.resize( elementsCount ); kernel.resize( kernelSize ); input = 1; result = 1; kernel = 1; auto inputView = input.getView(); auto resultView = result.getView(); auto kernelView = kernel.getView(); auto measure = [ & ]() { DummyTask<Dimension, Device>::exec(dimension, kernelSize, inputView, resultView, kernelView); }; benchmark.time< Device >( device, measure ); } TNL::Config::ConfigDescription makeInputConfig() const override { auto config = Base::makeInputConfig(); config.addDelimiter( "Grid dimension settings:" ); for( int i = 0; i < Dimension; i++ ) config.addEntry< int >( minDimensionIds[ i ], minDimensionIds[ i ], 512 ); for( int i = 0; i < Dimension; i++ ) config.addEntry< int >( maxDimensionIds[ i ], maxDimensionIds[ i ], 512 ); config.addEntry< int >( "dimension-step", "Step of kernel increase by which dimension is multiplied (must be even)", 2 ); config.addDelimiter( "Kernel settings:" ); for( int i = 0; i < Dimension; i++ ) config.addEntry< int >( minKernelSizeIds[ i ], minKernelSizeIds[ i ] + " (odd) :", 1 ); for( int i = 0; i < Dimension; i++ ) config.addEntry< int >( minKernelSizeIds[ i ], minKernelSizeIds[ i ] + " (odd) :", 11 ); config.addEntry< int >( "kernel-step", "Step of kernel increase which is added to kernel (must be even)", 2 ); return config; } }; Loading
src/Benchmarks/Convolution/.gitignore 0 → 100644 +1 −0 Original line number Diff line number Diff line generated
src/Benchmarks/Convolution/CMakeLists.txt 0 → 100644 +25 −0 Original line number Diff line number Diff line function(generate_cuda_executable PREFIX DIMENSION TEMPLATE KERNEL_HEADER) get_filename_component(MODULE_NAME ${KERNEL_HEADER} NAME_WE) get_filename_component(TEMPLATE_NAME ${TEMPLATE} NAME_WE) if (${BUILD_CUDA}) SET(SOURCE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/generated/${MODULE_NAME}_${DIMENSION}_${TEMPLATE_NAME}.cu") FILE(READ ${TEMPLATE} TEMPLATE_CONTENT) STRING(REGEX REPLACE "DIMENSION_VALUE" ${DIMENSION} TEMPLATE_CONTENT "${TEMPLATE_CONTENT}") FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}") SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}") CUDA_ADD_EXECUTABLE(${EXECUTABLE_NAME} ${SOURCE_FILE}) else() MESSAGE(WARNING "Convolutions are not supported on CPU") endif() endfunction() GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/naive.h")
src/Benchmarks/Convolution/kernels/naive.h 0 → 100644 +164 −0 Original line number Diff line number Diff line #ifdef HAVE_CUDA #include <TNL/Devices/Cuda.h> #include <TNL/Cuda/LaunchHelpers.h> template< int Dimension, typename Device > struct Convolution; template<> struct Convolution< 1, TNL::Devices::Cuda > { public: template< typename Index > static size_t getDynamicSharedMemorySize( Index kernelWidth, Index endX ) { return 0; } }; template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > __global__ static void convolution1D( Index kernelWidth, Index endX, FetchData fetchData, FetchBoundary fetchBoundary, FetchKernel fetchKernel, Convolve convolve, Store store ) { Index ix = threadIdx.x + blockIdx.x * blockDim.x; Index radius = kernelWidth >> 1; Real result = 0; for( Index i = -radius; i <= radius; i++ ) { Index elementIndex = i + ix; Index kernelIndex = i + radius; if( elementIndex < 0 || elementIndex >= endX ) { result = convolve( result, fetchBoundary( elementIndex ), fetchKernel( kernelIndex ) ); } else { result = convolve( result, fetchData( elementIndex ), fetchKernel( kernelIndex ) ); } } store( ix, result ); } // template<> // struct Convolution< 2, TNL::Devices::Cuda > // { // public: // template< typename Index > // static size_t // getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY ) // { // return 0; // } // }; // template< typename Index, // typename Real, // typename FetchData, // typename FetchBoundary, // typename FetchKernel, // typename Convolve, // typename Store > // __global__ // static void // convolution2D( Index kernelWidth, // Index kernelHeight, // Index endX, // Index endY, // FetchData& fetchData, // FetchBoundary& fetchBoundary, // FetchKernel& fetchKernel, // Convolve& convolve, // Store& store ) // { // int iy = threadIdx.y + blockIdx.y * blockDim.y; // int ix = threadIdx.x + blockIdx.x * blockDim.x; // Real result = 0; // for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) { // for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) { // if( i < 0 || i >= endX || j < 0 || j >= endY ) { // result = convolve( result, fetchBoundary( i, j ) ); // } // else { // result = convolve( result, fetchData( i, j ), fetchKernel( i, j ) ); // } // } // } // store( ix, iy, result ); // } // template<> // struct Convolution< 3, TNL::Devices::Cuda > // { // public: // template< typename Index > // static size_t // getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ ) // { // return 0; // } // }; // template< typename Index, // typename Real, // typename FetchData, // typename FetchBoundary, // typename FetchKernel, // typename Convolve, // typename Store > // __global__ // static void // convolution3D( Index kernelWidth, // Index kernelHeight, // Index kernelDepth, // Index endX, // Index endY, // Index endZ, // FetchData& fetchData, // FetchBoundary& fetchBoundary, // FetchKernel& fetchKernel, // Convolve& convolve, // Store& store ) // { // int ix = threadIdx.x + blockIdx.x * blockDim.x; // int iy = threadIdx.y + blockIdx.y * blockDim.y; // int iz = threadIdx.z + blockIdx.z * blockDim.z; // Real result = 0; // for( Index k = iz - kernelDepth; k <= iz + kernelDepth; k++ ) { // for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) { // for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) { // if( i < 0 || i >= endX || j < 0 || j >= endY || k < 0 || k >= endZ ) { // result = convolve( result, fetchBoundary( i, j, k ) ); // } // else { // result = convolve( result, fetchData( i, j, k ), fetchKernel( i, j, k ) ); // } // } // } // } // store( ix, iy, iz, result ); // } #endif
src/Benchmarks/Convolution/support/Benchmark.h 0 → 100644 +81 −0 Original line number Diff line number Diff line #pragma once #include <TNL/Config/parseCommandLine.h> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> #include <TNL/Benchmarks/Benchmarks.h> #include <TNL/Containers/StaticVector.h> #include <TNL/Containers/Array.h> template< int Dimension, typename Device > class Benchmark { public: using Benchmark = typename TNL::Benchmarks::Benchmark<>; void runBenchmark( const TNL::Config::ParameterContainer& parameters ) const { if( ! TNL::Devices::Host::setup( parameters ) || ! TNL::Devices::Cuda::setup( parameters ) ) return; const TNL::String logFileName = parameters.getParameter< TNL::String >( "log-file" ); const TNL::String outputMode = parameters.getParameter< TNL::String >( "output-mode" ); const TNL::String device = parameters.getParameter< TNL::String >( "device" ); const int verbose = parameters.getParameter< int >( "verbose" ); const int loops = parameters.getParameter< int >( "loops" ); auto mode = std::ios::out; if( outputMode == "append" ) mode |= std::ios::app; std::ofstream logFile( logFileName.getString(), mode ); Benchmark benchmark( logFile, loops, verbose ); std::map< std::string, std::string > metadata = TNL::Benchmarks::getHardwareMetadata(); TNL::Benchmarks::writeMapAsJson( metadata, logFileName, ".metadata.json" ); start(benchmark, parameters); } virtual void start(const Benchmark& benchmark, const TNL::Config::ParameterContainer& parameters) const { TNL_ASSERT_TRUE(false, << "Should be overriden"); } virtual TNL::Config::ConfigDescription makeInputConfig() const { TNL::Config::ConfigDescription config; config.addDelimiter( "Benchmark settings:" ); config.addEntry< TNL::String >( "id", "Identifier of the run", "unknown" ); config.addEntry< TNL::String >( "log-file", "Log file name.", "output.log" ); config.addEntry< TNL::String >( "output-mode", "Mode for opening the log file.", "overwrite" ); config.addEntryEnum( "append" ); config.addEntryEnum( "overwrite" ); config.addEntry< TNL::String >( "device", "Device the computation will run on.", "cuda" ); config.addEntryEnum< TNL::String >( "all" ); config.addEntryEnum< TNL::String >( "host" ); #ifdef HAVE_CUDA config.addEntryEnum< TNL::String >( "cuda" ); #endif config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); config.addEntry< int >( "verbose", "Verbose mode.", 1 ); config.addDelimiter( "Device settings:" ); TNL::Devices::Host::configSetup( config ); #ifdef HAVE_CUDA TNL::Devices::Cuda::configSetup( config ); #endif return config; } };
src/Benchmarks/Convolution/support/DummyBenchmark.h 0 → 100644 +165 −0 Original line number Diff line number Diff line #pragma once #include "Benchmark.h" #include "DummyTask.h" static std::vector< TNL::String > minDimensionIds = { "min-x-dimension", "min-y-dimension", "min-z-dimension" }; static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" }; static std::vector< TNL::String > maxDimensionIds = { "max-x-dimension", "max-y-dimension", "max-z-dimension" }; static std::vector< TNL::String > minKernelSizeIds = { "min-kernel-width", "min-kernel-height", "min-kernel-depth" }; static std::vector< TNL::String > kernelSizeIds = { "x-kernelSize", "y-kernelSize", "z-kernelSize" }; static std::vector< TNL::String > maxKernelSizeIds = { "max-kernel-width", "max-kernel-height", "max-kernel-depth" }; template< int Dimension, typename Device > class DummyBenchmark : public Benchmark< Dimension, Device > { public: using Vector = TNL::Containers::StaticVector< Dimension, int >; using DataStore = TNL::Containers::Array< int, Device, float >; using Benchmark = Base::Benchmark; using Base = Benchmark< Dimension, Device >; virtual void start( const Benchmark& benchmark, const TNL::Config::ParameterContainer& parameters ) const override { Vector start; Vector end; Vector minKernelSize; Vector maxKernelSize; for( int i = 0; i < Dimension; i++ ) { start[ i ] = parameters.getParameter< int >( minDimensionIds[ i ] ); end[ i ] = parameters.getParameter< int >( maxDimensionIds[ i ] ); minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] ); maxKernelSizeIds[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] ); TNL_ASSERT_GT( start[ i ], 1, "Start dimension must be positive integer" ); TNL_ASSERT_GT( end[ i ], start[ i ], "End dimension must be greater than start dimension" ); TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" ); TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" ); TNL_ASSERT_GT( end[ i ], start[ i ], "End kernel size must be greater than start kernel size" ); } int dimensionStep = parameters.getParameter< int >( "dimension-step" ); int kernelStep = parameters.getParameter< int >( "kernel-step" ); TNL_ASSERT_GT( dimensionStep, 1, "Dimension step must be a positive number" ); TNL_ASSERT_GT( kernelStep, 0, "Kernel step must be a positive number" ); TNL_ASSERT_EQ( kernelStep % 2, 0, "Kernel step must be even" ); time( benchmark, start, end, dimensionStep, minKernelSize, maxKernelSize, kernelStep ); } virtual void time( Benchmark& bencmark, const Vector& minDimension, const Vector& maxDimension, const int dimensionStep, const Vector& minKernelSize, const Vector& maxKernelSize, const int kernelStep ) const { Vector currentDimension = minDimension; Vector currentKernelSize; do { currentKernelSize = minKernelSize; do { time( benchmark, currentDimension, currentKernelSize ); currentKernelSize[ 0 ] += kernelStep; for( size_t i = 0; i < currentKernelSize.getSize() - 1; i++ ) { if( currentKernelSize[ i ] >= maxKernelSize[ i ] ) { currentKernelSize[ i ] = minKernelSize[ i ]; maxKernelSize[ i + 1 ] += kernelStep; } } } while( currentKernelSize < maxKernelSize ); currentDimension[ 0 ] *= dimensionStep; for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) { if( currentDimension[ i ] >= maxDimension[ i ] ) { currentDimension[ i ] = minDimension[ i ]; maxDimension[ i ] = maxDimension[ i ]; } } } while( currentDimension < maxDimension ); } void timeConvolution( Benchmark& benchmark, const Vector& dimension, const Vector& kernelSize ) const { auto device = TNL::getType< Device >(); Benchmark::MetadataColumns columns = {}; size_t elementsCount = 1; size_t kernelElementsCount = 1; for( size_t i = 0; i < dimension.getSize(); i++ ) { elementsCount *= dimension[ i ]; kernelElementsCount *= kernelSize[ i ]; columns.insert( { dimensionIds[ i ], dimension[ i ] } ); columns.insert( { kernelSizeIds[ i ], kernelSize[ i ] } ); } benchmark.setDatasetSize( ( elementsCount * 4 ) / 1.e9, 1.0 ); // Setup input data DataStore input, result, kernel; input.resize( elementsCount ); result.resize( elementsCount ); kernel.resize( kernelSize ); input = 1; result = 1; kernel = 1; auto inputView = input.getView(); auto resultView = result.getView(); auto kernelView = kernel.getView(); auto measure = [ & ]() { DummyTask<Dimension, Device>::exec(dimension, kernelSize, inputView, resultView, kernelView); }; benchmark.time< Device >( device, measure ); } TNL::Config::ConfigDescription makeInputConfig() const override { auto config = Base::makeInputConfig(); config.addDelimiter( "Grid dimension settings:" ); for( int i = 0; i < Dimension; i++ ) config.addEntry< int >( minDimensionIds[ i ], minDimensionIds[ i ], 512 ); for( int i = 0; i < Dimension; i++ ) config.addEntry< int >( maxDimensionIds[ i ], maxDimensionIds[ i ], 512 ); config.addEntry< int >( "dimension-step", "Step of kernel increase by which dimension is multiplied (must be even)", 2 ); config.addDelimiter( "Kernel settings:" ); for( int i = 0; i < Dimension; i++ ) config.addEntry< int >( minKernelSizeIds[ i ], minKernelSizeIds[ i ] + " (odd) :", 1 ); for( int i = 0; i < Dimension; i++ ) config.addEntry< int >( minKernelSizeIds[ i ], minKernelSizeIds[ i ] + " (odd) :", 11 ); config.addEntry< int >( "kernel-step", "Step of kernel increase which is added to kernel (must be even)", 2 ); return config; } };