From 77b4e0813cd1f1401a2566db1a14725d6aa28f8f Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Sat, 2 Apr 2022 13:05:51 +0200 Subject: [PATCH 01/19] Implement naive convolution for 1D kernel --- src/Benchmarks/Convolution/.gitignore | 1 + src/Benchmarks/Convolution/CMakeLists.txt | 25 ++ src/Benchmarks/Convolution/kernels/naive.h | 164 +++++++++++++ .../Convolution/support/Benchmark.h | 81 +++++++ .../Convolution/support/DummyBenchmark.h | 165 +++++++++++++ .../Convolution/support/DummySolver.h | 84 +++++++ .../Convolution/support/DummyTask.h | 153 ++++++++++++ src/Benchmarks/Convolution/support/Launcher.h | 218 ++++++++++++++++++ src/Benchmarks/Convolution/support/Solver.h | 52 +++++ .../Convolution/templates/main_benchmark.h | 0 .../Convolution/templates/main_solver.h | 25 ++ 11 files changed, 968 insertions(+) create mode 100644 src/Benchmarks/Convolution/.gitignore create mode 100644 src/Benchmarks/Convolution/CMakeLists.txt create mode 100644 src/Benchmarks/Convolution/kernels/naive.h create mode 100644 src/Benchmarks/Convolution/support/Benchmark.h create mode 100644 src/Benchmarks/Convolution/support/DummyBenchmark.h create mode 100644 src/Benchmarks/Convolution/support/DummySolver.h create mode 100644 src/Benchmarks/Convolution/support/DummyTask.h create mode 100644 src/Benchmarks/Convolution/support/Launcher.h create mode 100644 src/Benchmarks/Convolution/support/Solver.h create mode 100644 src/Benchmarks/Convolution/templates/main_benchmark.h create mode 100644 src/Benchmarks/Convolution/templates/main_solver.h diff --git a/src/Benchmarks/Convolution/.gitignore b/src/Benchmarks/Convolution/.gitignore new file mode 100644 index 000000000..86d4c2dd3 --- /dev/null +++ b/src/Benchmarks/Convolution/.gitignore @@ -0,0 +1 @@ +generated diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt new file mode 100644 index 000000000..22d0876bc --- /dev/null +++ b/src/Benchmarks/Convolution/CMakeLists.txt @@ -0,0 +1,25 @@ + +function(generate_cuda_executable PREFIX DIMENSION TEMPLATE KERNEL_HEADER) + +get_filename_component(MODULE_NAME ${KERNEL_HEADER} NAME_WE) +get_filename_component(TEMPLATE_NAME ${TEMPLATE} NAME_WE) + +if (${BUILD_CUDA}) + SET(SOURCE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/generated/${MODULE_NAME}_${DIMENSION}_${TEMPLATE_NAME}.cu") + + FILE(READ ${TEMPLATE} TEMPLATE_CONTENT) + + STRING(REGEX REPLACE "DIMENSION_VALUE" ${DIMENSION} TEMPLATE_CONTENT "${TEMPLATE_CONTENT}") + + FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}") + + SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}") + + CUDA_ADD_EXECUTABLE(${EXECUTABLE_NAME} ${SOURCE_FILE}) +else() + MESSAGE(WARNING "Convolutions are not supported on CPU") +endif() + +endfunction() + +GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/naive.h") diff --git a/src/Benchmarks/Convolution/kernels/naive.h b/src/Benchmarks/Convolution/kernels/naive.h new file mode 100644 index 000000000..a9e00d890 --- /dev/null +++ b/src/Benchmarks/Convolution/kernels/naive.h @@ -0,0 +1,164 @@ + +#ifdef HAVE_CUDA + +#include +#include + +template< int Dimension, typename Device > +struct Convolution; + +template<> +struct Convolution< 1, TNL::Devices::Cuda > +{ +public: + template< typename Index > + static size_t + getDynamicSharedMemorySize( Index kernelWidth, Index endX ) + { + return 0; + } +}; + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution1D( Index kernelWidth, + Index endX, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + Index radius = kernelWidth >> 1; + + Real result = 0; + + for( Index i = -radius; i <= radius; i++ ) { + Index elementIndex = i + ix; + Index kernelIndex = i + radius; + + if( elementIndex < 0 || elementIndex >= endX ) { + result = convolve( result, fetchBoundary( elementIndex ), fetchKernel( kernelIndex ) ); + } + else { + result = convolve( result, fetchData( elementIndex ), fetchKernel( kernelIndex ) ); + } + } + + store( ix, result ); +} + +// template<> +// struct Convolution< 2, TNL::Devices::Cuda > +// { +// public: +// template< typename Index > +// static size_t +// getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY ) +// { +// return 0; +// } +// }; + +// template< typename Index, +// typename Real, +// typename FetchData, +// typename FetchBoundary, +// typename FetchKernel, +// typename Convolve, +// typename Store > +// __global__ +// static void +// convolution2D( Index kernelWidth, +// Index kernelHeight, +// Index endX, +// Index endY, +// FetchData& fetchData, +// FetchBoundary& fetchBoundary, +// FetchKernel& fetchKernel, +// Convolve& convolve, +// Store& store ) +// { +// int iy = threadIdx.y + blockIdx.y * blockDim.y; +// int ix = threadIdx.x + blockIdx.x * blockDim.x; + +// Real result = 0; + +// for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) { +// for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) { +// if( i < 0 || i >= endX || j < 0 || j >= endY ) { +// result = convolve( result, fetchBoundary( i, j ) ); +// } +// else { +// result = convolve( result, fetchData( i, j ), fetchKernel( i, j ) ); +// } +// } +// } + +// store( ix, iy, result ); +// } + +// template<> +// struct Convolution< 3, TNL::Devices::Cuda > +// { +// public: +// template< typename Index > +// static size_t +// getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ ) +// { +// return 0; +// } +// }; + +// template< typename Index, +// typename Real, +// typename FetchData, +// typename FetchBoundary, +// typename FetchKernel, +// typename Convolve, +// typename Store > +// __global__ +// static void +// convolution3D( Index kernelWidth, +// Index kernelHeight, +// Index kernelDepth, +// Index endX, +// Index endY, +// Index endZ, +// FetchData& fetchData, +// FetchBoundary& fetchBoundary, +// FetchKernel& fetchKernel, +// Convolve& convolve, +// Store& store ) +// { +// int ix = threadIdx.x + blockIdx.x * blockDim.x; +// int iy = threadIdx.y + blockIdx.y * blockDim.y; +// int iz = threadIdx.z + blockIdx.z * blockDim.z; + +// Real result = 0; + +// for( Index k = iz - kernelDepth; k <= iz + kernelDepth; k++ ) { +// for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) { +// for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) { +// if( i < 0 || i >= endX || j < 0 || j >= endY || k < 0 || k >= endZ ) { +// result = convolve( result, fetchBoundary( i, j, k ) ); +// } +// else { +// result = convolve( result, fetchData( i, j, k ), fetchKernel( i, j, k ) ); +// } +// } +// } +// } + +// store( ix, iy, iz, result ); +// } + +#endif diff --git a/src/Benchmarks/Convolution/support/Benchmark.h b/src/Benchmarks/Convolution/support/Benchmark.h new file mode 100644 index 000000000..f5671a06b --- /dev/null +++ b/src/Benchmarks/Convolution/support/Benchmark.h @@ -0,0 +1,81 @@ + +#pragma once + +#include + +#include +#include + +#include +#include +#include + +template< int Dimension, typename Device > +class Benchmark +{ +public: + using Benchmark = typename TNL::Benchmarks::Benchmark<>; + + void + runBenchmark( const TNL::Config::ParameterContainer& parameters ) const + { + if( ! TNL::Devices::Host::setup( parameters ) || ! TNL::Devices::Cuda::setup( parameters ) ) + return; + + const TNL::String logFileName = parameters.getParameter< TNL::String >( "log-file" ); + const TNL::String outputMode = parameters.getParameter< TNL::String >( "output-mode" ); + const TNL::String device = parameters.getParameter< TNL::String >( "device" ); + + const int verbose = parameters.getParameter< int >( "verbose" ); + const int loops = parameters.getParameter< int >( "loops" ); + + auto mode = std::ios::out; + + if( outputMode == "append" ) + mode |= std::ios::app; + + std::ofstream logFile( logFileName.getString(), mode ); + + Benchmark benchmark( logFile, loops, verbose ); + + std::map< std::string, std::string > metadata = TNL::Benchmarks::getHardwareMetadata(); + TNL::Benchmarks::writeMapAsJson( metadata, logFileName, ".metadata.json" ); + + start(benchmark, parameters); + } + + virtual void start(const Benchmark& benchmark, const TNL::Config::ParameterContainer& parameters) const { + TNL_ASSERT_TRUE(false, << "Should be overriden"); + } + + virtual TNL::Config::ConfigDescription makeInputConfig() const { + TNL::Config::ConfigDescription config; + + config.addDelimiter( "Benchmark settings:" ); + config.addEntry< TNL::String >( "id", "Identifier of the run", "unknown" ); + config.addEntry< TNL::String >( "log-file", "Log file name.", "output.log" ); + config.addEntry< TNL::String >( "output-mode", "Mode for opening the log file.", "overwrite" ); + config.addEntryEnum( "append" ); + config.addEntryEnum( "overwrite" ); + + config.addEntry< TNL::String >( "device", "Device the computation will run on.", "cuda" ); + config.addEntryEnum< TNL::String >( "all" ); + config.addEntryEnum< TNL::String >( "host" ); + +#ifdef HAVE_CUDA + config.addEntryEnum< TNL::String >( "cuda" ); +#endif + + config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); + config.addEntry< int >( "verbose", "Verbose mode.", 1 ); + + + config.addDelimiter( "Device settings:" ); + TNL::Devices::Host::configSetup( config ); + +#ifdef HAVE_CUDA + TNL::Devices::Cuda::configSetup( config ); +#endif + return config; + } +}; diff --git a/src/Benchmarks/Convolution/support/DummyBenchmark.h b/src/Benchmarks/Convolution/support/DummyBenchmark.h new file mode 100644 index 000000000..1830e7484 --- /dev/null +++ b/src/Benchmarks/Convolution/support/DummyBenchmark.h @@ -0,0 +1,165 @@ + +#pragma once + +#include "Benchmark.h" +#include "DummyTask.h" + +static std::vector< TNL::String > minDimensionIds = { "min-x-dimension", "min-y-dimension", "min-z-dimension" }; +static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" }; +static std::vector< TNL::String > maxDimensionIds = { "max-x-dimension", "max-y-dimension", "max-z-dimension" }; +static std::vector< TNL::String > minKernelSizeIds = { "min-kernel-width", "min-kernel-height", "min-kernel-depth" }; +static std::vector< TNL::String > kernelSizeIds = { "x-kernelSize", "y-kernelSize", "z-kernelSize" }; +static std::vector< TNL::String > maxKernelSizeIds = { "max-kernel-width", "max-kernel-height", "max-kernel-depth" }; + +template< int Dimension, typename Device > +class DummyBenchmark : public Benchmark< Dimension, Device > +{ +public: + using Vector = TNL::Containers::StaticVector< Dimension, int >; + using DataStore = TNL::Containers::Array< int, Device, float >; + using Benchmark = Base::Benchmark; + using Base = Benchmark< Dimension, Device >; + + virtual void + start( const Benchmark& benchmark, const TNL::Config::ParameterContainer& parameters ) const override + { + Vector start; + Vector end; + Vector minKernelSize; + Vector maxKernelSize; + + for( int i = 0; i < Dimension; i++ ) { + start[ i ] = parameters.getParameter< int >( minDimensionIds[ i ] ); + end[ i ] = parameters.getParameter< int >( maxDimensionIds[ i ] ); + minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] ); + maxKernelSizeIds[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] ); + + TNL_ASSERT_GT( start[ i ], 1, "Start dimension must be positive integer" ); + TNL_ASSERT_GT( end[ i ], start[ i ], "End dimension must be greater than start dimension" ); + + TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" ); + TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" ); + TNL_ASSERT_GT( end[ i ], start[ i ], "End kernel size must be greater than start kernel size" ); + } + + int dimensionStep = parameters.getParameter< int >( "dimension-step" ); + int kernelStep = parameters.getParameter< int >( "kernel-step" ); + + TNL_ASSERT_GT( dimensionStep, 1, "Dimension step must be a positive number" ); + TNL_ASSERT_GT( kernelStep, 0, "Kernel step must be a positive number" ); + TNL_ASSERT_EQ( kernelStep % 2, 0, "Kernel step must be even" ); + + time( benchmark, start, end, dimensionStep, minKernelSize, maxKernelSize, kernelStep ); + } + + virtual void + time( Benchmark& bencmark, + const Vector& minDimension, + const Vector& maxDimension, + const int dimensionStep, + const Vector& minKernelSize, + const Vector& maxKernelSize, + const int kernelStep ) const + { + Vector currentDimension = minDimension; + Vector currentKernelSize; + + do { + currentKernelSize = minKernelSize; + + do { + time( benchmark, currentDimension, currentKernelSize ); + + currentKernelSize[ 0 ] += kernelStep; + + for( size_t i = 0; i < currentKernelSize.getSize() - 1; i++ ) { + if( currentKernelSize[ i ] >= maxKernelSize[ i ] ) { + currentKernelSize[ i ] = minKernelSize[ i ]; + maxKernelSize[ i + 1 ] += kernelStep; + } + } + } while( currentKernelSize < maxKernelSize ); + + currentDimension[ 0 ] *= dimensionStep; + + for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) { + if( currentDimension[ i ] >= maxDimension[ i ] ) { + currentDimension[ i ] = minDimension[ i ]; + maxDimension[ i ] = maxDimension[ i ]; + } + } + + } while( currentDimension < maxDimension ); + } + + void + timeConvolution( Benchmark& benchmark, const Vector& dimension, const Vector& kernelSize ) const + { + auto device = TNL::getType< Device >(); + + Benchmark::MetadataColumns columns = {}; + + size_t elementsCount = 1; + size_t kernelElementsCount = 1; + + for( size_t i = 0; i < dimension.getSize(); i++ ) { + elementsCount *= dimension[ i ]; + kernelElementsCount *= kernelSize[ i ]; + + columns.insert( { dimensionIds[ i ], dimension[ i ] } ); + columns.insert( { kernelSizeIds[ i ], kernelSize[ i ] } ); + } + + benchmark.setDatasetSize( ( elementsCount * 4 ) / 1.e9, 1.0 ); + + // Setup input data + DataStore input, result, kernel; + + input.resize( elementsCount ); + result.resize( elementsCount ); + kernel.resize( kernelSize ); + + input = 1; + result = 1; + kernel = 1; + + auto inputView = input.getView(); + auto resultView = result.getView(); + auto kernelView = kernel.getView(); + + auto measure = [ & ]() + { + DummyTask::exec(dimension, kernelSize, inputView, resultView, kernelView); + }; + + benchmark.time< Device >( device, measure ); + } + + TNL::Config::ConfigDescription + makeInputConfig() const override + { + auto config = Base::makeInputConfig(); + + config.addDelimiter( "Grid dimension settings:" ); + + for( int i = 0; i < Dimension; i++ ) + config.addEntry< int >( minDimensionIds[ i ], minDimensionIds[ i ], 512 ); + + for( int i = 0; i < Dimension; i++ ) + config.addEntry< int >( maxDimensionIds[ i ], maxDimensionIds[ i ], 512 ); + + config.addEntry< int >( "dimension-step", "Step of kernel increase by which dimension is multiplied (must be even)", 2 ); + + config.addDelimiter( "Kernel settings:" ); + + for( int i = 0; i < Dimension; i++ ) + config.addEntry< int >( minKernelSizeIds[ i ], minKernelSizeIds[ i ] + " (odd) :", 1 ); + + for( int i = 0; i < Dimension; i++ ) + config.addEntry< int >( minKernelSizeIds[ i ], minKernelSizeIds[ i ] + " (odd) :", 11 ); + + config.addEntry< int >( "kernel-step", "Step of kernel increase which is added to kernel (must be even)", 2 ); + + return config; + } +}; diff --git a/src/Benchmarks/Convolution/support/DummySolver.h b/src/Benchmarks/Convolution/support/DummySolver.h new file mode 100644 index 000000000..a871c7f3f --- /dev/null +++ b/src/Benchmarks/Convolution/support/DummySolver.h @@ -0,0 +1,84 @@ + +#pragma once + +#include "Solver.h" +#include "DummyTask.h" + +static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" }; +static std::vector< TNL::String > kernelSizeIds = { "x-kernel-size", "y-kernel-size", "z-kernel-size" }; + +template< int Dimension, typename Device > +class DummySolver : public Solver< Dimension, Device > +{ +public: + using Base = Solver< Dimension, Device >; + using Vector = TNL::Containers::StaticVector< Dimension, int >; + using DataStore = TNL::Containers::Array< float, Device, int >; + + virtual void + start( const TNL::Config::ParameterContainer& parameters ) const override + { + Vector dimensions; + Vector kernelSize; + + for( int i = 0; i < Dimension; i++ ) { + dimensions[ i ] = parameters.getParameter< int >( dimensionIds[ i ] ); + kernelSize[ i ] = parameters.getParameter< int >( kernelSizeIds[ i ] ); + + TNL_ASSERT_GT( dimensions[ i ], 1, "Start dimension must be positive integer" ); + + TNL_ASSERT_GE( kernelSize[ i ], 1, "Minimal kernel size must be a positive number" ); + TNL_ASSERT_EQ( kernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" ); + } + + launchConvolution( dimensions, kernelSize ); + } + + void + launchConvolution( const Vector& dimension, const Vector& kernelSize ) const + { + DataStore input, result, kernel; + + size_t elementsCount = 1; + size_t kernelElementsCount = 1; + + for( size_t i = 0; i < (size_t) dimension.getSize(); i++ ) { + elementsCount *= dimension[ i ]; + kernelElementsCount *= kernelSize[ i ]; + } + + input.resize( elementsCount ); + result.resize( elementsCount ); + kernel.resize( kernelElementsCount ); + + input = 1; + result = 1; + kernel = 1; + + auto inputView = input.getView(); + auto resultView = result.getView(); + auto kernelView = kernel.getView(); + + DummyTask::exec(dimension, kernelSize, inputView, resultView, kernelView); + + std::cout << "Everything is fine" << std::endl; + } + + virtual TNL::Config::ConfigDescription + makeInputConfig() const override + { + TNL::Config::ConfigDescription config = Base::makeInputConfig(); + + config.addDelimiter( "Grid dimension settings:" ); + + for( int i = 0; i < Dimension; i++ ) + config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 512 ); + + config.addDelimiter( "Kernel settings:" ); + + for( int i = 0; i < Dimension; i++ ) + config.addEntry< int >( kernelSizeIds[ i ], kernelSizeIds[ i ] + " (odd) :", 11 ); + + return config; + } +}; diff --git a/src/Benchmarks/Convolution/support/DummyTask.h b/src/Benchmarks/Convolution/support/DummyTask.h new file mode 100644 index 000000000..22565ac1b --- /dev/null +++ b/src/Benchmarks/Convolution/support/DummyTask.h @@ -0,0 +1,153 @@ + +#pragma once + +#include "Launcher.h" + +template< typename Index, typename Real, int Dimension, typename Device > +struct DummyTask; + +template< typename Index, typename Real > +struct DummyTask< Index, Real, 1, TNL::Devices::Cuda > +{ +public: + static constexpr int Dimension = 1; + using Device = TNL::Devices::Cuda; + using Vector = TNL::Containers::StaticVector< Dimension, Index >; + using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; + using Launcher = Launcher< Dimension, Device >; + + static void + exec( const Vector& dimensions, const Vector& kernelSize, DataStore input, DataStore result, DataStore kernel ) + { + auto fetchData = [ = ] __cuda_callable__( Index i ) + { + return input[ i ]; + }; + + auto fetchBoundary = [ = ] __cuda_callable__( Index i ) + { + return 1; + }; + + auto fetchKernel = [ = ] __cuda_callable__( Index i ) + { + return kernel[ i ]; + }; + + auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel ) + { + return result + data * kernel; + }; + + auto store = [ = ] __cuda_callable__( Index i, Real resultValue ) mutable + { + result[i] = resultValue; + }; + + Launcher::exec< Index, Real >( dimensions, + kernelSize, + std::forward< decltype( fetchData ) >( fetchData ), + std::forward< decltype( fetchBoundary ) >( fetchBoundary ), + std::forward< decltype( fetchKernel ) >( fetchKernel ), + std::forward< decltype( convolve ) >( convolve ), + std::forward< decltype( store ) >( store ) ); + } +}; + +// template< typename Index, typename Real > +// struct DummyTask< Index, Real, 2, TNL::Devices::Cuda > +// { +// public: +// static constexpr int Dimension = 2; +// using Device = TNL::Devices::Cuda; +// using Vector = TNL::Containers::StaticVector< Dimension, Index >; +// using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; +// using Launcher = Launcher< Dimension, Device >; + +// static void +// exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) +// { +// auto fetchData = [ = ] __cuda_callable__( Index i, Index j ) +// { +// auto index = i + j * dimensions.x(); + +// return input[ index ]; +// }; + +// auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j ) +// { +// return -1; +// }; + +// auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j ) +// { +// auto index = i + j * kernel.x(); + +// return kernel[ index ]; +// }; + +// auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel ) +// { +// return result + data * kernel; +// }; + +// auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) +// { +// auto index = i + j * dimensions.x(); + +// result[ index ] = resultValue; +// }; + +// Launcher::exec< Index >( dimensions, +// kernelSize, +// std::forward< decltype( fetchData ) >( fetchData ), +// std::forward< decltype( fetchBoundary ) >( fetchBoundary ), +// std::forward< decltype( fetchKernel ) >( fetchKernel ), +// std::forward< decltype( convolve ) >( convolve ), +// std::forward< decltype( store ) >( store ) ); +// } +// }; + +// template< typename Index, typename Real > +// struct DummyTask< Index, Real, 3, TNL::Devices::Cuda > +// { +// public: +// static constexpr int Dimension = 3; +// using Device = TNL::Devices::Cuda; +// using Vector = TNL::Containers::StaticVector< Dimension, Index >; +// using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; +// using Launcher = Launcher< Dimension, Device >; + +// static void +// exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) +// { +// auto fetchData = [ = ] __cuda_callable__( Index i, Index j, Index k ) { + +// }; + +// auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j, Index k ) { + +// }; + +// auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j, Index k ) { + +// }; + +// auto convolve = [ = ] __cuda_callable__( float result, Index data, Index kernel ) +// { +// return result + data * kernel; +// }; + +// auto store = [ = ] __cuda_callable__( Index i, Index j, Index k, Real result ) { + +// }; + +// Launcher::exec< Index >( dimensions, +// kernelSize, +// std::forward< decltype( fetchData ) >( fetchData ), +// std::forward< decltype( fetchBoundary ) >( fetchBoundary ), +// std::forward< decltype( fetchKernel ) >( fetchKernel ), +// std::forward< decltype( convolve ) >( convolve ), +// std::forward< decltype( store ) >( store ) ); +// } +// }; diff --git a/src/Benchmarks/Convolution/support/Launcher.h b/src/Benchmarks/Convolution/support/Launcher.h new file mode 100644 index 000000000..c86ed2057 --- /dev/null +++ b/src/Benchmarks/Convolution/support/Launcher.h @@ -0,0 +1,218 @@ + +#pragma once + +#include +#include + +template< int Dimension, typename Device > +struct Convolution; + +template< int Dimension, typename Device > +struct Launcher; + +template<> +struct Launcher< 1, TNL::Devices::Cuda > +{ +public: + using Vector = TNL::Containers::StaticVector< 1, int >; + using ConvolutionKernel = Convolution< 1, TNL::Devices::Cuda >; + + template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > + static inline void + exec( const Vector& dimensions, + const Vector& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration launchConfig; + + launchConfig.dynamicSharedMemorySize = + ConvolutionKernel::getDynamicSharedMemorySize< Index >( kernelSize.x(), dimensions.x() ); + + // TODO: - Benchmark the best value + launchConfig.blockSize.x = 256; + launchConfig.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), launchConfig.blockSize.x ) ); + + if( (std::size_t) launchConfig.blockSize.x * launchConfig.gridSize.x < (std::size_t) dimensions.x() ) { + const int desGridSize = 32 * TNL::Cuda::DeviceInfo::getCudaMultiprocessors( TNL::Cuda::DeviceInfo::getActiveDevice() ); + + launchConfig.gridSize.x = + TNL::min( desGridSize, TNL::Cuda::getNumberOfBlocks( dimensions.x(), launchConfig.blockSize.x ) ); + } + + constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( kernel, + 0, + launchConfig, + kernelSize.x(), + dimensions.x(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + } +}; + +// template<> +// struct Launcher< 2, TNL::Devices::Cuda > +// { +// public: +// using Vector = TNL::Containers::StaticVector< 2, int >; +// using ConvolutionKernel = Convolution< 2, TNL::Devices::Cuda >; + +// template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > +// static inline void +// exec( const Vector& dimensions, +// const Vector& kernelSize, +// FetchData&& fetchData, +// FetchBoundary&& fetchBoundary, +// FetchKernel&& fetchKernel, +// Convolve&& convolve, +// Store&& store ) +// { +// TNL::Cuda::LaunchConfiguration launchConfig; + +// launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( +// kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() ); + +// const Index sizeX = dimensions.x(); +// const Index sizeY = dimensions.y(); + +// if( sizeX >= sizeY * sizeY ) { +// launchConfig.blockSize.x = TNL::min( 256, sizeX ); +// launchConfig.blockSize.y = 1; +// } +// else if( sizeY >= sizeX * sizeX ) { +// launchConfig.blockSize.x = 1; +// launchConfig.blockSize.y = TNL::min( 256, sizeY ); +// } +// else { +// launchConfig.blockSize.x = TNL::min( 32, sizeX ); +// launchConfig.blockSize.y = TNL::min( 8, sizeY ); +// } + +// launchConfig.gridSize.x = +// TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); +// launchConfig.gridSize.y = +// TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); + +// dim3 gridCount; + +// gridCount.x = roundUpDivision( sizeX, launchConfig.blockSize.x * launchConfig.gridSize.x ); +// gridCount.y = roundUpDivision( sizeY, launchConfig.blockSize.y * launchConfig.gridSize.y ); + +// constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + +// TNL::Cuda::launchKernel< true >( kernel, +// 0, +// launchConfig, +// kernelSize.x(), +// kernelSize.y(), +// dimensions.x(), +// dimensions.y(), +// std::forward< FetchData >( fetchData ), +// std::forward< FetchBoundary >( fetchBoundary ), +// std::forward< FetchKernel >( fetchKernel ), +// std::forward< Convolve >( convolve ), +// std::forward< Store >( store ) ); +// } +// }; + +// template<> +// struct Launcher< 3, TNL::Devices::Cuda > +// { +// public: +// using Vector = TNL::Containers::StaticVector< 3, int >; +// using ConvolutionKernel = Convolution< 3, TNL::Devices::Cuda >; + +// template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > +// static inline void +// exec( const Vector& dimensions, +// const Vector& kernelSize, +// FetchData&& fetchData, +// FetchBoundary&& fetchBoundary, +// FetchKernel&& fetchKernel, +// Convolve&& convolve, +// Store&& store ) +// { +// const Index sizeX = dimensions.x(); +// const Index sizeY = dimensions.y(); +// const Index sizeZ = dimensions.z(); + +// TNL::Cuda::LaunchConfiguration launchConfig; + +// launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( +// kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z() ); + +// if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) { +// launchConfig.blockSize.x = TNL::min( 256, sizeX ); +// launchConfig.blockSize.y = 1; +// launchConfig.blockSize.z = 1; +// } +// else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) { +// launchConfig.blockSize.x = 1; +// launchConfig.blockSize.y = TNL::min( 256, sizeY ); +// launchConfig.blockSize.z = 1; +// } +// else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) { +// launchConfig.blockSize.x = TNL::min( 2, sizeX ); +// launchConfig.blockSize.y = TNL::min( 2, sizeY ); +// // CUDA allows max 64 for launchConfig.blockSize.z +// launchConfig.blockSize.z = TNL::min( 64, sizeZ ); +// } +// else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) { +// launchConfig.blockSize.x = TNL::min( 32, sizeX ); +// launchConfig.blockSize.y = TNL::min( 8, sizeY ); +// launchConfig.blockSize.z = 1; +// } +// else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) { +// launchConfig.blockSize.x = TNL::min( 32, sizeX ); +// launchConfig.blockSize.y = 1; +// launchConfig.blockSize.z = TNL::min( 8, sizeZ ); +// } +// else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) { +// launchConfig.blockSize.x = 1; +// launchConfig.blockSize.y = TNL::min( 32, sizeY ); +// launchConfig.blockSize.z = TNL::min( 8, sizeZ ); +// } +// else { +// launchConfig.blockSize.x = TNL::min( 16, sizeX ); +// launchConfig.blockSize.y = TNL::min( 4, sizeY ); +// launchConfig.blockSize.z = TNL::min( 4, sizeZ ); +// } +// launchConfig.gridSize.x = +// TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); +// launchConfig.gridSize.y = +// TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); +// launchConfig.gridSize.z = +// TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeZ, launchConfig.blockSize.z ) ); + +// dim3 gridCount; +// gridCount.x = roundUpDivision( sizeX, launchConfig.blockSize.x * launchConfig.gridSize.x ); +// gridCount.y = roundUpDivision( sizeY, launchConfig.blockSize.y * launchConfig.gridSize.y ); +// gridCount.z = roundUpDivision( sizeZ, launchConfig.blockSize.z * launchConfig.gridSize.z ); + +// constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + +// TNL::Cuda::launchKernel< true >( kernel, +// 0, +// launchConfig, +// kernelSize.x(), +// kernelSize.y(), +// kernelSize.z(), +// dimensions.x(), +// dimensions.y(), +// dimensions.z(), +// std::forward< FetchData >( fetchData ), +// std::forward< FetchBoundary >( fetchBoundary ), +// std::forward< FetchKernel >( fetchKernel ), +// std::forward< Convolve >( convolve ), +// std::forward< Store >( store ) ); +// } +// }; diff --git a/src/Benchmarks/Convolution/support/Solver.h b/src/Benchmarks/Convolution/support/Solver.h new file mode 100644 index 000000000..a6b1d2c91 --- /dev/null +++ b/src/Benchmarks/Convolution/support/Solver.h @@ -0,0 +1,52 @@ + +#pragma once + +#include + +#include +#include + +#include "Launcher.h" + +template< int Dimension, typename Device > +class Solver +{ +public: + void + solve( const TNL::Config::ParameterContainer& parameters ) const + { + if( ! TNL::Devices::Host::setup( parameters ) || ! TNL::Devices::Cuda::setup( parameters ) ) + return; + + start( parameters ); + } + + virtual void + start( const TNL::Config::ParameterContainer& parameters ) const + { + TNL_ASSERT_TRUE( false, "Should be overriden" ); + } + + virtual TNL::Config::ConfigDescription + makeInputConfig() const + { + TNL::Config::ConfigDescription config; + + config.addEntry< TNL::String >( "device", "Device the computation will run on.", "cuda" ); + config.addEntryEnum< TNL::String >( "all" ); + config.addEntryEnum< TNL::String >( "host" ); + +#ifdef HAVE_CUDA + config.addEntryEnum< TNL::String >( "cuda" ); +#endif + + config.addDelimiter( "Device settings:" ); + TNL::Devices::Host::configSetup( config ); + +#ifdef HAVE_CUDA + TNL::Devices::Cuda::configSetup( config ); +#endif + + return config; + } +}; diff --git a/src/Benchmarks/Convolution/templates/main_benchmark.h b/src/Benchmarks/Convolution/templates/main_benchmark.h new file mode 100644 index 000000000..e69de29bb diff --git a/src/Benchmarks/Convolution/templates/main_solver.h b/src/Benchmarks/Convolution/templates/main_solver.h new file mode 100644 index 000000000..1a6c33a9b --- /dev/null +++ b/src/Benchmarks/Convolution/templates/main_solver.h @@ -0,0 +1,25 @@ + +#include "../kernels/naive.h" +#include "../support/DummySolver.h" + +#include + +#define DIMENSION DIMENSION_VALUE + +using TaskSolver = DummySolver< DIMENSION, TNL::Devices::Cuda >; + +int main(int argc, char* argv[]) +{ + TaskSolver solver; + + auto config = solver.makeInputConfig(); + + TNL::Config::ParameterContainer parameters; + + if( ! parseCommandLine( argc, argv, config, parameters ) ) + return EXIT_FAILURE; + + solver.solve( parameters ); + + return 0; +} -- GitLab From 5a5a294c8ee811687be19bd7cfd05832d57decbe Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Sat, 2 Apr 2022 13:15:37 +0200 Subject: [PATCH 02/19] Implement naive 2D kernel --- src/Benchmarks/Convolution/CMakeLists.txt | 1 + src/Benchmarks/Convolution/kernels/naive.h | 99 ++++++++-------- .../Convolution/support/DummyTask.h | 88 +++++++------- src/Benchmarks/Convolution/support/Launcher.h | 109 +++++++++--------- 4 files changed, 151 insertions(+), 146 deletions(-) diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt index 22d0876bc..6e31beaeb 100644 --- a/src/Benchmarks/Convolution/CMakeLists.txt +++ b/src/Benchmarks/Convolution/CMakeLists.txt @@ -23,3 +23,4 @@ endif() endfunction() GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/naive.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/naive.h") diff --git a/src/Benchmarks/Convolution/kernels/naive.h b/src/Benchmarks/Convolution/kernels/naive.h index a9e00d890..76c73237d 100644 --- a/src/Benchmarks/Convolution/kernels/naive.h +++ b/src/Benchmarks/Convolution/kernels/naive.h @@ -56,55 +56,64 @@ convolution1D( Index kernelWidth, store( ix, result ); } -// template<> -// struct Convolution< 2, TNL::Devices::Cuda > -// { -// public: -// template< typename Index > -// static size_t -// getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY ) -// { -// return 0; -// } -// }; +template<> +struct Convolution< 2, TNL::Devices::Cuda > +{ +public: + template< typename Index > + static size_t + getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY ) + { + return 0; + } +}; -// template< typename Index, -// typename Real, -// typename FetchData, -// typename FetchBoundary, -// typename FetchKernel, -// typename Convolve, -// typename Store > -// __global__ -// static void -// convolution2D( Index kernelWidth, -// Index kernelHeight, -// Index endX, -// Index endY, -// FetchData& fetchData, -// FetchBoundary& fetchBoundary, -// FetchKernel& fetchKernel, -// Convolve& convolve, -// Store& store ) -// { -// int iy = threadIdx.y + blockIdx.y * blockDim.y; -// int ix = threadIdx.x + blockIdx.x * blockDim.x; +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution2D( Index kernelWidth, + Index kernelHeight, + Index endX, + Index endY, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Index iy = threadIdx.y + blockIdx.y * blockDim.y; + Index ix = threadIdx.x + blockIdx.x * blockDim.x; -// Real result = 0; + Index radiusY = kernelHeight >> 1; + Index radiusX = kernelHeight >> 1; -// for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) { -// for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) { -// if( i < 0 || i >= endX || j < 0 || j >= endY ) { -// result = convolve( result, fetchBoundary( i, j ) ); -// } -// else { -// result = convolve( result, fetchData( i, j ), fetchKernel( i, j ) ); -// } -// } -// } + Real result = 0; -// store( ix, iy, result ); -// } + for( Index j = - radiusY; j <= radiusY; j++ ) { + Index elementIndexY = j + iy; + Index kernelIndexY = j + radiusY; + + for( Index i = - radiusX; i <= radiusX; i++ ) { + Index elementIndexX = i + ix; + Index kernelIndexX = i + radiusX; + + if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY ) { + result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel ( kernelIndexX, kernelIndexY ) ); + } + else { + result = convolve( result, fetchData( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) ); + } + } + } + + store( ix, iy, result ); +} // template<> // struct Convolution< 3, TNL::Devices::Cuda > diff --git a/src/Benchmarks/Convolution/support/DummyTask.h b/src/Benchmarks/Convolution/support/DummyTask.h index 22565ac1b..f92a5c2fc 100644 --- a/src/Benchmarks/Convolution/support/DummyTask.h +++ b/src/Benchmarks/Convolution/support/DummyTask.h @@ -54,59 +54,59 @@ public: } }; -// template< typename Index, typename Real > -// struct DummyTask< Index, Real, 2, TNL::Devices::Cuda > -// { -// public: -// static constexpr int Dimension = 2; -// using Device = TNL::Devices::Cuda; -// using Vector = TNL::Containers::StaticVector< Dimension, Index >; -// using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; -// using Launcher = Launcher< Dimension, Device >; +template< typename Index, typename Real > +struct DummyTask< Index, Real, 2, TNL::Devices::Cuda > +{ +public: + static constexpr int Dimension = 2; + using Device = TNL::Devices::Cuda; + using Vector = TNL::Containers::StaticVector< Dimension, Index >; + using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; + using Launcher = Launcher< Dimension, Device >; -// static void -// exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) -// { -// auto fetchData = [ = ] __cuda_callable__( Index i, Index j ) -// { -// auto index = i + j * dimensions.x(); + static void + exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) + { + auto fetchData = [ = ] __cuda_callable__( Index i, Index j ) + { + auto index = i + j * dimensions.x(); -// return input[ index ]; -// }; + return input[ index ]; + }; -// auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j ) -// { -// return -1; -// }; + auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j ) + { + return -1; + }; -// auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j ) -// { -// auto index = i + j * kernel.x(); + auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j ) + { + auto index = i + j * kernelSize.x(); -// return kernel[ index ]; -// }; + return kernel[ index ]; + }; -// auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel ) -// { -// return result + data * kernel; -// }; + auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel ) + { + return result + data * kernel; + }; -// auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) -// { -// auto index = i + j * dimensions.x(); + auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) mutable + { + auto index = i + j * dimensions.x(); -// result[ index ] = resultValue; -// }; + result[ index ] = resultValue; + }; -// Launcher::exec< Index >( dimensions, -// kernelSize, -// std::forward< decltype( fetchData ) >( fetchData ), -// std::forward< decltype( fetchBoundary ) >( fetchBoundary ), -// std::forward< decltype( fetchKernel ) >( fetchKernel ), -// std::forward< decltype( convolve ) >( convolve ), -// std::forward< decltype( store ) >( store ) ); -// } -// }; + Launcher::exec< Index, Real >( dimensions, + kernelSize, + std::forward< decltype( fetchData ) >( fetchData ), + std::forward< decltype( fetchBoundary ) >( fetchBoundary ), + std::forward< decltype( fetchKernel ) >( fetchKernel ), + std::forward< decltype( convolve ) >( convolve ), + std::forward< decltype( store ) >( store ) ); + } +}; // template< typename Index, typename Real > // struct DummyTask< Index, Real, 3, TNL::Devices::Cuda > diff --git a/src/Benchmarks/Convolution/support/Launcher.h b/src/Benchmarks/Convolution/support/Launcher.h index c86ed2057..fde1e91ab 100644 --- a/src/Benchmarks/Convolution/support/Launcher.h +++ b/src/Benchmarks/Convolution/support/Launcher.h @@ -59,70 +59,65 @@ public: } }; -// template<> -// struct Launcher< 2, TNL::Devices::Cuda > -// { -// public: -// using Vector = TNL::Containers::StaticVector< 2, int >; -// using ConvolutionKernel = Convolution< 2, TNL::Devices::Cuda >; - -// template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > -// static inline void -// exec( const Vector& dimensions, -// const Vector& kernelSize, -// FetchData&& fetchData, -// FetchBoundary&& fetchBoundary, -// FetchKernel&& fetchKernel, -// Convolve&& convolve, -// Store&& store ) -// { -// TNL::Cuda::LaunchConfiguration launchConfig; - -// launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( -// kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() ); +template<> +struct Launcher< 2, TNL::Devices::Cuda > +{ +public: + using Vector = TNL::Containers::StaticVector< 2, int >; + using ConvolutionKernel = Convolution< 2, TNL::Devices::Cuda >; -// const Index sizeX = dimensions.x(); -// const Index sizeY = dimensions.y(); + template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > + static inline void + exec( const Vector& dimensions, + const Vector& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration launchConfig; -// if( sizeX >= sizeY * sizeY ) { -// launchConfig.blockSize.x = TNL::min( 256, sizeX ); -// launchConfig.blockSize.y = 1; -// } -// else if( sizeY >= sizeX * sizeX ) { -// launchConfig.blockSize.x = 1; -// launchConfig.blockSize.y = TNL::min( 256, sizeY ); -// } -// else { -// launchConfig.blockSize.x = TNL::min( 32, sizeX ); -// launchConfig.blockSize.y = TNL::min( 8, sizeY ); -// } + launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( + kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() ); -// launchConfig.gridSize.x = -// TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); -// launchConfig.gridSize.y = -// TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); + const Index sizeX = dimensions.x(); + const Index sizeY = dimensions.y(); -// dim3 gridCount; + if( sizeX >= sizeY * sizeY ) { + launchConfig.blockSize.x = TNL::min( 256, sizeX ); + launchConfig.blockSize.y = 1; + } + else if( sizeY >= sizeX * sizeX ) { + launchConfig.blockSize.x = 1; + launchConfig.blockSize.y = TNL::min( 256, sizeY ); + } + else { + launchConfig.blockSize.x = TNL::min( 32, sizeX ); + launchConfig.blockSize.y = TNL::min( 8, sizeY ); + } -// gridCount.x = roundUpDivision( sizeX, launchConfig.blockSize.x * launchConfig.gridSize.x ); -// gridCount.y = roundUpDivision( sizeY, launchConfig.blockSize.y * launchConfig.gridSize.y ); + launchConfig.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); + launchConfig.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); -// constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; -// TNL::Cuda::launchKernel< true >( kernel, -// 0, -// launchConfig, -// kernelSize.x(), -// kernelSize.y(), -// dimensions.x(), -// dimensions.y(), -// std::forward< FetchData >( fetchData ), -// std::forward< FetchBoundary >( fetchBoundary ), -// std::forward< FetchKernel >( fetchKernel ), -// std::forward< Convolve >( convolve ), -// std::forward< Store >( store ) ); -// } -// }; + TNL::Cuda::launchKernel< true >( kernel, + 0, + launchConfig, + kernelSize.x(), + kernelSize.y(), + dimensions.x(), + dimensions.y(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + } +}; // template<> // struct Launcher< 3, TNL::Devices::Cuda > -- GitLab From c710d0b218937bc01e84419f4894ba009d9c87ef Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Sat, 2 Apr 2022 14:22:51 +0200 Subject: [PATCH 03/19] Implement naive 3D kernel --- src/Benchmarks/Convolution/CMakeLists.txt | 2 +- src/Benchmarks/Convolution/kernels/naive.h | 133 +++++++------ .../Convolution/support/DummyTask.h | 84 ++++---- src/Benchmarks/Convolution/support/Launcher.h | 180 +++++++++--------- 4 files changed, 214 insertions(+), 185 deletions(-) diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt index 6e31beaeb..4c80ff07e 100644 --- a/src/Benchmarks/Convolution/CMakeLists.txt +++ b/src/Benchmarks/Convolution/CMakeLists.txt @@ -13,7 +13,7 @@ if (${BUILD_CUDA}) FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}") - SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}") + SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}_${TEMPLATE_NAME}") CUDA_ADD_EXECUTABLE(${EXECUTABLE_NAME} ${SOURCE_FILE}) else() diff --git a/src/Benchmarks/Convolution/kernels/naive.h b/src/Benchmarks/Convolution/kernels/naive.h index 76c73237d..2a8cf47ca 100644 --- a/src/Benchmarks/Convolution/kernels/naive.h +++ b/src/Benchmarks/Convolution/kernels/naive.h @@ -37,6 +37,10 @@ convolution1D( Index kernelWidth, Store store ) { Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + if (ix >= endX) + return; + Index radius = kernelWidth >> 1; Real result = 0; @@ -90,8 +94,11 @@ convolution2D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; + if (ix >= endX || iy >= endY) + return; + Index radiusY = kernelHeight >> 1; - Index radiusX = kernelHeight >> 1; + Index radiusX = kernelWidth >> 1; Real result = 0; @@ -115,59 +122,75 @@ convolution2D( Index kernelWidth, store( ix, iy, result ); } -// template<> -// struct Convolution< 3, TNL::Devices::Cuda > -// { -// public: -// template< typename Index > -// static size_t -// getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ ) -// { -// return 0; -// } -// }; - -// template< typename Index, -// typename Real, -// typename FetchData, -// typename FetchBoundary, -// typename FetchKernel, -// typename Convolve, -// typename Store > -// __global__ -// static void -// convolution3D( Index kernelWidth, -// Index kernelHeight, -// Index kernelDepth, -// Index endX, -// Index endY, -// Index endZ, -// FetchData& fetchData, -// FetchBoundary& fetchBoundary, -// FetchKernel& fetchKernel, -// Convolve& convolve, -// Store& store ) -// { -// int ix = threadIdx.x + blockIdx.x * blockDim.x; -// int iy = threadIdx.y + blockIdx.y * blockDim.y; -// int iz = threadIdx.z + blockIdx.z * blockDim.z; - -// Real result = 0; - -// for( Index k = iz - kernelDepth; k <= iz + kernelDepth; k++ ) { -// for( Index j = iy - kernelHeight; j <= iy + kernelHeight; j++ ) { -// for( Index i = ix - kernelWidth; i <= ix + kernelWidth; i++ ) { -// if( i < 0 || i >= endX || j < 0 || j >= endY || k < 0 || k >= endZ ) { -// result = convolve( result, fetchBoundary( i, j, k ) ); -// } -// else { -// result = convolve( result, fetchData( i, j, k ), fetchKernel( i, j, k ) ); -// } -// } -// } -// } - -// store( ix, iy, iz, result ); -// } +template<> +struct Convolution< 3, TNL::Devices::Cuda > +{ +public: + template< typename Index > + static size_t + getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ ) + { + return 0; + } +}; + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution3D( Index kernelWidth, + Index kernelHeight, + Index kernelDepth, + Index endX, + Index endY, + Index endZ, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Index iz = threadIdx.z + blockIdx.z * blockDim.z; + Index iy = threadIdx.y + blockIdx.y * blockDim.y; + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + if (ix >= endX || iy >= endY || iz >= endZ) + return; + + Index radiusZ = kernelDepth >> 1; + Index radiusY = kernelHeight >> 1; + Index radiusX = kernelWidth >> 1; + + Real result = 0; + + for( Index k = -radiusZ; k <= radiusZ; k++ ) { + Index elementIndexZ = k + iz; + Index kernelIndexZ = k + radiusZ; + + for( Index j = -radiusY; j <= radiusY; j++ ) { + Index elementIndexY = j + iy; + Index kernelIndexY = j + radiusY; + + for( Index i = -radiusX; i <= radiusX; i++ ) { + Index elementIndexX = i + ix; + Index kernelIndexX = i + radiusX; + + if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 || elementIndexZ >= endZ ) { + result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); + } + else { + result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); + } + } + } + } + + store( ix, iy, iz, result ); +} #endif diff --git a/src/Benchmarks/Convolution/support/DummyTask.h b/src/Benchmarks/Convolution/support/DummyTask.h index f92a5c2fc..f7db47e34 100644 --- a/src/Benchmarks/Convolution/support/DummyTask.h +++ b/src/Benchmarks/Convolution/support/DummyTask.h @@ -17,7 +17,7 @@ public: using Launcher = Launcher< Dimension, Device >; static void - exec( const Vector& dimensions, const Vector& kernelSize, DataStore input, DataStore result, DataStore kernel ) + exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) { auto fetchData = [ = ] __cuda_callable__( Index i ) { @@ -36,12 +36,12 @@ public: auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel ) { - return result + data * kernel; + return result + data * kernel; }; auto store = [ = ] __cuda_callable__( Index i, Real resultValue ) mutable { - result[i] = resultValue; + result[ i ] = resultValue; }; Launcher::exec< Index, Real >( dimensions, @@ -108,46 +108,56 @@ public: } }; -// template< typename Index, typename Real > -// struct DummyTask< Index, Real, 3, TNL::Devices::Cuda > -// { -// public: -// static constexpr int Dimension = 3; -// using Device = TNL::Devices::Cuda; -// using Vector = TNL::Containers::StaticVector< Dimension, Index >; -// using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; -// using Launcher = Launcher< Dimension, Device >; - -// static void -// exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) -// { -// auto fetchData = [ = ] __cuda_callable__( Index i, Index j, Index k ) { +template< typename Index, typename Real > +struct DummyTask< Index, Real, 3, TNL::Devices::Cuda > +{ +public: + static constexpr int Dimension = 3; + using Device = TNL::Devices::Cuda; + using Vector = TNL::Containers::StaticVector< Dimension, Index >; + using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; + using Launcher = Launcher< Dimension, Device >; -// }; + static void + exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) + { + auto fetchData = [ = ] __cuda_callable__( Index i, Index j, Index k ) + { + auto index = i + j * dimensions.x() + k * dimensions.x() * dimensions.y(); -// auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j, Index k ) { + return input[index]; + }; -// }; + auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j, Index k ) + { + return 1; + }; -// auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j, Index k ) { + auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j, Index k ) + { + auto index = i + j * kernelSize.x() + k * kernelSize.x() * kernelSize.y(); -// }; + return kernel[ index ]; + }; -// auto convolve = [ = ] __cuda_callable__( float result, Index data, Index kernel ) -// { -// return result + data * kernel; -// }; + auto convolve = [ = ] __cuda_callable__( float result, Index data, Index kernel ) + { + return result + data * kernel; + }; -// auto store = [ = ] __cuda_callable__( Index i, Index j, Index k, Real result ) { + auto store = [ = ] __cuda_callable__( Index i, Index j, Index k, Real resultValue ) mutable + { + auto index = i + j * dimensions.x() + k * dimensions.x() * dimensions.y(); -// }; + result[ index ] = resultValue; + }; -// Launcher::exec< Index >( dimensions, -// kernelSize, -// std::forward< decltype( fetchData ) >( fetchData ), -// std::forward< decltype( fetchBoundary ) >( fetchBoundary ), -// std::forward< decltype( fetchKernel ) >( fetchKernel ), -// std::forward< decltype( convolve ) >( convolve ), -// std::forward< decltype( store ) >( store ) ); -// } -// }; + Launcher::exec< Index, Real >( dimensions, + kernelSize, + std::forward< decltype( fetchData ) >( fetchData ), + std::forward< decltype( fetchBoundary ) >( fetchBoundary ), + std::forward< decltype( fetchKernel ) >( fetchKernel ), + std::forward< decltype( convolve ) >( convolve ), + std::forward< decltype( store ) >( store ) ); + } +}; diff --git a/src/Benchmarks/Convolution/support/Launcher.h b/src/Benchmarks/Convolution/support/Launcher.h index fde1e91ab..94e9b096b 100644 --- a/src/Benchmarks/Convolution/support/Launcher.h +++ b/src/Benchmarks/Convolution/support/Launcher.h @@ -119,95 +119,91 @@ public: } }; -// template<> -// struct Launcher< 3, TNL::Devices::Cuda > -// { -// public: -// using Vector = TNL::Containers::StaticVector< 3, int >; -// using ConvolutionKernel = Convolution< 3, TNL::Devices::Cuda >; - -// template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > -// static inline void -// exec( const Vector& dimensions, -// const Vector& kernelSize, -// FetchData&& fetchData, -// FetchBoundary&& fetchBoundary, -// FetchKernel&& fetchKernel, -// Convolve&& convolve, -// Store&& store ) -// { -// const Index sizeX = dimensions.x(); -// const Index sizeY = dimensions.y(); -// const Index sizeZ = dimensions.z(); - -// TNL::Cuda::LaunchConfiguration launchConfig; - -// launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( -// kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z() ); - -// if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) { -// launchConfig.blockSize.x = TNL::min( 256, sizeX ); -// launchConfig.blockSize.y = 1; -// launchConfig.blockSize.z = 1; -// } -// else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) { -// launchConfig.blockSize.x = 1; -// launchConfig.blockSize.y = TNL::min( 256, sizeY ); -// launchConfig.blockSize.z = 1; -// } -// else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) { -// launchConfig.blockSize.x = TNL::min( 2, sizeX ); -// launchConfig.blockSize.y = TNL::min( 2, sizeY ); -// // CUDA allows max 64 for launchConfig.blockSize.z -// launchConfig.blockSize.z = TNL::min( 64, sizeZ ); -// } -// else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) { -// launchConfig.blockSize.x = TNL::min( 32, sizeX ); -// launchConfig.blockSize.y = TNL::min( 8, sizeY ); -// launchConfig.blockSize.z = 1; -// } -// else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) { -// launchConfig.blockSize.x = TNL::min( 32, sizeX ); -// launchConfig.blockSize.y = 1; -// launchConfig.blockSize.z = TNL::min( 8, sizeZ ); -// } -// else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) { -// launchConfig.blockSize.x = 1; -// launchConfig.blockSize.y = TNL::min( 32, sizeY ); -// launchConfig.blockSize.z = TNL::min( 8, sizeZ ); -// } -// else { -// launchConfig.blockSize.x = TNL::min( 16, sizeX ); -// launchConfig.blockSize.y = TNL::min( 4, sizeY ); -// launchConfig.blockSize.z = TNL::min( 4, sizeZ ); -// } -// launchConfig.gridSize.x = -// TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); -// launchConfig.gridSize.y = -// TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); -// launchConfig.gridSize.z = -// TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeZ, launchConfig.blockSize.z ) ); - -// dim3 gridCount; -// gridCount.x = roundUpDivision( sizeX, launchConfig.blockSize.x * launchConfig.gridSize.x ); -// gridCount.y = roundUpDivision( sizeY, launchConfig.blockSize.y * launchConfig.gridSize.y ); -// gridCount.z = roundUpDivision( sizeZ, launchConfig.blockSize.z * launchConfig.gridSize.z ); - -// constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; - -// TNL::Cuda::launchKernel< true >( kernel, -// 0, -// launchConfig, -// kernelSize.x(), -// kernelSize.y(), -// kernelSize.z(), -// dimensions.x(), -// dimensions.y(), -// dimensions.z(), -// std::forward< FetchData >( fetchData ), -// std::forward< FetchBoundary >( fetchBoundary ), -// std::forward< FetchKernel >( fetchKernel ), -// std::forward< Convolve >( convolve ), -// std::forward< Store >( store ) ); -// } -// }; +template<> +struct Launcher< 3, TNL::Devices::Cuda > +{ +public: + using Vector = TNL::Containers::StaticVector< 3, int >; + using ConvolutionKernel = Convolution< 3, TNL::Devices::Cuda >; + + template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > + static inline void + exec( const Vector& dimensions, + const Vector& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + const Index sizeX = dimensions.x(); + const Index sizeY = dimensions.y(); + const Index sizeZ = dimensions.z(); + + TNL::Cuda::LaunchConfiguration launchConfig; + + launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( + kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z() ); + + if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) { + launchConfig.blockSize.x = TNL::min( 256, sizeX ); + launchConfig.blockSize.y = 1; + launchConfig.blockSize.z = 1; + } + else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) { + launchConfig.blockSize.x = 1; + launchConfig.blockSize.y = TNL::min( 256, sizeY ); + launchConfig.blockSize.z = 1; + } + else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) { + launchConfig.blockSize.x = TNL::min( 2, sizeX ); + launchConfig.blockSize.y = TNL::min( 2, sizeY ); + // CUDA allows max 64 for launchConfig.blockSize.z + launchConfig.blockSize.z = TNL::min( 64, sizeZ ); + } + else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) { + launchConfig.blockSize.x = TNL::min( 32, sizeX ); + launchConfig.blockSize.y = TNL::min( 8, sizeY ); + launchConfig.blockSize.z = 1; + } + else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) { + launchConfig.blockSize.x = TNL::min( 32, sizeX ); + launchConfig.blockSize.y = 1; + launchConfig.blockSize.z = TNL::min( 8, sizeZ ); + } + else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) { + launchConfig.blockSize.x = 1; + launchConfig.blockSize.y = TNL::min( 32, sizeY ); + launchConfig.blockSize.z = TNL::min( 8, sizeZ ); + } + else { + launchConfig.blockSize.x = TNL::min( 16, sizeX ); + launchConfig.blockSize.y = TNL::min( 4, sizeY ); + launchConfig.blockSize.z = TNL::min( 4, sizeZ ); + } + + launchConfig.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); + launchConfig.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); + launchConfig.gridSize.z = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeZ, launchConfig.blockSize.z ) ); + + constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( kernel, + 0, + launchConfig, + kernelSize.x(), + kernelSize.y(), + kernelSize.z(), + dimensions.x(), + dimensions.y(), + dimensions.z(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + } +}; -- GitLab From 868f233e64c3a6bf1c4c4057a42dbac3e93878e9 Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Sat, 2 Apr 2022 14:23:20 +0200 Subject: [PATCH 04/19] Implement benchmarks for naive kernel --- src/Benchmarks/Convolution/CMakeLists.txt | 5 +++ .../Convolution/support/Benchmark.h | 11 +++--- .../Convolution/support/DummyBenchmark.h | 37 ++++++++++--------- .../Convolution/templates/main_benchmark.h | 25 +++++++++++++ 4 files changed, 54 insertions(+), 24 deletions(-) diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt index 4c80ff07e..d8a0c683c 100644 --- a/src/Benchmarks/Convolution/CMakeLists.txt +++ b/src/Benchmarks/Convolution/CMakeLists.txt @@ -24,3 +24,8 @@ endfunction() GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/naive.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/naive.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/naive.h") + +GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/naive.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/naive.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/naive.h") diff --git a/src/Benchmarks/Convolution/support/Benchmark.h b/src/Benchmarks/Convolution/support/Benchmark.h index f5671a06b..ce1b91b23 100644 --- a/src/Benchmarks/Convolution/support/Benchmark.h +++ b/src/Benchmarks/Convolution/support/Benchmark.h @@ -14,10 +14,10 @@ template< int Dimension, typename Device > class Benchmark { public: - using Benchmark = typename TNL::Benchmarks::Benchmark<>; + using TNLBenchmark = typename TNL::Benchmarks::Benchmark<>; void - runBenchmark( const TNL::Config::ParameterContainer& parameters ) const + run( const TNL::Config::ParameterContainer& parameters ) const { if( ! TNL::Devices::Host::setup( parameters ) || ! TNL::Devices::Cuda::setup( parameters ) ) return; @@ -36,7 +36,7 @@ public: std::ofstream logFile( logFileName.getString(), mode ); - Benchmark benchmark( logFile, loops, verbose ); + TNLBenchmark benchmark( logFile, loops, verbose ); std::map< std::string, std::string > metadata = TNL::Benchmarks::getHardwareMetadata(); TNL::Benchmarks::writeMapAsJson( metadata, logFileName, ".metadata.json" ); @@ -44,8 +44,8 @@ public: start(benchmark, parameters); } - virtual void start(const Benchmark& benchmark, const TNL::Config::ParameterContainer& parameters) const { - TNL_ASSERT_TRUE(false, << "Should be overriden"); + virtual void start( TNLBenchmark& benchmark, const TNL::Config::ParameterContainer& parameters) const { + TNL_ASSERT_TRUE(false, "Should be overriden"); } virtual TNL::Config::ConfigDescription makeInputConfig() const { @@ -69,7 +69,6 @@ public: config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); config.addEntry< int >( "verbose", "Verbose mode.", 1 ); - config.addDelimiter( "Device settings:" ); TNL::Devices::Host::configSetup( config ); diff --git a/src/Benchmarks/Convolution/support/DummyBenchmark.h b/src/Benchmarks/Convolution/support/DummyBenchmark.h index 1830e7484..8c8e60be7 100644 --- a/src/Benchmarks/Convolution/support/DummyBenchmark.h +++ b/src/Benchmarks/Convolution/support/DummyBenchmark.h @@ -16,12 +16,12 @@ class DummyBenchmark : public Benchmark< Dimension, Device > { public: using Vector = TNL::Containers::StaticVector< Dimension, int >; - using DataStore = TNL::Containers::Array< int, Device, float >; - using Benchmark = Base::Benchmark; + using DataStore = TNL::Containers::Array< float, Device, int >; using Base = Benchmark< Dimension, Device >; + using TNLBenchmark = typename Base::TNLBenchmark; virtual void - start( const Benchmark& benchmark, const TNL::Config::ParameterContainer& parameters ) const override + start( TNLBenchmark& benchmark, const TNL::Config::ParameterContainer& parameters ) const override { Vector start; Vector end; @@ -53,7 +53,7 @@ public: } virtual void - time( Benchmark& bencmark, + time( TNLBenchmark& benchmark, const Vector& minDimension, const Vector& maxDimension, const int dimensionStep, @@ -68,14 +68,14 @@ public: currentKernelSize = minKernelSize; do { - time( benchmark, currentDimension, currentKernelSize ); + timeConvolution( benchmark, currentDimension, currentKernelSize ); currentKernelSize[ 0 ] += kernelStep; for( size_t i = 0; i < currentKernelSize.getSize() - 1; i++ ) { if( currentKernelSize[ i ] >= maxKernelSize[ i ] ) { currentKernelSize[ i ] = minKernelSize[ i ]; - maxKernelSize[ i + 1 ] += kernelStep; + currentKernelSize[ i + 1 ] += kernelStep; } } } while( currentKernelSize < maxKernelSize ); @@ -85,7 +85,7 @@ public: for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) { if( currentDimension[ i ] >= maxDimension[ i ] ) { currentDimension[ i ] = minDimension[ i ]; - maxDimension[ i ] = maxDimension[ i ]; + currentDimension[ i ] = maxDimension[ i ]; } } @@ -93,11 +93,11 @@ public: } void - timeConvolution( Benchmark& benchmark, const Vector& dimension, const Vector& kernelSize ) const + timeConvolution( TNLBenchmark& benchmark, const Vector& dimension, const Vector& kernelSize ) const { auto device = TNL::getType< Device >(); - Benchmark::MetadataColumns columns = {}; + typename TNLBenchmark::MetadataColumns columns; size_t elementsCount = 1; size_t kernelElementsCount = 1; @@ -106,18 +106,19 @@ public: elementsCount *= dimension[ i ]; kernelElementsCount *= kernelSize[ i ]; - columns.insert( { dimensionIds[ i ], dimension[ i ] } ); - columns.insert( { kernelSizeIds[ i ], kernelSize[ i ] } ); + columns.push_back( { dimensionIds[ i ], TNL::convertToString(dimension[ i ]) } ); + columns.push_back( { kernelSizeIds[ i ], TNL::convertToString(kernelSize[ i ]) } ); } benchmark.setDatasetSize( ( elementsCount * 4 ) / 1.e9, 1.0 ); + benchmark.setMetadataColumns( columns ); // Setup input data DataStore input, result, kernel; input.resize( elementsCount ); result.resize( elementsCount ); - kernel.resize( kernelSize ); + kernel.resize( kernelElementsCount ); input = 1; result = 1; @@ -129,24 +130,24 @@ public: auto measure = [ & ]() { - DummyTask::exec(dimension, kernelSize, inputView, resultView, kernelView); + DummyTask::exec(dimension, kernelSize, inputView, resultView, kernelView); }; - benchmark.time< Device >( device, measure ); + benchmark.template time( device, measure ); } TNL::Config::ConfigDescription makeInputConfig() const override { - auto config = Base::makeInputConfig(); + TNL::Config::ConfigDescription config = Base::makeInputConfig(); config.addDelimiter( "Grid dimension settings:" ); for( int i = 0; i < Dimension; i++ ) - config.addEntry< int >( minDimensionIds[ i ], minDimensionIds[ i ], 512 ); + config.addEntry< int >( minDimensionIds[ i ], minDimensionIds[ i ], 16 ); for( int i = 0; i < Dimension; i++ ) - config.addEntry< int >( maxDimensionIds[ i ], maxDimensionIds[ i ], 512 ); + config.addEntry< int >( maxDimensionIds[ i ], maxDimensionIds[ i ], 128 ); config.addEntry< int >( "dimension-step", "Step of kernel increase by which dimension is multiplied (must be even)", 2 ); @@ -156,7 +157,7 @@ public: config.addEntry< int >( minKernelSizeIds[ i ], minKernelSizeIds[ i ] + " (odd) :", 1 ); for( int i = 0; i < Dimension; i++ ) - config.addEntry< int >( minKernelSizeIds[ i ], minKernelSizeIds[ i ] + " (odd) :", 11 ); + config.addEntry< int >( maxKernelSizeIds[ i ], maxKernelSizeIds[ i ] + " (odd) :", 11 ); config.addEntry< int >( "kernel-step", "Step of kernel increase which is added to kernel (must be even)", 2 ); diff --git a/src/Benchmarks/Convolution/templates/main_benchmark.h b/src/Benchmarks/Convolution/templates/main_benchmark.h index e69de29bb..5124922e6 100644 --- a/src/Benchmarks/Convolution/templates/main_benchmark.h +++ b/src/Benchmarks/Convolution/templates/main_benchmark.h @@ -0,0 +1,25 @@ + +#include "../kernels/naive.h" +#include "../support/DummyBenchmark.h" + +#include + +#define DIMENSION DIMENSION_VALUE + +using TaskBenchmark = DummyBenchmark< DIMENSION, TNL::Devices::Cuda >; + +int main(int argc, char* argv[]) +{ + TaskBenchmark benchmark; + + auto config = benchmark.makeInputConfig(); + + TNL::Config::ParameterContainer parameters; + + if( ! parseCommandLine( argc, argv, config, parameters ) ) + return EXIT_FAILURE; + + benchmark.run( parameters ); + + return 0; +} -- GitLab From 3781cb67e0da09317df14979e9295443a9abfdc4 Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Sat, 2 Apr 2022 16:49:21 +0200 Subject: [PATCH 05/19] Move out kernel coniguration to convolution task --- src/Benchmarks/Convolution/kernels/naive.h | 85 +++++++++++++----- .../Convolution/support/DummyBenchmark.h | 5 +- src/Benchmarks/Convolution/support/Launcher.h | 86 +------------------ 3 files changed, 70 insertions(+), 106 deletions(-) diff --git a/src/Benchmarks/Convolution/kernels/naive.h b/src/Benchmarks/Convolution/kernels/naive.h index 2a8cf47ca..f8b5966e2 100644 --- a/src/Benchmarks/Convolution/kernels/naive.h +++ b/src/Benchmarks/Convolution/kernels/naive.h @@ -1,8 +1,9 @@ #ifdef HAVE_CUDA -#include -#include + #include + #include + #include template< int Dimension, typename Device > struct Convolution; @@ -12,10 +13,18 @@ struct Convolution< 1, TNL::Devices::Cuda > { public: template< typename Index > - static size_t - getDynamicSharedMemorySize( Index kernelWidth, Index endX ) + using Vector = TNL::Containers::StaticVector< 1, Index >; + + template< typename Index > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { - return 0; + configuration.dynamicSharedMemorySize = 0; + + // TODO: - Benchmark the best value + configuration.blockSize.x = kernelSize.x(); + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); } }; @@ -36,9 +45,9 @@ convolution1D( Index kernelWidth, Convolve convolve, Store store ) { - Index ix = threadIdx.x + blockIdx.x * blockDim.x; + Index ix = threadIdx.x + blockIdx.x * blockDim.x; - if (ix >= endX) + if( ix >= endX ) return; Index radius = kernelWidth >> 1; @@ -65,10 +74,22 @@ struct Convolution< 2, TNL::Devices::Cuda > { public: template< typename Index > - static size_t - getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index endX, Index endY ) + using Vector = TNL::Containers::StaticVector< 2, Index >; + + template< typename Index > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { - return 0; + configuration.dynamicSharedMemorySize = 0; + + // TODO: - Benchmark the best value + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); } }; @@ -94,7 +115,7 @@ convolution2D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; - if (ix >= endX || iy >= endY) + if( ix >= endX || iy >= endY ) return; Index radiusY = kernelHeight >> 1; @@ -102,16 +123,17 @@ convolution2D( Index kernelWidth, Real result = 0; - for( Index j = - radiusY; j <= radiusY; j++ ) { + for( Index j = -radiusY; j <= radiusY; j++ ) { Index elementIndexY = j + iy; Index kernelIndexY = j + radiusY; - for( Index i = - radiusX; i <= radiusX; i++ ) { + for( Index i = -radiusX; i <= radiusX; i++ ) { Index elementIndexX = i + ix; Index kernelIndexX = i + radiusX; if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY ) { - result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel ( kernelIndexX, kernelIndexY ) ); + result = + convolve( result, fetchBoundary( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) ); } else { result = convolve( result, fetchData( elementIndexX, elementIndexY ), fetchKernel( kernelIndexX, kernelIndexY ) ); @@ -127,10 +149,25 @@ struct Convolution< 3, TNL::Devices::Cuda > { public: template< typename Index > - static size_t - getDynamicSharedMemorySize( Index kernelWidth, Index kernelHeight, Index kernelDepth, Index endX, Index endY, Index endZ ) + using Vector = TNL::Containers::StaticVector< 3, Index >; + + template< typename Index > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { - return 0; + configuration.dynamicSharedMemorySize = 0; + + // TODO: - Benchmark the best value + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + configuration.blockSize.z = kernelSize.z(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } }; @@ -159,7 +196,7 @@ convolution3D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; - if (ix >= endX || iy >= endY || iz >= endZ) + if( ix >= endX || iy >= endY || iz >= endZ ) return; Index radiusZ = kernelDepth >> 1; @@ -180,11 +217,17 @@ convolution3D( Index kernelWidth, Index elementIndexX = i + ix; Index kernelIndexX = i + radiusX; - if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 || elementIndexZ >= endZ ) { - result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); + if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 + || elementIndexZ >= endZ ) + { + result = convolve( result, + fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), + fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); } else { - result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); + result = convolve( result, + fetchData( elementIndexX, elementIndexY, elementIndexZ ), + fetchKernel( kernelIndexX, kernelIndexY, kernelIndexZ ) ); } } } diff --git a/src/Benchmarks/Convolution/support/DummyBenchmark.h b/src/Benchmarks/Convolution/support/DummyBenchmark.h index 8c8e60be7..804ca03d7 100644 --- a/src/Benchmarks/Convolution/support/DummyBenchmark.h +++ b/src/Benchmarks/Convolution/support/DummyBenchmark.h @@ -32,13 +32,14 @@ public: start[ i ] = parameters.getParameter< int >( minDimensionIds[ i ] ); end[ i ] = parameters.getParameter< int >( maxDimensionIds[ i ] ); minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] ); - maxKernelSizeIds[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] ); + maxKernelSize[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] ); TNL_ASSERT_GT( start[ i ], 1, "Start dimension must be positive integer" ); TNL_ASSERT_GT( end[ i ], start[ i ], "End dimension must be greater than start dimension" ); TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" ); TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" ); + TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End dimension must be greater than start dimension" ); TNL_ASSERT_GT( end[ i ], start[ i ], "End kernel size must be greater than start kernel size" ); } @@ -85,7 +86,7 @@ public: for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) { if( currentDimension[ i ] >= maxDimension[ i ] ) { currentDimension[ i ] = minDimension[ i ]; - currentDimension[ i ] = maxDimension[ i ]; + currentDimension[ i + 1 ] *= dimensionStep; } } diff --git a/src/Benchmarks/Convolution/support/Launcher.h b/src/Benchmarks/Convolution/support/Launcher.h index 94e9b096b..208deb080 100644 --- a/src/Benchmarks/Convolution/support/Launcher.h +++ b/src/Benchmarks/Convolution/support/Launcher.h @@ -29,20 +29,7 @@ public: { TNL::Cuda::LaunchConfiguration launchConfig; - launchConfig.dynamicSharedMemorySize = - ConvolutionKernel::getDynamicSharedMemorySize< Index >( kernelSize.x(), dimensions.x() ); - - // TODO: - Benchmark the best value - launchConfig.blockSize.x = 256; - launchConfig.gridSize.x = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), launchConfig.blockSize.x ) ); - - if( (std::size_t) launchConfig.blockSize.x * launchConfig.gridSize.x < (std::size_t) dimensions.x() ) { - const int desGridSize = 32 * TNL::Cuda::DeviceInfo::getCudaMultiprocessors( TNL::Cuda::DeviceInfo::getActiveDevice() ); - - launchConfig.gridSize.x = - TNL::min( desGridSize, TNL::Cuda::getNumberOfBlocks( dimensions.x(), launchConfig.blockSize.x ) ); - } + ConvolutionKernel::setup(launchConfig, dimensions, kernelSize); constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; @@ -78,29 +65,7 @@ public: { TNL::Cuda::LaunchConfiguration launchConfig; - launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( - kernelSize.x(), kernelSize.y(), dimensions.x(), dimensions.y() ); - - const Index sizeX = dimensions.x(); - const Index sizeY = dimensions.y(); - - if( sizeX >= sizeY * sizeY ) { - launchConfig.blockSize.x = TNL::min( 256, sizeX ); - launchConfig.blockSize.y = 1; - } - else if( sizeY >= sizeX * sizeX ) { - launchConfig.blockSize.x = 1; - launchConfig.blockSize.y = TNL::min( 256, sizeY ); - } - else { - launchConfig.blockSize.x = TNL::min( 32, sizeX ); - launchConfig.blockSize.y = TNL::min( 8, sizeY ); - } - - launchConfig.gridSize.x = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); - launchConfig.gridSize.y = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); + ConvolutionKernel::setup(launchConfig, dimensions, kernelSize); constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; @@ -142,52 +107,7 @@ public: TNL::Cuda::LaunchConfiguration launchConfig; - launchConfig.dynamicSharedMemorySize = ConvolutionKernel::getDynamicSharedMemorySize< Index >( - kernelSize.x(), kernelSize.y(), kernelSize.z(), dimensions.x(), dimensions.y(), dimensions.z() ); - - if( sizeX >= sizeY * sizeY * sizeZ * sizeZ ) { - launchConfig.blockSize.x = TNL::min( 256, sizeX ); - launchConfig.blockSize.y = 1; - launchConfig.blockSize.z = 1; - } - else if( sizeY >= sizeX * sizeX * sizeZ * sizeZ ) { - launchConfig.blockSize.x = 1; - launchConfig.blockSize.y = TNL::min( 256, sizeY ); - launchConfig.blockSize.z = 1; - } - else if( sizeZ >= sizeX * sizeX * sizeY * sizeY ) { - launchConfig.blockSize.x = TNL::min( 2, sizeX ); - launchConfig.blockSize.y = TNL::min( 2, sizeY ); - // CUDA allows max 64 for launchConfig.blockSize.z - launchConfig.blockSize.z = TNL::min( 64, sizeZ ); - } - else if( sizeX >= sizeZ * sizeZ && sizeY >= sizeZ * sizeZ ) { - launchConfig.blockSize.x = TNL::min( 32, sizeX ); - launchConfig.blockSize.y = TNL::min( 8, sizeY ); - launchConfig.blockSize.z = 1; - } - else if( sizeX >= sizeY * sizeY && sizeZ >= sizeY * sizeY ) { - launchConfig.blockSize.x = TNL::min( 32, sizeX ); - launchConfig.blockSize.y = 1; - launchConfig.blockSize.z = TNL::min( 8, sizeZ ); - } - else if( sizeY >= sizeX * sizeX && sizeZ >= sizeX * sizeX ) { - launchConfig.blockSize.x = 1; - launchConfig.blockSize.y = TNL::min( 32, sizeY ); - launchConfig.blockSize.z = TNL::min( 8, sizeZ ); - } - else { - launchConfig.blockSize.x = TNL::min( 16, sizeX ); - launchConfig.blockSize.y = TNL::min( 4, sizeY ); - launchConfig.blockSize.z = TNL::min( 4, sizeZ ); - } - - launchConfig.gridSize.x = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeX, launchConfig.blockSize.x ) ); - launchConfig.gridSize.y = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeY, launchConfig.blockSize.y ) ); - launchConfig.gridSize.z = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( sizeZ, launchConfig.blockSize.z ) ); + ConvolutionKernel::setup(launchConfig, dimensions, kernelSize); constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; -- GitLab From be08382dc41aa2720465424fe2f44adf200908a8 Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Sat, 2 Apr 2022 18:31:24 +0200 Subject: [PATCH 06/19] Implement convolution with storing kernel in shared memory --- src/Benchmarks/Convolution/CMakeLists.txt | 9 + src/Benchmarks/Convolution/kernels/naive.h | 22 +- .../Convolution/kernels/sharedKernel.h | 274 ++++++++++++++++++ src/Benchmarks/Convolution/support/Launcher.h | 15 +- .../Convolution/templates/main_benchmark.h | 7 +- .../Convolution/templates/main_solver.h | 6 +- 6 files changed, 317 insertions(+), 16 deletions(-) create mode 100644 src/Benchmarks/Convolution/kernels/sharedKernel.h diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt index d8a0c683c..ec637f6dd 100644 --- a/src/Benchmarks/Convolution/CMakeLists.txt +++ b/src/Benchmarks/Convolution/CMakeLists.txt @@ -10,6 +10,7 @@ if (${BUILD_CUDA}) FILE(READ ${TEMPLATE} TEMPLATE_CONTENT) STRING(REGEX REPLACE "DIMENSION_VALUE" ${DIMENSION} TEMPLATE_CONTENT "${TEMPLATE_CONTENT}") + STRING(REGEX REPLACE "KERNEL_VALUE" "\"../${KERNEL_HEADER}\"" TEMPLATE_CONTENT "${TEMPLATE_CONTENT}") FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}") @@ -29,3 +30,11 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/naiv GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/naive.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/naive.h") GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/naive.h") + +GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedKernel.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedKernel.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedKernel.h") + +GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedKernel.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedKernel.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedKernel.h") diff --git a/src/Benchmarks/Convolution/kernels/naive.h b/src/Benchmarks/Convolution/kernels/naive.h index f8b5966e2..d69e219d2 100644 --- a/src/Benchmarks/Convolution/kernels/naive.h +++ b/src/Benchmarks/Convolution/kernels/naive.h @@ -1,9 +1,18 @@ +#pragma once + #ifdef HAVE_CUDA - #include - #include - #include +#include +#include +#include + +/** + * There are several pitfalls with such configuration. + * + * 1. At first we don't use shared memory + * 2. At second we don't control block size, so we may launch extremely small kernels or otherwise we can launch extremely large kernels. + */ template< int Dimension, typename Device > struct Convolution; @@ -15,7 +24,7 @@ public: template< typename Index > using Vector = TNL::Containers::StaticVector< 1, Index >; - template< typename Index > + template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { @@ -76,13 +85,12 @@ public: template< typename Index > using Vector = TNL::Containers::StaticVector< 2, Index >; - template< typename Index > + template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { configuration.dynamicSharedMemorySize = 0; - // TODO: - Benchmark the best value configuration.blockSize.x = kernelSize.x(); configuration.blockSize.y = kernelSize.y(); @@ -151,7 +159,7 @@ public: template< typename Index > using Vector = TNL::Containers::StaticVector< 3, Index >; - template< typename Index > + template< typename Index, typename Real > static void setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) { diff --git a/src/Benchmarks/Convolution/kernels/sharedKernel.h b/src/Benchmarks/Convolution/kernels/sharedKernel.h new file mode 100644 index 000000000..de76dd32c --- /dev/null +++ b/src/Benchmarks/Convolution/kernels/sharedKernel.h @@ -0,0 +1,274 @@ + +#pragma once + +#ifdef HAVE_CUDA + + #include + #include + #include + #include + +template< int Dimension, typename Device > +struct Convolution; + +template<> +struct Convolution< 1, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 1, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= kernelSize[ i ]; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + } +}; + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution1D( Index kernelWidth, + Index endX, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Real* shared = TNL::Cuda::getSharedMemory< Real >(); + + Index radius = kernelWidth >> 1; + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + // The size of the block is equal to the kernel size + shared[ threadIdx.x ] = fetchKernel( threadIdx.x ); + + __syncthreads(); + + Real result = 0; + + for( Index i = -radius; i <= radius; i++ ) { + Index elementIndex = i + ix; + Index kernelIndex = i + radius; + + if( elementIndex < 0 || elementIndex >= endX ) { + result = convolve( result, fetchBoundary( elementIndex ), shared[ kernelIndex ] ); + } + else { + result = convolve( result, fetchData( elementIndex ), shared[ kernelIndex ] ); + } + } + + store( ix, result ); +} + +template<> +struct Convolution< 2, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 2, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= kernelSize[ i ]; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + } +}; + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution2D( Index kernelWidth, + Index kernelHeight, + Index endX, + Index endY, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Real* shared = TNL::Cuda::getSharedMemory< Real >(); + + Index radiusY = kernelHeight >> 1; + Index radiusX = kernelWidth >> 1; + + Index iy = threadIdx.y + blockIdx.y * blockDim.y; + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y; + + // The size of the block is equal to the kernel size + shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y ); + + __syncthreads(); + + Real result = 0; + + for( Index j = -radiusY; j <= radiusY; j++ ) { + Index elementIndexY = j + iy; + Index kernelIndexY = j + radiusY; + + for( Index i = -radiusX; i <= radiusX; i++ ) { + Index elementIndexX = i + ix; + Index kernelIndexX = i + radiusX; + + Index threadIndex = kernelIndexX + kernelWidth * kernelIndexY; + + if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY ) { + result = convolve( result, fetchBoundary( elementIndexX, elementIndexY ), shared[ threadIndex ] ); + } + else { + result = convolve( result, fetchData( elementIndexX, elementIndexY ), shared[ threadIndex ] ); + } + } + } + + store( ix, iy, result ); +} + +template<> +struct Convolution< 3, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 3, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= kernelSize[ i ]; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + configuration.blockSize.z = kernelSize.z(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); + } +}; + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution3D( Index kernelWidth, + Index kernelHeight, + Index kernelDepth, + Index endX, + Index endY, + Index endZ, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Real* shared = TNL::Cuda::getSharedMemory< Real >(); + + Index iz = threadIdx.z + blockIdx.z * blockDim.z; + Index iy = threadIdx.y + blockIdx.y * blockDim.y; + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + Index radiusZ = kernelDepth >> 1; + Index radiusY = kernelHeight >> 1; + Index radiusX = kernelWidth >> 1; + + Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z; + + printf( "%d\n", threadIndex ); + + // The size of the block is equal to the kernel size + shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); + + __syncthreads(); + + Real result = 0; + + for( Index k = -radiusZ; k <= radiusZ; k++ ) { + Index elementIndexZ = k + iz; + Index kernelIndexZ = k + radiusZ; + + for( Index j = -radiusY; j <= radiusY; j++ ) { + Index elementIndexY = j + iy; + Index kernelIndexY = j + radiusY; + + for( Index i = -radiusX; i <= radiusX; i++ ) { + Index elementIndexX = i + ix; + Index kernelIndexX = i + radiusX; + + Index threadIndex = kernelIndexX + kernelWidth * kernelIndexY + kernelWidth * kernelHeight * kernelIndexZ; + + if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 + || elementIndexZ >= endZ ) + { + result = convolve( result, + fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), + shared[threadIndex] ); + } + else { + result = convolve( result, + fetchData( elementIndexX, elementIndexY, elementIndexZ ), + shared[threadIndex] ); + } + } + } + } + + store( ix, iy, iz, result ); +} + +#endif diff --git a/src/Benchmarks/Convolution/support/Launcher.h b/src/Benchmarks/Convolution/support/Launcher.h index 208deb080..c336dc8e5 100644 --- a/src/Benchmarks/Convolution/support/Launcher.h +++ b/src/Benchmarks/Convolution/support/Launcher.h @@ -5,7 +5,14 @@ #include template< int Dimension, typename Device > -struct Convolution; +struct Convolution { + template< typename Index > + using Vector = TNL::Containers::StaticVector< 1, Index >; + + template< typename Index, typename Real > + static void + setup(TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize); +}; template< int Dimension, typename Device > struct Launcher; @@ -29,7 +36,7 @@ public: { TNL::Cuda::LaunchConfiguration launchConfig; - ConvolutionKernel::setup(launchConfig, dimensions, kernelSize); + ConvolutionKernel::setup(launchConfig, dimensions, kernelSize); constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; @@ -65,7 +72,7 @@ public: { TNL::Cuda::LaunchConfiguration launchConfig; - ConvolutionKernel::setup(launchConfig, dimensions, kernelSize); + ConvolutionKernel::setup(launchConfig, dimensions, kernelSize); constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; @@ -107,7 +114,7 @@ public: TNL::Cuda::LaunchConfiguration launchConfig; - ConvolutionKernel::setup(launchConfig, dimensions, kernelSize); + ConvolutionKernel::setup(launchConfig, dimensions, kernelSize); constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; diff --git a/src/Benchmarks/Convolution/templates/main_benchmark.h b/src/Benchmarks/Convolution/templates/main_benchmark.h index 5124922e6..4be1e80e5 100644 --- a/src/Benchmarks/Convolution/templates/main_benchmark.h +++ b/src/Benchmarks/Convolution/templates/main_benchmark.h @@ -1,11 +1,12 @@ -#include "../kernels/naive.h" +#define KERNEL KERNEL_VALUE +#define DIMENSION DIMENSION_VALUE + +#include KERNEL_VALUE #include "../support/DummyBenchmark.h" #include -#define DIMENSION DIMENSION_VALUE - using TaskBenchmark = DummyBenchmark< DIMENSION, TNL::Devices::Cuda >; int main(int argc, char* argv[]) diff --git a/src/Benchmarks/Convolution/templates/main_solver.h b/src/Benchmarks/Convolution/templates/main_solver.h index 1a6c33a9b..ab2bc8699 100644 --- a/src/Benchmarks/Convolution/templates/main_solver.h +++ b/src/Benchmarks/Convolution/templates/main_solver.h @@ -1,10 +1,12 @@ -#include "../kernels/naive.h" +#define KERNEL KERNEL_VALUE +#define DIMENSION DIMENSION_VALUE + +#include KERNEL #include "../support/DummySolver.h" #include -#define DIMENSION DIMENSION_VALUE using TaskSolver = DummySolver< DIMENSION, TNL::Devices::Cuda >; -- GitLab From d3b6676ebf2911d6360ca65f0d55c4a0c2dc1ee2 Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Sat, 2 Apr 2022 18:38:43 +0200 Subject: [PATCH 07/19] Add loggin of the id to the benchmark output --- src/Benchmarks/Convolution/kernels/sharedKernel.h | 8 ++++---- src/Benchmarks/Convolution/support/DummyBenchmark.h | 13 ++++++++----- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/Benchmarks/Convolution/kernels/sharedKernel.h b/src/Benchmarks/Convolution/kernels/sharedKernel.h index de76dd32c..c4f7f0199 100644 --- a/src/Benchmarks/Convolution/kernels/sharedKernel.h +++ b/src/Benchmarks/Convolution/kernels/sharedKernel.h @@ -3,10 +3,10 @@ #ifdef HAVE_CUDA - #include - #include - #include - #include +#include +#include +#include +#include template< int Dimension, typename Device > struct Convolution; diff --git a/src/Benchmarks/Convolution/support/DummyBenchmark.h b/src/Benchmarks/Convolution/support/DummyBenchmark.h index 804ca03d7..005096d12 100644 --- a/src/Benchmarks/Convolution/support/DummyBenchmark.h +++ b/src/Benchmarks/Convolution/support/DummyBenchmark.h @@ -50,11 +50,14 @@ public: TNL_ASSERT_GT( kernelStep, 0, "Kernel step must be a positive number" ); TNL_ASSERT_EQ( kernelStep % 2, 0, "Kernel step must be even" ); - time( benchmark, start, end, dimensionStep, minKernelSize, maxKernelSize, kernelStep ); + TNL::String id = parameters.getParameter("id"); + + time( id, benchmark, start, end, dimensionStep, minKernelSize, maxKernelSize, kernelStep ); } virtual void - time( TNLBenchmark& benchmark, + time( const TNL::String& id, + TNLBenchmark& benchmark, const Vector& minDimension, const Vector& maxDimension, const int dimensionStep, @@ -69,7 +72,7 @@ public: currentKernelSize = minKernelSize; do { - timeConvolution( benchmark, currentDimension, currentKernelSize ); + timeConvolution( id, benchmark, currentDimension, currentKernelSize ); currentKernelSize[ 0 ] += kernelStep; @@ -94,11 +97,11 @@ public: } void - timeConvolution( TNLBenchmark& benchmark, const Vector& dimension, const Vector& kernelSize ) const + timeConvolution( const TNL::String& id, TNLBenchmark& benchmark, const Vector& dimension, const Vector& kernelSize ) const { auto device = TNL::getType< Device >(); - typename TNLBenchmark::MetadataColumns columns; + typename TNLBenchmark::MetadataColumns columns = {{ "id", id }}; size_t elementsCount = 1; size_t kernelElementsCount = 1; -- GitLab From 9728480563a24e4ee9eef4e1ab16355514318c87 Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Sat, 2 Apr 2022 21:03:49 +0200 Subject: [PATCH 08/19] Implement kernel, which loads data in shared memory --- src/Benchmarks/Convolution/CMakeLists.txt | 8 + .../Convolution/kernels/sharedData.h | 432 ++++++++++++++++++ .../Convolution/kernels/sharedKernel.h | 42 +- 3 files changed, 466 insertions(+), 16 deletions(-) create mode 100644 src/Benchmarks/Convolution/kernels/sharedData.h diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt index ec637f6dd..b51e7de7d 100644 --- a/src/Benchmarks/Convolution/CMakeLists.txt +++ b/src/Benchmarks/Convolution/CMakeLists.txt @@ -38,3 +38,11 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/shar GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedKernel.h") + +GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedData.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedData.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedData.h") + +GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedData.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedData.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedData.h") diff --git a/src/Benchmarks/Convolution/kernels/sharedData.h b/src/Benchmarks/Convolution/kernels/sharedData.h new file mode 100644 index 000000000..f6dbe48fb --- /dev/null +++ b/src/Benchmarks/Convolution/kernels/sharedData.h @@ -0,0 +1,432 @@ +#pragma once + +#ifdef HAVE_CUDA + +/** + * This method stores image tile into shared memory + * and then calculates convolution. + * + * Thanks for the idea https://www.evl.uic.edu/sjames/cs525/final.html + */ + + #include + #include + #include + #include + +template< int Dimension, typename Device > +struct Convolution; + +template<> +struct Convolution< 1, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 1, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + } +}; + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution1D( Index kernelWidth, + Index endX, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + if( ix >= endX ) + return; + + Real* shared = TNL::Cuda::getSharedMemory< Real >(); + Index radius = kernelWidth >> 1; + + // Left + Index lhs = ix - radius; + + if( lhs < 0 ) { + shared[ threadIdx.x ] = fetchBoundary( lhs ); + } + else { + shared[ threadIdx.x ] = fetchData( lhs ); + } + + // Right + Index rhs = ix + radius; + + if( rhs >= endX ) { + shared[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs ); + } + else { + shared[ threadIdx.x + blockDim.x ] = fetchData( rhs ); + } + + __syncthreads(); + + Real result = 0; + + #pragma unroll + for( Index i = 0; i < kernelWidth; i++ ) { + Index elementIndex = i + threadIdx.x; + + result = convolve( result, shared[ elementIndex ], fetchKernel( i ) ); + } + + store( ix, result ); +} + +template<> +struct Convolution< 2, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 2, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + } +}; + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution2D( Index kernelWidth, + Index kernelHeight, + Index endX, + Index endY, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Index iy = threadIdx.y + blockIdx.y * blockDim.y; + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + if( ix >= endX || iy >= endY ) + return; + + Real* shared = TNL::Cuda::getSharedMemory< Real >(); + + Index radiusY = kernelHeight >> 1; + Index radiusX = kernelWidth >> 1; + + Index x, y, index; + + // Top Left + x = ix - radiusX; + y = iy - radiusY; + + index = threadIdx.x + threadIdx.y * blockDim.x; + + if( x < 0 || y < 0 ) { + shared[ index ] = fetchBoundary( x, y ); + } + else { + shared[ index ] = fetchData( x, y ); + } + + // Top right + x = ix + radiusX; + y = iy - radiusY; + + index = radiusX + threadIdx.x + threadIdx.y * blockDim.x; + + if( x >= endX || y < 0 ) { + shared[ index ] = fetchBoundary( x, y ); + } + else { + shared[ index ] = fetchData( x, y ); + } + + // Bottom Left + x = ix - radiusX; + y = iy + radiusY; + + index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; + + if( x < 0 || y >= endY ) { + shared[ index ] = fetchBoundary( x, y ); + } + else { + shared[ index ] = fetchData( x, y ); + } + + // Bottom Right + x = ix + radiusX; + y = iy + radiusY; + + index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; + + if( x >= endX || y >= endY ) { + shared[ index ] = fetchBoundary( x, y ); + } + else { + shared[ index ] = fetchData( x, y ); + } + + __syncthreads(); + + Real result = 0; + + for( Index j = 0; j <= radiusY; j++ ) { + Index align = ( j + threadIdx.y ) * blockDim.y; + + for( Index i = 0; i <= radiusX; i++ ) { + Index index = i + threadIdx.x + align; + + result = convolve( result, shared[ index ], fetchKernel( i, j ) ); + } + } + + store( ix, iy, result ); +} + +template<> +struct Convolution< 3, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 3, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + configuration.blockSize.z = kernelSize.z(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); + } +}; + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution3D( Index kernelWidth, + Index kernelHeight, + Index kernelDepth, + Index endX, + Index endY, + Index endZ, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Index iz = threadIdx.z + blockIdx.z * blockDim.z; + Index iy = threadIdx.y + blockIdx.y * blockDim.y; + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + if( ix >= endX || iy >= endY || iz >= endZ ) + return; + + Real* shared = TNL::Cuda::getSharedMemory< Real >(); + + Index radiusZ = kernelDepth >> 1; + Index radiusY = kernelHeight >> 1; + Index radiusX = kernelWidth >> 1; + + Index x, y, z, index; + + // Z: 0 Y: 0 X: 0 + x = ix - radiusX; + y = iy - radiusY; + z = iz - radiusZ; + + index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; + + if( x < 0 || y < 0 || z < 0 ) { + shared[ index ] = fetchBoundary( x, y, z ); + } + else { + shared[ index ] = fetchData( x, y, z ); + } + + // Z: 0 Y: 0 X: 1 + x = ix + radiusX; + y = iy - radiusY; + z = iz - radiusZ; + + index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; + + if( x >= endX || y < 0 || z < 0 ) { + shared[ index ] = fetchBoundary( x, y, z ); + } + else { + shared[ index ] = fetchData( x, y, z ); + } + + // Z: 0 Y: 1 X: 0 + x = ix - radiusX; + y = iy + radiusY; + z = iz - radiusZ; + + index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; + + if( x < 0 || y >= endY || z < 0 ) { + shared[ index ] = fetchBoundary( x, y, z ); + } + else { + shared[ index ] = fetchData( x, y, z ); + } + + // Z: 1 Y: 0 X: 0 + x = ix - radiusX; + y = iy - radiusY; + z = iz + radiusZ; + + index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; + + if( x < 0 || y < 0 || z >= endZ ) { + shared[ index ] = fetchBoundary( x, y, z ); + } + else { + shared[ index ] = fetchData( x, y, z ); + } + + // Z: 0 Y: 1 X: 1 + x = ix + radiusX; + y = iy + radiusY; + z = iz - radiusZ; + + index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; + + if( x >= endX || y >= endY || z < 0 ) { + shared[ index ] = fetchBoundary( x, y, z ); + } + else { + shared[ index ] = fetchData( x, y, z ); + } + + // Z: 1 Y: 0 X: 1 + x = ix + radiusX; + y = iy - radiusY; + z = iz + radiusZ; + + index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; + + if( x >= endX || y < 0 || z >= endZ ) { + shared[ index ] = fetchBoundary( x, y, z ); + } + else { + shared[ index ] = fetchData( x, y, z ); + } + + // Z: 1 Y: 1 X: 0 + x = ix - radiusX; + y = iy + radiusY; + z = iz + radiusZ; + + index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; + + if( x < 0 || y >= endY || z >= endZ ) { + shared[ index ] = fetchBoundary( x, y, z ); + } + else { + shared[ index ] = fetchData( x, y, z ); + } + + // Z: 1 Y: 1 X: 1 + x = ix + radiusX; + y = iy + radiusY; + z = iz + radiusZ; + + index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; + + if( x >= endX || y >= endY || z >= endZ ) { + shared[ index ] = fetchBoundary( x, y, z ); + } + else { + shared[ index ] = fetchData( x, y, z ); + } + + __syncthreads(); + + Real result = 0; + + for( Index k = 0; k <= radiusZ; k++ ) { + Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; + + for( Index j = 0; j <= radiusY; j++ ) { + Index xAlign = ( j + threadIdx.y ) * blockDim.y; + + for( Index i = 0; i <= radiusX; i++ ) { + Index index = i + threadIdx.x + xAlign + xyAlign; + + result = convolve( result, shared[ index ], fetchKernel( i, j, k ) ); + } + } + } + + store( ix, iy, iz, result ); +} + +#endif diff --git a/src/Benchmarks/Convolution/kernels/sharedKernel.h b/src/Benchmarks/Convolution/kernels/sharedKernel.h index c4f7f0199..eb6537270 100644 --- a/src/Benchmarks/Convolution/kernels/sharedKernel.h +++ b/src/Benchmarks/Convolution/kernels/sharedKernel.h @@ -3,10 +3,14 @@ #ifdef HAVE_CUDA -#include -#include -#include -#include + #include + #include + #include + #include + +/** + * This method stores kernel in the shared memory to reduce amount of loads. + */ template< int Dimension, typename Device > struct Convolution; @@ -52,10 +56,14 @@ convolution1D( Index kernelWidth, Convolve convolve, Store store ) { + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + if( ix >= endX ) + return; + Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radius = kernelWidth >> 1; - Index ix = threadIdx.x + blockIdx.x * blockDim.x; // The size of the block is equal to the kernel size shared[ threadIdx.x ] = fetchKernel( threadIdx.x ); @@ -126,14 +134,17 @@ convolution2D( Index kernelWidth, Convolve convolve, Store store ) { + Index iy = threadIdx.y + blockIdx.y * blockDim.y; + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + if( ix >= endX || iy >= endY ) + return; + Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; - Index iy = threadIdx.y + blockIdx.y * blockDim.y; - Index ix = threadIdx.x + blockIdx.x * blockDim.x; - Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y; // The size of the block is equal to the kernel size @@ -217,12 +228,15 @@ convolution3D( Index kernelWidth, Convolve convolve, Store store ) { - Real* shared = TNL::Cuda::getSharedMemory< Real >(); - Index iz = threadIdx.z + blockIdx.z * blockDim.z; Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; + if( ix >= endX || iy >= endY || iz >= endZ ) + return; + + Real* shared = TNL::Cuda::getSharedMemory< Real >(); + Index radiusZ = kernelDepth >> 1; Index radiusY = kernelHeight >> 1; Index radiusX = kernelWidth >> 1; @@ -255,14 +269,10 @@ convolution3D( Index kernelWidth, if( elementIndexX < 0 || elementIndexX >= endX || elementIndexY < 0 || elementIndexY >= endY || elementIndexZ < 0 || elementIndexZ >= endZ ) { - result = convolve( result, - fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), - shared[threadIndex] ); + result = convolve( result, fetchBoundary( elementIndexX, elementIndexY, elementIndexZ ), shared[ threadIndex ] ); } else { - result = convolve( result, - fetchData( elementIndexX, elementIndexY, elementIndexZ ), - shared[threadIndex] ); + result = convolve( result, fetchData( elementIndexX, elementIndexY, elementIndexZ ), shared[ threadIndex ] ); } } } -- GitLab From 3571b27e99d774d874142950feb3f2aa8cd9a91d Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Sun, 3 Apr 2022 11:20:48 +0200 Subject: [PATCH 09/19] Move kernel launching in kernel definition --- src/Benchmarks/Convolution/kernels/naive.h | 252 +++++++++++----- .../Convolution/kernels/sharedData.h | 272 ++++++++++++------ .../Convolution/kernels/sharedKernel.h | 266 +++++++++++------ .../Convolution/support/DummyTask.h | 73 +++-- src/Benchmarks/Convolution/support/Launcher.h | 136 --------- src/Benchmarks/Convolution/support/Solver.h | 2 - 6 files changed, 594 insertions(+), 407 deletions(-) delete mode 100644 src/Benchmarks/Convolution/support/Launcher.h diff --git a/src/Benchmarks/Convolution/kernels/naive.h b/src/Benchmarks/Convolution/kernels/naive.h index d69e219d2..4326e0d74 100644 --- a/src/Benchmarks/Convolution/kernels/naive.h +++ b/src/Benchmarks/Convolution/kernels/naive.h @@ -3,40 +3,21 @@ #ifdef HAVE_CUDA -#include -#include -#include + #include + #include + #include /** * There are several pitfalls with such configuration. * * 1. At first we don't use shared memory - * 2. At second we don't control block size, so we may launch extremely small kernels or otherwise we can launch extremely large kernels. + * 2. At second we don't control block size, so we may launch extremely small kernels or otherwise we can launch extremely large + * kernels. */ template< int Dimension, typename Device > struct Convolution; -template<> -struct Convolution< 1, TNL::Devices::Cuda > -{ -public: - template< typename Index > - using Vector = TNL::Containers::StaticVector< 1, Index >; - - template< typename Index, typename Real > - static void - setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) - { - configuration.dynamicSharedMemorySize = 0; - - // TODO: - Benchmark the best value - configuration.blockSize.x = kernelSize.x(); - configuration.gridSize.x = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); - } -}; - template< typename Index, typename Real, typename FetchData, @@ -78,29 +59,6 @@ convolution1D( Index kernelWidth, store( ix, result ); } -template<> -struct Convolution< 2, TNL::Devices::Cuda > -{ -public: - template< typename Index > - using Vector = TNL::Containers::StaticVector< 2, Index >; - - template< typename Index, typename Real > - static void - setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) - { - configuration.dynamicSharedMemorySize = 0; - - configuration.blockSize.x = kernelSize.x(); - configuration.blockSize.y = kernelSize.y(); - - configuration.gridSize.x = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); - configuration.gridSize.y = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); - } -}; - template< typename Index, typename Real, typename FetchData, @@ -152,33 +110,6 @@ convolution2D( Index kernelWidth, store( ix, iy, result ); } -template<> -struct Convolution< 3, TNL::Devices::Cuda > -{ -public: - template< typename Index > - using Vector = TNL::Containers::StaticVector< 3, Index >; - - template< typename Index, typename Real > - static void - setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) - { - configuration.dynamicSharedMemorySize = 0; - - // TODO: - Benchmark the best value - configuration.blockSize.x = kernelSize.x(); - configuration.blockSize.y = kernelSize.y(); - configuration.blockSize.z = kernelSize.z(); - - configuration.gridSize.x = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); - configuration.gridSize.y = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); - configuration.gridSize.y = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); - } -}; - template< typename Index, typename Real, typename FetchData, @@ -244,4 +175,177 @@ convolution3D( Index kernelWidth, store( ix, iy, iz, result ); } +template< int Dimension, typename Device > +struct Convolution; + +template<> +struct Convolution< 1, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 1, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + configuration.dynamicSharedMemorySize = 0; + + // TODO: - Benchmark the best value + configuration.blockSize.x = kernelSize.x(); + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( + kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store ); + }; +}; + +template<> +struct Convolution< 2, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 2, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + configuration.dynamicSharedMemorySize = 0; + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( kernel, + 0, + configuration, + kernelSize.x(), + kernelSize.y(), + dimensions.x(), + dimensions.y(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + }; +}; + +template<> +struct Convolution< 3, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 3, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + configuration.dynamicSharedMemorySize = 0; + + // TODO: - Benchmark the best value + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + configuration.blockSize.z = kernelSize.z(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( kernel, + 0, + configuration, + kernelSize.x(), + kernelSize.y(), + kernelSize.z(), + dimensions.x(), + dimensions.y(), + dimensions.z(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + }; +}; + #endif diff --git a/src/Benchmarks/Convolution/kernels/sharedData.h b/src/Benchmarks/Convolution/kernels/sharedData.h index f6dbe48fb..0c0e0a2a6 100644 --- a/src/Benchmarks/Convolution/kernels/sharedData.h +++ b/src/Benchmarks/Convolution/kernels/sharedData.h @@ -14,33 +14,6 @@ #include #include -template< int Dimension, typename Device > -struct Convolution; - -template<> -struct Convolution< 1, TNL::Devices::Cuda > -{ -public: - template< typename Index > - using Vector = TNL::Containers::StaticVector< 1, Index >; - - template< typename Index, typename Real > - static void - setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) - { - Index kernelElementCount = 1; - - for( Index i = 0; i < kernelSize.getSize(); i++ ) - kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; - - configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); - - configuration.blockSize.x = kernelSize.x(); - configuration.gridSize.x = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); - } -}; - template< typename Index, typename Real, typename FetchData, @@ -100,34 +73,6 @@ convolution1D( Index kernelWidth, store( ix, result ); } -template<> -struct Convolution< 2, TNL::Devices::Cuda > -{ -public: - template< typename Index > - using Vector = TNL::Containers::StaticVector< 2, Index >; - - template< typename Index, typename Real > - static void - setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) - { - Index kernelElementCount = 1; - - for( Index i = 0; i < kernelSize.getSize(); i++ ) - kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; - - configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); - - configuration.blockSize.x = kernelSize.x(); - configuration.blockSize.y = kernelSize.y(); - - configuration.gridSize.x = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); - configuration.gridSize.y = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); - } -}; - template< typename Index, typename Real, typename FetchData, @@ -229,37 +174,6 @@ convolution2D( Index kernelWidth, store( ix, iy, result ); } -template<> -struct Convolution< 3, TNL::Devices::Cuda > -{ -public: - template< typename Index > - using Vector = TNL::Containers::StaticVector< 3, Index >; - - template< typename Index, typename Real > - static void - setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) - { - Index kernelElementCount = 1; - - for( Index i = 0; i < kernelSize.getSize(); i++ ) - kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; - - configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); - - configuration.blockSize.x = kernelSize.x(); - configuration.blockSize.y = kernelSize.y(); - configuration.blockSize.z = kernelSize.z(); - - configuration.gridSize.x = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); - configuration.gridSize.y = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); - configuration.gridSize.y = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); - } -}; - template< typename Index, typename Real, typename FetchData, @@ -429,4 +343,190 @@ convolution3D( Index kernelWidth, store( ix, iy, iz, result ); } +template< int Dimension, typename Device > +struct Convolution; + +template<> +struct Convolution< 1, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 1, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( + kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store ); + }; +}; + +template<> +struct Convolution< 2, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 2, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( kernel, + 0, + configuration, + kernelSize.x(), + kernelSize.y(), + dimensions.x(), + dimensions.y(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + }; +}; + +template<> +struct Convolution< 3, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 3, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + configuration.blockSize.z = kernelSize.z(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( kernel, + 0, + configuration, + kernelSize.x(), + kernelSize.y(), + kernelSize.z(), + dimensions.x(), + dimensions.y(), + dimensions.z(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + }; +}; + #endif diff --git a/src/Benchmarks/Convolution/kernels/sharedKernel.h b/src/Benchmarks/Convolution/kernels/sharedKernel.h index eb6537270..d3e1e4da3 100644 --- a/src/Benchmarks/Convolution/kernels/sharedKernel.h +++ b/src/Benchmarks/Convolution/kernels/sharedKernel.h @@ -15,30 +15,6 @@ template< int Dimension, typename Device > struct Convolution; -template<> -struct Convolution< 1, TNL::Devices::Cuda > -{ -public: - template< typename Index > - using Vector = TNL::Containers::StaticVector< 1, Index >; - - template< typename Index, typename Real > - static void - setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) - { - Index kernelElementCount = 1; - - for( Index i = 0; i < kernelSize.getSize(); i++ ) - kernelElementCount *= kernelSize[ i ]; - - configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); - - configuration.blockSize.x = kernelSize.x(); - configuration.gridSize.x = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); - } -}; - template< typename Index, typename Real, typename FetchData, @@ -87,34 +63,6 @@ convolution1D( Index kernelWidth, store( ix, result ); } -template<> -struct Convolution< 2, TNL::Devices::Cuda > -{ -public: - template< typename Index > - using Vector = TNL::Containers::StaticVector< 2, Index >; - - template< typename Index, typename Real > - static void - setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) - { - Index kernelElementCount = 1; - - for( Index i = 0; i < kernelSize.getSize(); i++ ) - kernelElementCount *= kernelSize[ i ]; - - configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); - - configuration.blockSize.x = kernelSize.x(); - configuration.blockSize.y = kernelSize.y(); - - configuration.gridSize.x = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); - configuration.gridSize.y = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); - } -}; - template< typename Index, typename Real, typename FetchData, @@ -176,37 +124,6 @@ convolution2D( Index kernelWidth, store( ix, iy, result ); } -template<> -struct Convolution< 3, TNL::Devices::Cuda > -{ -public: - template< typename Index > - using Vector = TNL::Containers::StaticVector< 3, Index >; - - template< typename Index, typename Real > - static void - setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) - { - Index kernelElementCount = 1; - - for( Index i = 0; i < kernelSize.getSize(); i++ ) - kernelElementCount *= kernelSize[ i ]; - - configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); - - configuration.blockSize.x = kernelSize.x(); - configuration.blockSize.y = kernelSize.y(); - configuration.blockSize.z = kernelSize.z(); - - configuration.gridSize.x = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); - configuration.gridSize.y = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); - configuration.gridSize.y = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); - } -}; - template< typename Index, typename Real, typename FetchData, @@ -281,4 +198,187 @@ convolution3D( Index kernelWidth, store( ix, iy, iz, result ); } +template<> +struct Convolution< 1, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 1, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= kernelSize[ i ]; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( + kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store ); + }; +}; + +template<> +struct Convolution< 2, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 2, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= kernelSize[ i ]; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( kernel, + 0, + configuration, + kernelSize.x(), + kernelSize.y(), + dimensions.x(), + dimensions.y(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + }; +}; + +template<> +struct Convolution< 3, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 3, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= kernelSize[ i ]; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + configuration.blockSize.z = kernelSize.z(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( kernel, + 0, + configuration, + kernelSize.x(), + kernelSize.y(), + kernelSize.z(), + dimensions.x(), + dimensions.y(), + dimensions.z(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + }; +}; + #endif diff --git a/src/Benchmarks/Convolution/support/DummyTask.h b/src/Benchmarks/Convolution/support/DummyTask.h index f7db47e34..e850b64c0 100644 --- a/src/Benchmarks/Convolution/support/DummyTask.h +++ b/src/Benchmarks/Convolution/support/DummyTask.h @@ -1,7 +1,28 @@ #pragma once -#include "Launcher.h" +template< int Dimension, typename Device > +struct Convolution +{ + template< typename Index > + using Vector = TNL::Containers::StaticVector< Dimension, Index >; + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ); +}; template< typename Index, typename Real, int Dimension, typename Device > struct DummyTask; @@ -14,7 +35,7 @@ public: using Device = TNL::Devices::Cuda; using Vector = TNL::Containers::StaticVector< Dimension, Index >; using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; - using Launcher = Launcher< Dimension, Device >; + using ConvolutionLauncher = Convolution< Dimension, Device >; static void exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) @@ -44,13 +65,13 @@ public: result[ i ] = resultValue; }; - Launcher::exec< Index, Real >( dimensions, - kernelSize, - std::forward< decltype( fetchData ) >( fetchData ), - std::forward< decltype( fetchBoundary ) >( fetchBoundary ), - std::forward< decltype( fetchKernel ) >( fetchKernel ), - std::forward< decltype( convolve ) >( convolve ), - std::forward< decltype( store ) >( store ) ); + ConvolutionLauncher::execute< Index, Real >( dimensions, + kernelSize, + std::forward< decltype( fetchData ) >( fetchData ), + std::forward< decltype( fetchBoundary ) >( fetchBoundary ), + std::forward< decltype( fetchKernel ) >( fetchKernel ), + std::forward< decltype( convolve ) >( convolve ), + std::forward< decltype( store ) >( store ) ); } }; @@ -62,7 +83,7 @@ public: using Device = TNL::Devices::Cuda; using Vector = TNL::Containers::StaticVector< Dimension, Index >; using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; - using Launcher = Launcher< Dimension, Device >; + using ConvolutionLauncher = Convolution< Dimension, Device >; static void exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) @@ -98,13 +119,13 @@ public: result[ index ] = resultValue; }; - Launcher::exec< Index, Real >( dimensions, - kernelSize, - std::forward< decltype( fetchData ) >( fetchData ), - std::forward< decltype( fetchBoundary ) >( fetchBoundary ), - std::forward< decltype( fetchKernel ) >( fetchKernel ), - std::forward< decltype( convolve ) >( convolve ), - std::forward< decltype( store ) >( store ) ); + ConvolutionLauncher::execute< Index, Real >( dimensions, + kernelSize, + std::forward< decltype( fetchData ) >( fetchData ), + std::forward< decltype( fetchBoundary ) >( fetchBoundary ), + std::forward< decltype( fetchKernel ) >( fetchKernel ), + std::forward< decltype( convolve ) >( convolve ), + std::forward< decltype( store ) >( store ) ); } }; @@ -116,7 +137,7 @@ public: using Device = TNL::Devices::Cuda; using Vector = TNL::Containers::StaticVector< Dimension, Index >; using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; - using Launcher = Launcher< Dimension, Device >; + using ConvolutionLauncher = Convolution< Dimension, Device >; static void exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) @@ -125,7 +146,7 @@ public: { auto index = i + j * dimensions.x() + k * dimensions.x() * dimensions.y(); - return input[index]; + return input[ index ]; }; auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j, Index k ) @@ -152,12 +173,12 @@ public: result[ index ] = resultValue; }; - Launcher::exec< Index, Real >( dimensions, - kernelSize, - std::forward< decltype( fetchData ) >( fetchData ), - std::forward< decltype( fetchBoundary ) >( fetchBoundary ), - std::forward< decltype( fetchKernel ) >( fetchKernel ), - std::forward< decltype( convolve ) >( convolve ), - std::forward< decltype( store ) >( store ) ); + ConvolutionLauncher::execute< Index, Real >( dimensions, + kernelSize, + std::forward< decltype( fetchData ) >( fetchData ), + std::forward< decltype( fetchBoundary ) >( fetchBoundary ), + std::forward< decltype( fetchKernel ) >( fetchKernel ), + std::forward< decltype( convolve ) >( convolve ), + std::forward< decltype( store ) >( store ) ); } }; diff --git a/src/Benchmarks/Convolution/support/Launcher.h b/src/Benchmarks/Convolution/support/Launcher.h deleted file mode 100644 index c336dc8e5..000000000 --- a/src/Benchmarks/Convolution/support/Launcher.h +++ /dev/null @@ -1,136 +0,0 @@ - -#pragma once - -#include -#include - -template< int Dimension, typename Device > -struct Convolution { - template< typename Index > - using Vector = TNL::Containers::StaticVector< 1, Index >; - - template< typename Index, typename Real > - static void - setup(TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize); -}; - -template< int Dimension, typename Device > -struct Launcher; - -template<> -struct Launcher< 1, TNL::Devices::Cuda > -{ -public: - using Vector = TNL::Containers::StaticVector< 1, int >; - using ConvolutionKernel = Convolution< 1, TNL::Devices::Cuda >; - - template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > - static inline void - exec( const Vector& dimensions, - const Vector& kernelSize, - FetchData&& fetchData, - FetchBoundary&& fetchBoundary, - FetchKernel&& fetchKernel, - Convolve&& convolve, - Store&& store ) - { - TNL::Cuda::LaunchConfiguration launchConfig; - - ConvolutionKernel::setup(launchConfig, dimensions, kernelSize); - - constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; - - TNL::Cuda::launchKernel< true >( kernel, - 0, - launchConfig, - kernelSize.x(), - dimensions.x(), - fetchData, - fetchBoundary, - fetchKernel, - convolve, - store ); - } -}; - -template<> -struct Launcher< 2, TNL::Devices::Cuda > -{ -public: - using Vector = TNL::Containers::StaticVector< 2, int >; - using ConvolutionKernel = Convolution< 2, TNL::Devices::Cuda >; - - template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > - static inline void - exec( const Vector& dimensions, - const Vector& kernelSize, - FetchData&& fetchData, - FetchBoundary&& fetchBoundary, - FetchKernel&& fetchKernel, - Convolve&& convolve, - Store&& store ) - { - TNL::Cuda::LaunchConfiguration launchConfig; - - ConvolutionKernel::setup(launchConfig, dimensions, kernelSize); - - constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; - - TNL::Cuda::launchKernel< true >( kernel, - 0, - launchConfig, - kernelSize.x(), - kernelSize.y(), - dimensions.x(), - dimensions.y(), - fetchData, - fetchBoundary, - fetchKernel, - convolve, - store ); - } -}; - -template<> -struct Launcher< 3, TNL::Devices::Cuda > -{ -public: - using Vector = TNL::Containers::StaticVector< 3, int >; - using ConvolutionKernel = Convolution< 3, TNL::Devices::Cuda >; - - template< typename Index, typename Real, typename FetchData, typename FetchBoundary, typename FetchKernel, typename Convolve, typename Store > - static inline void - exec( const Vector& dimensions, - const Vector& kernelSize, - FetchData&& fetchData, - FetchBoundary&& fetchBoundary, - FetchKernel&& fetchKernel, - Convolve&& convolve, - Store&& store ) - { - const Index sizeX = dimensions.x(); - const Index sizeY = dimensions.y(); - const Index sizeZ = dimensions.z(); - - TNL::Cuda::LaunchConfiguration launchConfig; - - ConvolutionKernel::setup(launchConfig, dimensions, kernelSize); - - constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; - - TNL::Cuda::launchKernel< true >( kernel, - 0, - launchConfig, - kernelSize.x(), - kernelSize.y(), - kernelSize.z(), - dimensions.x(), - dimensions.y(), - dimensions.z(), - fetchData, - fetchBoundary, - fetchKernel, - convolve, - store ); - } -}; diff --git a/src/Benchmarks/Convolution/support/Solver.h b/src/Benchmarks/Convolution/support/Solver.h index a6b1d2c91..c0c0b31e2 100644 --- a/src/Benchmarks/Convolution/support/Solver.h +++ b/src/Benchmarks/Convolution/support/Solver.h @@ -6,8 +6,6 @@ #include #include -#include "Launcher.h" - template< int Dimension, typename Device > class Solver { -- GitLab From 5d0dac3964d679ee48f2ed5c47b9c91bed52ed60 Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Sun, 3 Apr 2022 15:04:57 +0200 Subject: [PATCH 10/19] Implement shared data and kernel kernel --- src/Benchmarks/Convolution/CMakeLists.txt | 8 + .../Convolution/kernels/sharedData.h | 4 +- .../Convolution/kernels/sharedDataAndKernel.h | 577 ++++++++++++++++++ .../Convolution/support/Benchmark.h | 13 +- .../Convolution/support/DummyBenchmark.h | 70 +-- src/Benchmarks/Convolution/support/Solver.h | 11 +- 6 files changed, 610 insertions(+), 73 deletions(-) create mode 100644 src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt index b51e7de7d..0569e1013 100644 --- a/src/Benchmarks/Convolution/CMakeLists.txt +++ b/src/Benchmarks/Convolution/CMakeLists.txt @@ -46,3 +46,11 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/shar GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedData.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedData.h") GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedData.h") + +GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_solver.h" "kernels/sharedDataAndKernel.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_solver.h" "kernels/sharedDataAndKernel.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/sharedDataAndKernel.h") + +GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") +GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") diff --git a/src/Benchmarks/Convolution/kernels/sharedData.h b/src/Benchmarks/Convolution/kernels/sharedData.h index 0c0e0a2a6..dc173703e 100644 --- a/src/Benchmarks/Convolution/kernels/sharedData.h +++ b/src/Benchmarks/Convolution/kernels/sharedData.h @@ -162,7 +162,7 @@ convolution2D( Index kernelWidth, Real result = 0; for( Index j = 0; j <= radiusY; j++ ) { - Index align = ( j + threadIdx.y ) * blockDim.y; + Index align = ( j + threadIdx.y ) * blockDim.x; for( Index i = 0; i <= radiusX; i++ ) { Index index = i + threadIdx.x + align; @@ -330,7 +330,7 @@ convolution3D( Index kernelWidth, Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; for( Index j = 0; j <= radiusY; j++ ) { - Index xAlign = ( j + threadIdx.y ) * blockDim.y; + Index xAlign = ( j + threadIdx.y ) * blockDim.x; for( Index i = 0; i <= radiusX; i++ ) { Index index = i + threadIdx.x + xAlign + xyAlign; diff --git a/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h new file mode 100644 index 000000000..8cbc26aa4 --- /dev/null +++ b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h @@ -0,0 +1,577 @@ +#pragma once + +#ifdef HAVE_CUDA + + #include + #include + #include + #include + +/** + * This method stores kernel and data in the shared memory to reduce amount of loads. + * + * We can calculate the size of shared memory needed the next way: + * 1. We need to store in shared memory: + * * for 1D -> (2 * kernelWidth) - 1 < 2 * kernelWidth + * * for 2D -> ( (2 * kernelWidth) - 1 ) * ( (2 * kernelHeight) - 1 ) < 4 * kernelWidth * kernelHeight + * * for 3D -> ( (2 * kernelWidth) - 1 ) * ( (2 * kernelHeight) - 1 ) * ( (2 * kernelDepth) - 1 ) < 8 * kernelWidth * + * kernelHeight * kernelDepth + * 2. We take into account, that the maximal block size is 1024, so the maximum volume of kernel is 1024. + * Then the maximal amount of shared memory is: + * * for 1D -> 2 * 1024 -> 2048 elements (Note, that even if we take long double (16B) we still can fit in the shared + * memory) + * * for 2D -> 4 * 1024 -> 4096 elements + * * for 3D -> 8 * 1024 -> 8196 elements (Note, that if double takes 8 bytes, then we can't fit tile into shared memory, + * because we have 64 KB of data) + * 3. The last thing is, that even if we take 1D and 2D case we have enough space to store 1024 kernel element. + * Then the maximal amount of shared memory is: + * * for 1D -> 3 * 1024 -> can use long double, double, float + * * for 2D -> 5 * 1024 -> can use double, float + * * for 3D -> 9 * 1024 -> can use float + */ + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution1D( Index kernelWidth, + Index endX, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + if( ix >= endX ) + return; + + Index kernelOffset = 2 * kernelWidth; + + Real* data = TNL::Cuda::getSharedMemory< Real >(); + Real* kernel = data + kernelOffset; + + Index radius = kernelWidth >> 1; + + // Left + Index lhs = ix - radius; + + if( lhs < 0 ) { + data[ threadIdx.x ] = fetchBoundary( lhs ); + } + else { + data[ threadIdx.x ] = fetchData( lhs ); + } + + // Right + Index rhs = ix + radius; + + if( rhs >= endX ) { + data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs ); + } + else { + data[ threadIdx.x + blockDim.x ] = fetchData( rhs ); + } + + kernel[ threadIdx.x ] = fetchKernel( threadIdx.x ); + + __syncthreads(); + + Real result = 0; + + #pragma unroll + for( Index i = 0; i < kernelWidth; i++ ) { + Index elementIndex = i + threadIdx.x; + + result = convolve( result, data[ elementIndex ], kernel[ i ] ); + } + + store( ix, result ); +} + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution2D( Index kernelWidth, + Index kernelHeight, + Index endX, + Index endY, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Index iy = threadIdx.y + blockIdx.y * blockDim.y; + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + if( ix >= endX || iy >= endY ) + return; + + Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ); + + Real* data = TNL::Cuda::getSharedMemory< Real >(); + Real* kernel = data + kernelOffset; + + Index radiusY = kernelHeight >> 1; + Index radiusX = kernelWidth >> 1; + + Index x, y, index; + + // Top Left + x = ix - radiusX; + y = iy - radiusY; + + index = threadIdx.x + threadIdx.y * blockDim.x; + + kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y ); + + if( x < 0 || y < 0 ) { + data[ index ] = fetchBoundary( x, y ); + } + else { + data[ index ] = fetchData( x, y ); + } + + // Top right + x = ix + radiusX; + y = iy - radiusY; + + index = radiusX + threadIdx.x + threadIdx.y * blockDim.x; + + if( x >= endX || y < 0 ) { + data[ index ] = fetchBoundary( x, y ); + } + else { + data[ index ] = fetchData( x, y ); + } + + // Bottom Left + x = ix - radiusX; + y = iy + radiusY; + + index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; + + if( x < 0 || y >= endY ) { + data[ index ] = fetchBoundary( x, y ); + } + else { + data[ index ] = fetchData( x, y ); + } + + // Bottom Right + x = ix + radiusX; + y = iy + radiusY; + + index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; + + if( x >= endX || y >= endY ) { + data[ index ] = fetchBoundary( x, y ); + } + else { + data[ index ] = fetchData( x, y ); + } + + __syncthreads(); + + Real result = 0; + + #pragma unroll + for( Index j = 0; j <= radiusY; j++ ) { + Index elementAlign = ( j + threadIdx.y ) * blockDim.x; + Index kernelAlign = j * blockDim.x; + + #pragma unroll + for( Index i = 0; i <= radiusX; i++ ) { + Index elementIndex = i + threadIdx.x + elementAlign; + Index kernelIndex = i + kernelAlign; + + result = convolve( result, data[ elementIndex ], kernel[ kernelIndex ] ); + } + } + + store( ix, iy, result ); +} + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > +__global__ +static void +convolution3D( Index kernelWidth, + Index kernelHeight, + Index kernelDepth, + Index endX, + Index endY, + Index endZ, + FetchData fetchData, + FetchBoundary fetchBoundary, + FetchKernel fetchKernel, + Convolve convolve, + Store store ) +{ + Index iz = threadIdx.z + blockIdx.z * blockDim.z; + Index iy = threadIdx.y + blockIdx.y * blockDim.y; + Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + if( ix >= endX || iy >= endY || iz >= endZ ) + return; + + Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ) * ( 2 * kernelDepth - 1 ); + + Real* data = TNL::Cuda::getSharedMemory< Real >(); + Real* kernel = data + kernelOffset; + + Index radiusZ = kernelDepth >> 1; + Index radiusY = kernelHeight >> 1; + Index radiusX = kernelWidth >> 1; + + Index x, y, z, index; + + // Z: 0 Y: 0 X: 0 + x = ix - radiusX; + y = iy - radiusY; + z = iz - radiusZ; + + index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; + + kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); + + if( x < 0 || y < 0 || z < 0 ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 0 Y: 0 X: 1 + x = ix + radiusX; + y = iy - radiusY; + z = iz - radiusZ; + + index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; + + if( x >= endX || y < 0 || z < 0 ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 0 Y: 1 X: 0 + x = ix - radiusX; + y = iy + radiusY; + z = iz - radiusZ; + + index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; + + if( x < 0 || y >= endY || z < 0 ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 1 Y: 0 X: 0 + x = ix - radiusX; + y = iy - radiusY; + z = iz + radiusZ; + + index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; + + if( x < 0 || y < 0 || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 0 Y: 1 X: 1 + x = ix + radiusX; + y = iy + radiusY; + z = iz - radiusZ; + + index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; + + if( x >= endX || y >= endY || z < 0 ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 1 Y: 0 X: 1 + x = ix + radiusX; + y = iy - radiusY; + z = iz + radiusZ; + + index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; + + if( x >= endX || y < 0 || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 1 Y: 1 X: 0 + x = ix - radiusX; + y = iy + radiusY; + z = iz + radiusZ; + + index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; + + if( x < 0 || y >= endY || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + // Z: 1 Y: 1 X: 1 + x = ix + radiusX; + y = iy + radiusY; + z = iz + radiusZ; + + index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; + + if( x >= endX || y >= endY || z >= endZ ) { + data[ index ] = fetchBoundary( x, y, z ); + } + else { + data[ index ] = fetchData( x, y, z ); + } + + __syncthreads(); + + Real result = 0; + + for( Index k = 0; k <= radiusZ; k++ ) { + Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; + Index xyKernelAlign = k * blockDim.x * blockDim.y; + + for( Index j = 0; j <= radiusY; j++ ) { + Index xAlign = ( j + threadIdx.y ) * blockDim.x; + Index xKernelAlign = j * blockDim.x; + + for( Index i = 0; i <= radiusX; i++ ) { + Index elementIndex = i + threadIdx.x + xAlign + xyAlign; + Index kernelIndex = i + xKernelAlign + xyKernelAlign; + + result = convolve( result, data[ index ], kernel[ kernelIndex ] ); + } + } + } + + store( ix, iy, iz, result ); +} + +template< int Dimension, typename Device > +struct Convolution; + +template<> +struct Convolution< 1, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 1, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; + + configuration.dynamicSharedMemorySize = ( kernelSize.x() + kernelElementCount ) * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( + kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store ); + }; +}; + +template<> +struct Convolution< 2, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 2, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + Index kernelVolume = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) { + kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; + kernelVolume *= kernelSize[ i ]; + } + + configuration.dynamicSharedMemorySize = ( kernelVolume + kernelElementCount ) * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( kernel, + 0, + configuration, + kernelSize.x(), + kernelSize.y(), + dimensions.x(), + dimensions.y(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + }; +}; + +template<> +struct Convolution< 3, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 3, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + Index kernelVolume = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) { + kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; + kernelVolume *= kernelSize[ i ]; + } + + configuration.dynamicSharedMemorySize = ( kernelVolume + kernelElementCount ) * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + configuration.blockSize.z = kernelSize.z(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename FetchKernel, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + FetchKernel&& fetchKernel, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( kernel, + 0, + configuration, + kernelSize.x(), + kernelSize.y(), + kernelSize.z(), + dimensions.x(), + dimensions.y(), + dimensions.z(), + fetchData, + fetchBoundary, + fetchKernel, + convolve, + store ); + }; +}; + +#endif diff --git a/src/Benchmarks/Convolution/support/Benchmark.h b/src/Benchmarks/Convolution/support/Benchmark.h index ce1b91b23..b489000d4 100644 --- a/src/Benchmarks/Convolution/support/Benchmark.h +++ b/src/Benchmarks/Convolution/support/Benchmark.h @@ -19,12 +19,12 @@ public: void run( const TNL::Config::ParameterContainer& parameters ) const { - if( ! TNL::Devices::Host::setup( parameters ) || ! TNL::Devices::Cuda::setup( parameters ) ) + if( ! TNL::Devices::Cuda::setup( parameters ) ) return; const TNL::String logFileName = parameters.getParameter< TNL::String >( "log-file" ); const TNL::String outputMode = parameters.getParameter< TNL::String >( "output-mode" ); - const TNL::String device = parameters.getParameter< TNL::String >( "device" ); + const int verbose = parameters.getParameter< int >( "verbose" ); const int loops = parameters.getParameter< int >( "loops" ); @@ -58,19 +58,10 @@ public: config.addEntryEnum( "append" ); config.addEntryEnum( "overwrite" ); - config.addEntry< TNL::String >( "device", "Device the computation will run on.", "cuda" ); - config.addEntryEnum< TNL::String >( "all" ); - config.addEntryEnum< TNL::String >( "host" ); - -#ifdef HAVE_CUDA - config.addEntryEnum< TNL::String >( "cuda" ); -#endif - config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); config.addEntry< int >( "verbose", "Verbose mode.", 1 ); config.addDelimiter( "Device settings:" ); - TNL::Devices::Host::configSetup( config ); #ifdef HAVE_CUDA TNL::Devices::Cuda::configSetup( config ); diff --git a/src/Benchmarks/Convolution/support/DummyBenchmark.h b/src/Benchmarks/Convolution/support/DummyBenchmark.h index 005096d12..9b44d53dd 100644 --- a/src/Benchmarks/Convolution/support/DummyBenchmark.h +++ b/src/Benchmarks/Convolution/support/DummyBenchmark.h @@ -23,77 +23,52 @@ public: virtual void start( TNLBenchmark& benchmark, const TNL::Config::ParameterContainer& parameters ) const override { - Vector start; - Vector end; + Vector dimension; Vector minKernelSize; Vector maxKernelSize; for( int i = 0; i < Dimension; i++ ) { - start[ i ] = parameters.getParameter< int >( minDimensionIds[ i ] ); - end[ i ] = parameters.getParameter< int >( maxDimensionIds[ i ] ); + dimension[ i ] = parameters.getParameter< int >( dimensionIds[ i ] ); minKernelSize[ i ] = parameters.getParameter< int >( minKernelSizeIds[ i ] ); maxKernelSize[ i ] = parameters.getParameter< int >( maxKernelSizeIds[ i ] ); - TNL_ASSERT_GT( start[ i ], 1, "Start dimension must be positive integer" ); - TNL_ASSERT_GT( end[ i ], start[ i ], "End dimension must be greater than start dimension" ); - TNL_ASSERT_GE( minKernelSize[ i ], 1, "Minimal kernel size must be a positive number" ); TNL_ASSERT_EQ( minKernelSize[ i ] % 2, 1, "Minimal kernel size must be odd" ); - TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End dimension must be greater than start dimension" ); - TNL_ASSERT_GT( end[ i ], start[ i ], "End kernel size must be greater than start kernel size" ); + TNL_ASSERT_GT( maxKernelSize[ i ], minKernelSize[ i ], "End kernel size must be greater than start kernel size" ); } - int dimensionStep = parameters.getParameter< int >( "dimension-step" ); int kernelStep = parameters.getParameter< int >( "kernel-step" ); - TNL_ASSERT_GT( dimensionStep, 1, "Dimension step must be a positive number" ); TNL_ASSERT_GT( kernelStep, 0, "Kernel step must be a positive number" ); TNL_ASSERT_EQ( kernelStep % 2, 0, "Kernel step must be even" ); - TNL::String id = parameters.getParameter("id"); + TNL::String id = parameters.getParameter< TNL::String >( "id" ); - time( id, benchmark, start, end, dimensionStep, minKernelSize, maxKernelSize, kernelStep ); + time( id, benchmark, dimension, minKernelSize, maxKernelSize, kernelStep ); } virtual void time( const TNL::String& id, TNLBenchmark& benchmark, - const Vector& minDimension, - const Vector& maxDimension, - const int dimensionStep, + const Vector& dimension, const Vector& minKernelSize, const Vector& maxKernelSize, const int kernelStep ) const { - Vector currentDimension = minDimension; - Vector currentKernelSize; + Vector currentKernelSize = minKernelSize; do { - currentKernelSize = minKernelSize; - - do { - timeConvolution( id, benchmark, currentDimension, currentKernelSize ); - - currentKernelSize[ 0 ] += kernelStep; - - for( size_t i = 0; i < currentKernelSize.getSize() - 1; i++ ) { - if( currentKernelSize[ i ] >= maxKernelSize[ i ] ) { - currentKernelSize[ i ] = minKernelSize[ i ]; - currentKernelSize[ i + 1 ] += kernelStep; - } - } - } while( currentKernelSize < maxKernelSize ); + timeConvolution( id, benchmark, dimension, currentKernelSize ); - currentDimension[ 0 ] *= dimensionStep; + currentKernelSize[ 0 ] += kernelStep; - for( size_t i = 0; i < currentDimension.getSize() - 1; i++ ) { - if( currentDimension[ i ] >= maxDimension[ i ] ) { - currentDimension[ i ] = minDimension[ i ]; - currentDimension[ i + 1 ] *= dimensionStep; + for( size_t i = 0; i < currentKernelSize.getSize() - 1; i++ ) { + if( currentKernelSize[ i ] >= maxKernelSize[ i ] ) { + currentKernelSize[ i ] = minKernelSize[ i ]; + currentKernelSize[ i + 1 ] += kernelStep; } } - - } while( currentDimension < maxDimension ); + } while( currentKernelSize < maxKernelSize ); } void @@ -101,7 +76,7 @@ public: { auto device = TNL::getType< Device >(); - typename TNLBenchmark::MetadataColumns columns = {{ "id", id }}; + typename TNLBenchmark::MetadataColumns columns = { { "id", id } }; size_t elementsCount = 1; size_t kernelElementsCount = 1; @@ -110,8 +85,8 @@ public: elementsCount *= dimension[ i ]; kernelElementsCount *= kernelSize[ i ]; - columns.push_back( { dimensionIds[ i ], TNL::convertToString(dimension[ i ]) } ); - columns.push_back( { kernelSizeIds[ i ], TNL::convertToString(kernelSize[ i ]) } ); + columns.push_back( { dimensionIds[ i ], TNL::convertToString( dimension[ i ] ) } ); + columns.push_back( { kernelSizeIds[ i ], TNL::convertToString( kernelSize[ i ] ) } ); } benchmark.setDatasetSize( ( elementsCount * 4 ) / 1.e9, 1.0 ); @@ -134,10 +109,10 @@ public: auto measure = [ & ]() { - DummyTask::exec(dimension, kernelSize, inputView, resultView, kernelView); + DummyTask< int, float, Dimension, Device >::exec( dimension, kernelSize, inputView, resultView, kernelView ); }; - benchmark.template time( device, measure ); + benchmark.template time< Device >( device, measure ); } TNL::Config::ConfigDescription @@ -148,12 +123,7 @@ public: config.addDelimiter( "Grid dimension settings:" ); for( int i = 0; i < Dimension; i++ ) - config.addEntry< int >( minDimensionIds[ i ], minDimensionIds[ i ], 16 ); - - for( int i = 0; i < Dimension; i++ ) - config.addEntry< int >( maxDimensionIds[ i ], maxDimensionIds[ i ], 128 ); - - config.addEntry< int >( "dimension-step", "Step of kernel increase by which dimension is multiplied (must be even)", 2 ); + config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 16 ); config.addDelimiter( "Kernel settings:" ); diff --git a/src/Benchmarks/Convolution/support/Solver.h b/src/Benchmarks/Convolution/support/Solver.h index c0c0b31e2..3fd56fb02 100644 --- a/src/Benchmarks/Convolution/support/Solver.h +++ b/src/Benchmarks/Convolution/support/Solver.h @@ -13,7 +13,7 @@ public: void solve( const TNL::Config::ParameterContainer& parameters ) const { - if( ! TNL::Devices::Host::setup( parameters ) || ! TNL::Devices::Cuda::setup( parameters ) ) + if( ! TNL::Devices::Cuda::setup( parameters ) ) return; start( parameters ); @@ -30,16 +30,7 @@ public: { TNL::Config::ConfigDescription config; - config.addEntry< TNL::String >( "device", "Device the computation will run on.", "cuda" ); - config.addEntryEnum< TNL::String >( "all" ); - config.addEntryEnum< TNL::String >( "host" ); - -#ifdef HAVE_CUDA - config.addEntryEnum< TNL::String >( "cuda" ); -#endif - config.addDelimiter( "Device settings:" ); - TNL::Devices::Host::configSetup( config ); #ifdef HAVE_CUDA TNL::Devices::Cuda::configSetup( config ); -- GitLab From 6860696cdb39ab8290a3300f65e683362b7b5e6a Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Mon, 4 Apr 2022 08:40:08 +0200 Subject: [PATCH 11/19] Fix convolution kernel execution --- src/Benchmarks/Convolution/kernels/naive.h | 2 +- .../Convolution/kernels/sharedData.h | 85 +++++++++--------- .../Convolution/kernels/sharedDataAndKernel.h | 87 ++++++++++--------- .../Convolution/kernels/sharedKernel.h | 28 +++--- .../Convolution/support/DummySolver.h | 9 +- .../Convolution/support/DummyTask.h | 8 +- 6 files changed, 116 insertions(+), 103 deletions(-) diff --git a/src/Benchmarks/Convolution/kernels/naive.h b/src/Benchmarks/Convolution/kernels/naive.h index 4326e0d74..5705deb04 100644 --- a/src/Benchmarks/Convolution/kernels/naive.h +++ b/src/Benchmarks/Convolution/kernels/naive.h @@ -305,7 +305,7 @@ public: TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); - configuration.gridSize.y = + configuration.gridSize.z = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } diff --git a/src/Benchmarks/Convolution/kernels/sharedData.h b/src/Benchmarks/Convolution/kernels/sharedData.h index dc173703e..8abc82fda 100644 --- a/src/Benchmarks/Convolution/kernels/sharedData.h +++ b/src/Benchmarks/Convolution/kernels/sharedData.h @@ -33,16 +33,13 @@ convolution1D( Index kernelWidth, { Index ix = threadIdx.x + blockIdx.x * blockDim.x; - if( ix >= endX ) - return; - Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radius = kernelWidth >> 1; // Left Index lhs = ix - radius; - if( lhs < 0 ) { + if( lhs < 0 || lhs >= endX ) { shared[ threadIdx.x ] = fetchBoundary( lhs ); } else { @@ -52,7 +49,7 @@ convolution1D( Index kernelWidth, // Right Index rhs = ix + radius; - if( rhs >= endX ) { + if( rhs < 0 || rhs >= endX ) { shared[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs ); } else { @@ -61,6 +58,9 @@ convolution1D( Index kernelWidth, __syncthreads(); + if( ix >= endX ) + return; + Real result = 0; #pragma unroll @@ -95,9 +95,6 @@ convolution2D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; - if( ix >= endX || iy >= endY ) - return; - Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radiusY = kernelHeight >> 1; @@ -105,13 +102,16 @@ convolution2D( Index kernelWidth, Index x, y, index; + Index kernelHorizontalPadding = kernelWidth == 1 ? 0 : kernelWidth; + Index kernelVerticalPadding = kernelHeight == 1 ? 0 : kernelHeight; + // Top Left x = ix - radiusX; y = iy - radiusY; index = threadIdx.x + threadIdx.y * blockDim.x; - if( x < 0 || y < 0 ) { + if( x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); } else { @@ -122,9 +122,9 @@ convolution2D( Index kernelWidth, x = ix + radiusX; y = iy - radiusY; - index = radiusX + threadIdx.x + threadIdx.y * blockDim.x; + index = kernelHorizontalPadding + threadIdx.x + threadIdx.y * blockDim.x; - if( x >= endX || y < 0 ) { + if( x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); } else { @@ -135,9 +135,9 @@ convolution2D( Index kernelWidth, x = ix - radiusX; y = iy + radiusY; - index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; + index = threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x; - if( x < 0 || y >= endY ) { + if(x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); } else { @@ -148,9 +148,9 @@ convolution2D( Index kernelWidth, x = ix + radiusX; y = iy + radiusY; - index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; + index = kernelHorizontalPadding + threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x; - if( x >= endX || y >= endY ) { + if( x < 0 || y < 0 || x >= endX || y >= endY ) { shared[ index ] = fetchBoundary( x, y ); } else { @@ -159,12 +159,15 @@ convolution2D( Index kernelWidth, __syncthreads(); + if( ix >= endX || iy >= endY ) + return; + Real result = 0; - for( Index j = 0; j <= radiusY; j++ ) { + for( Index j = 0; j < kernelHeight; j++ ) { Index align = ( j + threadIdx.y ) * blockDim.x; - for( Index i = 0; i <= radiusX; i++ ) { + for( Index i = 0; i < kernelWidth; i++ ) { Index index = i + threadIdx.x + align; result = convolve( result, shared[ index ], fetchKernel( i, j ) ); @@ -199,9 +202,6 @@ convolution3D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; - if( ix >= endX || iy >= endY || iz >= endZ ) - return; - Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radiusZ = kernelDepth >> 1; @@ -215,9 +215,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; - index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; + index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; - if( x < 0 || y < 0 || z < 0 ) { + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { @@ -229,9 +229,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; - index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; + index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; - if( x >= endX || y < 0 || z < 0 ) { + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { @@ -243,9 +243,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; - index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; + index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; - if( x < 0 || y >= endY || z < 0 ) { + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { @@ -257,9 +257,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; - index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; + index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; - if( x < 0 || y < 0 || z >= endZ ) { + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { @@ -271,9 +271,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; - index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; + index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; - if( x >= endX || y >= endY || z < 0 ) { + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { @@ -285,9 +285,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; - index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; + index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; - if( x >= endX || y < 0 || z >= endZ ) { + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { @@ -299,9 +299,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; - index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; + index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; - if( x < 0 || y >= endY || z >= endZ ) { + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { @@ -313,9 +313,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; - index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; + index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; - if( x >= endX || y >= endY || z >= endZ ) { + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { shared[ index ] = fetchBoundary( x, y, z ); } else { @@ -324,15 +324,18 @@ convolution3D( Index kernelWidth, __syncthreads(); + if( ix >= endX || iy >= endY || iz >= endZ ) + return; + Real result = 0; - for( Index k = 0; k <= radiusZ; k++ ) { + for( Index k = 0; k < kernelDepth; k++ ) { Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; - for( Index j = 0; j <= radiusY; j++ ) { + for( Index j = 0; j < kernelHeight; j++ ) { Index xAlign = ( j + threadIdx.y ) * blockDim.x; - for( Index i = 0; i <= radiusX; i++ ) { + for( Index i = 0; i < kernelWidth; i++ ) { Index index = i + threadIdx.x + xAlign + xyAlign; result = convolve( result, shared[ index ], fetchKernel( i, j, k ) ); @@ -360,7 +363,7 @@ public: Index kernelElementCount = 1; for( Index i = 0; i < kernelSize.getSize(); i++ ) - kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; + kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1 ; configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); @@ -486,7 +489,7 @@ public: TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); - configuration.gridSize.y = + configuration.gridSize.z = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } diff --git a/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h index 8cbc26aa4..70e1d58a9 100644 --- a/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h +++ b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h @@ -49,9 +49,6 @@ convolution1D( Index kernelWidth, { Index ix = threadIdx.x + blockIdx.x * blockDim.x; - if( ix >= endX ) - return; - Index kernelOffset = 2 * kernelWidth; Real* data = TNL::Cuda::getSharedMemory< Real >(); @@ -62,7 +59,7 @@ convolution1D( Index kernelWidth, // Left Index lhs = ix - radius; - if( lhs < 0 ) { + if( lhs < 0 || lhs >= endX ) { data[ threadIdx.x ] = fetchBoundary( lhs ); } else { @@ -72,7 +69,7 @@ convolution1D( Index kernelWidth, // Right Index rhs = ix + radius; - if( rhs >= endX ) { + if( rhs < 0 || rhs >= endX ) { data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs ); } else { @@ -83,6 +80,9 @@ convolution1D( Index kernelWidth, __syncthreads(); + if( ix >= endX ) + return; + Real result = 0; #pragma unroll @@ -117,9 +117,6 @@ convolution2D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; - if( ix >= endX || iy >= endY ) - return; - Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ); Real* data = TNL::Cuda::getSharedMemory< Real >(); @@ -138,7 +135,7 @@ convolution2D( Index kernelWidth, kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y ); - if( x < 0 || y < 0 ) { + if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { @@ -149,9 +146,9 @@ convolution2D( Index kernelWidth, x = ix + radiusX; y = iy - radiusY; - index = radiusX + threadIdx.x + threadIdx.y * blockDim.x; + index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x; - if( x >= endX || y < 0 ) { + if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { @@ -162,9 +159,9 @@ convolution2D( Index kernelWidth, x = ix - radiusX; y = iy + radiusY; - index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; + index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x; - if( x < 0 || y >= endY ) { + if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { @@ -175,9 +172,9 @@ convolution2D( Index kernelWidth, x = ix + radiusX; y = iy + radiusY; - index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.x; + index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x; - if( x >= endX || y >= endY ) { + if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { @@ -186,15 +183,18 @@ convolution2D( Index kernelWidth, __syncthreads(); + if( ix >= endX || iy >= endY ) + return; + Real result = 0; #pragma unroll - for( Index j = 0; j <= radiusY; j++ ) { + for( Index j = 0; j < kernelHeight; j++ ) { Index elementAlign = ( j + threadIdx.y ) * blockDim.x; Index kernelAlign = j * blockDim.x; #pragma unroll - for( Index i = 0; i <= radiusX; i++ ) { + for( Index i = 0; i < kernelWidth; i++ ) { Index elementIndex = i + threadIdx.x + elementAlign; Index kernelIndex = i + kernelAlign; @@ -230,9 +230,6 @@ convolution3D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; - if( ix >= endX || iy >= endY || iz >= endZ ) - return; - Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ) * ( 2 * kernelDepth - 1 ); Real* data = TNL::Cuda::getSharedMemory< Real >(); @@ -249,11 +246,11 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; - index = threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; + index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); - if( x < 0 || y < 0 || z < 0 ) { + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { @@ -265,9 +262,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; - index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; + index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; - if( x >= endX || y < 0 || z < 0 ) { + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { @@ -279,9 +276,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; - index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; + index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; - if( x < 0 || y >= endY || z < 0 ) { + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { @@ -293,9 +290,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; - index = threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; + index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; - if( x < 0 || y < 0 || z >= endZ ) { + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { @@ -307,9 +304,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; - index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + threadIdx.z * blockDim.x * blockDim.y; + index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; - if( x >= endX || y >= endY || z < 0 ) { + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ) { data[ index ] = fetchBoundary( x, y, z ); } else { @@ -321,9 +318,9 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; - index = radiusX + threadIdx.x + threadIdx.y * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; + index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; - if( x >= endX || y < 0 || z >= endZ ) { + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { @@ -335,9 +332,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; - index = threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; + index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; - if( x < 0 || y >= endY || z >= endZ ) { + if(x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { @@ -349,9 +346,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; - index = radiusX + threadIdx.x + ( radiusY + threadIdx.y ) * blockDim.y + ( radiusZ + threadIdx.z ) * blockDim.x * blockDim.y; + index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; - if( x >= endX || y >= endY || z >= endZ ) { + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { @@ -360,21 +357,25 @@ convolution3D( Index kernelWidth, __syncthreads(); + if( ix >= endX || iy >= endY || iz >= endZ ) + return; + Real result = 0; - for( Index k = 0; k <= radiusZ; k++ ) { + #pragma unroll + for( Index k = 0; k < kernelDepth; k++ ) { Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; Index xyKernelAlign = k * blockDim.x * blockDim.y; - - for( Index j = 0; j <= radiusY; j++ ) { + #pragma unroll + for( Index j = 0; j < kernelHeight; j++ ) { Index xAlign = ( j + threadIdx.y ) * blockDim.x; Index xKernelAlign = j * blockDim.x; - - for( Index i = 0; i <= radiusX; i++ ) { + #pragma unroll + for( Index i = 0; i < kernelWidth; i++ ) { Index elementIndex = i + threadIdx.x + xAlign + xyAlign; Index kernelIndex = i + xKernelAlign + xyKernelAlign; - result = convolve( result, data[ index ], kernel[ kernelIndex ] ); + result = convolve( result, data[ elementIndex ], kernel[ kernelIndex ] ); } } } @@ -531,7 +532,7 @@ public: TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); - configuration.gridSize.y = + configuration.gridSize.z = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } diff --git a/src/Benchmarks/Convolution/kernels/sharedKernel.h b/src/Benchmarks/Convolution/kernels/sharedKernel.h index d3e1e4da3..c217cfb34 100644 --- a/src/Benchmarks/Convolution/kernels/sharedKernel.h +++ b/src/Benchmarks/Convolution/kernels/sharedKernel.h @@ -34,9 +34,6 @@ convolution1D( Index kernelWidth, { Index ix = threadIdx.x + blockIdx.x * blockDim.x; - if( ix >= endX ) - return; - Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radius = kernelWidth >> 1; @@ -46,8 +43,12 @@ convolution1D( Index kernelWidth, __syncthreads(); + if( ix >= endX ) + return; + Real result = 0; + #pragma unroll for( Index i = -radius; i <= radius; i++ ) { Index elementIndex = i + ix; Index kernelIndex = i + radius; @@ -85,9 +86,6 @@ convolution2D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; - if( ix >= endX || iy >= endY ) - return; - Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radiusY = kernelHeight >> 1; @@ -100,12 +98,17 @@ convolution2D( Index kernelWidth, __syncthreads(); + if( ix >= endX || iy >= endY ) + return; + Real result = 0; + #pragma unroll for( Index j = -radiusY; j <= radiusY; j++ ) { Index elementIndexY = j + iy; Index kernelIndexY = j + radiusY; + #pragma unroll for( Index i = -radiusX; i <= radiusX; i++ ) { Index elementIndexX = i + ix; Index kernelIndexX = i + radiusX; @@ -149,9 +152,6 @@ convolution3D( Index kernelWidth, Index iy = threadIdx.y + blockIdx.y * blockDim.y; Index ix = threadIdx.x + blockIdx.x * blockDim.x; - if( ix >= endX || iy >= endY || iz >= endZ ) - return; - Real* shared = TNL::Cuda::getSharedMemory< Real >(); Index radiusZ = kernelDepth >> 1; @@ -160,23 +160,27 @@ convolution3D( Index kernelWidth, Index threadIndex = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z; - printf( "%d\n", threadIndex ); - // The size of the block is equal to the kernel size shared[ threadIndex ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); __syncthreads(); + if( ix >= endX || iy >= endY || iz >= endZ ) + return; + Real result = 0; + #pragma unroll for( Index k = -radiusZ; k <= radiusZ; k++ ) { Index elementIndexZ = k + iz; Index kernelIndexZ = k + radiusZ; + #pragma unroll for( Index j = -radiusY; j <= radiusY; j++ ) { Index elementIndexY = j + iy; Index kernelIndexY = j + radiusY; + #pragma unroll for( Index i = -radiusX; i <= radiusX; i++ ) { Index elementIndexX = i + ix; Index kernelIndexX = i + radiusX; @@ -338,7 +342,7 @@ public: TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); configuration.gridSize.y = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); - configuration.gridSize.y = + configuration.gridSize.z = TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.z(), configuration.blockSize.z ) ); } diff --git a/src/Benchmarks/Convolution/support/DummySolver.h b/src/Benchmarks/Convolution/support/DummySolver.h index a871c7f3f..82a8f6ad4 100644 --- a/src/Benchmarks/Convolution/support/DummySolver.h +++ b/src/Benchmarks/Convolution/support/DummySolver.h @@ -61,6 +61,11 @@ public: DummyTask::exec(dimension, kernelSize, inputView, resultView, kernelView); + TNL::Containers::Array< float, TNL::Devices::Host, int > host(result); + + for (int i = 0; i < host.getSize(); i++) + TNL_ASSERT_EQ(host[i], kernelElementsCount, "Dummy task always sets volume of kernel"); + std::cout << "Everything is fine" << std::endl; } @@ -72,12 +77,12 @@ public: config.addDelimiter( "Grid dimension settings:" ); for( int i = 0; i < Dimension; i++ ) - config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 512 ); + config.addEntry< int >( dimensionIds[ i ], dimensionIds[ i ], 64 ); config.addDelimiter( "Kernel settings:" ); for( int i = 0; i < Dimension; i++ ) - config.addEntry< int >( kernelSizeIds[ i ], kernelSizeIds[ i ] + " (odd) :", 11 ); + config.addEntry< int >( kernelSizeIds[ i ], kernelSizeIds[ i ] + " (odd) :", 9 ); return config; } diff --git a/src/Benchmarks/Convolution/support/DummyTask.h b/src/Benchmarks/Convolution/support/DummyTask.h index e850b64c0..026575c38 100644 --- a/src/Benchmarks/Convolution/support/DummyTask.h +++ b/src/Benchmarks/Convolution/support/DummyTask.h @@ -55,7 +55,7 @@ public: return kernel[ i ]; }; - auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel ) + auto convolve = [ = ] __cuda_callable__( Real result, Real data, Real kernel ) { return result + data * kernel; }; @@ -97,7 +97,7 @@ public: auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j ) { - return -1; + return 1; }; auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j ) @@ -107,7 +107,7 @@ public: return kernel[ index ]; }; - auto convolve = [ = ] __cuda_callable__( Real result, Index data, Index kernel ) + auto convolve = [ = ] __cuda_callable__( Real result, Real data, Real kernel ) { return result + data * kernel; }; @@ -161,7 +161,7 @@ public: return kernel[ index ]; }; - auto convolve = [ = ] __cuda_callable__( float result, Index data, Index kernel ) + auto convolve = [ = ] __cuda_callable__( Real result, Real data, Real kernel ) { return result + data * kernel; }; -- GitLab From 47036e5e6ab038b5ada97d91312d3a309b44e198 Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Mon, 4 Apr 2022 09:44:18 +0200 Subject: [PATCH 12/19] Add convolution directory in CMake --- src/Benchmarks/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt index 50e467762..e3b14d851 100644 --- a/src/Benchmarks/CMakeLists.txt +++ b/src/Benchmarks/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory( Convolution ) add_subdirectory( HeatEquation ) add_subdirectory( BLAS ) add_subdirectory( NDArray ) -- GitLab From 12827c2dae50b8eca0c3b11b4319a0a80f88acb6 Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Mon, 2 May 2022 18:55:39 +0200 Subject: [PATCH 13/19] Add prefer of the shared memory --- src/Benchmarks/Convolution/kernels/sharedData.h | 2 +- src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h | 6 ++++++ src/Benchmarks/Convolution/kernels/sharedKernel.h | 6 ++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/Benchmarks/Convolution/kernels/sharedData.h b/src/Benchmarks/Convolution/kernels/sharedData.h index 8abc82fda..97f252b11 100644 --- a/src/Benchmarks/Convolution/kernels/sharedData.h +++ b/src/Benchmarks/Convolution/kernels/sharedData.h @@ -6,7 +6,7 @@ * This method stores image tile into shared memory * and then calculates convolution. * - * Thanks for the idea https://www.evl.uic.edu/sjames/cs525/final.html + * Thanks for the idea https://www.evl.uic.edu/sjames/cs525/final.html */ #include diff --git a/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h index 70e1d58a9..62276c3cf 100644 --- a/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h +++ b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h @@ -431,6 +431,8 @@ public: constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared); + TNL::Cuda::launchKernel< true >( kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store ); }; @@ -488,6 +490,8 @@ public: constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared); + TNL::Cuda::launchKernel< true >( kernel, 0, configuration, @@ -558,6 +562,8 @@ public: constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared); + TNL::Cuda::launchKernel< true >( kernel, 0, configuration, diff --git a/src/Benchmarks/Convolution/kernels/sharedKernel.h b/src/Benchmarks/Convolution/kernels/sharedKernel.h index c217cfb34..ba98efe73 100644 --- a/src/Benchmarks/Convolution/kernels/sharedKernel.h +++ b/src/Benchmarks/Convolution/kernels/sharedKernel.h @@ -247,6 +247,8 @@ public: constexpr auto kernel = convolution1D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared); + TNL::Cuda::launchKernel< true >( kernel, 0, configuration, kernelSize.x(), dimensions.x(), fetchData, fetchBoundary, fetchKernel, convolve, store ); }; @@ -301,6 +303,8 @@ public: constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared); + TNL::Cuda::launchKernel< true >( kernel, 0, configuration, @@ -368,6 +372,8 @@ public: constexpr auto kernel = convolution3D< Index, Real, FetchData, FetchBoundary, FetchKernel, Convolve, Store >; + cudaFuncSetCacheConfig(kernel, cudaFuncCachePreferShared); + TNL::Cuda::launchKernel< true >( kernel, 0, configuration, -- GitLab From 231c69bf9fff0feeb7f2c1a80773fec2731dcdf6 Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Tue, 3 May 2022 13:05:58 +0200 Subject: [PATCH 14/19] Add image convolution solver --- src/Benchmarks/Convolution/CMakeLists.txt | 9 + .../Convolution/kernels/sharedData.h | 141 +++++++------- .../Convolution/kernels/sharedDataAndKernel.h | 100 ++++++---- .../Convolution/support/DummyBenchmark.h | 6 +- .../Convolution/support/DummySolver.h | 6 +- .../Convolution/support/DummyTask.h | 17 +- .../Convolution/support/ImageSolver.h | 184 ++++++++++++++++++ src/Benchmarks/Convolution/support/Solver.h | 2 +- .../Convolution/templates/main_image_solver.h | 26 +++ src/TNL/Images/PNGImage_impl.h | 2 + 10 files changed, 371 insertions(+), 122 deletions(-) create mode 100644 src/Benchmarks/Convolution/support/ImageSolver.h create mode 100644 src/Benchmarks/Convolution/templates/main_image_solver.h diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt index 0569e1013..d34518dcc 100644 --- a/src/Benchmarks/Convolution/CMakeLists.txt +++ b/src/Benchmarks/Convolution/CMakeLists.txt @@ -17,6 +17,10 @@ if (${BUILD_CUDA}) SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}_${TEMPLATE_NAME}") CUDA_ADD_EXECUTABLE(${EXECUTABLE_NAME} ${SOURCE_FILE}) + + if( PNG_FOUND ) + target_link_libraries( ${EXECUTABLE_NAME} ${PNG_LIBRARIES} ) + endif() else() MESSAGE(WARNING "Convolutions are not supported on CPU") endif() @@ -54,3 +58,8 @@ GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_solver.h" "kernels/shar GENERATE_CUDA_EXECUTABLE("Convolution" 1 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 2 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") GENERATE_CUDA_EXECUTABLE("Convolution" 3 "templates/main_benchmark.h" "kernels/sharedDataAndKernel.h") + +GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/naive.h") +GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedData.h") +GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedKernel.h") +GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedDataAndKernel.h") diff --git a/src/Benchmarks/Convolution/kernels/sharedData.h b/src/Benchmarks/Convolution/kernels/sharedData.h index 97f252b11..dcaa5236e 100644 --- a/src/Benchmarks/Convolution/kernels/sharedData.h +++ b/src/Benchmarks/Convolution/kernels/sharedData.h @@ -33,27 +33,27 @@ convolution1D( Index kernelWidth, { Index ix = threadIdx.x + blockIdx.x * blockDim.x; - Real* shared = TNL::Cuda::getSharedMemory< Real >(); + Real* data = TNL::Cuda::getSharedMemory< Real >(); Index radius = kernelWidth >> 1; // Left Index lhs = ix - radius; if( lhs < 0 || lhs >= endX ) { - shared[ threadIdx.x ] = fetchBoundary( lhs ); + data[ threadIdx.x ] = fetchBoundary( lhs ); } else { - shared[ threadIdx.x ] = fetchData( lhs ); + data[ threadIdx.x ] = fetchData( lhs ); } // Right Index rhs = ix + radius; if( rhs < 0 || rhs >= endX ) { - shared[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs ); + data[ threadIdx.x + blockDim.x ] = fetchBoundary( rhs ); } else { - shared[ threadIdx.x + blockDim.x ] = fetchData( rhs ); + data[ threadIdx.x + blockDim.x ] = fetchData( rhs ); } __syncthreads(); @@ -67,7 +67,7 @@ convolution1D( Index kernelWidth, for( Index i = 0; i < kernelWidth; i++ ) { Index elementIndex = i + threadIdx.x; - result = convolve( result, shared[ elementIndex ], fetchKernel( i ) ); + result = convolve( result, data[ elementIndex ], fetchKernel( i ) ); } store( ix, result ); @@ -92,69 +92,68 @@ convolution2D( Index kernelWidth, Convolve convolve, Store store ) { - Index iy = threadIdx.y + blockIdx.y * blockDim.y; - Index ix = threadIdx.x + blockIdx.x * blockDim.x; + Real* data = TNL::Cuda::getSharedMemory< Real >(); - Real* shared = TNL::Cuda::getSharedMemory< Real >(); + const Index iy = threadIdx.y + blockIdx.y * blockDim.y; + const Index ix = threadIdx.x + blockIdx.x * blockDim.x; - Index radiusY = kernelHeight >> 1; - Index radiusX = kernelWidth >> 1; + const Index radiusY = kernelHeight >> 1; + const Index radiusX = kernelWidth >> 1; - Index x, y, index; + const Index dataBlockWidth = 2 * kernelWidth - 1; + const Index dataBlockHeight = 2 * kernelHeight - 1; + + const Index dataBlockRadiusX = dataBlockWidth >> 1; + const Index dataBlockRadiusY = dataBlockHeight >> 1; - Index kernelHorizontalPadding = kernelWidth == 1 ? 0 : kernelWidth; - Index kernelVerticalPadding = kernelHeight == 1 ? 0 : kernelHeight; + Index x, y, index; // Top Left x = ix - radiusX; y = iy - radiusY; - - index = threadIdx.x + threadIdx.y * blockDim.x; + index = threadIdx.x + threadIdx.y * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { - shared[ index ] = fetchBoundary( x, y ); + data[ index ] = fetchBoundary( x, y ); } else { - shared[ index ] = fetchData( x, y ); + data[ index ] = fetchData( x, y ); } // Top right x = ix + radiusX; y = iy - radiusY; - - index = kernelHorizontalPadding + threadIdx.x + threadIdx.y * blockDim.x; + index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { - shared[ index ] = fetchBoundary( x, y ); + data[ index ] = fetchBoundary( x, y ); } else { - shared[ index ] = fetchData( x, y ); + data[ index ] = fetchData( x, y ); } // Bottom Left x = ix - radiusX; y = iy + radiusY; - - index = threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x; + index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; if(x < 0 || y < 0 || x >= endX || y >= endY ) { - shared[ index ] = fetchBoundary( x, y ); + data[ index ] = fetchBoundary( x, y ); } else { - shared[ index ] = fetchData( x, y ); + data[ index ] = fetchData( x, y ); } // Bottom Right x = ix + radiusX; y = iy + radiusY; - - index = kernelHorizontalPadding + threadIdx.x + ( kernelVerticalPadding + threadIdx.y ) * blockDim.x; + index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { - shared[ index ] = fetchBoundary( x, y ); + data[ index ] = fetchBoundary( x, y ); } else { - shared[ index ] = fetchData( x, y ); + data[ index ] = fetchData( x, y ); } __syncthreads(); @@ -165,12 +164,12 @@ convolution2D( Index kernelWidth, Real result = 0; for( Index j = 0; j < kernelHeight; j++ ) { - Index align = ( j + threadIdx.y ) * blockDim.x; + Index align = ( j + threadIdx.y ) * dataBlockWidth; for( Index i = 0; i < kernelWidth; i++ ) { Index index = i + threadIdx.x + align; - result = convolve( result, shared[ index ], fetchKernel( i, j ) ); + result = convolve( result, data[ index ], fetchKernel( i, j ) ); } } @@ -198,15 +197,25 @@ convolution3D( Index kernelWidth, Convolve convolve, Store store ) { - Index iz = threadIdx.z + blockIdx.z * blockDim.z; - Index iy = threadIdx.y + blockIdx.y * blockDim.y; - Index ix = threadIdx.x + blockIdx.x * blockDim.x; + Real* data = TNL::Cuda::getSharedMemory< Real >(); + + const Index ix = threadIdx.x + blockIdx.x * blockDim.x; + const Index iy = threadIdx.y + blockIdx.y * blockDim.y; + const Index iz = threadIdx.z + blockIdx.z * blockDim.z; + + const Index radiusX = kernelWidth >> 1; + const Index radiusY = kernelHeight >> 1; + const Index radiusZ = kernelDepth >> 1; + + const Index dataBlockWidth = 2 * kernelWidth - 1; + const Index dataBlockHeight = 2 * kernelHeight - 1; + const Index dataBlockDepth = 2 * kernelDepth - 1; - Real* shared = TNL::Cuda::getSharedMemory< Real >(); + const Index dataBlockXYVolume = dataBlockWidth * dataBlockHeight; - Index radiusZ = kernelDepth >> 1; - Index radiusY = kernelHeight >> 1; - Index radiusX = kernelWidth >> 1; + const Index dataBlockRadiusX = dataBlockWidth >> 1; + const Index dataBlockRadiusY = dataBlockHeight >> 1; + const Index dataBlockRadiusZ = dataBlockDepth >> 1; Index x, y, z, index; @@ -215,13 +224,13 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; - index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; + index = threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { - shared[ index ] = fetchBoundary( x, y, z ); + data[ index ] = fetchBoundary( x, y, z ); } else { - shared[ index ] = fetchData( x, y, z ); + data[ index ] = fetchData( x, y, z ); } // Z: 0 Y: 0 X: 1 @@ -229,13 +238,13 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; - index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; + index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { - shared[ index ] = fetchBoundary( x, y, z ); + data[ index ] = fetchBoundary( x, y, z ); } else { - shared[ index ] = fetchData( x, y, z ); + data[ index ] = fetchData( x, y, z ); } // Z: 0 Y: 1 X: 0 @@ -243,13 +252,13 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; - index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; + index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { - shared[ index ] = fetchBoundary( x, y, z ); + data[ index ] = fetchBoundary( x, y, z ); } else { - shared[ index ] = fetchData( x, y, z ); + data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 0 X: 0 @@ -257,13 +266,13 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; - index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; + index = threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { - shared[ index ] = fetchBoundary( x, y, z ); + data[ index ] = fetchBoundary( x, y, z ); } else { - shared[ index ] = fetchData( x, y, z ); + data[ index ] = fetchData( x, y, z ); } // Z: 0 Y: 1 X: 1 @@ -271,13 +280,13 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; - index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; + index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { - shared[ index ] = fetchBoundary( x, y, z ); + data[ index ] = fetchBoundary( x, y, z ); } else { - shared[ index ] = fetchData( x, y, z ); + data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 0 X: 1 @@ -285,13 +294,13 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; - index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; + index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { - shared[ index ] = fetchBoundary( x, y, z ); + data[ index ] = fetchBoundary( x, y, z ); } else { - shared[ index ] = fetchData( x, y, z ); + data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 1 X: 0 @@ -299,13 +308,13 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; - index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; + index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { - shared[ index ] = fetchBoundary( x, y, z ); + data[ index ] = fetchBoundary( x, y, z ); } else { - shared[ index ] = fetchData( x, y, z ); + data[ index ] = fetchData( x, y, z ); } // Z: 1 Y: 1 X: 1 @@ -313,13 +322,13 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; - index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; + index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { - shared[ index ] = fetchBoundary( x, y, z ); + data[ index ] = fetchBoundary( x, y, z ); } else { - shared[ index ] = fetchData( x, y, z ); + data[ index ] = fetchData( x, y, z ); } __syncthreads(); @@ -330,15 +339,15 @@ convolution3D( Index kernelWidth, Real result = 0; for( Index k = 0; k < kernelDepth; k++ ) { - Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; + Index xyAlign = ( k + threadIdx.z ) * dataBlockXYVolume; for( Index j = 0; j < kernelHeight; j++ ) { - Index xAlign = ( j + threadIdx.y ) * blockDim.x; + Index xAlign = ( j + threadIdx.y ) * dataBlockWidth; for( Index i = 0; i < kernelWidth; i++ ) { Index index = i + threadIdx.x + xAlign + xyAlign; - result = convolve( result, shared[ index ], fetchKernel( i, j, k ) ); + result = convolve( result, data[ index ], fetchKernel( i, j, k ) ); } } } diff --git a/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h index 62276c3cf..b9d094203 100644 --- a/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h +++ b/src/Benchmarks/Convolution/kernels/sharedDataAndKernel.h @@ -8,7 +8,7 @@ #include /** - * This method stores kernel and data in the shared memory to reduce amount of loads. + * This method stores kernel and data in the data memory to reduce amount of loads. * * We can calculate the size of shared memory needed the next way: * 1. We need to store in shared memory: @@ -49,7 +49,7 @@ convolution1D( Index kernelWidth, { Index ix = threadIdx.x + blockIdx.x * blockDim.x; - Index kernelOffset = 2 * kernelWidth; + Index kernelOffset = 2 * kernelWidth - 1; Real* data = TNL::Cuda::getSharedMemory< Real >(); Real* kernel = data + kernelOffset; @@ -114,26 +114,29 @@ convolution2D( Index kernelWidth, Convolve convolve, Store store ) { - Index iy = threadIdx.y + blockIdx.y * blockDim.y; - Index ix = threadIdx.x + blockIdx.x * blockDim.x; + const Index iy = threadIdx.y + blockIdx.y * blockDim.y; + const Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + const Index radiusY = kernelHeight >> 1; + const Index radiusX = kernelWidth >> 1; + + const Index dataBlockWidth = 2 * kernelWidth - 1; + const Index dataBlockHeight = 2 * kernelHeight - 1; + + const Index dataBlockRadiusX = dataBlockWidth >> 1; + const Index dataBlockRadiusY = dataBlockHeight >> 1; - Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ); + const Index kernelOffset = dataBlockWidth * dataBlockHeight; Real* data = TNL::Cuda::getSharedMemory< Real >(); Real* kernel = data + kernelOffset; - Index radiusY = kernelHeight >> 1; - Index radiusX = kernelWidth >> 1; - Index x, y, index; // Top Left x = ix - radiusX; y = iy - radiusY; - - index = threadIdx.x + threadIdx.y * blockDim.x; - - kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y ); + index = threadIdx.x + threadIdx.y * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); @@ -145,8 +148,7 @@ convolution2D( Index kernelWidth, // Top right x = ix + radiusX; y = iy - radiusY; - - index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x; + index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); @@ -158,10 +160,9 @@ convolution2D( Index kernelWidth, // Bottom Left x = ix - radiusX; y = iy + radiusY; + index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; - index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x; - - if( x < 0 || y < 0 || x >= endX || y >= endY ) { + if(x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); } else { @@ -171,8 +172,7 @@ convolution2D( Index kernelWidth, // Bottom Right x = ix + radiusX; y = iy + radiusY; - - index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x; + index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; if( x < 0 || y < 0 || x >= endX || y >= endY ) { data[ index ] = fetchBoundary( x, y ); @@ -181,6 +181,10 @@ convolution2D( Index kernelWidth, data[ index ] = fetchData( x, y ); } + index = threadIdx.x + threadIdx.y * blockDim.x; + + kernel[index] = fetchKernel( threadIdx.x, threadIdx.y ); + __syncthreads(); if( ix >= endX || iy >= endY ) @@ -190,7 +194,7 @@ convolution2D( Index kernelWidth, #pragma unroll for( Index j = 0; j < kernelHeight; j++ ) { - Index elementAlign = ( j + threadIdx.y ) * blockDim.x; + Index elementAlign = ( j + threadIdx.y ) * dataBlockWidth; Index kernelAlign = j * blockDim.x; #pragma unroll @@ -226,19 +230,29 @@ convolution3D( Index kernelWidth, Convolve convolve, Store store ) { - Index iz = threadIdx.z + blockIdx.z * blockDim.z; - Index iy = threadIdx.y + blockIdx.y * blockDim.y; - Index ix = threadIdx.x + blockIdx.x * blockDim.x; + const Index ix = threadIdx.x + blockIdx.x * blockDim.x; + const Index iy = threadIdx.y + blockIdx.y * blockDim.y; + const Index iz = threadIdx.z + blockIdx.z * blockDim.z; + + const Index radiusX = kernelWidth >> 1; + const Index radiusY = kernelHeight >> 1; + const Index radiusZ = kernelDepth >> 1; + + const Index dataBlockWidth = 2 * kernelWidth - 1; + const Index dataBlockHeight = 2 * kernelHeight - 1; + const Index dataBlockDepth = 2 * kernelDepth - 1; - Index kernelOffset = ( 2 * kernelWidth - 1 ) * ( 2 * kernelHeight - 1 ) * ( 2 * kernelDepth - 1 ); + const Index dataBlockXYVolume = dataBlockWidth * dataBlockHeight; + + const Index dataBlockRadiusX = dataBlockWidth >> 1; + const Index dataBlockRadiusY = dataBlockHeight >> 1; + const Index dataBlockRadiusZ = dataBlockDepth >> 1; + + const Index kernelOffset = dataBlockWidth * dataBlockHeight * dataBlockDepth; Real* data = TNL::Cuda::getSharedMemory< Real >(); Real* kernel = data + kernelOffset; - Index radiusZ = kernelDepth >> 1; - Index radiusY = kernelHeight >> 1; - Index radiusX = kernelWidth >> 1; - Index x, y, z, index; // Z: 0 Y: 0 X: 0 @@ -246,9 +260,7 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; - index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; - - kernel[ index ] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); + index = threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); @@ -262,7 +274,7 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz - radiusZ; - index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; + index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); @@ -276,7 +288,7 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; - index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; + index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); @@ -290,7 +302,7 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; - index = threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; + index = threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); @@ -304,9 +316,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz - radiusZ; - index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; + index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + threadIdx.z * dataBlockXYVolume; - if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ) { + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { @@ -318,7 +330,7 @@ convolution3D( Index kernelWidth, y = iy - radiusY; z = iz + radiusZ; - index = kernelWidth + threadIdx.x + threadIdx.y * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; + index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); @@ -332,9 +344,9 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; - index = threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; + index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; - if(x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { + if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); } else { @@ -346,7 +358,7 @@ convolution3D( Index kernelWidth, y = iy + radiusY; z = iz + radiusZ; - index = kernelWidth + threadIdx.x + ( kernelHeight + threadIdx.y ) * blockDim.x + ( kernelDepth + threadIdx.z ) * blockDim.x * blockDim.y; + index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth + ( dataBlockRadiusZ + threadIdx.z ) * dataBlockXYVolume; if( x < 0 || y < 0 || z < 0 || x >= endX || y >= endY || z >= endZ ) { data[ index ] = fetchBoundary( x, y, z ); @@ -355,6 +367,10 @@ convolution3D( Index kernelWidth, data[ index ] = fetchData( x, y, z ); } + index = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; + + kernel[index] = fetchKernel( threadIdx.x, threadIdx.y, threadIdx.z ); + __syncthreads(); if( ix >= endX || iy >= endY || iz >= endZ ) @@ -364,11 +380,11 @@ convolution3D( Index kernelWidth, #pragma unroll for( Index k = 0; k < kernelDepth; k++ ) { - Index xyAlign = ( k + threadIdx.z ) * blockDim.y * blockDim.x; + Index xyAlign = ( k + threadIdx.z ) * dataBlockXYVolume; Index xyKernelAlign = k * blockDim.x * blockDim.y; #pragma unroll for( Index j = 0; j < kernelHeight; j++ ) { - Index xAlign = ( j + threadIdx.y ) * blockDim.x; + Index xAlign = ( j + threadIdx.y ) * dataBlockWidth; Index xKernelAlign = j * blockDim.x; #pragma unroll for( Index i = 0; i < kernelWidth; i++ ) { diff --git a/src/Benchmarks/Convolution/support/DummyBenchmark.h b/src/Benchmarks/Convolution/support/DummyBenchmark.h index 9b44d53dd..f8e2d4ae8 100644 --- a/src/Benchmarks/Convolution/support/DummyBenchmark.h +++ b/src/Benchmarks/Convolution/support/DummyBenchmark.h @@ -16,7 +16,7 @@ class DummyBenchmark : public Benchmark< Dimension, Device > { public: using Vector = TNL::Containers::StaticVector< Dimension, int >; - using DataStore = TNL::Containers::Array< float, Device, int >; + using DataStore = TNL::Containers::Vector< float, Device, int >; using Base = Benchmark< Dimension, Device >; using TNLBenchmark = typename Base::TNLBenchmark; @@ -103,9 +103,9 @@ public: result = 1; kernel = 1; - auto inputView = input.getView(); + auto inputView = input.getConstView(); + auto kernelView = kernel.getConstView(); auto resultView = result.getView(); - auto kernelView = kernel.getView(); auto measure = [ & ]() { diff --git a/src/Benchmarks/Convolution/support/DummySolver.h b/src/Benchmarks/Convolution/support/DummySolver.h index 82a8f6ad4..2b1e60041 100644 --- a/src/Benchmarks/Convolution/support/DummySolver.h +++ b/src/Benchmarks/Convolution/support/DummySolver.h @@ -13,7 +13,7 @@ class DummySolver : public Solver< Dimension, Device > public: using Base = Solver< Dimension, Device >; using Vector = TNL::Containers::StaticVector< Dimension, int >; - using DataStore = TNL::Containers::Array< float, Device, int >; + using DataStore = TNL::Containers::Vector< float, Device, int >; virtual void start( const TNL::Config::ParameterContainer& parameters ) const override @@ -55,9 +55,9 @@ public: result = 1; kernel = 1; - auto inputView = input.getView(); + auto inputView = input.getConstView(); + auto kernelView = kernel.getConstView(); auto resultView = result.getView(); - auto kernelView = kernel.getView(); DummyTask::exec(dimension, kernelSize, inputView, resultView, kernelView); diff --git a/src/Benchmarks/Convolution/support/DummyTask.h b/src/Benchmarks/Convolution/support/DummyTask.h index 026575c38..d8e904896 100644 --- a/src/Benchmarks/Convolution/support/DummyTask.h +++ b/src/Benchmarks/Convolution/support/DummyTask.h @@ -34,11 +34,12 @@ public: static constexpr int Dimension = 1; using Device = TNL::Devices::Cuda; using Vector = TNL::Containers::StaticVector< Dimension, Index >; - using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; + using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType; + using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType; using ConvolutionLauncher = Convolution< Dimension, Device >; static void - exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) + exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel ) { auto fetchData = [ = ] __cuda_callable__( Index i ) { @@ -82,11 +83,12 @@ public: static constexpr int Dimension = 2; using Device = TNL::Devices::Cuda; using Vector = TNL::Containers::StaticVector< Dimension, Index >; - using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; + using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType; + using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType; using ConvolutionLauncher = Convolution< Dimension, Device >; static void - exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) + exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel ) { auto fetchData = [ = ] __cuda_callable__( Index i, Index j ) { @@ -116,7 +118,7 @@ public: { auto index = i + j * dimensions.x(); - result[ index ] = resultValue; + result[ index ] = TNL::max(TNL::min(resultValue, 1.), 0.); }; ConvolutionLauncher::execute< Index, Real >( dimensions, @@ -136,11 +138,12 @@ public: static constexpr int Dimension = 3; using Device = TNL::Devices::Cuda; using Vector = TNL::Containers::StaticVector< Dimension, Index >; - using DataStore = typename TNL::Containers::Array< Real, Device, Index >::ViewType; + using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType; + using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType; using ConvolutionLauncher = Convolution< Dimension, Device >; static void - exec( const Vector& dimensions, const Vector& kernelSize, DataStore& input, DataStore& result, DataStore& kernel ) + exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel ) { auto fetchData = [ = ] __cuda_callable__( Index i, Index j, Index k ) { diff --git a/src/Benchmarks/Convolution/support/ImageSolver.h b/src/Benchmarks/Convolution/support/ImageSolver.h new file mode 100644 index 000000000..1e50ab535 --- /dev/null +++ b/src/Benchmarks/Convolution/support/ImageSolver.h @@ -0,0 +1,184 @@ + +#pragma once + +#include "Solver.h" +#include "DummyTask.h" + +#include +#include +#include +#include +#include +#include +#include + +static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" }; +static std::vector< TNL::String > kernelSizeIds = { "x-kernel-size", "y-kernel-size", "z-kernel-size" }; + +class ImageSolver : public Solver< 2, TNL::Devices::Cuda > +{ +public: + constexpr static int Dimension = 2; + using Device = TNL::Devices::Cuda; + + using Base = Solver< Dimension, Device >; + using Vector = TNL::Containers::StaticVector< Dimension, int >; + using DataStore = TNL::Containers::Vector< float, Device, int >; + using HostDataStore = TNL::Containers::Vector< float, TNL::Devices::Host, int >; + + using GridType = TNL::Meshes::Grid< 2, float, Device, int >; + using GridPointer = TNL::Pointers::SharedPointer< GridType >; + using MeshFunctionType = TNL::Functions::MeshFunction< GridType >; + + virtual void + start( const TNL::Config::ParameterContainer& parameters ) const override + { + GridPointer grid; + MeshFunctionType meshFunction; + TNL::Images::PNGImage< int > image; + TNL::Images::RegionOfInterest< int > roi; + + meshFunction.setMesh( grid ); + + auto output = parameters.getParameter< TNL::String >( "output" ); + + if (!this -> readImage(parameters, grid, meshFunction, image, roi) || + !this -> convolve(parameters, meshFunction) || + !this -> write(parameters, image, meshFunction)) + return; + } + + template + bool readImage(const TNL::Config::ParameterContainer& parameters, + GridPointer & grid, + MeshFunctionType& meshFunction, + Image& image, + TNL::Images::RegionOfInterest< int >& roi) const { + auto input = parameters.getParameter< TNL::String >( "input" ); + + if( image.openForRead( input ) ) { + if( ! roi.setup( parameters, &image ) ) { + std::cout << "Invalid image roi."; + image.close(); + return false; + } + + std::cout << image.getWidth() << " " << image.getHeight() << std::endl; + + auto meshPointer = meshFunction.getMeshPointer(); + + meshPointer -> setDimensions(image.getWidth(), image.getHeight()); + + meshFunction.setMesh(meshPointer); + + if( ! image.read( roi, meshFunction ) ) { + std::cout << "Invalid image size" << std::endl;; + image.close(); + return false; + } + + image.close(); + + std::cout << "Image read was successful: " << meshFunction.getData().getSize() << " elements count" << std::endl; + return true; + } + + std::cout << "Image open for read failed. Please check file path" << std::endl;; + + return false; + } + + bool convolve(const TNL::Config::ParameterContainer& parameters, MeshFunctionType& meshFunction) const { + auto imageData = meshFunction.getData().getConstView(); + + Vector kernelSize; + DataStore kernel; + + kernel = getKernel(parameters, kernelSize); + + DataStore result; + + result.setLike( imageData ); + result = 0; + + TNL::Timer timer; + + timer.start(); + + std::cout << imageData.getSize() << " " << result.getSize() << std::endl; + + launchConvolution( imageData, + kernel.getConstView(), + result.getView(), + meshFunction.getMeshPointer() -> getDimensions(), + kernelSize ); + + timer.stop(); + + meshFunction.getData() = result; + + std::cout << "Image convolution was successful. Time: " << timer.getRealTime() << " sec" << std::endl; + + return true; + } + + template + bool write(const TNL::Config::ParameterContainer& parameters, Image& image, MeshFunctionType& meshFunction) const { + auto output = parameters.getParameter< TNL::String >( "output" ); + GridType grid = meshFunction.getMesh(); + + if( image.openForWrite( output, grid ) ) { + if( ! image.write( meshFunction ) ) { + std::cout << "Image write failed" << std::endl;; + image.close(); + return false; + } + + image.close(); + + return true; + } + + std::cout << "Image open for write failed. Please check file path" << std::endl; + + return false; + } + + HostDataStore getKernel( const TNL::Config::ParameterContainer& parameters, Vector& kernelDimension ) const { + kernelDimension = {3, 3}; + + return {-1, -1, -1, + -1, 8, -1, + -1, -1, -1}; + } + + void + launchConvolution( DataStore::ConstViewType image, + DataStore::ConstViewType kernel, + DataStore::ViewType result, + const GridType::CoordinatesType& imageDimension, + const GridType::CoordinatesType& kernelDimension) const + { + DummyTask::exec(imageDimension, kernelDimension, image, result, kernel); + } + + virtual TNL::Config::ConfigDescription + makeInputConfig() const override + { + TNL::Config::ConfigDescription config = Base::makeInputConfig(); + + config.addDelimiter( "Image settings:" ); + + config.addEntry< TNL::String >( "input", "PNG image" ); + config.addEntry< TNL::String >( "output", "PNG image" ); + + config.addDelimiter( "Roi settings:" ); + + config.addEntry< int >( "roi-top", "Top (smaller number) line of the region of interest.", -1 ); + config.addEntry< int >( "roi-bottom", "Bottom (larger number) line of the region of interest.", -1 ); + config.addEntry< int >( "roi-left", "Left (smaller number) column of the region of interest.", -1 ); + config.addEntry< int >( "roi-right", "Right (larger number) column of the region of interest.", -1 ); + + return config; + } +}; diff --git a/src/Benchmarks/Convolution/support/Solver.h b/src/Benchmarks/Convolution/support/Solver.h index 3fd56fb02..d80373f0d 100644 --- a/src/Benchmarks/Convolution/support/Solver.h +++ b/src/Benchmarks/Convolution/support/Solver.h @@ -4,7 +4,7 @@ #include #include -#include +#include template< int Dimension, typename Device > class Solver diff --git a/src/Benchmarks/Convolution/templates/main_image_solver.h b/src/Benchmarks/Convolution/templates/main_image_solver.h new file mode 100644 index 000000000..c2b9c1440 --- /dev/null +++ b/src/Benchmarks/Convolution/templates/main_image_solver.h @@ -0,0 +1,26 @@ + +#define KERNEL KERNEL_VALUE +#define DIMENSION DIMENSION_VALUE + +#include KERNEL +#include "../support/ImageSolver.h" + +#include + +using TaskSolver = ImageSolver; + +int main(int argc, char* argv[]) +{ + TaskSolver solver; + + auto config = solver.makeInputConfig(); + + TNL::Config::ParameterContainer parameters; + + if( ! parseCommandLine( argc, argv, config, parameters ) ) + return EXIT_FAILURE; + + solver.solve( parameters ); + + return 0; +} diff --git a/src/TNL/Images/PNGImage_impl.h b/src/TNL/Images/PNGImage_impl.h index 3b946ff82..5efd66c84 100644 --- a/src/TNL/Images/PNGImage_impl.h +++ b/src/TNL/Images/PNGImage_impl.h @@ -266,6 +266,7 @@ PNGImage< Index >::write( const Meshes::Grid< 2, Real, Device, Index >& grid, Ve for( j = 0; j < grid.getDimensions().x(); j++ ) { cell.getCoordinates().x() = j; cell.getCoordinates().y() = grid.getDimensions().y() - 1 - i; + cell.refresh(); // Index cellIndex = grid.getCellIndex( CoordinatesType( j, // grid.getDimensions().y() - 1 - i ) ); @@ -305,6 +306,7 @@ PNGImage< Index >::write( const Functions::MeshFunction< Meshes::Grid< 2, MeshRe for( j = 0; j < grid.getDimensions().x(); j++ ) { cell.getCoordinates().x() = j; cell.getCoordinates().y() = grid.getDimensions().y() - 1 - i; + cell.refresh(); // Index cellIndex = grid.getCellIndex( CoordinatesType( j, // grid.getDimensions().y() - 1 - i ) ); -- GitLab From 8a23431f452248210c76567b8151e01f60529aba Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Wed, 4 May 2022 16:52:39 +0200 Subject: [PATCH 15/19] Add custom kernels to image solver --- .../Convolution/support/DummyTask.h | 2 +- .../Convolution/support/ImageSolver.h | 135 ++++++++++++++---- 2 files changed, 106 insertions(+), 31 deletions(-) diff --git a/src/Benchmarks/Convolution/support/DummyTask.h b/src/Benchmarks/Convolution/support/DummyTask.h index d8e904896..60d37f923 100644 --- a/src/Benchmarks/Convolution/support/DummyTask.h +++ b/src/Benchmarks/Convolution/support/DummyTask.h @@ -118,7 +118,7 @@ public: { auto index = i + j * dimensions.x(); - result[ index ] = TNL::max(TNL::min(resultValue, 1.), 0.); + result[ index ] = resultValue; }; ConvolutionLauncher::execute< Index, Real >( dimensions, diff --git a/src/Benchmarks/Convolution/support/ImageSolver.h b/src/Benchmarks/Convolution/support/ImageSolver.h index 1e50ab535..6d3d6b79d 100644 --- a/src/Benchmarks/Convolution/support/ImageSolver.h +++ b/src/Benchmarks/Convolution/support/ImageSolver.h @@ -14,6 +14,8 @@ static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" }; static std::vector< TNL::String > kernelSizeIds = { "x-kernel-size", "y-kernel-size", "z-kernel-size" }; +static std::vector< TNL::String > kernels = { "identity", "gauss3x3", "gauss5x5", + "sobelHorizontal", "sobelVertical", "edgeDetection" }; class ImageSolver : public Solver< 2, TNL::Devices::Cuda > { @@ -42,18 +44,19 @@ public: auto output = parameters.getParameter< TNL::String >( "output" ); - if (!this -> readImage(parameters, grid, meshFunction, image, roi) || - !this -> convolve(parameters, meshFunction) || - !this -> write(parameters, image, meshFunction)) + if( ! this->readImage( parameters, grid, meshFunction, image, roi ) || ! this->convolve( parameters, meshFunction ) + || ! this->write( parameters, image, meshFunction ) ) return; } - template - bool readImage(const TNL::Config::ParameterContainer& parameters, - GridPointer & grid, - MeshFunctionType& meshFunction, - Image& image, - TNL::Images::RegionOfInterest< int >& roi) const { + template< typename Image > + bool + readImage( const TNL::Config::ParameterContainer& parameters, + GridPointer& grid, + MeshFunctionType& meshFunction, + Image& image, + TNL::Images::RegionOfInterest< int >& roi ) const + { auto input = parameters.getParameter< TNL::String >( "input" ); if( image.openForRead( input ) ) { @@ -67,12 +70,13 @@ public: auto meshPointer = meshFunction.getMeshPointer(); - meshPointer -> setDimensions(image.getWidth(), image.getHeight()); + meshPointer->setDimensions( image.getWidth(), image.getHeight() ); - meshFunction.setMesh(meshPointer); + meshFunction.setMesh( meshPointer ); if( ! image.read( roi, meshFunction ) ) { - std::cout << "Invalid image size" << std::endl;; + std::cout << "Invalid image size" << std::endl; + image.close(); return false; } @@ -83,18 +87,20 @@ public: return true; } - std::cout << "Image open for read failed. Please check file path" << std::endl;; + std::cout << "Image open for read failed. Please check file path" << std::endl; return false; } - bool convolve(const TNL::Config::ParameterContainer& parameters, MeshFunctionType& meshFunction) const { + bool + convolve( const TNL::Config::ParameterContainer& parameters, MeshFunctionType& meshFunction ) const + { auto imageData = meshFunction.getData().getConstView(); Vector kernelSize; DataStore kernel; - kernel = getKernel(parameters, kernelSize); + kernel = getKernel( parameters, kernelSize ); DataStore result; @@ -107,14 +113,17 @@ public: std::cout << imageData.getSize() << " " << result.getSize() << std::endl; - launchConvolution( imageData, - kernel.getConstView(), - result.getView(), - meshFunction.getMeshPointer() -> getDimensions(), - kernelSize ); + launchConvolution( + imageData, kernel.getConstView(), result.getView(), meshFunction.getMeshPointer()->getDimensions(), kernelSize ); timer.stop(); + result.forAllElements( + [] __cuda_callable__( int i, float& value ) + { + value = TNL::max( TNL::min( value, 1.0 ), 0.0 ); + } ); + meshFunction.getData() = result; std::cout << "Image convolution was successful. Time: " << timer.getRealTime() << " sec" << std::endl; @@ -122,14 +131,17 @@ public: return true; } - template - bool write(const TNL::Config::ParameterContainer& parameters, Image& image, MeshFunctionType& meshFunction) const { + template< typename Image > + bool + write( const TNL::Config::ParameterContainer& parameters, Image& image, MeshFunctionType& meshFunction ) const + { auto output = parameters.getParameter< TNL::String >( "output" ); GridType grid = meshFunction.getMesh(); if( image.openForWrite( output, grid ) ) { if( ! image.write( meshFunction ) ) { - std::cout << "Image write failed" << std::endl;; + std::cout << "Image write failed" << std::endl; + image.close(); return false; } @@ -144,12 +156,71 @@ public: return false; } - HostDataStore getKernel( const TNL::Config::ParameterContainer& parameters, Vector& kernelDimension ) const { - kernelDimension = {3, 3}; + HostDataStore + getKernel( const TNL::Config::ParameterContainer& parameters, Vector& kernelDimension ) const + { + auto kernel = parameters.getParameter< TNL::String >( "kernel" ); + + if( kernel == "identity" ) { + kernelDimension = { 3, 3 }; + + return { 0, 0, 0, + 0, 1, 0, + 0, 0, 0 }; + } + + if( kernel == "gauss3x3" ) { + kernelDimension = { 3, 3 }; + + HostDataStore kernel = { 1, 2, 1, + 2, 4, 2, + 1, 2, 1 }; + + kernel /= 16; + + return kernel; + } + + if( kernel == "gauss5x5" ) { + kernelDimension = { 5, 5 }; + + HostDataStore kernel = { 1, 4, 7, 4, 1, + 4, 16, 26, 16, 4, + 7, 26, 41, 26, 7, + 4, 16, 26, 16, 4, + 1, 4, 7, 4, 1 }; + + kernel /= 273; + + return kernel; + } + + if( kernel == "sobelHorizontal" ) { + kernelDimension = { 3, 3 }; + + return { 1, 2, 1, + 0, 0, 0, + -1, -2, -1 }; + } - return {-1, -1, -1, - -1, 8, -1, - -1, -1, -1}; + if( kernel == "sobelVertical" ) { + kernelDimension = { 3, 3 }; + + return { 1, 0, -1, + 2, 0, -2, + 1, 0, -1 }; + } + + if( kernel == "edgeDetection" ) { + kernelDimension = { 3, 3 }; + + return { -1, -1, -1, + -1, 8, -1, + -1, -1, -1 }; + } + + std::cout << "Unknown kernel " << kernel << ". Exit" << std::endl; + exit(1); } void @@ -157,9 +228,9 @@ public: DataStore::ConstViewType kernel, DataStore::ViewType result, const GridType::CoordinatesType& imageDimension, - const GridType::CoordinatesType& kernelDimension) const + const GridType::CoordinatesType& kernelDimension ) const { - DummyTask::exec(imageDimension, kernelDimension, image, result, kernel); + DummyTask< int, float, Dimension, Device >::exec( imageDimension, kernelDimension, image, result, kernel ); } virtual TNL::Config::ConfigDescription @@ -171,6 +242,10 @@ public: config.addEntry< TNL::String >( "input", "PNG image" ); config.addEntry< TNL::String >( "output", "PNG image" ); + config.addEntry< TNL::String >( "kernel", "A kernel to apply", kernels[ 0 ] ); + + for( const auto& kernel : kernels ) + config.addEntryEnum( kernel); config.addDelimiter( "Roi settings:" ); -- GitLab From 053bc398a0dadcfc3db603f305967ee2e8ff518a Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Thu, 5 May 2022 19:21:02 +0200 Subject: [PATCH 16/19] Implement heat equation solver --- src/Benchmarks/Convolution/CMakeLists.txt | 9 +- .../Convolution/support/DummyTask.h | 12 +- .../Convolution/support/HeatEquationSolver.h | 189 ++++++++++++++++++ .../Convolution/support/ImageSolver.h | 78 +++----- .../templates/main_heat_equation_solver.h | 26 +++ 5 files changed, 261 insertions(+), 53 deletions(-) create mode 100644 src/Benchmarks/Convolution/support/HeatEquationSolver.h create mode 100644 src/Benchmarks/Convolution/templates/main_heat_equation_solver.h diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt index d34518dcc..6ac9ed64f 100644 --- a/src/Benchmarks/Convolution/CMakeLists.txt +++ b/src/Benchmarks/Convolution/CMakeLists.txt @@ -12,7 +12,11 @@ if (${BUILD_CUDA}) STRING(REGEX REPLACE "DIMENSION_VALUE" ${DIMENSION} TEMPLATE_CONTENT "${TEMPLATE_CONTENT}") STRING(REGEX REPLACE "KERNEL_VALUE" "\"../${KERNEL_HEADER}\"" TEMPLATE_CONTENT "${TEMPLATE_CONTENT}") - FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}") + FILE(READ ${SOURCE_FILE} SOURCE_FILE_CONTENT) + + if ( NOT "${SOURCE_FILE_CONTENT}" STREQUAL "${TEMPLATE_CONTENT}" ) + FILE(WRITE ${SOURCE_FILE} "${TEMPLATE_CONTENT}") + endif() SET(EXECUTABLE_NAME "${PREFIX}_${DIMENSION}_${MODULE_NAME}_${TEMPLATE_NAME}") @@ -63,3 +67,6 @@ GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "k GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedData.h") GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedDataAndKernel.h") + +GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/naive.h") +GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/sharedDataAndKernel.h") diff --git a/src/Benchmarks/Convolution/support/DummyTask.h b/src/Benchmarks/Convolution/support/DummyTask.h index 60d37f923..07a58e98d 100644 --- a/src/Benchmarks/Convolution/support/DummyTask.h +++ b/src/Benchmarks/Convolution/support/DummyTask.h @@ -39,7 +39,7 @@ public: using ConvolutionLauncher = Convolution< Dimension, Device >; static void - exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel ) + exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel, int boundaryValue = 1 ) { auto fetchData = [ = ] __cuda_callable__( Index i ) { @@ -48,7 +48,7 @@ public: auto fetchBoundary = [ = ] __cuda_callable__( Index i ) { - return 1; + return boundaryValue; }; auto fetchKernel = [ = ] __cuda_callable__( Index i ) @@ -88,7 +88,7 @@ public: using ConvolutionLauncher = Convolution< Dimension, Device >; static void - exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel ) + exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel, int boundaryValue = 1 ) { auto fetchData = [ = ] __cuda_callable__( Index i, Index j ) { @@ -99,7 +99,7 @@ public: auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j ) { - return 1; + return boundaryValue; }; auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j ) @@ -143,7 +143,7 @@ public: using ConvolutionLauncher = Convolution< Dimension, Device >; static void - exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel ) + exec( const Vector& dimensions, const Vector& kernelSize, ConstDataStore& input, DataStore& result, ConstDataStore& kernel, int boundaryValue = 1 ) { auto fetchData = [ = ] __cuda_callable__( Index i, Index j, Index k ) { @@ -154,7 +154,7 @@ public: auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j, Index k ) { - return 1; + return boundaryValue; }; auto fetchKernel = [ = ] __cuda_callable__( Index i, Index j, Index k ) diff --git a/src/Benchmarks/Convolution/support/HeatEquationSolver.h b/src/Benchmarks/Convolution/support/HeatEquationSolver.h new file mode 100644 index 000000000..6ce6d5c09 --- /dev/null +++ b/src/Benchmarks/Convolution/support/HeatEquationSolver.h @@ -0,0 +1,189 @@ + +#pragma once + +#include "Solver.h" +#include "DummyTask.h" + +#include +#include + +static std::vector< TNL::String > dimensionIds = { "grid-x-size", "grid-y-size" }; +static std::vector< TNL::String > kernelSizeIds = { "kernel-x-size", "kernel-y-size" }; +static std::vector< TNL::String > domainIds = { "domain-x-size", "domain-y-size" }; +static std::string sigmaKey = "sigma"; +static std::string timestepKey = "timeStep"; +static std::string finalTimeKey = "finalTime"; +static std::string outputFilenamePrefix = "outputFilenamePrefix"; + +template< typename Real = double > +class HeatEquationSolver : public Solver< 2, TNL::Devices::Cuda > +{ +public: + constexpr static int Dimension = 2; + using Device = TNL::Devices::Cuda; + + using Base = Solver< Dimension, Device >; + using Vector = TNL::Containers::StaticVector< Dimension, int >; + using Point = TNL::Containers::StaticVector< Dimension, Real >; + using DataStore = TNL::Containers::Vector< Real, Device, int >; + using HostDataStore = TNL::Containers::Vector< Real, TNL::Devices::Host, int >; + + virtual void + start( const TNL::Config::ParameterContainer& parameters ) const override + { + int gridXSize = parameters.getParameter< int >( dimensionIds[ 0 ] ); + int gridYSize = parameters.getParameter< int >( dimensionIds[ 1 ] ); + + int kernelXSize = parameters.getParameter< int >( kernelSizeIds[ 0 ] ); + int kernelYSize = parameters.getParameter< int >( kernelSizeIds[ 1 ] ); + + Real xDomainSize = parameters.getParameter< Real >( domainIds[ 0 ] ); + Real yDomainSize = parameters.getParameter< Real >( domainIds[ 1 ] ); + + Real hx = xDomainSize / (Real) gridXSize; + Real hy = yDomainSize / (Real) gridYSize; + + Point domain = { xDomainSize, yDomainSize }; + Point spaceSteps = { hx, hy }; + + Vector dimensions = { gridXSize, gridYSize }; + Vector kernelSize = { kernelXSize, kernelYSize }; + + DataStore function = prepareFunction( parameters, dimensions, domain, spaceSteps ); + + auto filenamePrefix = parameters.getParameter< TNL::String >( outputFilenamePrefix ); + auto initialFilename = filenamePrefix + "_initial.txt"; + + if( ! writeGNUPlot( initialFilename, dimensions, spaceSteps, domain, function.getConstView() ) ) { + std::cout << "Did fail during file write"; + return; + } + + DataStore result; + + result.setLike( function ); + result = 0; + + auto finalTime = parameters.getParameter< Real >( finalTimeKey ); + + convolve( dimensions, domain, spaceSteps, kernelSize, function.getConstView(), result.getView(), finalTime ); + + auto finalFilename = filenamePrefix + "_final.txt"; + + if( ! writeGNUPlot( finalFilename, dimensions, spaceSteps, domain, result.getConstView() ) ) { + std::cout << "Did fail during file write"; + return; + } + } + + virtual TNL::Config::ConfigDescription + makeInputConfig() const override + { + TNL::Config::ConfigDescription config = Base::makeInputConfig(); + + config.addDelimiter( "Grid settings:" ); + config.addEntry< int >( dimensionIds[ 0 ], "Grid size along x-axis.", 100 ); + config.addEntry< int >( dimensionIds[ 1 ], "Grid size along y-axis.", 100 ); + + config.addDelimiter( "Kernel settings:" ); + config.addEntry< int >( kernelSizeIds[ 0 ], "Kernel size along x-axis.", 3 ); + config.addEntry< int >( kernelSizeIds[ 1 ], "Kernel size along y-axis.", 3 ); + + config.addDelimiter( "Problem settings:" ); + config.addEntry< TNL::String >( outputFilenamePrefix, "The prefix in name of the output file", "data" ); + + config.addEntry< Real >( domainIds[ 0 ], "Domain size along x-axis.", 4.0 ); + config.addEntry< Real >( domainIds[ 1 ], "Domain size along y-axis.", 4.0 ); + + config.addEntry< Real >( sigmaKey, "Sigma in exponential initial condition.", 0.5); + + config.addEntry< Real >( finalTimeKey, "Final time of the simulation.", 0.12); + + return config; + } + + DataStore + prepareFunction( const TNL::Config::ParameterContainer& parameters, + const Vector& dimensions, + const Point& domain, + const Point& spaceSteps ) const + { + DataStore function; + + function.resize( dimensions.x() * dimensions.y() ); + + auto functionView = function.getView(); + + auto xDomainSize = parameters.getParameter< Real >( domainIds[ 0 ] ); + auto yDomainSize = parameters.getParameter< Real >( domainIds[ 1 ] ); + auto sigma = parameters.getParameter< Real >( sigmaKey ); + + auto init = [ = ] __cuda_callable__( int i, int j ) mutable + { + auto index = j * dimensions.x() + i; + + auto x = i * spaceSteps.x() - domain.x() / 2.; + auto y = j * spaceSteps.y() - domain.y() / 2.; + + functionView[ index ] = exp( sigma * ( x * x + y * y ) ); + }; + + TNL::Algorithms::ParallelFor2D< Device >::exec( 0, 0, dimensions.x(), dimensions.y(), init ); + + return function; + } + + void + convolve( const Vector& dimensions, + const Point& domain, + const Point& spaceSteps, + const Vector& kernelSize, + typename DataStore::ConstViewType input, + typename DataStore::ViewType result, + const Real time ) const + { + HostDataStore kernel; + + kernel.resize( kernelSize.x() * kernelSize.y() ); + + for( int j = 0; j < kernelSize.y(); j++ ) { + for( int i = 0; i < kernelSize.x(); i++ ) { + int index = i + j * kernelSize.x(); + + auto x = i * spaceSteps.x() - domain.x() / 2.; + auto y = j * spaceSteps.y() - domain.y() / 2.; + + kernel[ index ] = ( 1. / ( 4. * M_PI * time ) ) * exp( -( x * x + y * y ) / ( 4. * time ) ); + } + } + + std::cout << kernel << std::endl; + + DataStore kernelDevice( kernel ); + + auto kernelView = kernelDevice.getConstView(); + + DummyTask< int, Real, Dimension, Device >::exec( dimensions, kernelSize, input, result, kernelView, 0); + } + + bool + writeGNUPlot( const std::string& filename, + const Vector& dimensions, + const Point& spaceSteps, + const Point& domain, + const typename DataStore::ConstViewType& map ) const + { + std::ofstream out( filename, std::ios::out ); + + if( ! out.is_open() ) + return false; + + for( int j = 0; j < dimensions.y(); j++ ) + for( int i = 0; i < dimensions.x(); i++ ) + out << i * spaceSteps.x() - domain.x() / 2. << " " + << j * spaceSteps.y() - domain.y() / 2. << " " + << map.getElement( j * dimensions.x() + i ) << std::endl; + + return out.good(); + } +}; diff --git a/src/Benchmarks/Convolution/support/ImageSolver.h b/src/Benchmarks/Convolution/support/ImageSolver.h index 6d3d6b79d..069553573 100644 --- a/src/Benchmarks/Convolution/support/ImageSolver.h +++ b/src/Benchmarks/Convolution/support/ImageSolver.h @@ -13,7 +13,7 @@ #include static std::vector< TNL::String > dimensionIds = { "x-dimension", "y-dimension", "z-dimension" }; -static std::vector< TNL::String > kernelSizeIds = { "x-kernel-size", "y-kernel-size", "z-kernel-size" }; +static std::vector< TNL::String > kernelSizeIds = { "kernel-x-size", "kernel-y-size", "kernel-z-size" }; static std::vector< TNL::String > kernels = { "identity", "gauss3x3", "gauss5x5", "sobelHorizontal", "sobelVertical", "edgeDetection" }; @@ -49,6 +49,30 @@ public: return; } + virtual TNL::Config::ConfigDescription + makeInputConfig() const override + { + TNL::Config::ConfigDescription config = Base::makeInputConfig(); + + config.addDelimiter( "Image settings:" ); + + config.addEntry< TNL::String >( "input", "PNG image" ); + config.addEntry< TNL::String >( "output", "PNG image" ); + config.addEntry< TNL::String >( "kernel", "A kernel to apply", kernels[ 0 ] ); + + for( const auto& kernel : kernels ) + config.addEntryEnum( kernel ); + + config.addDelimiter( "Roi settings:" ); + + config.addEntry< int >( "roi-top", "Top (smaller number) line of the region of interest.", -1 ); + config.addEntry< int >( "roi-bottom", "Bottom (larger number) line of the region of interest.", -1 ); + config.addEntry< int >( "roi-left", "Left (smaller number) column of the region of interest.", -1 ); + config.addEntry< int >( "roi-right", "Right (larger number) column of the region of interest.", -1 ); + + return config; + } + template< typename Image > bool readImage( const TNL::Config::ParameterContainer& parameters, @@ -164,17 +188,13 @@ public: if( kernel == "identity" ) { kernelDimension = { 3, 3 }; - return { 0, 0, 0, - 0, 1, 0, - 0, 0, 0 }; + return { 0, 0, 0, 0, 1, 0, 0, 0, 0 }; } if( kernel == "gauss3x3" ) { kernelDimension = { 3, 3 }; - HostDataStore kernel = { 1, 2, 1, - 2, 4, 2, - 1, 2, 1 }; + HostDataStore kernel = { 1, 2, 1, 2, 4, 2, 1, 2, 1 }; kernel /= 16; @@ -184,11 +204,7 @@ public: if( kernel == "gauss5x5" ) { kernelDimension = { 5, 5 }; - HostDataStore kernel = { 1, 4, 7, 4, 1, - 4, 16, 26, 16, 4, - 7, 26, 41, 26, 7, - 4, 16, 26, 16, 4, - 1, 4, 7, 4, 1 }; + HostDataStore kernel = { 1, 4, 7, 4, 1, 4, 16, 26, 16, 4, 7, 26, 41, 26, 7, 4, 16, 26, 16, 4, 1, 4, 7, 4, 1 }; kernel /= 273; @@ -198,29 +214,23 @@ public: if( kernel == "sobelHorizontal" ) { kernelDimension = { 3, 3 }; - return { 1, 2, 1, - 0, 0, 0, - -1, -2, -1 }; + return { 1, 2, 1, 0, 0, 0, -1, -2, -1 }; } if( kernel == "sobelVertical" ) { kernelDimension = { 3, 3 }; - return { 1, 0, -1, - 2, 0, -2, - 1, 0, -1 }; + return { 1, 0, -1, 2, 0, -2, 1, 0, -1 }; } if( kernel == "edgeDetection" ) { kernelDimension = { 3, 3 }; - return { -1, -1, -1, - -1, 8, -1, - -1, -1, -1 }; + return { -1, -1, -1, -1, 8, -1, -1, -1, -1 }; } std::cout << "Unknown kernel " << kernel << ". Exit" << std::endl; - exit(1); + exit( 1 ); } void @@ -232,28 +242,4 @@ public: { DummyTask< int, float, Dimension, Device >::exec( imageDimension, kernelDimension, image, result, kernel ); } - - virtual TNL::Config::ConfigDescription - makeInputConfig() const override - { - TNL::Config::ConfigDescription config = Base::makeInputConfig(); - - config.addDelimiter( "Image settings:" ); - - config.addEntry< TNL::String >( "input", "PNG image" ); - config.addEntry< TNL::String >( "output", "PNG image" ); - config.addEntry< TNL::String >( "kernel", "A kernel to apply", kernels[ 0 ] ); - - for( const auto& kernel : kernels ) - config.addEntryEnum( kernel); - - config.addDelimiter( "Roi settings:" ); - - config.addEntry< int >( "roi-top", "Top (smaller number) line of the region of interest.", -1 ); - config.addEntry< int >( "roi-bottom", "Bottom (larger number) line of the region of interest.", -1 ); - config.addEntry< int >( "roi-left", "Left (smaller number) column of the region of interest.", -1 ); - config.addEntry< int >( "roi-right", "Right (larger number) column of the region of interest.", -1 ); - - return config; - } }; diff --git a/src/Benchmarks/Convolution/templates/main_heat_equation_solver.h b/src/Benchmarks/Convolution/templates/main_heat_equation_solver.h new file mode 100644 index 000000000..c258f0740 --- /dev/null +++ b/src/Benchmarks/Convolution/templates/main_heat_equation_solver.h @@ -0,0 +1,26 @@ + +#define KERNEL KERNEL_VALUE +#define DIMENSION DIMENSION_VALUE + +#include KERNEL +#include "../support/HeatEquationSolver.h" + +#include + +using TaskSolver = HeatEquationSolver<>; + +int main(int argc, char* argv[]) +{ + TaskSolver solver; + + auto config = solver.makeInputConfig(); + + TNL::Config::ParameterContainer parameters; + + if( ! parseCommandLine( argc, argv, config, parameters ) ) + return EXIT_FAILURE; + + solver.solve( parameters ); + + return 0; +} -- GitLab From 405faef731a05d8e3dfa3decb4e568d293c108aa Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Fri, 6 May 2022 19:12:16 +0200 Subject: [PATCH 17/19] Update the heat equation solver --- src/Benchmarks/Convolution/CMakeLists.txt | 3 +- .../kernels/heatEquationSharedData.h | 182 ++++++++++++++++++ .../Convolution/kernels/sharedData.h | 8 +- .../Convolution/support/HeatEquationSolver.h | 62 +++--- .../Convolution/support/HeatEquationTask.h | 94 +++++++++ 5 files changed, 308 insertions(+), 41 deletions(-) create mode 100644 src/Benchmarks/Convolution/kernels/heatEquationSharedData.h create mode 100644 src/Benchmarks/Convolution/support/HeatEquationTask.h diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt index 6ac9ed64f..8695a4048 100644 --- a/src/Benchmarks/Convolution/CMakeLists.txt +++ b/src/Benchmarks/Convolution/CMakeLists.txt @@ -68,5 +68,4 @@ GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "k GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedDataAndKernel.h") -GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/naive.h") -GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/sharedDataAndKernel.h") +GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/heatEquationSharedData.h") diff --git a/src/Benchmarks/Convolution/kernels/heatEquationSharedData.h b/src/Benchmarks/Convolution/kernels/heatEquationSharedData.h new file mode 100644 index 000000000..bf6fdccf5 --- /dev/null +++ b/src/Benchmarks/Convolution/kernels/heatEquationSharedData.h @@ -0,0 +1,182 @@ + +#ifdef HAVE_CUDA + +/** + * This method stores image tile into shared memory + * and then calculates convolution. + * + * Thanks for the idea https://www.evl.uic.edu/sjames/cs525/final.html + */ + +#include +#include +#include +#include + +template< int Dimension, typename Device > +struct Convolution; + +template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename Convolve, + typename Store > +__global__ +static void +convolution2D( Index kernelWidth, + Index kernelHeight, + Index endX, + Index endY, + FetchData fetchData, + FetchBoundary fetchBoundary, + Convolve convolve, + Store store ) +{ + Real* data = TNL::Cuda::getSharedMemory< Real >(); + + const Index iy = threadIdx.y + blockIdx.y * blockDim.y; + const Index ix = threadIdx.x + blockIdx.x * blockDim.x; + + const Index radiusY = kernelHeight >> 1; + const Index radiusX = kernelWidth >> 1; + + const Index dataBlockWidth = 2 * kernelWidth - 1; + const Index dataBlockHeight = 2 * kernelHeight - 1; + + const Index dataBlockRadiusX = dataBlockWidth >> 1; + const Index dataBlockRadiusY = dataBlockHeight >> 1; + + Index x, y, index; + + // Top Left + x = ix - radiusX; + y = iy - radiusY; + index = threadIdx.x + threadIdx.y * dataBlockWidth; + + if( x < 0 || y < 0 || x >= endX || y >= endY ) { + data[ index ] = fetchBoundary( x, y ); + } + else { + data[ index ] = fetchData( x, y ); + } + + // Top right + x = ix + radiusX; + y = iy - radiusY; + index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth; + + if( x < 0 || y < 0 || x >= endX || y >= endY ) { + data[ index ] = fetchBoundary( x, y ); + } + else { + data[ index ] = fetchData( x, y ); + } + + // Bottom Left + x = ix - radiusX; + y = iy + radiusY; + index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; + + if(x < 0 || y < 0 || x >= endX || y >= endY ) { + data[ index ] = fetchBoundary( x, y ); + } + else { + data[ index ] = fetchData( x, y ); + } + + // Bottom Right + x = ix + radiusX; + y = iy + radiusY; + index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; + + if( x < 0 || y < 0 || x >= endX || y >= endY ) { + data[ index ] = fetchBoundary( x, y ); + } + else { + data[ index ] = fetchData( x, y ); + } + + __syncthreads(); + + if( ix >= endX || iy >= endY ) + return; + + Real result = 0; + + for( Index j = 0; j < kernelHeight; j++ ) { + Index align = ( j + threadIdx.y ) * dataBlockWidth; + + for( Index i = 0; i < kernelWidth; i++ ) { + Index index = i + threadIdx.x + align; + + result = convolve( result, ix, iy, i, j, data[ index ]); + } + } + + store( ix, iy, result ); +} + + +template<> +struct Convolution< 2, TNL::Devices::Cuda > +{ +public: + template< typename Index > + using Vector = TNL::Containers::StaticVector< 2, Index >; + + template< typename Index, typename Real > + static void + setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) + { + Index kernelElementCount = 1; + + for( Index i = 0; i < kernelSize.getSize(); i++ ) + kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; + + configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); + + configuration.blockSize.x = kernelSize.x(); + configuration.blockSize.y = kernelSize.y(); + + configuration.gridSize.x = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); + configuration.gridSize.y = + TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); + } + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + Convolve&& convolve, + Store&& store ) + { + TNL::Cuda::LaunchConfiguration configuration; + + setup< Index, Real >( configuration, dimensions, kernelSize ); + + constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, Convolve, Store >; + + TNL::Cuda::launchKernel< true >( kernel, + 0, + configuration, + kernelSize.x(), + kernelSize.y(), + dimensions.x(), + dimensions.y(), + fetchData, + fetchBoundary, + convolve, + store ); + }; +}; + +#endif diff --git a/src/Benchmarks/Convolution/kernels/sharedData.h b/src/Benchmarks/Convolution/kernels/sharedData.h index dcaa5236e..f1dfb9008 100644 --- a/src/Benchmarks/Convolution/kernels/sharedData.h +++ b/src/Benchmarks/Convolution/kernels/sharedData.h @@ -9,10 +9,10 @@ * Thanks for the idea https://www.evl.uic.edu/sjames/cs525/final.html */ - #include - #include - #include - #include +#include +#include +#include +#include template< typename Index, typename Real, diff --git a/src/Benchmarks/Convolution/support/HeatEquationSolver.h b/src/Benchmarks/Convolution/support/HeatEquationSolver.h index 6ce6d5c09..539268d9b 100644 --- a/src/Benchmarks/Convolution/support/HeatEquationSolver.h +++ b/src/Benchmarks/Convolution/support/HeatEquationSolver.h @@ -2,7 +2,7 @@ #pragma once #include "Solver.h" -#include "DummyTask.h" +#include "HeatEquationTask.h" #include #include @@ -11,8 +11,8 @@ static std::vector< TNL::String > dimensionIds = { "grid-x-size", "grid-y-size" static std::vector< TNL::String > kernelSizeIds = { "kernel-x-size", "kernel-y-size" }; static std::vector< TNL::String > domainIds = { "domain-x-size", "domain-y-size" }; static std::string sigmaKey = "sigma"; -static std::string timestepKey = "timeStep"; -static std::string finalTimeKey = "finalTime"; +static std::string timeStepKey = "timeStep"; +static std::string timeKey = "time"; static std::string outputFilenamePrefix = "outputFilenamePrefix"; template< typename Real = double > @@ -64,15 +64,28 @@ public: result.setLike( function ); result = 0; - auto finalTime = parameters.getParameter< Real >( finalTimeKey ); + auto timeStep = parameters.getParameter< double >( timeStepKey ); + auto finalTime = parameters.getParameter< double >( timeKey ); - convolve( dimensions, domain, spaceSteps, kernelSize, function.getConstView(), result.getView(), finalTime ); + int iterationsCount = finalTime / timeStep; - auto finalFilename = filenamePrefix + "_final.txt"; + double time = timeStep; - if( ! writeGNUPlot( finalFilename, dimensions, spaceSteps, domain, result.getConstView() ) ) { - std::cout << "Did fail during file write"; - return; + for (int i = 1; i <= iterationsCount; i++) { + printf("Time: %lf\n", time); + + convolve( dimensions, domain, kernelSize, function.getConstView(), result.getView(), time ); + + auto filename = TNL::String("data_") + TNL::convertToString(i) + ".txt"; + + if( ! writeGNUPlot( filename, dimensions, spaceSteps, domain, result.getConstView() ) ) { + std::cout << "Did fail during file write"; + return; + } + + result = 0; + + time += timeStep; } } @@ -82,8 +95,8 @@ public: TNL::Config::ConfigDescription config = Base::makeInputConfig(); config.addDelimiter( "Grid settings:" ); - config.addEntry< int >( dimensionIds[ 0 ], "Grid size along x-axis.", 100 ); - config.addEntry< int >( dimensionIds[ 1 ], "Grid size along y-axis.", 100 ); + config.addEntry< int >( dimensionIds[ 0 ], "Grid size along x-axis.", 200 ); + config.addEntry< int >( dimensionIds[ 1 ], "Grid size along y-axis.", 200 ); config.addDelimiter( "Kernel settings:" ); config.addEntry< int >( kernelSizeIds[ 0 ], "Kernel size along x-axis.", 3 ); @@ -97,7 +110,8 @@ public: config.addEntry< Real >( sigmaKey, "Sigma in exponential initial condition.", 0.5); - config.addEntry< Real >( finalTimeKey, "Final time of the simulation.", 0.12); + config.addEntry< Real >( timeStepKey, "Time step of the simulation.", 0.005); + config.addEntry< Real >( timeKey, "Final time of the simulation.", 0.36); return config; } @@ -136,34 +150,12 @@ public: void convolve( const Vector& dimensions, const Point& domain, - const Point& spaceSteps, const Vector& kernelSize, typename DataStore::ConstViewType input, typename DataStore::ViewType result, const Real time ) const { - HostDataStore kernel; - - kernel.resize( kernelSize.x() * kernelSize.y() ); - - for( int j = 0; j < kernelSize.y(); j++ ) { - for( int i = 0; i < kernelSize.x(); i++ ) { - int index = i + j * kernelSize.x(); - - auto x = i * spaceSteps.x() - domain.x() / 2.; - auto y = j * spaceSteps.y() - domain.y() / 2.; - - kernel[ index ] = ( 1. / ( 4. * M_PI * time ) ) * exp( -( x * x + y * y ) / ( 4. * time ) ); - } - } - - std::cout << kernel << std::endl; - - DataStore kernelDevice( kernel ); - - auto kernelView = kernelDevice.getConstView(); - - DummyTask< int, Real, Dimension, Device >::exec( dimensions, kernelSize, input, result, kernelView, 0); + HeatEquationTask< int, Real, Dimension, Device >::exec( dimensions, kernelSize, domain, { 3., 3. }, time, input, result); } bool diff --git a/src/Benchmarks/Convolution/support/HeatEquationTask.h b/src/Benchmarks/Convolution/support/HeatEquationTask.h new file mode 100644 index 000000000..c4b4f5546 --- /dev/null +++ b/src/Benchmarks/Convolution/support/HeatEquationTask.h @@ -0,0 +1,94 @@ + +#pragma once + +template< int Dimension, typename Device > +struct Convolution +{ + template< typename Index > + using Vector = TNL::Containers::StaticVector< Dimension, Index >; + + template< typename Index, + typename Real, + typename FetchData, + typename FetchBoundary, + typename Convolve, + typename Store > + static void + execute( const Vector< Index >& dimensions, + const Vector< Index >& kernelSize, + FetchData&& fetchData, + FetchBoundary&& fetchBoundary, + Convolve&& convolve, + Store&& store ); +}; + +template< typename Index, typename Real, int Dimension, typename Device > +struct HeatEquationTask; + +template< typename Index, typename Real > +struct HeatEquationTask< Index, Real, 2, TNL::Devices::Cuda > +{ +public: + static constexpr int Dimension = 2; + using Device = TNL::Devices::Cuda; + using Vector = TNL::Containers::StaticVector< Dimension, Index >; + using Point = TNL::Containers::StaticVector< Dimension, Real >; + using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType; + using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType; + using ConvolutionLauncher = Convolution< Dimension, Device >; + + static void + exec( const Vector& dimensions, + const Vector& kernelSize, + const Point& functionDomain, + const Point& kernelDomain, + const Real time, + ConstDataStore& input, + DataStore& result) + { + auto functionSpaceSteps = Point(functionDomain.x() / dimensions.x(), functionDomain.y() / dimensions.y()); + auto kernelSpaceSteps = Point(kernelDomain.x() / kernelSize.x(), kernelDomain.y() / kernelSize.y()); + + auto fetchData = [ = ] __cuda_callable__( Index i, Index j ) + { + auto index = i + j * dimensions.x(); + + return input[ index ]; + }; + + auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j ) + { + return 0; + }; + + auto convolve = [ = ] __cuda_callable__( Real result, Index dataX, Index dataY, Index kernelX, Index kernelY, Real data ) + { + auto functionXPos = dataX * functionSpaceSteps.x() - (functionDomain.x() / 2), + functionYPos = dataY * functionSpaceSteps.y() - (functionDomain.y() / 2); + + auto kernelXPos = (kernelX - kernelSize.x() / 2) * kernelSpaceSteps.x(), + kernelYPos = (kernelY - kernelSize.y() / 2) * kernelSpaceSteps.y(); + + auto deltaXPos = kernelXPos - functionXPos, + deltaYPos = kernelYPos - functionYPos; + + auto kernel = kernelSpaceSteps.x() * kernelSpaceSteps.y() * ( (Real)1 / ( (Real)4 * M_PI * time ) ) * exp( - ( pow(deltaXPos, 2.) + pow(deltaYPos, 2.) ) / ( (Real)4 * time ) ); + + return result + data * kernel; + }; + + auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) mutable + { + auto index = i + j * dimensions.x(); + + result[ index ] = resultValue; + }; + + ConvolutionLauncher::execute< Index, Real >( dimensions, + kernelSize, + std::forward< decltype( fetchData ) >( fetchData ), + std::forward< decltype( fetchBoundary ) >( fetchBoundary ), + std::forward< decltype( convolve ) >( convolve ), + std::forward< decltype( store ) >( store ) ); + } +}; -- GitLab From 82c7ee07cbf521637e3da170c99185626c7caff8 Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Sun, 8 May 2022 11:57:38 +0200 Subject: [PATCH 18/19] Add possibility to specify domain and start time of the iteration --- src/Benchmarks/Convolution/CMakeLists.txt | 6 ++++ .../Convolution/support/HeatEquationSolver.h | 29 ++++++++++++++----- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt index 8695a4048..65e7fe897 100644 --- a/src/Benchmarks/Convolution/CMakeLists.txt +++ b/src/Benchmarks/Convolution/CMakeLists.txt @@ -12,6 +12,12 @@ if (${BUILD_CUDA}) STRING(REGEX REPLACE "DIMENSION_VALUE" ${DIMENSION} TEMPLATE_CONTENT "${TEMPLATE_CONTENT}") STRING(REGEX REPLACE "KERNEL_VALUE" "\"../${KERNEL_HEADER}\"" TEMPLATE_CONTENT "${TEMPLATE_CONTENT}") + get_filename_component(ABSOLUTE_SUPPORT_PATH ${SOURCE_FILE} ABSOLUTE) + + if(NOT EXISTS ${ABSOLUTE_SUPPORT_PATH}) + FILE(WRITE ${ABSOLUTE_SUPPORT_PATH} "") + endif() + FILE(READ ${SOURCE_FILE} SOURCE_FILE_CONTENT) if ( NOT "${SOURCE_FILE_CONTENT}" STREQUAL "${TEMPLATE_CONTENT}" ) diff --git a/src/Benchmarks/Convolution/support/HeatEquationSolver.h b/src/Benchmarks/Convolution/support/HeatEquationSolver.h index 539268d9b..57ecd21a0 100644 --- a/src/Benchmarks/Convolution/support/HeatEquationSolver.h +++ b/src/Benchmarks/Convolution/support/HeatEquationSolver.h @@ -10,9 +10,11 @@ static std::vector< TNL::String > dimensionIds = { "grid-x-size", "grid-y-size" }; static std::vector< TNL::String > kernelSizeIds = { "kernel-x-size", "kernel-y-size" }; static std::vector< TNL::String > domainIds = { "domain-x-size", "domain-y-size" }; +static std::vector< TNL::String > kernelDomainIds = { "kernel-domain-x-size", "kernel-domain-y-size" }; static std::string sigmaKey = "sigma"; static std::string timeStepKey = "timeStep"; -static std::string timeKey = "time"; +static std::string startTimeKey = "startTime"; +static std::string finalTimeKey = "finalTime"; static std::string outputFilenamePrefix = "outputFilenamePrefix"; template< typename Real = double > @@ -40,10 +42,14 @@ public: Real xDomainSize = parameters.getParameter< Real >( domainIds[ 0 ] ); Real yDomainSize = parameters.getParameter< Real >( domainIds[ 1 ] ); + Real kernelXDomainSize = parameters.getParameter< Real >( kernelDomainIds[ 0 ] ); + Real kernelYDomainSize = parameters.getParameter< Real >( kernelDomainIds[ 1 ] ); + Real hx = xDomainSize / (Real) gridXSize; Real hy = yDomainSize / (Real) gridYSize; Point domain = { xDomainSize, yDomainSize }; + Point kernelDomain = { kernelXDomainSize, kernelYDomainSize }; Point spaceSteps = { hx, hy }; Vector dimensions = { gridXSize, gridYSize }; @@ -65,16 +71,18 @@ public: result = 0; auto timeStep = parameters.getParameter< double >( timeStepKey ); - auto finalTime = parameters.getParameter< double >( timeKey ); + auto startTime = parameters.getParameter< double >( startTimeKey ); + auto finalTime = parameters.getParameter< double >( finalTimeKey ); - int iterationsCount = finalTime / timeStep; + int iteration = (startTime / timeStep) + 1; + int finalIteration = finalTime / timeStep; - double time = timeStep; + double time = iteration * timeStep; - for (int i = 1; i <= iterationsCount; i++) { + for (int i = iteration; i <= finalIteration; i++) { printf("Time: %lf\n", time); - convolve( dimensions, domain, kernelSize, function.getConstView(), result.getView(), time ); + convolve( dimensions, domain, kernelSize, kernelDomain, function.getConstView(), result.getView(), time ); auto filename = TNL::String("data_") + TNL::convertToString(i) + ".txt"; @@ -108,10 +116,14 @@ public: config.addEntry< Real >( domainIds[ 0 ], "Domain size along x-axis.", 4.0 ); config.addEntry< Real >( domainIds[ 1 ], "Domain size along y-axis.", 4.0 ); + config.addEntry< Real >( kernelDomainIds[ 0 ], "Kernel domain size along x-axis.", 3.0 ); + config.addEntry< Real >( kernelDomainIds[ 1 ], "Kernel domain size along y-axis.", 3.0 ); + config.addEntry< Real >( sigmaKey, "Sigma in exponential initial condition.", 0.5); + config.addEntry< Real >( startTimeKey, "Final time of the simulation.", 0.0); config.addEntry< Real >( timeStepKey, "Time step of the simulation.", 0.005); - config.addEntry< Real >( timeKey, "Final time of the simulation.", 0.36); + config.addEntry< Real >( finalTimeKey, "Final time of the simulation.", 0.36); return config; } @@ -151,11 +163,12 @@ public: convolve( const Vector& dimensions, const Point& domain, const Vector& kernelSize, + const Point& kernelDomain, typename DataStore::ConstViewType input, typename DataStore::ViewType result, const Real time ) const { - HeatEquationTask< int, Real, Dimension, Device >::exec( dimensions, kernelSize, domain, { 3., 3. }, time, input, result); + HeatEquationTask< int, Real, Dimension, Device >::exec( dimensions, kernelSize, domain, kernelDomain, time, input, result); } bool -- GitLab From 6cc6bffe2ef5cb0ebb5fbad1e3ce2dd15ceb2d51 Mon Sep 17 00:00:00 2001 From: hayeuyur Date: Mon, 9 May 2022 14:27:08 +0200 Subject: [PATCH 19/19] Fix heat equation solver --- src/Benchmarks/Convolution/CMakeLists.txt | 2 +- .../kernels/heatEquationSharedData.h | 182 ------------------ .../Convolution/support/HeatEquationSolver.h | 62 +++++- .../Convolution/support/HeatEquationTask.h | 94 --------- 4 files changed, 55 insertions(+), 285 deletions(-) delete mode 100644 src/Benchmarks/Convolution/kernels/heatEquationSharedData.h delete mode 100644 src/Benchmarks/Convolution/support/HeatEquationTask.h diff --git a/src/Benchmarks/Convolution/CMakeLists.txt b/src/Benchmarks/Convolution/CMakeLists.txt index 65e7fe897..31d46cbf7 100644 --- a/src/Benchmarks/Convolution/CMakeLists.txt +++ b/src/Benchmarks/Convolution/CMakeLists.txt @@ -74,4 +74,4 @@ GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "k GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedKernel.h") GENERATE_CUDA_EXECUTABLE("ImageConvolution" 2 "templates/main_image_solver.h" "kernels/sharedDataAndKernel.h") -GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/heatEquationSharedData.h") +GENERATE_CUDA_EXECUTABLE("HeatEquation" 2 "templates/main_heat_equation_solver.h" "kernels/sharedDataAndKernel.h") diff --git a/src/Benchmarks/Convolution/kernels/heatEquationSharedData.h b/src/Benchmarks/Convolution/kernels/heatEquationSharedData.h deleted file mode 100644 index bf6fdccf5..000000000 --- a/src/Benchmarks/Convolution/kernels/heatEquationSharedData.h +++ /dev/null @@ -1,182 +0,0 @@ - -#ifdef HAVE_CUDA - -/** - * This method stores image tile into shared memory - * and then calculates convolution. - * - * Thanks for the idea https://www.evl.uic.edu/sjames/cs525/final.html - */ - -#include -#include -#include -#include - -template< int Dimension, typename Device > -struct Convolution; - -template< typename Index, - typename Real, - typename FetchData, - typename FetchBoundary, - typename Convolve, - typename Store > -__global__ -static void -convolution2D( Index kernelWidth, - Index kernelHeight, - Index endX, - Index endY, - FetchData fetchData, - FetchBoundary fetchBoundary, - Convolve convolve, - Store store ) -{ - Real* data = TNL::Cuda::getSharedMemory< Real >(); - - const Index iy = threadIdx.y + blockIdx.y * blockDim.y; - const Index ix = threadIdx.x + blockIdx.x * blockDim.x; - - const Index radiusY = kernelHeight >> 1; - const Index radiusX = kernelWidth >> 1; - - const Index dataBlockWidth = 2 * kernelWidth - 1; - const Index dataBlockHeight = 2 * kernelHeight - 1; - - const Index dataBlockRadiusX = dataBlockWidth >> 1; - const Index dataBlockRadiusY = dataBlockHeight >> 1; - - Index x, y, index; - - // Top Left - x = ix - radiusX; - y = iy - radiusY; - index = threadIdx.x + threadIdx.y * dataBlockWidth; - - if( x < 0 || y < 0 || x >= endX || y >= endY ) { - data[ index ] = fetchBoundary( x, y ); - } - else { - data[ index ] = fetchData( x, y ); - } - - // Top right - x = ix + radiusX; - y = iy - radiusY; - index = dataBlockRadiusX + threadIdx.x + threadIdx.y * dataBlockWidth; - - if( x < 0 || y < 0 || x >= endX || y >= endY ) { - data[ index ] = fetchBoundary( x, y ); - } - else { - data[ index ] = fetchData( x, y ); - } - - // Bottom Left - x = ix - radiusX; - y = iy + radiusY; - index = threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; - - if(x < 0 || y < 0 || x >= endX || y >= endY ) { - data[ index ] = fetchBoundary( x, y ); - } - else { - data[ index ] = fetchData( x, y ); - } - - // Bottom Right - x = ix + radiusX; - y = iy + radiusY; - index = dataBlockRadiusX + threadIdx.x + ( dataBlockRadiusY + threadIdx.y ) * dataBlockWidth; - - if( x < 0 || y < 0 || x >= endX || y >= endY ) { - data[ index ] = fetchBoundary( x, y ); - } - else { - data[ index ] = fetchData( x, y ); - } - - __syncthreads(); - - if( ix >= endX || iy >= endY ) - return; - - Real result = 0; - - for( Index j = 0; j < kernelHeight; j++ ) { - Index align = ( j + threadIdx.y ) * dataBlockWidth; - - for( Index i = 0; i < kernelWidth; i++ ) { - Index index = i + threadIdx.x + align; - - result = convolve( result, ix, iy, i, j, data[ index ]); - } - } - - store( ix, iy, result ); -} - - -template<> -struct Convolution< 2, TNL::Devices::Cuda > -{ -public: - template< typename Index > - using Vector = TNL::Containers::StaticVector< 2, Index >; - - template< typename Index, typename Real > - static void - setup( TNL::Cuda::LaunchConfiguration& configuration, const Vector< Index >& dimensions, const Vector< Index >& kernelSize ) - { - Index kernelElementCount = 1; - - for( Index i = 0; i < kernelSize.getSize(); i++ ) - kernelElementCount *= ( 2 * kernelSize[ i ] ) - 1; - - configuration.dynamicSharedMemorySize = kernelElementCount * sizeof( Real ); - - configuration.blockSize.x = kernelSize.x(); - configuration.blockSize.y = kernelSize.y(); - - configuration.gridSize.x = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.x(), configuration.blockSize.x ) ); - configuration.gridSize.y = - TNL::min( TNL::Cuda::getMaxGridSize(), TNL::Cuda::getNumberOfBlocks( dimensions.y(), configuration.blockSize.y ) ); - } - - template< typename Index, - typename Real, - typename FetchData, - typename FetchBoundary, - typename Convolve, - typename Store > - static void - execute( const Vector< Index >& dimensions, - const Vector< Index >& kernelSize, - FetchData&& fetchData, - FetchBoundary&& fetchBoundary, - Convolve&& convolve, - Store&& store ) - { - TNL::Cuda::LaunchConfiguration configuration; - - setup< Index, Real >( configuration, dimensions, kernelSize ); - - constexpr auto kernel = convolution2D< Index, Real, FetchData, FetchBoundary, Convolve, Store >; - - TNL::Cuda::launchKernel< true >( kernel, - 0, - configuration, - kernelSize.x(), - kernelSize.y(), - dimensions.x(), - dimensions.y(), - fetchData, - fetchBoundary, - convolve, - store ); - }; -}; - -#endif diff --git a/src/Benchmarks/Convolution/support/HeatEquationSolver.h b/src/Benchmarks/Convolution/support/HeatEquationSolver.h index 57ecd21a0..eebe89c4b 100644 --- a/src/Benchmarks/Convolution/support/HeatEquationSolver.h +++ b/src/Benchmarks/Convolution/support/HeatEquationSolver.h @@ -2,7 +2,7 @@ #pragma once #include "Solver.h" -#include "HeatEquationTask.h" +#include "DummyTask.h" #include #include @@ -11,7 +11,11 @@ static std::vector< TNL::String > dimensionIds = { "grid-x-size", "grid-y-size" static std::vector< TNL::String > kernelSizeIds = { "kernel-x-size", "kernel-y-size" }; static std::vector< TNL::String > domainIds = { "domain-x-size", "domain-y-size" }; static std::vector< TNL::String > kernelDomainIds = { "kernel-domain-x-size", "kernel-domain-y-size" }; -static std::string sigmaKey = "sigma"; + +static std::string alphaKey = "alpha"; +static std::string betaKey = "beta"; +static std::string gammaKey = "gamma"; + static std::string timeStepKey = "timeStep"; static std::string startTimeKey = "startTime"; static std::string finalTimeKey = "finalTime"; @@ -116,11 +120,15 @@ public: config.addEntry< Real >( domainIds[ 0 ], "Domain size along x-axis.", 4.0 ); config.addEntry< Real >( domainIds[ 1 ], "Domain size along y-axis.", 4.0 ); - config.addEntry< Real >( kernelDomainIds[ 0 ], "Kernel domain size along x-axis.", 3.0 ); - config.addEntry< Real >( kernelDomainIds[ 1 ], "Kernel domain size along y-axis.", 3.0 ); + config.addEntry< Real >( kernelDomainIds[ 0 ], "Kernel domain size along x-axis.", 4.0 ); + config.addEntry< Real >( kernelDomainIds[ 1 ], "Kernel domain size along y-axis.", 4.0 ); - config.addEntry< Real >( sigmaKey, "Sigma in exponential initial condition.", 0.5); + config.addDelimiter( "Initial condition settings ( (x^2/alpha + y^2/beta) + gamma)):" ); + config.addEntry< Real >( alphaKey, "Alpha value in initial condition", -0.05 ); + config.addEntry< Real >( betaKey, "Beta value in initial condition", -0.05 ); + config.addEntry< Real >( gammaKey, "Gamma key in initial condition", 15 ); + config.addDelimiter( "Time settings:" ); config.addEntry< Real >( startTimeKey, "Final time of the simulation.", 0.0); config.addEntry< Real >( timeStepKey, "Time step of the simulation.", 0.005); config.addEntry< Real >( finalTimeKey, "Final time of the simulation.", 0.36); @@ -142,7 +150,10 @@ public: auto xDomainSize = parameters.getParameter< Real >( domainIds[ 0 ] ); auto yDomainSize = parameters.getParameter< Real >( domainIds[ 1 ] ); - auto sigma = parameters.getParameter< Real >( sigmaKey ); + + auto alpha = parameters.getParameter< Real >( alphaKey ); + auto beta = parameters.getParameter< Real >( betaKey ); + auto gamma = parameters.getParameter< Real >( gammaKey ); auto init = [ = ] __cuda_callable__( int i, int j ) mutable { @@ -151,7 +162,7 @@ public: auto x = i * spaceSteps.x() - domain.x() / 2.; auto y = j * spaceSteps.y() - domain.y() / 2.; - functionView[ index ] = exp( sigma * ( x * x + y * y ) ); + functionView[ index ] = TNL::max((x * x / alpha) + (y * y / beta) + gamma, 0); }; TNL::Algorithms::ParallelFor2D< Device >::exec( 0, 0, dimensions.x(), dimensions.y(), init ); @@ -168,7 +179,42 @@ public: typename DataStore::ViewType result, const Real time ) const { - HeatEquationTask< int, Real, Dimension, Device >::exec( dimensions, kernelSize, domain, kernelDomain, time, input, result); + DataStore kernel; + kernel.resize(kernelSize.x() * kernelSize.y()); + + auto kernelView = kernel.getView(); + auto domainSpaceSteps = Point(domain.x() / dimensions.x(), domain.y() / dimensions.y()); + auto kernelSpaceSteps = Point(kernelDomain.x() / (kernelSize.x() - 1), kernelDomain.y() / (kernelSize.y() - 1)); + + auto init = [ = ] __cuda_callable__( int i, int j ) mutable { + auto index = j * kernelSize.x() + i; + + auto x = i * kernelSpaceSteps.x() - kernelDomain.x() / 2.; + auto y = j * kernelSpaceSteps.y() - kernelDomain.y() / 2.; + + // The space step is given by the function domain + // However, because the kernel is limited to 31x31 size + // The user can specify it custom kernel domain from which values are taken + kernelView[ index ] = domainSpaceSteps.x() * domainSpaceSteps.y() * ( (Real)1 / ( (Real)4 * M_PI * time ) ) * exp( - ( pow(x, 2.) + pow(y, 2.) ) / ( (Real)4 * time ) ); + }; + + TNL::Algorithms::ParallelFor2D< Device >::exec( 0, 0, kernelSize.x(), kernelSize.y(), init ); + + // std::cout << std::endl << std::endl << std::endl; + + for (int i = 0; i < kernelSize.x(); i++) { + for (int j = 0; j < kernelSize.y(); j++) { + auto index = j * kernelSize.x() + i; + + printf("%lf ", kernelView.getElement(index)); + } + + printf("\n"); + } + + auto kernelConstView = kernel.getConstView(); + + DummyTask::exec(dimensions, kernelSize, input, result, kernelConstView, 0); } bool diff --git a/src/Benchmarks/Convolution/support/HeatEquationTask.h b/src/Benchmarks/Convolution/support/HeatEquationTask.h deleted file mode 100644 index c4b4f5546..000000000 --- a/src/Benchmarks/Convolution/support/HeatEquationTask.h +++ /dev/null @@ -1,94 +0,0 @@ - -#pragma once - -template< int Dimension, typename Device > -struct Convolution -{ - template< typename Index > - using Vector = TNL::Containers::StaticVector< Dimension, Index >; - - template< typename Index, - typename Real, - typename FetchData, - typename FetchBoundary, - typename Convolve, - typename Store > - static void - execute( const Vector< Index >& dimensions, - const Vector< Index >& kernelSize, - FetchData&& fetchData, - FetchBoundary&& fetchBoundary, - Convolve&& convolve, - Store&& store ); -}; - -template< typename Index, typename Real, int Dimension, typename Device > -struct HeatEquationTask; - -template< typename Index, typename Real > -struct HeatEquationTask< Index, Real, 2, TNL::Devices::Cuda > -{ -public: - static constexpr int Dimension = 2; - using Device = TNL::Devices::Cuda; - using Vector = TNL::Containers::StaticVector< Dimension, Index >; - using Point = TNL::Containers::StaticVector< Dimension, Real >; - using ConstDataStore = typename TNL::Containers::Vector< Real, Device, Index >::ConstViewType; - using DataStore = typename TNL::Containers::Vector< Real, Device, Index >::ViewType; - using ConvolutionLauncher = Convolution< Dimension, Device >; - - static void - exec( const Vector& dimensions, - const Vector& kernelSize, - const Point& functionDomain, - const Point& kernelDomain, - const Real time, - ConstDataStore& input, - DataStore& result) - { - auto functionSpaceSteps = Point(functionDomain.x() / dimensions.x(), functionDomain.y() / dimensions.y()); - auto kernelSpaceSteps = Point(kernelDomain.x() / kernelSize.x(), kernelDomain.y() / kernelSize.y()); - - auto fetchData = [ = ] __cuda_callable__( Index i, Index j ) - { - auto index = i + j * dimensions.x(); - - return input[ index ]; - }; - - auto fetchBoundary = [ = ] __cuda_callable__( Index i, Index j ) - { - return 0; - }; - - auto convolve = [ = ] __cuda_callable__( Real result, Index dataX, Index dataY, Index kernelX, Index kernelY, Real data ) - { - auto functionXPos = dataX * functionSpaceSteps.x() - (functionDomain.x() / 2), - functionYPos = dataY * functionSpaceSteps.y() - (functionDomain.y() / 2); - - auto kernelXPos = (kernelX - kernelSize.x() / 2) * kernelSpaceSteps.x(), - kernelYPos = (kernelY - kernelSize.y() / 2) * kernelSpaceSteps.y(); - - auto deltaXPos = kernelXPos - functionXPos, - deltaYPos = kernelYPos - functionYPos; - - auto kernel = kernelSpaceSteps.x() * kernelSpaceSteps.y() * ( (Real)1 / ( (Real)4 * M_PI * time ) ) * exp( - ( pow(deltaXPos, 2.) + pow(deltaYPos, 2.) ) / ( (Real)4 * time ) ); - - return result + data * kernel; - }; - - auto store = [ = ] __cuda_callable__( Index i, Index j, Real resultValue ) mutable - { - auto index = i + j * dimensions.x(); - - result[ index ] = resultValue; - }; - - ConvolutionLauncher::execute< Index, Real >( dimensions, - kernelSize, - std::forward< decltype( fetchData ) >( fetchData ), - std::forward< decltype( fetchBoundary ) >( fetchBoundary ), - std::forward< decltype( convolve ) >( convolve ), - std::forward< decltype( store ) >( store ) ); - } -}; -- GitLab